class TrashCansDataFlowTest(unittest.TestCase): set_up() RECORD = '1,74,"2017-09-14T13:24:40.35","2019-12-02T02:17:17.327","1st Division","122 E North Ave","Pittsburgh",' \ '"Pennsylvania",15212,"Central Northside",1,1,22,"1-6"' SCHEMA = get_schema('smart_trash_cans') converted = ConvertToDicts.process(ConvertToDicts(), RECORD) def test_convert_to_dicts(self): expected = [{ 'container_id': 1, 'receptacle_model_id': 74, 'assignment_date': '2017-09-14T13:24:40.35', 'last_updated_date': '2019-12-02T02:17:17.327', 'group_name': '1st Division', 'address': '122 E North Ave', 'city': 'Pittsburgh', 'state': 'Pennsylvania', 'zip': 15212, 'neighborhood': 'Central Northside', 'dpw_division': 1, 'council_district': 1, 'ward': 22, 'fire_zone': '1-6' }] self.assertEqual(expected, self.converted) def test_schema(self): self.assertTrue(validate(self.converted[0], self.SCHEMA))
class ComputronixTradesDataFlowTest(unittest.TestCase): set_up() RECORD = { "LICENSENUMBER":"EL09927", "LICENSETYPENAME":"Electrical Trade", "NAICSCODE":"236217", "BUSINESSNAME":"Michael Conroy", "LICENSESTATE":"Active", "INITIALISSUEDATE":"2017-11-15T15:04:07-05:00", "MOSTRECENTISSUEDATE":"2019-09-13T00:00:00-04:00", "EFFECTIVEDATE":"2019-10-25T08:23:48-04:00", "EXPIRATIONDATE":"2020-10-24T00:00:00-04:00" } SCHEMA = get_schema('trade_licenses_computronix') # need to use next() to access dict value because dataflow steps yield generators formatted = next(FormatColumnNames.process(FormatColumnNames(), RECORD)) type_converted = next(ConvertTypes.process(ConvertTypes(), formatted)) def test_format_column_names(self): expected = { "license_number": "EL09927", "license_type_name": "Electrical Trade", "naics_code": "236217", "business_name": "Michael Conroy", "license_state": "Active", "initial_issue_date": "2017-11-15T15:04:07-05:00", "most_recent_issue_date": "2019-09-13T00:00:00-04:00", "effective_date": "2019-10-25T08:23:48-04:00", "expiration_date": "2020-10-24T00:00:00-04:00" } self.assertEqual(sorted(expected), sorted(self.formatted)) def test_convert_types(self): expected = { "license_number": "EL09927", "license_type_name": "Electrical Trade", "naics_code": 236217, "business_name": "Michael Conroy", "license_state": "Active", "initial_issue_date": "2017-11-15T15:04:07-05:00", "most_recent_issue_date": "2019-09-13T00:00:00-04:00", "effective_date": "2019-10-25T08:23:48-04:00", "expiration_date": "2020-10-24T00:00:00-04:00" } self.assertEqual(sorted(expected), sorted(self.type_converted)) def test_schema(self): self.assertTrue(validate(self.type_converted, self.SCHEMA))
class ComputronixTradesDataFlowTest(unittest.TestCase): set_up() RECORD = { "LICENSENUMBER": "BL008027", "LICENSETYPENAME": "General Contractor", "NAICSCODE": "236227", "BUSINESSNAME": "ENGINE 30 ARCHITECTURE, LLC", "LICENSESTATE": "Expired", "INITIALISSUEDATE": "2017-11-09T09:10:47-05:00", "MOSTRECENTISSUEDATE": "2017-11-09T09:12:14-05:00", "EFFECTIVEDATE": "2017-11-09T09:12:14-05:00", "EXPIRATIONDATE": "2018-11-09T09:12:14-05:00" } SCHEMA = get_schema('contractors_computronix') # need to use next() to access dict value because dataflow steps yield generators formatted = next(FormatColumnNames.process(FormatColumnNames(), RECORD)) type_converted = next(ConvertTypes.process(ConvertTypes(), formatted)) def test_format_column_names(self): expected = { "license_number": "BL008027", "license_type_name": "General Contractor", "naics_code": "236227", "business_name": "ENGINE 30 ARCHITECTURE, LLC", "license_state": "Expired", "initial_issue_date": "2017-11-09T09:10:47-05:00", "most_recent_issue_date": "2017-11-09T09:12:14-05:00", "effective_date": "2017-11-09T09:12:14-05:00", "expiration_date": "2018-11-09T09:12:14-05:00" } self.assertEqual(sorted(expected), sorted(self.formatted)) def test_convert_types(self): expected = { "license_number": "BL008027", "license_type_name": "General Contractor", "naics_code": 236227, "business_name": "ENGINE 30 ARCHITECTURE, LLC", "license_state": "Expired", "initial_issue_date": "2017-11-09T09:10:47-05:00", "most_recent_issue_date": "2017-11-09T09:12:14-05:00", "effective_date": "2017-11-09T09:12:14-05:00", "expiration_date": "2018-11-09T09:12:14-05:00" } self.assertEqual(sorted(expected), sorted(self.type_converted)) def test_schema(self): self.assertTrue(validate(self.type_converted, self.SCHEMA))
def run(argv=None): dt = datetime.now() parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default= 'gs://{}_computronix/contractors/{}/{}/{}_contractors_licenses.json'. format(os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Input file to process.') parser.add_argument( '--avro_output', dest='avro_output', default= 'gs://{}_computronix/contractors/avro_output/{}/{}/{}/avro_output'. format(os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Output directory to write avro files.') known_args, pipeline_args = parser.parse_known_args(argv) #TODO: run on on-prem network when route is opened # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally pipeline_args.extend( generate_args('computronix-trades-dataflow_scripts', '{}_computronix'.format(os.environ['GCS_PREFIX']), 'DirectRunner')) avro_schema = get_schema('contractors_computronix') pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(FormatColumnNames()) | beam.ParDo(ConvertTypes()) | beam.io.avroio.WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): dt = datetime.now() parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://{}_finance/{}/{}/{}_registered_businesses.csv'.format( os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Input file to process.') parser.add_argument( '--avro_output', dest='avro_output', default='gs://{}_finance/avro_output/{}/{}/{}/avro_output'.format( os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Output directory to write avro files.') known_args, pipeline_args = parser.parse_known_args(argv) #TODO: run on on-prem network when route is opened # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally pipeline_args.extend( generate_args('registered-businesses-dataflow_scripts', '{}_finance'.format(os.environ['GCS_PREFIX']), 'DirectRunner')) avro_schema = get_schema('registered_businesses') pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, skip_header_lines=1) load = (lines | beam.ParDo(ConvertToDicts()) | beam.ParDo(AddNormalizedAddress()) | beam.io.avroio.WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): dt = datetime.now() parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://{}_311/requests/{}/{}/{}_requests.json'.format( os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Input file to process.') parser.add_argument( '--avro_output', dest='avro_output', default='gs://{}_311/requests/avro_output/{}/{}/{}/avro_output'.format( os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Output directory to write avro files.') known_args, pipeline_args = parser.parse_known_args(argv) #TODO: run on on-prem network when route is opened # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally pipeline_args.extend( generate_args('qalert-requests-dataflow', '{}_311'.format(os.environ['GCS_PREFIX']), 'DirectRunner')) avro_schema = get_schema('City_of_Pittsburgh_QAlert_Requests') pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(GetStatus()) | beam.ParDo(CleanLatLong()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
class ComputronixTradesDataFlowTest(unittest.TestCase): set_up() RECORD = { "LICENSENUMBER": "17-SNC00002777", "LICENSETYPENAME": "Sign Maintenance Certification", "NAICSCODE": "541857", "BUSINESSNAME": "LAMAR ADVERTISING", "PARCELNUMBER": "0044C00325000900", "LICENSESTATE": "Expired", "INITIALISSUEDATE": "2017-11-09T09:59:51-05:00", "MOSTRECENTISSUEDATE": "2017-11-09T10:37:16-05:00", "EFFECTIVEDATE": "2017-11-09T10:37:16-05:00", "EXPIRATIONDATE": "2018-11-09T10:37:16-05:00", "NUMBEROFLARGESIGNS": 1, "NUMBEROFSMALLSIGNS": 0, "NUMBEROFSIGNSTOTAL": 0 } SCHEMA = get_schema('businesses_computronix') # need to use next() to access dict value because dataflow steps yield generators formatted = next(FormatColumnNames.process(FormatColumnNames(), RECORD)) type_converted = next(ConvertTypes.process(ConvertTypes(), formatted)) def test_format_column_names(self): expected = { u"license_number": "17-SNC00002777", u"license_type_name": "Sign Maintenance Certification", u"naics_code": "541857", u"business_name": "LAMAR ADVERTISING", u"license_state": "Expired", u"initial_issue_date": "2017-11-09T09:59:51-05:00", u"most_recent_issue_date": "2017-11-09T10:37:16-05:00", u"effective_date": "2017-11-09T10:37:16-05:00", u"expiration_date": "2018-11-09T10:37:16-05:00", u"insurance_expiration_date": None, u"number_of_employees": None, u"number_of_signs_total": 0, u"number_of_small_signs": 0, u"number_of_large_signs": 1, u"total_number_of_spaces": None, u"number_of_nonleased_pub_spaces": None, u"number_of_revgen_spaces": None, u"number_of_handicap_spaces": None, u"number_of_seats": None, u"number_of_nongambling_machines": None, u"number_of_pool_tables": None, u"number_of_jukeboxes": None, u"parcel_number": "0044C00325000900", u"address": None } self.assertEqual(sorted(expected), sorted(self.formatted)) def test_convert_types(self): expected = { u"license_number": "17-SNC00002777", u"license_type_name": "Sign Maintenance Certification", u"naics_code": 541857, u"business_name": "LAMAR ADVERTISING", u"license_state": "Expired", u"initial_issue_date": "2017-11-09T09:59:51-05:00", u"most_recent_issue_date": "2017-11-09T10:37:16-05:00", u"effective_date": "2017-11-09T10:37:16-05:00", u"expiration_date": "2018-11-09T10:37:16-05:00", u"insurance_expiration_date": None, u"number_of_employees": None, u"number_of_signs_total": 0, u"number_of_small_signs": 0, u"number_of_large_signs": 1, u"total_number_of_spaces": None, u"number_of_nonleased_pub_spaces": None, u"number_of_revgen_spaces": None, u"number_of_handicap_spaces": None, u"number_of_seats": None, u"number_of_nongambling_machines": None, u"number_of_pool_tables": None, u"number_of_jukeboxes": None, u"parcel_number": "0044C00325000900", u"address": None } self.assertEqual(sorted(expected), sorted(self.type_converted)) def test_schema(self): self.assertTrue(validate(self.type_converted, self.SCHEMA))