def run(argv=None): """ If you want to run just this file for rapid development, pass the arg '-r DirectRunner' and add GCS paths for --input and --avro_output, e.g. python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/ -r DirectRunner """ known_args, pipeline_options, avro_schema = generate_args( job_name='wprdc-fire-dataflow', bucket='{}_ems_fire'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='ems_calls') with beam.Pipeline(options=pipeline_options) as p: field_name_swaps = [("census_block_group_center__x", "long"), ("census_block_group_center__y", "lat")] lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(SwapFieldNames(field_name_swaps)) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='computronix-domi-permits', bucket='{}_computronix'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='domi_permits_computronix') with beam.Pipeline(options=pipeline_options) as p: field_name_swaps = [('PERMITTYPEPERMITTYPE', 'PERMITTYPE'), ('TYPEOFWORKDESCRIPTION', 'WORKTYPE'), ('APPLICANTCUSTOMFORMATTEDNAME', 'APPLICANTNAME'), ('ALLCONTRACTORSNAME', 'CONTRACTORNAME'), ('SPECIALPERMITINSTRUCTIONS', 'SPECIALINSTRUCTIONS'), ('STATUSDESCRIPTION', 'STATUS')] lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(ParseNestedFields()) | beam.ParDo(SwapFieldNames(field_name_swaps)) | beam.ParDo(GeocodeAddress()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): """ If you want to run just this file for rapid development, change runner to 'DirectRunner' and add GCS paths for --input and --avro_output, e.g. python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/ """ known_args, pipeline_options, avro_schema = generate_args( job_name='parking-meters-dataflow', bucket='{}_parking'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='parking_meters' ) with beam.Pipeline(options=pipeline_options) as p: field_name_swaps = [("longitude", "long"), ("latitude", "lat")] type_changes = [("long", "float"), ("lat", "float")] lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = ( lines | beam.ParDo(SwapFieldNames(field_name_swaps)) | beam.ParDo(ChangeDataTypes(type_changes)) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='accela-permits', bucket='{}_accela'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='accela_permits' ) with beam.Pipeline(options=pipeline_options) as p: exclude_fields = [ 'module', 'serviceProviderCode', 'undistributedCost', 'totalJobCost', 'recordClass', 'reportedChannel', 'closedByDepartment', 'estimatedProductionUnit', 'actualProductionUnit', 'createdByCloning', 'closedByUser', 'trackingId', 'initiatedProduct', 'createdBy', 'value', 'balance', 'booking', 'infraction', 'misdemeanor', 'offenseWitnessed', 'defendantSignature', 'parcels', 'id', 'statusDate', 'jobValue', 'reportedDate' ] address_field = 'address' field_name_swaps = [ ('customId', 'id'), ('totalPay', 'total_paid') ] lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = ( lines | beam.ParDo(FilterInvalidRecord()) | beam.ParDo(FilterFields(exclude_fields)) | beam.ParDo(ParseNestedFields()) | beam.ParDo(GeocodeAddress(address_field)) | beam.ParDo(SwapFieldNames(field_name_swaps)) | beam.ParDo(ColumnsCamelToSnakeCase()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='twilio-311-dataflow', bucket='{}_twilio'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='twilio_reports' ) with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = ( lines | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='computronix-trades-dataflow', bucket='{}_computronix'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='trade_licenses_computronix') with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(FormatColumnNames()) | beam.ParDo(ConvertTypes()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='registered-businesses-dataflow', bucket='{}_finance'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='registered_businesses') with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(ColumnsToLowerCase()) | beam.ParDo(ParseAddress()) | beam.ParDo( NormalizeAddress(StaticValueProvider(str, 'address_full'))) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): """ If you want to run just this file for rapid development, change runner to 'DirectRunner' and add GCS paths for --input and --avro_output, e.g. python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/ """ known_args, pipeline_options, avro_schema = generate_args( job_name='parking-transactions-dataflow', bucket='{}_parking'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='parking_transactions' ) with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = ( lines | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='police-blotter-30-day-dataflow', bucket='{}_police'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='30_day_police_blotter') with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromText(known_args.input, coder=JsonCoder()) data_type_changes = [('CCR', 'int'), ('TRACT', 'int')] field_name_swaps = [('PK', 'id')] load = (lines | beam.ParDo(CleanPKs()) | beam.ParDo(ChangeDataTypes(data_type_changes)) | beam.ParDo(SwapFieldNames(field_name_swaps)) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): """ If you want to run just this file for rapid development, add the arg '-r DirectRunner' and add GCS paths for --input and --avro_output, e.g. python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/ -r DirectRunner """ known_args, pipeline_options, avro_schema = generate_args( job_name='qalert-requests-dataflow', bucket='{}_qalert'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='City_of_Pittsburgh_QAlert_Requests') with beam.Pipeline(options=pipeline_options) as p: date_conversions = [('lastActionUnix', 'lastAction'), ('addDateUnix', 'createDate')] field_name_swaps = [('addDateUnix', 'createDateUnix'), ('status', 'statusCode'), ('latitude', 'lat'), ('longitude', 'long'), ('master', 'masterRequestId'), ('typeId', 'requestTypeId'), ('typeName', 'requestType')] lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(GetDateStrings(date_conversions)) | beam.ParDo(SwapFieldNames(field_name_swaps)) | beam.ParDo(GetStatus()) | beam.ParDo(GetClosedDate()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))