def run(argv=None): dt = datetime.now() parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default= 'gs://{}_computronix/contractors/{}/{}/{}_contractors_licenses.json'. format(os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Input file to process.') parser.add_argument( '--avro_output', dest='avro_output', default= 'gs://{}_computronix/contractors/avro_output/{}/{}/{}/avro_output'. format(os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Output directory to write avro files.') known_args, pipeline_args = parser.parse_known_args(argv) #TODO: run on on-prem network when route is opened # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally pipeline_args.extend( generate_args('computronix-trades-dataflow_scripts', '{}_computronix'.format(os.environ['GCS_PREFIX']), 'DirectRunner')) avro_schema = get_schema('contractors_computronix') pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(FormatColumnNames()) | beam.ParDo(ConvertTypes()) | beam.io.avroio.WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): dt = datetime.now() parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://{}_finance/{}/{}/{}_registered_businesses.csv'.format( os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Input file to process.') parser.add_argument( '--avro_output', dest='avro_output', default='gs://{}_finance/avro_output/{}/{}/{}/avro_output'.format( os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Output directory to write avro files.') known_args, pipeline_args = parser.parse_known_args(argv) #TODO: run on on-prem network when route is opened # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally pipeline_args.extend( generate_args('registered-businesses-dataflow_scripts', '{}_finance'.format(os.environ['GCS_PREFIX']), 'DirectRunner')) avro_schema = get_schema('registered_businesses') pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, skip_header_lines=1) load = (lines | beam.ParDo(ConvertToDicts()) | beam.ParDo(AddNormalizedAddress()) | beam.io.avroio.WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): dt = datetime.now() parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://{}_311/requests/{}/{}/{}_requests.json'.format( os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Input file to process.') parser.add_argument( '--avro_output', dest='avro_output', default='gs://{}_311/requests/avro_output/{}/{}/{}/avro_output'.format( os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Output directory to write avro files.') known_args, pipeline_args = parser.parse_known_args(argv) #TODO: run on on-prem network when route is opened # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally pipeline_args.extend( generate_args('qalert-requests-dataflow', '{}_311'.format(os.environ['GCS_PREFIX']), 'DirectRunner')) avro_schema = get_schema('City_of_Pittsburgh_QAlert_Requests') pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(GetStatus()) | beam.ParDo(CleanLatLong()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))