def run(argv=None):
    dt = datetime.now()
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        dest='input',
        default=
        'gs://{}_computronix/contractors/{}/{}/{}_contractors_licenses.json'.
        format(os.environ['GCS_PREFIX'], dt.strftime('%Y'),
               dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Input file to process.')
    parser.add_argument(
        '--avro_output',
        dest='avro_output',
        default=
        'gs://{}_computronix/contractors/avro_output/{}/{}/{}/avro_output'.
        format(os.environ['GCS_PREFIX'], dt.strftime('%Y'),
               dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Output directory to write avro files.')

    known_args, pipeline_args = parser.parse_known_args(argv)

    #TODO: run on on-prem network when route is opened
    # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally
    pipeline_args.extend(
        generate_args('computronix-trades-dataflow_scripts',
                      '{}_computronix'.format(os.environ['GCS_PREFIX']),
                      'DirectRunner'))

    avro_schema = get_schema('contractors_computronix')

    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(FormatColumnNames())
                | beam.ParDo(ConvertTypes())
                | beam.io.avroio.WriteToAvro(known_args.avro_output,
                                             schema=avro_schema,
                                             file_name_suffix='.avro',
                                             use_fastavro=True))
示例#2
0
def run(argv=None):
    dt = datetime.now()
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        dest='input',
        default='gs://{}_finance/{}/{}/{}_registered_businesses.csv'.format(
            os.environ['GCS_PREFIX'], dt.strftime('%Y'),
            dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Input file to process.')
    parser.add_argument(
        '--avro_output',
        dest='avro_output',
        default='gs://{}_finance/avro_output/{}/{}/{}/avro_output'.format(
            os.environ['GCS_PREFIX'], dt.strftime('%Y'),
            dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Output directory to write avro files.')

    known_args, pipeline_args = parser.parse_known_args(argv)

    #TODO: run on on-prem network when route is opened

    # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally
    pipeline_args.extend(
        generate_args('registered-businesses-dataflow_scripts',
                      '{}_finance'.format(os.environ['GCS_PREFIX']),
                      'DirectRunner'))

    avro_schema = get_schema('registered_businesses')

    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, skip_header_lines=1)

        load = (lines
                | beam.ParDo(ConvertToDicts())
                | beam.ParDo(AddNormalizedAddress())
                | beam.io.avroio.WriteToAvro(known_args.avro_output,
                                             schema=avro_schema,
                                             file_name_suffix='.avro',
                                             use_fastavro=True))
示例#3
0
def run(argv=None):
    dt = datetime.now()
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        dest='input',
        default='gs://{}_311/requests/{}/{}/{}_requests.json'.format(
            os.environ['GCS_PREFIX'], dt.strftime('%Y'),
            dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Input file to process.')
    parser.add_argument(
        '--avro_output',
        dest='avro_output',
        default='gs://{}_311/requests/avro_output/{}/{}/{}/avro_output'.format(
            os.environ['GCS_PREFIX'], dt.strftime('%Y'),
            dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Output directory to write avro files.')

    known_args, pipeline_args = parser.parse_known_args(argv)

    #TODO: run on on-prem network when route is opened
    # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally
    pipeline_args.extend(
        generate_args('qalert-requests-dataflow',
                      '{}_311'.format(os.environ['GCS_PREFIX']),
                      'DirectRunner'))

    avro_schema = get_schema('City_of_Pittsburgh_QAlert_Requests')

    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(GetStatus())
                | beam.ParDo(CleanLatLong())
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))