Пример #1
0
def run(argv=None):
    """Test Avro IO (backed by fastavro or Apache Avro) on a simple pipeline
  that transforms bitcoin transactions"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='gs://beam-avro-test/bitcoin/txns/*',
                        help='Input file(s) to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    parser.add_argument('--compress',
                        dest='compress',
                        required=False,
                        action='store_true',
                        help='When set, compress the output data')
    parser.add_argument('--fastavro',
                        dest='use_fastavro',
                        required=False,
                        action='store_true',
                        help='When set, use fastavro for Avro I/O')

    opts, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the avro file[pattern] into a PCollection.
    records = \
        p | 'read' >> ReadFromAvro(opts.input, use_fastavro=opts.use_fastavro)

    measured = records | 'scan' >> beam.ParDo(BitcoinTxnCountDoFn())

    # pylint: disable=expression-not-assigned
    measured | 'write' >> \
        WriteToAvro(
            opts.output,
            schema=SCHEMA,
            codec=('deflate' if opts.compress else 'null'),
            use_fastavro=opts.use_fastavro
        )

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        metrics = result.metrics().query()

        for counter in metrics['counters']:
            logging.info("Counter: %s", counter)

        for dist in metrics['distributions']:
            logging.info("Distribution: %s", dist)
def run():
    argv = [
        '--project={0}'.format(PROJECT),
        '--staging_location=gs://{0}/staging/'.format(BUCKET),
        '--temp_location=gs://{0}/staging/'.format(BUCKET),
        '--runner=DataflowRunner'
    ]

    #Apache Beam Pipeline

    p = beam.Pipeline(argv=argv)

    (p
     | 'ReadAvroFromGCS' >>
     ReadFromAvro('gs://dataflow-excercise/test-dataset.avro')
     | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
         '{0}:apache_beam.avro_dataflow2'.format(PROJECT), schema=table_schema)
     )

    p.run()