示例#1
0
def run(argv=None):
    """Construct the pipeline."""

    options = ImportAssetOptions(argv)

    p = beam.Pipeline(options=options)

    # Cleanup json documents.
    sanitized = (p | 'read' >> ReadFromText(options.input, coder=JsonCoder())
                 | 'produce_resource_json' >> beam.ParDo(
                     ProduceResourceJson(options.group_by))
                 | 'bigquery_sanitize' >> beam.ParDo(BigQuerySanitize()))

    # Joining all iam_policy objects with resources of the same name.
    merged_iam = (sanitized | 'assign_name_key' >> beam.ParDo(
        AssignGroupByKey('NAME', options.num_shards))
                  | 'group_by_name' >> beam.GroupByKey()
                  | 'combine_policy' >> beam.ParDo(CombinePolicyResource()))

    # split into BigQuery tables.
    keyed_assets = merged_iam | 'assign_group_by_key' >> beam.ParDo(
        AssignGroupByKey(options.group_by, options.num_shards))

    # Generate BigQuery schema for each table.
    schemas = keyed_assets | 'to_schema' >> core.CombinePerKey(
        BigQuerySchemaCombineFn())

    pvalue_schemas = beam.pvalue.AsDict(schemas)
    # Write to GCS and load to BigQuery.
    # pylint: disable=expression-not-assigned
    (keyed_assets
     | 'add_load_time' >> beam.ParDo(AddLoadTime(options.load_time))
     | 'group_by_key_before_enforce' >> beam.GroupByKey()
     | 'enforce_schema' >> beam.ParDo(EnforceSchemaDataTypes(), pvalue_schemas)
     | 'group_by_key_before_write' >> beam.GroupByKey()
     |
     'write_to_gcs' >> beam.ParDo(WriteToGCS(options.stage, options.load_time))
     | 'group_written_objects_by_key' >> beam.GroupByKey()
     | 'delete_tables' >> beam.ParDo(
         DeleteDataSetTables(options.dataset, options.write_disposition))
     | 'load_to_bigquery' >> beam.ParDo(
         LoadToBigQuery(options.dataset, options.load_time),
         beam.pvalue.AsDict(schemas)))

    return p.run()
示例#2
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.
    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

    # Format the counts into a PCollection of strings.
    output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' %
                                           (word, c))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> WriteToText(known_args.output)

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    result.wait_until_finish()
    empty_line_values = result.aggregated_values(empty_line_aggregator)
    logging.info('number of empty lines: %d', sum(empty_line_values.values()))
    word_length_values = result.aggregated_values(average_word_size_aggregator)
    logging.info('average word lengths: %s', word_length_values.values())
示例#3
0
def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic):
        events = (p
                | 'read' >> ReadFromText(known_args.input)
                | 'parse' >> beam.FlatMap(ParseEventFn())
                | 'add_event_timestamps' >> beam.Map(
                    lambda x: beam.window.TimestampedValue(x, x.timestamp)))
    else:
        events = (p
                | 'read' >> ReadFromPubSub(
                    topic=known_args.topic,
                    timestamp_attribute='timestamp_ms')
                | 'parse' >> beam.ParDo(ParseEventFn()))

    # [START EXERCISE 6]
    _ = (events
         | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score))
         # Extract sessions of user data, using known_args.session_gap as the
         # gap duration.
         # https://beam.apache.org/documentation/programming-guide/#provided-windowing-functions
         | 'sessionize' >> ChangeMe()
         | 'drop_scores' >> beam.CombinePerKey(lambda x: 0)
         | 'convert_to_activity' >> beam.ParDo(UserSessionActivity())
         # Re-window into fixed windows of size user_activity_window in order
         # to compute the mean session duration for that window of activity.
         | 'window_of_sessions' >> ChangeMe()
         | 'session_mean' >> ChangeMe()
         # [END EXERCISE 6]
         | 'format_sessions' >> beam.ParDo(FormatSessionMeans())
         | 'write_to_bigquery' >> beam.io.WriteToBigQuery(
             known_args.output_tablename, known_args.output_dataset, project,
             SESSION_SCHEMA)
         )

    p.run().wait_until_finish()
def run(argv=None):
    """Construct the pipeline."""

    options = ImportAssetOptions(argv)

    p = beam.Pipeline(options=options)

    # Delete bigquery dataset on pipeline start.
    deleted_tables = (
        p | beam.Create([None])  # dummy PCollection to trigger delete tables.
        | 'delete_tables' >> beam.ParDo(
            DeleteDataSetTables(options.dataset, options.write_disposition)))

    # Cleanup json documents.
    sanitized_assets = (
        p | 'read' >> ReadFromText(options.input, coder=JsonCoder())
        |
        'bigquery_sanitize' >> beam.ParDo(BigQuerySanitize(options.load_time)))

    # Joining all iam_policy objects with resources of the same name.
    merged_iam_and_asset = (
        sanitized_assets | 'name_key' >> beam.ParDo(AssignGroupByKey('NAME'))
        | 'group_by_name' >> beam.GroupByKey()
        | 'combine_policy' >> beam.ParDo(CombinePolicyResource()))

    # split into BigQuery tables.
    keyed_assets = merged_iam_and_asset | 'group_by_key' >> beam.ParDo(
        AssignGroupByKey(options.group_by))

    # Generate BigQuery schema for each table.
    schemas = keyed_assets | 'to_schema' >> core.CombinePerKey(
        BigQuerySchemaCombineFn())

    # Write to GCS and load to BigQuery.
    # pylint: disable=expression-not-assigned
    (keyed_assets | 'group_assets_by_key' >> beam.GroupByKey()
     |
     'write_to_gcs' >> beam.ParDo(WriteToGCS(options.stage, options.load_time))
     | 'group_written_objets_by_key' >> beam.GroupByKey()
     | 'load_to_bigquery' >> beam.ParDo(
         LoadToBigQuery(options.dataset, options.write_disposition,
                        options.load_time), beam.pvalue.AsDict(schemas),
         beam.pvalue.AsSingleton(deleted_tables)))

    return p.run()
示例#5
0
def run(argv=None):

    p = beam.Pipeline(options=PipelineOptions())


    class Printer(beam.DoFn):
        def process(self, element):
            print element

    class Transaction(beam.DoFn):
        def process(self, element):
            #tripduration,starttime,stoptime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,birth_year,gender,t1,t2,t3,t4,t5 = element.split(',')
            t=[]
            t=element.split(',')
	    print(len(t))
            if t[0]!='tripduration' and len(t)==15: # just to avoid the problems caused by the csv table header
                #return [{"tripduration": tripduration,"starttime": starttime,"stoptime" : stoptime,"start_station_id" : start_station_id,"start_station_name" : start_station_name,"start_station_latitude" : start_station_latitude,"start_station_longitude" : start_station_longitude,"end_station_id" : end_station_id,"end_station_name" : end_station_name,"end_station_latitude" : end_station_latitude,"end_station_longitude": end_station_longitude ,"bikeid": bikeid,"usertype": usertype,"birth_year": birth_year,"gender": gender}]
                #return [{"tripduration": t[0],"starttime": t[1],"stoptime" : t[2],"start_station_id" : t[3],"start_station_name" : t[4],"start_station_latitude" : t[5],"start_station_longitude" : t[6],"end_station_id" : t[7],"end_station_name" : t[8],"end_station_latitude" : t[9],"end_station_longitude": t[10] ,"bikeid": t[11],"usertype": t[12],"birth_year": t[13],"gender": t[14]}]
                return[{"birth_year": int(t[13]),"birth_year_double": int(t[13])*2,"gender": t[14],"gender_reverse": t[14][::-1]}]


    data_from_source = (p
                        | 'Read the source file' >> ReadFromText('gs://group-2-ross/dataset.csv')
                        | 'Clean the items' >> beam.ParDo(Transaction())
                        )

    project_id = "pe-training"  # replace with your project ID
    dataset_id = 'rstar'  # replace with your dataset ID
    table_id = 'data1'  # replace with your table ID
    #table_schema = ('tripduration:INTEGER,starttime:TIMESTAMP,stoptime:TIMESTAMP,start_station_id:INTEGER,start_station_name:STRING	,start_station_latitude:FLOAT,start_station_longitude:FLOAT,end_station_id:INTEGER,end_station_name:STRING,end_station_latitude:FLOAT,end_station_longitude:FLOAT,bikeid:INTEGER,usertype:STRING,birth_year:INTEGER	,gender:STRING')
    table_schema = ('birth_year:INTEGER,birth_year_double:INTEGER,gender:STRING,gender_reverse:STRING')
    # Persist to BigQuery
    # WriteToBigQuery accepts the data as list of JSON objects
    data_from_source | 'Write' >> beam.io.WriteToBigQuery(
                    table=table_id,
                    dataset=dataset_id,
                    project=project_id,
                    schema=table_schema,
                    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                    write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                    batch_size=int(100)
                    )

    result = p.run()
    result.wait_until_finish()
示例#6
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # Import this here to avoid pickling the main session.
    import re

    # The pipeline will be run on exiting the with block.
    with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p:

        # Read the text file[pattern] into a PCollection.
        lines = p | 'Read' >> ReadFromText(known_args.input)

        words = (
            lines
            | 'Split' >> beam.FlatMap(
                lambda line: re.findall(r'[\w]+', line)).with_output_types(str)
            # Map to Row objects to generate a schema suitable for conversion
            # to a dataframe.
            | 'ToRows' >> beam.Map(lambda word: beam.Row(word=word)))

        df = to_dataframe(words)
        df['count'] = 1
        counted = df.groupby('word').sum()
        counted.to_csv(known_args.output)

        # Deferred DataFrames can also be converted back to schema'd PCollections
        counted_pc = to_pcollection(counted, include_indexes=True)

        # Print out every word that occurred >50 times
        _ = (counted_pc
             | beam.Filter(lambda row: row.count > 50)
             | beam.Map(lambda row: f'{row.word}: {row.count}')
             | beam.Map(print))
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input",
        dest="input",
        default=
        "gs://airflow-training-data/land_registry_price_paid_uk/*/*.json",
        help="Input file to process.",
    )
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        "--runner=DataflowRunner",
        "--project=gdd-airflow-training",
        "--staging_location=gs://airflow-training-data/dataflow-staging",
        "--temp_location=gs://airflow-training-data/dataflow-temp",
        "--job_name=gcs-gzcomp-to-bq1",
    ])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:
        (p
         | "ReadFromGCS" >> ReadFromText(known_args.input, coder=JsonCoder())
         | WriteToBigQuery(
             "result_table",
             dataset="result_dataset",
             project="gdd-airflow-training",
             schema="city:string, "
             "county:string, "
             "district:string, "
             "duration:string, "
             "locality:string, "
             "newly_built:boolean, "
             "paon:string, "
             "postcode:string, "
             "ppd_category_type:string, "
             "price:numeric, "
             "property_type:string, "
             "record_status:string, "
             "saon:string, "
             "street:string, "
             "transaction:string, "
             "transfer_date:numeric",
             create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
         ))
def run(argv=None):
    dt = datetime.now()
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        dest='input',
        default=
        'gs://{}_computronix/contractors/{}/{}/{}_contractors_licenses.json'.
        format(os.environ['GCS_PREFIX'], dt.strftime('%Y'),
               dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Input file to process.')
    parser.add_argument(
        '--avro_output',
        dest='avro_output',
        default=
        'gs://{}_computronix/contractors/avro_output/{}/{}/{}/avro_output'.
        format(os.environ['GCS_PREFIX'], dt.strftime('%Y'),
               dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Output directory to write avro files.')

    known_args, pipeline_args = parser.parse_known_args(argv)

    #TODO: run on on-prem network when route is opened
    # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally
    pipeline_args.extend(
        generate_args('computronix-trades-dataflow_scripts',
                      '{}_computronix'.format(os.environ['GCS_PREFIX']),
                      'DirectRunner'))

    avro_schema = get_schema('contractors_computronix')

    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(FormatColumnNames())
                | beam.ParDo(ConvertTypes())
                | beam.io.avroio.WriteToAvro(known_args.avro_output,
                                             schema=avro_schema,
                                             file_name_suffix='.avro',
                                             use_fastavro=True))
示例#9
0
def examples_wordcount_templated(renames):
  """Templated WordCount example snippet."""
  import re

  import apache_beam as beam
  from apache_beam.io import ReadFromText
  from apache_beam.io import WriteToText
  from apache_beam.options.pipeline_options import PipelineOptions

  # [START example_wordcount_templated]
  class WordcountTemplatedOptions(PipelineOptions):
    @classmethod
    def _add_argparse_args(cls, parser):
      # Use add_value_provider_argument for arguments to be templatable
      # Use add_argument as usual for non-templatable arguments
      parser.add_value_provider_argument(
          '--input',
          help='Path of the file to read from')
      parser.add_argument(
          '--output',
          required=True,
          help='Output file to write results to.')
  pipeline_options = PipelineOptions(['--output', 'some/output_path'])
  with beam.Pipeline(options=pipeline_options) as p:

    wordcount_options = pipeline_options.view_as(WordcountTemplatedOptions)
    lines = p | 'Read' >> ReadFromText(wordcount_options.input)
    # [END example_wordcount_templated]

    def format_result(word_count):
      (word, count) = word_count
      return '%s: %s' % (word, count)

    (
        lines
        | 'ExtractWords' >> beam.FlatMap(
            lambda x: re.findall(r'[A-Za-z\']+', x))
        | 'PairWithOnes' >> beam.Map(lambda x: (x, 1))
        | 'Group' >> beam.GroupByKey()
        | 'Sum' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1])))
        | 'Format' >> beam.Map(format_result)
        | 'Write' >> WriteToText(wordcount_options.output)
    )

    p.visit(SnippetUtils.RenameFiles(renames))
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input",
        dest="input",
        help="Input file to process.",
    )
    parser.add_argument(
        "--table",
        dest="table",
        help="Destination BigQuery table",
    )
    parser.add_argument(
        "--dataset",
        dest="dataset",
        help="Destination BigQuery dataset",
    )
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:
        (
                p
                | "ReadFromGCS" >> ReadFromText(known_args.input, coder=JsonCoder())
                | WriteToBigQuery(
            known_args.table,
            dataset=known_args.dataset,
            schema="city:string, "
                   "county:string, "
                   "district:string, "
                   "duration:string, "
                   "locality:string, "
                   "newly_built:boolean, "
                   "paon:string, "
                   "postcode:string, "
                   "ppd_category_type:string, "
                   "price:numeric, "
                   "property_type:string, "
                   "record_status:string, "
                   "saon:string, "
                   "street:string, "
                   "transaction:string, "
                   "transfer_date:numeric",
            create_disposition=BigQueryDisposition.CREATE_IF_NEEDED)
        )
示例#11
0
def run(archivo, mifecha):
    gcs_path = "gs://ct-sensus"  #Definicion de la raiz del bucket
    gcs_project = "contento-bi"

    mi_runer = ("DirectRunner",
                "DataflowRunner")[socket.gethostname() == "contentobi"]
    pipeline = beam.Pipeline(
        runner=mi_runer,
        argv=[
            "--project", gcs_project, "--staging_location",
            ("%s/dataflow_files/staging_location" % gcs_path),
            "--temp_location",
            ("%s/dataflow_files/temp" % gcs_path), "--output",
            ("%s/dataflow_files/output" % gcs_path), "--setup_file",
            "./setup.py", "--max_num_workers", "5", "--subnetwork",
            "https://www.googleapis.com/compute/v1/projects/contento-bi/regions/us-central1/subnetworks/contento-subnet1"
            # "--num_workers", "30",
            # "--autoscaling_algorithm", "NONE"
        ])

    # lines = pipeline | 'Lectura de Archivo' >> ReadFromText("gs://ct-bancolombia/info-segumiento/BANCOLOMBIA_INF_SEG_20181206 1100.csv", skip_header_lines=1)
    #lines = pipeline | 'Lectura de Archivo' >> ReadFromText("gs://ct-bancolombia/info-segumiento/BANCOLOMBIA_INF_SEG_20181129 0800.csv", skip_header_lines=1)
    lines = pipeline | 'Lectura de Archivo' >> ReadFromText(
        archivo, skip_header_lines=1)

    transformed = (lines
                   | 'Formatear Data' >> beam.ParDo(formatearData(mifecha)))

    # lines | 'Escribir en Archivo' >> WriteToText("archivos/Info_carga_banco_prej_small", file_name_suffix='.csv',shard_name_template='')

    # transformed | 'Escribir en Archivo' >> WriteToText("archivos/Info_carga_banco_seg", file_name_suffix='.csv',shard_name_template='')
    #transformed | 'Escribir en Archivo' >> WriteToText("gs://ct-bancolombia/info-segumiento/info_carga_banco_seg",file_name_suffix='.csv',shard_name_template='')

    transformed | 'Escritura a BigQuery lal' >> beam.io.WriteToBigQuery(
        gcs_project + ":sensus.dcorreo",
        schema=TABLE_SCHEMA,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
    # transformed | 'Borrar Archivo' >> FileSystems.delete('gs://ct-avon/prejuridico/AVON_INF_PREJ_20181111.TXT')
    # 'Eliminar' >> FileSystems.delete (["archivos/Info_carga_avon.1.txt"])

    jobObject = pipeline.run()
    # jobID = jobObject.job_id()

    return ("Corrio Full HD")
def run(argv=None):
    """Runs the debugging wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:

        # Read the text file[pattern] into a PCollection, count the occurrences of
        # each word and filter by a list of words.
        filtered_words = (
            p | 'read' >> ReadFromText(known_args.input)
            | CountWords()
            | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

        # assert_that is a convenient PTransform that checks a PCollection has an
        # expected value. Asserts are best used in unit tests with small data sets
        # but is demonstrated here as a teaching tool.
        #
        # Note assert_that does not provide any output and that successful
        # completion of the Pipeline implies that the expectations were  met. Learn
        # more at https://cloud.google.com/dataflow/pipelines/testing-your-pipeline
        # on how to best test your pipeline.
        assert_that(filtered_words, equal_to([('Flourish', 3),
                                              ('stomach', 1)]))

        # Format the counts into a PCollection of strings and write the output using
        # a "Write" transform that has side effects.
        # pylint: disable=unused-variable
        output = (filtered_words
                  |
                  'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
                  | 'write' >> WriteToText(known_args.output))
def run(argv=None):
    known_args, pipeline_options, avro_schema = generate_args(
        job_name='computronix-trades-dataflow',
        bucket='{}_computronix'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='trade_licenses_computronix')

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(FormatColumnNames())
                | beam.ParDo(ConvertTypes())
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
示例#14
0
def run(argv=None):
    """Main entry point; defines and runs the entity extraction pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', required=True, help='Input JSON to read')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to')

    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:
        records = p | 'ReadRecords' >> ReadFromText(known_args.input)
        entities = records | 'ExtractEntities' >> beam.ParDo(
            EntityExtraction())
        entities | 'WriteEntities' >> WriteToText(
            known_args.output, file_name_suffix='entities.json.gz')
示例#15
0
def run(argv=None):
    # argument parser
    parser = argparse.ArgumentParser()

    # pipeline options, google_cloud_options
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    setup_options = pipeline_options.view_as(SetupOptions)
    setup_options.save_main_session = True

    p = beam.Pipeline(options=pipeline_options)

    rec = p | "read from GCS" >> ReadFromText("gs://bucket_name/folder_path/file*.csv") \
        | "transform" >> beam.Map(lambda x: x) \
        | "write to GCS" >> WriteToText(known_args.output_path)

    result = p.run()
    result.wait_until_finish()
示例#16
0
def run(argv=None):
  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      required=True,
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  
  pipeline_options = PipelineOptions(pipeline_args)
  with beam.Pipeline(options=pipeline_options) as p:

    (p
      | 'read' >> ReadFromText(known_args.input)
      | 'copy' >> WriteToText(known_args.output))
示例#17
0
def main(input: str, table: str, project: str, pipeline_args):
    pipeline_args = list(pipeline_args)
    pipeline_args.extend([
        "--runner=DataflowRunner",
        f"--project={project}",
        "--temp_location=gs://recsys2020-challenge-wantedly/temp",
        "--staging_location=gs://recsys2020-challenge-wantedly/staging",
    ])
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | ReadFromText(input)
        rows = lines | "Parse" >> beam.ParDo(ParseFn(COLUMNS))
        rows | beam.io.gcp.bigquery.WriteToBigQuery(
            table,
            schema=schema(),
            project=project,
        )
示例#18
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        help='Input file to process.',
                        default='../input/smallMat.json')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    parser.add_argument('--runner',
                        dest='runner',
                        default='DirectRunner',
                        help='Runner class to use (DataflowRunner for GCP).')
    parser.add_argument('--project',
                        dest='project',
                        help='GCP project for Cloud Dataflow')
    parser.add_argument(
        '--num-results',
        dest='num_results',
        default=1000,
        type=int,
        help='Number of results top results to keep per column.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--runner={}'.format(known_args.runner),
        '--project={}'.format(known_args.project),
        '--staging_location=gs://beam-matmul/staging',
        '--temp_location=gs://beam-matmul/temp',
        '--job_name=matmul-side-input',
    ])

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:
        files = p | 'Read ls List' >> ReadFromText(known_args.input)
        mat = files | 'Parse Columns' >> beam.Map(json.loads)
        (mat | 'Calc Scores' >> beam.Map(calc_scores, beam.pvalue.AsIter(mat),
                                         known_args.num_results)
         | 'Serialize as JSON' >> beam.Map(json.dumps)
         | 'Write Scores' >> WriteToText(known_args.output))
示例#19
0
def run(argv=None):

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-kga369-207722/twitter_data_3.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        default='$BUCKET/output',
                        help='kga369-207722:MindValley_Project.Twitter')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--runner=DirectRunner',
        '--project=$PROJECT',
        '--staging_location=$BUCKET/staging',
        '--temp_location=$BUCKET/temp',
        '--job_name=your-wordcount-job',
    ])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:

        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input)

        # Count the occurrences of each word.
        counts = (lines
                  | 'Split' >> (beam.FlatMap(lambda x: re.findall(
                      r'[#]+[A-Za-z\']+', x)).with_output_types(unicode))
                  | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
                  | 'GroupAndSum' >> beam.CombinePerKey(sum))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return '%s: %s' % (word, count)

        output = counts | 'Format' >> beam.Map(format_result)
        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | WriteToText(known_args.output)
示例#20
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/5: The Google Cloud Storage path is required
        # for outputting the results.
        default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:

        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input)

        # Count the occurrences of each word.
        counts = (lines
                  | 'Split' >> (beam.FlatMap(lambda x: re.findall(
                      r'[A-Za-z\']+', x)).with_output_types(unicode))
                  | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
                  | 'GroupAndSum' >> beam.CombinePerKey(sum))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return '%s: %s' % (word, count)

        output = counts | 'Format' >> beam.Map(format_result)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | WriteToText(known_args.output)
示例#21
0
def run_wordcount_without_save_main_session(argv):
  """Defines and runs a simple version of wordcount pipeline.

  This pipeline is the same as wordcount example except replace customized
  DoFn class with transform function and disable save_main_session option
  due to BEAM-6158."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  p = beam.Pipeline(options=PipelineOptions(pipeline_args))

  # Count the occurrences of each word.
  def count_ones(word_ones):
    (word, ones) = word_ones
    return (word, sum(ones))

  # Parse each line of input text into words.
  def extract_words(line):
    return re.findall(r'[\w\']+', line, re.UNICODE)

  # Format the counts into a PCollection of strings.
  def format_result(word_count):
    (word, count) = word_count
    return '%s: %d' % (word, count)

  # pylint: disable=expression-not-assigned
  (p | 'read' >> ReadFromText(known_args.input)
   | 'split' >> (beam.ParDo(extract_words).with_output_types(unicode))
   | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
   | 'group' >> beam.GroupByKey()
   | 'count' >> beam.Map(count_ones)
   | 'format' >> beam.Map(format_result)
   | 'write' >> WriteToText(known_args.output))

  result = p.run()
  result.wait_until_finish()
示例#22
0
def run(argv=None):
    dt = datetime.now()
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        dest='input',
        default='gs://{}_finance/{}/{}/{}_registered_businesses.csv'.format(
            os.environ['GCS_PREFIX'], dt.strftime('%Y'),
            dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Input file to process.')
    parser.add_argument(
        '--avro_output',
        dest='avro_output',
        default='gs://{}_finance/avro_output/{}/{}/{}/avro_output'.format(
            os.environ['GCS_PREFIX'], dt.strftime('%Y'),
            dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Output directory to write avro files.')

    known_args, pipeline_args = parser.parse_known_args(argv)

    #TODO: run on on-prem network when route is opened

    # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally
    pipeline_args.extend(
        generate_args('registered-businesses-dataflow_scripts',
                      '{}_finance'.format(os.environ['GCS_PREFIX']),
                      'DirectRunner'))

    avro_schema = get_schema('registered_businesses')

    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, skip_header_lines=1)

        load = (lines
                | beam.ParDo(ConvertToDicts())
                | beam.ParDo(AddNormalizedAddress())
                | beam.io.avroio.WriteToAvro(known_args.avro_output,
                                             schema=avro_schema,
                                             file_name_suffix='.avro',
                                             use_fastavro=True))
示例#23
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    # The pipeline will be run on exiting the with block.
    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | 'Read' >> ReadFromText(known_args.input)

        counts = (
            lines
            | 'Split Words' >>
            (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
            | 'Pair With One' >> beam.Map(lambda x: (x, 1))
            | 'Group And Sum' >> beam.CombinePerKey(sum)
            | 'Find Top 3 Most Frequent Words' >> beam.CombineGlobally(
                beam.combiners.TopCombineFn(
                    n=3, compare=lambda a, b: a[1] < b[1])).without_defaults())

        # Format the counts into a PCollection of strings.
        def format_result(word, count):
            return '%s: %d' % (word, count)

        #output = counts | 'Format' >> beam.MapTuple(format_result)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output = counts | 'Write' >> WriteToText(known_args.output)
示例#24
0
def run(argv=None):
    """Main entry point; defines and runs the pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input', dest='input', help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:
        (p  # pylint: disable=expression-not-assigned
         | 'read' >> ReadFromText(known_args.input, coder=JsonCoder())
         | 'compute' >> beam.FlatMap(compute)
         | 'write' >> WriteToText(known_args.output, shard_name_template=''))
示例#25
0
def run(argv=None):

    p = beam.Pipeline(options=PipelineOptions())


    class Printer(beam.DoFn):
        def process(self, element):
            print element

    class Transaction(beam.DoFn):
        def process(self, element):
            
            t=[]
            t=element.split(',')	
            if t[0]!='Place':
                return [{"Place": t[0],"Gender": t[1],"Year" : t[2],"Name" : t[3][::-1],"Number" : int(t[4])*22}]
				


    data_from_source = (p
                        | 'Read the source file' >> ReadFromText('gs://group4-bucket/Group4Data1.csv')
                        | 'Clean the items' >> beam.ParDo(Transaction())
                        )

    project_id = "pe-training"  # replace with your project ID
    dataset_id = 'group4dataset1'  # replace with your dataset ID
    table_id = 'ayushtable'  # replace with your table ID
    table_schema = ('Place:STRING,Gender:STRING,Year:INTEGER,Name:STRING,Number:INTEGER')

    # Persist to BigQuery
    # WriteToBigQuery accepts the data as list of JSON objects
    data_from_source | 'Write' >> beam.io.WriteToBigQuery(
                    table=table_id,
                    dataset=dataset_id,
                    project=project_id,
                    schema=table_schema,
                    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                    write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                    batch_size=int(100)
                    )

    result = p.run()
    result.wait_until_finish()
示例#26
0
def run():
    pipeline = beam.Pipeline()

    # Read lines from input file.
    lines = pipeline | 'read' >> ReadFromText('data/king_arthur.txt')

    # Count words.
    counts = (lines | 'split' >> beam.FlatMap(
        lambda x: re.findall(r'[A-Za-z\']+', x)).with_output_types(unicode)
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

    # Format and write to output file.
    output = counts | 'format' >> beam.Map(
        lambda (word, count): '{}: {}'.format(word, count))
    output | 'write' >> WriteToText('minimal-wordcount-output', '.txt')

    pipeline.run().wait_until_finish()
示例#27
0
def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    window_duration = 1 * 60  # 1 minute windows.
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic):
        events = (
            p
            | 'read' >> ReadFromText(known_args.input)
            | 'parse' >> beam.FlatMap(ParseEventFn())
            | 'add_event_timestamps' >>
            beam.Map(lambda x: beam.window.TimestampedValue(x, x.timestamp)))
    else:
        events = (p
                  |
                  'read' >> ReadFromPubSub(topic=known_args.topic,
                                           timestamp_attribute='timestamp_ms')
                  | 'decode' >> beam.ParDo(ParseEventFn()))

    # Window team scores and write them BigQuery.
    _ = (events
         | 'windowed_team_score' >> WindowedTeamScore(window_duration)
         | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
         | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery(
             known_args.output_tablename + '_team', known_args.output_dataset,
             project, TEAM_SCHEMA))

    # Write leaderboards to BigQuery.
    _ = (events
         | 'running_user_score' >> RunningUserScores()
         | 'format_user_scores' >> beam.ParDo(FormatUserScoreSum())
         | 'write_users_to_bigquery' >> beam.io.WriteToBigQuery(
             known_args.output_tablename + '_user', known_args.output_dataset,
             project, USER_SCHEMA))

    p.run().wait_until_finish()
示例#28
0
def main(argv=None):
    # options = PipelineOptions(flags=argv)
    # review_processing_options = options.view_as(TemplateReviewProcessingOptions)

    # #installing packages used in process
    # setup_options = options.view_as(SetupOptions)
    # setup_options.save_main_session = True

    # pipeline.run(options, review_processing_options)

    options = PipelineOptions(flags=argv)
    setup_options = options.view_as(SetupOptions)
    # setup_options.setup_file = './setup.py'
    # setup_options.save_main_session = False

    wordcount_options = options.view_as(WordcountOptions)

    with beam.Pipeline(options=setup_options) as p:
        lines = p | 'read' >> ReadFromText(wordcount_options.input)
示例#29
0
def run(input_path, output_path, expansion_service_port, pipeline_args):
    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | 'Read' >> ReadFromText(input_path).with_output_types(str)
        words = lines | 'Split' >> (beam.ParDo(
            WordExtractingDoFn()).with_output_types(str))

        java_output = (words
                       | 'JavaCount' >> beam.ExternalTransform(
                           'beam:transform:org.apache.beam:javacount:v1', None,
                           ('localhost:%s' % expansion_service_port)))

        def format(kv):
            key, value = kv
            return '%s:%s' % (key, value)

        output = java_output | 'Format' >> beam.Map(format)
        output | 'Write' >> WriteToText(output_path)
示例#30
0
def run(argv=None):
    """pipeline abandoned-carts"""
    """Definicao argumentos de entrada do pipeline"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        required=True,
                        help='Arquivo de entrada para o processamento.')
    parser.add_argument(
        '--output',
        dest='output',
        required=True,
        help='Arquivo de saida com o resultado do processamento.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    """Le arquivo"""
    lines = p | 'le_arquivo' >> ReadFromText(known_args.input,
                                             coder=JsonCoder())
    """Filtra todos os clientes que fizeram checkout, e traz o customer como chave"""
    filtros_checkout = (
        lines
        | 'filtra_pagina_checkout' >> beam.Filter(lambda x:
                                                  (x['page'] == 'checkout'))
        | 'gera_chave_customer' >> beam.Map(lambda x: (x['customer'], x)))
    """Traz o customer como chave para todos os registros"""
    filtros_todos = (
        lines
        |
        'gera_chave_customer_geral' >> beam.Map(lambda x: (x['customer'], x)))
    """Faz o left join trazendo todos os registros que nao tem checkout"""
    results = ((filtros_todos, filtros_checkout)
               | 'left_join' >> beam.CoGroupByKey()
               | 'filtro_sem_checkout' >> beam.Filter(lambda x: not (x[1][1]))
               | 'formata_saida' >> beam.Map(lambda x: (x[1][0][-1])))
    """Grava o resultado num arquivo json"""
    results | 'grava_resultado' >> WriteToText(
        known_args.output, coder=JsonCoder(), file_name_suffix='.json')

    result_run = p.run()
    result_run.wait_until_finish()