Exemplo n.º 1
0
    def _create_input_data(self):
        """
    Runs an additional pipeline which creates test data and waits for its
    completion.
    """
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            import base64
            return {'data': base64.b64encode(record[1])}

        with TestPipeline() as p:
            (  # pylint: disable=expression-not-assigned
                p
                | 'Produce rows' >> Read(
                    SyntheticSource(self.parse_synthetic_source_options()))
                | 'Format' >> Map(format_record)
                | 'Write to BigQuery' >> WriteToBigQuery(
                    dataset=self.input_dataset,
                    table=self.input_table,
                    schema=SCHEMA,
                    create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                    write_disposition=BigQueryDisposition.WRITE_EMPTY))
Exemplo n.º 2
0
def main(argv):
    """Main entry point"""

    # Define and parse command line arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        type=str,
                        default='',
                        help='Path to the data file(s) containing game data.')

    parser.add_argument(
        '--output_dataset',
        type=str,
        default='',
        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument(
        '--output_table_name',
        type=str,
        default='',
        help='The BigQuery table name where to write all the data.')

    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)

    # Create and run the pipeline
    with beam.Pipeline(options=options) as p:
        (p | 'ReadInputText' >> beam.io.ReadFromText(args.input)
         | 'ParseGameEvent' >> ParDo(ParseEventFn())
         | 'ExtractUserScore' >> ExtractAndSumScore()
         | 'FormatUserScoreSums' >> ParDo(FormatUserScoreSumsFn())
         | 'WriteTeamScoreSums' >> WriteToBigQuery(
             args.output_table_name, args.output_dataset,
             options.get_all_options().get("project"), table_schema()))
def run(argv=None):
    # argument parser
    parser = argparse.ArgumentParser()

    # pipeline options, google_cloud_options
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    setup_options = pipeline_options.view_as(SetupOptions)
    setup_options.save_main_session = True

    p = beam.Pipeline(options=pipeline_options)

    #  Read from a csv source and a pubsub source
    p1 = p | 'read from topic' >> beam.io.ReadFromPubSub(topic='projects/PROJECT_ID/topics/TOPIC_NAME_1') \
        | 'convert to dict' >> beam.Map(lambda x: json.loads(x))

    p2 = p | 'read from csv files' >> ReadFromText('gs://bucket_name/historical/files*.csv') \
        | 'split' >> beam.Map(lambda x: x.split(',')) \
        | 'format to dict' >> beam.Map(lambda x: {"id": x[0], "name": x[1]})

    input_rec = (p1, p2) | 'flatten' >> beam.Flatten()

    # Write the messages
    output_rec = input_rec | 'write to GCS' >> WriteToBigQuery(
        'PROJECT_ID:DATASET_ID.TABLE_NAME',
        schema='id:INTEGER, name:STRING',
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

    result = p.run()
    result.wait_until_finish()
def run(argv=None):
    # argument parser
    parser = argparse.ArgumentParser()

    # pipeline options, google_cloud_options
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    setup_options = pipeline_options.view_as(SetupOptions)
    setup_options.save_main_session = True

    p = beam.Pipeline(options=pipeline_options)

    p1 = p | 'trigger from pubsub' >> beam.io.ReadFromPubSub(topic='projects/PROJECT_ID/topics/TOPIC_NAME_1') \
        | "convert msg to dict" >> beam.Map(lambda x: json.loads(x)) \
        | "extract filename" >> beam.Map(lambda x : 'gs://{}/{}'.format(x['bucket'], x['name'])) \
        | "read file" >> ReadAllFromText() \
        | 'split' >> beam.Map(lambda x: x.split(',')) \
        | 'format to dict' >> beam.Map(lambda x: {"id": x[0], "name": x[1]})

    # Write the messages
    output_rec = p1 | 'write to BigQuery' >> WriteToBigQuery(
        'PROJECT_ID:DATASET_ID.TABLE_NAME',
        schema='id:INTEGER, name:STRING',
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

    result = p.run()
    result.wait_until_finish()
Exemplo n.º 5
0
def run(argv=None):
    # argument parser
    parser = argparse.ArgumentParser()

    # pipeline options, google_cloud_options
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    setup_options = pipeline_options.view_as(SetupOptions)
    setup_options.save_main_session = True

    p = beam.Pipeline(options=pipeline_options)

    # Create two PCollections by reading from two different pubsub topics
    p1 = p | 'read from topic 1' >> beam.io.ReadFromPubSub(
        topic='projects/PROJECT_ID/topics/TOPIC_NAME_1')
    p2 = p | 'read from topic 2' >> beam.io.ReadFromPubSub(
        topic='projects/PROJECT_ID/topics/TOPIC_NAME_2')

    # Merge the two PCollections
    merged = (p1, p2) | 'merge sources' >> beam.Flatten()
    # Convert to dict
    rec_dict = merged | 'convert to dict' >> beam.Map(lambda x: json.loads(x))

    # Write the messages
    rec_dict | 'write to GCS' >> WriteToBigQuery(
        'PROJECT_ID:DATASET_ID.TABLE_NAME',
        schema='id:INTEGER, name:STRING',
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

    result = p.run()
    result.wait_until_finish()
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input",
        dest="input",
        default="gs://airflow-training-knab-asv/land_registry_price_paid_uk/*/*.json",
        help="Input file to process.",
    )
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(
        [
            "--runner=DataflowRunner",
            "--project=gdd-990fd90d0db6efbabdc6b70f1c",
            "--staging_location=gs://airflow-training-knab-asv/dataflow-staging",
            "--temp_location=gs://airflow-training-knab-asv/dataflow-temp",
            "--job_name=gcs-gzcomp-to-bq1",
        ]
    )

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:
        (
            p
            | "ReadFromGCS" >> ReadFromText(known_args.input, coder=JsonCoder())
            | WriteToBigQuery(
                "result_table",
                dataset="result_dataset",
                project="gdd-990fd90d0db6efbabdc6b70f1c",
                schema="city:string, county:string, district:string, duration:string, locality:string, newly_built:boolean, paon:string, postcode:string, ppd_category_type:string, price:numeric, property_type:string, record_status:string, saon:string, street:string, transaction:string, transfer_date:numeric",
                create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
            )
        )
Exemplo n.º 7
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('input_topic',
                        type=str,
                        help="Input Pub/Sub topic name.")
    parser.add_argument(
        'output_table',
        type=str,
        help="Output BigQuery table name. Example: project.db.name")
    parser.add_argument('--model_project',
                        type=str,
                        help="Google Project ID with model.")
    parser.add_argument('--model_name',
                        type=str,
                        help="Name of the Google AI Platform model name.")
    parser.add_argument('--model_region',
                        type=str,
                        help="AI Platform region name.")
    parser.add_argument('--model_version',
                        type=str,
                        help="AI Platform model version.")

    known_args, pipeline_args = parser.parse_known_args(argv)

    _topic_comp = known_args.input_topic.split('/')
    if len(_topic_comp) != 4 or _topic_comp[0] != 'projects' or _topic_comp[
            2] != 'topics':
        raise ValueError("Table topic name has inappropriate format.")

    if len(known_args.output_table.split('.')) != 2:
        raise ValueError("Table name has inappropriate format.")

    inf_args = [
        known_args.model_project, known_args.model_name,
        known_args.model_region, known_args.model_version
    ]
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    p = Pipeline(options=options)
    _ = (p | 'read from pub/sub' >> ReadFromPubSub(
        known_args.input_topic).with_output_types(bytes)
         | 'windowing' >> WindowInto(window.FixedWindows(10, 0))
         | 'convert to dict' >> Map(json.loads)
         | 'pre processing' >> PreProcessing()
         | 'make inference' >> ParDo(MakeRemoteInferenceDoFn(*inf_args))
         | 'format message' >> Map(formatter)
         | 'write to BQ' >> WriteToBigQuery(
             table=known_args.output_table,
             schema=build_bq_schema(),
             create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=BigQueryDisposition.WRITE_APPEND))
    if os.environ.get('DEPLOY'):
        p.run(
        )  # I use p.run() instead of "opening context `with Pipeline() as p`" because it need to exit after running.
    else:
        p.run().wait_until_finish()
Exemplo n.º 8
0
def run(argv=None):
    """
    Main entry point, define and run the pipeline
    """
    parser = argparse.ArgumentParser(
        description='Run Apache Beam to process the logs')
    parser.add_argument('--input', dest='input', help='Input file to process')
    parser.add_argument('--output',
                        dest='output',
                        help='Output file to write results to')
    parser.add_argument(
        '--input_subscription',
        dest='input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    parser.add_argument(
        '--output_table',
        dest='output_table',
        help=('BigQuery Table to write results to, with the form '
              '<PROJECT>:<DATASET>.<TABLE>'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True

    print('pipeline options:', pipeline_options)

    # Specification for table in BigQuery
    table_spec = args.output_table
    table_schema = 'host:STRING, utc_timestamp:TIMESTAMP, action:STRING, uri:STRING, protocol:STRING, status:STRING, size:INTEGER'

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        if known_args.input_subscription:
            lines = (p
                     |
                     ReadFromPubSub(subscription=known_args.input_subscription
                                    ).with_output_types(bytes))
        else:
            lines = (p
                     | ReadFromText(known_args.input,
                                    coder=coders.BytesCoder()))

        output = (lines | 'parse_filter' >> beam.ParDo(ParseAndFilterDoFn()))
        # | 'parse' >> (beam.Map(parse_one_record)))

        # output | WriteToText(known_args.output)
        output | WriteToBigQuery(
            table_spec,
            schema=table_schema,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
Exemplo n.º 9
0
def run(argv=None):
    """Pipeline for reading data from a PubSub topic,
    redacting the data using Cloud DLP and writing the results to BigQuery"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        help='PubSub topic to read from.')
    parser.add_argument(
        '--output',
        dest='output',
        help=
        'BigQuery output dataset and table name in the format dataset.tablename'
    )
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:

        if 'streaming' in p.options.display_data():
            # Read in the CSV file
            lines = (p
                     | 'ReadFromPubSub' >> ReadFromPubSub(
                         topic=known_args.input).with_output_types(bytes)
                     | 'DecodeMessage' >> beam.Map(lambda x: x.decode('utf-8'))
                     | 'ParseMessage' >> beam.ParDo(ParsePubSubMessageFn()))
        else:
            # Read in the CSV file
            lines = (p
                     | 'ReadFromGCS' >> ReadFromText(known_args.input)
                     | 'ParseFileFn' >> beam.ParDo(ParseFileFn()))

        # Redact PII from the 'text' column.
        redacted_rows = (
            lines
            | 'IdentifyAndRedactText' >> IdentifyAndRedactText(
                p.options.display_data()['project'], ['ALL_BASIC']))

        # Format rows and write to BigQuery.
        (redacted_rows
         | 'MapToTableRows' >> beam.Map(lambda row: {
             'id': row['id'],
             'text': row['text']
         })
         | 'WriteToBigQuery' >> WriteToBigQuery(
             known_args.output,
             schema='id:INTEGER, text:STRING',
             project=p.options.display_data()['project'],
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
Exemplo n.º 10
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input",
        dest="input",
        help="Input file to process.",
    )
    parser.add_argument(
        "--table",
        dest="table",
        help="Destination BigQuery table",
    )
    parser.add_argument(
        "--dataset",
        dest="dataset",
        help="Destination BigQuery dataset",
    )
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(
        [
            "--staging_location=gs://airflow-emgsilva/dataflow-staging",
            "--temp_location=gs://airflow-emgsilva/dataflow-temp"
        ]
    )
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:
        (
                p
                | "ReadFromGCS" >> ReadFromText(known_args.input, coder=JsonCoder())
                | WriteToBigQuery(
            known_args.table,
            dataset=known_args.dataset,
            schema="city:string, "
                   "county:string, "
                   "district:string, "
                   "duration:string, "
                   "locality:string, "
                   "newly_built:boolean, "
                   "paon:string, "
                   "postcode:string, "
                   "ppd_category_type:string, "
                   "price:numeric, "
                   "property_type:string, "
                   "record_status:string, "
                   "saon:string, "
                   "street:string, "
                   "transaction:string, "
                   "transfer_date:numeric",
            create_disposition=BigQueryDisposition.CREATE_IF_NEEDED)
        )
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--schema_registry',
        dest='schema_registry',
        default='http://127.0.0.1:8081',
        help='Schema registry endpoint. Defaults to local endpoint.')
    parser.add_argument('--failed-bq-inserts',
                        dest='failed_bq_inserts',
                        required=True,
                        help='Bucket for writing failed inserts')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--job_name=dbz-test-example',
    ])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    pipeline_options.view_as(StandardOptions).streaming = True

    project_id = 'crafty-apex-264713'
    kafka_topic = 'dbserver1.inventory.customers'
    pubsub_topic = f'projects/{project_id}/topics/{kafka_topic}'

    with beam.Pipeline(options=pipeline_options) as p:
        bq = (
            p
            | 'Read from PubSub' >> ReadFromPubSub(topic=pubsub_topic)
            | '2 Second Window' >> beam.WindowInto(window.FixedWindows(2))
            | 'Avro to Row' >> beam.FlatMap(
                avro_to_row(known_args.schema_registry))
            # | 'Write to File' >>
            #       beam.io.WriteToText('args.output')
            | 'Write to BigQuery' >> WriteToBigQuery(
                'crafty-apex-264713:inventory.customers',
                schema='id:INT64,'
                'first_name:STRING,'
                'last_name:STRING,'
                'email:STRING,'
                '__op:STRING,'
                '__source_ts_ms:INT64,'
                '__lsn:INT64',
                create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=BigQueryDisposition.WRITE_APPEND))

        # Can't get this to run in dataflow - causes job graph that is not updatable
        # In direct runner I can't get it to spit any errors
        """
Exemplo n.º 12
0
def main(argv):
    """Main entry point"""

    # Define and parse command line arguments
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        type=str,
        default='',
        help=
        'Path to the data file(s) containing game data (use either this parameter or --topic but not both).'
    )

    parser.add_argument(
        '--topic',
        type=str,
        default='',
        help=
        'Topic to subscribe to (use either this parameter or --input but not both).'
    )

    parser.add_argument(
        '--output_dataset',
        type=str,
        default='',
        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument(
        '--output_table_name',
        type=str,
        default='',
        help='The BigQuery table name where to write all the data.')

    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=options) as p:
        (p | 'ReadGameEvents' >> ReadGameEvents(args)
         | 'WindowedTeamScore' >> WindowedTeamScore(30)
         | 'FormatTeamScoreSums' >> ParDo(
             FormatTeamScoreSumsFn(
                 (args.topic != None) and (args.topic != "")))
         | 'WriteTeamScoreSums' >>
         WriteToBigQuery(args.output_table_name, args.output_dataset,
                         options.get_all_options().get("project"),
                         table_schema(), BigQueryDisposition.CREATE_IF_NEEDED,
                         BigQueryDisposition.WRITE_APPEND))
def run():
    transform_events_options = PipelineOptions().view_as(
        TransformEventsOptions)
    pipeline_options = PipelineOptions()
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options, )
    incoming_topic = str(transform_events_options.incoming_topic)
    logging.info("Incoming topic for events = {}".format(incoming_topic))
    output_table = str(transform_events_options.output_table)
    logging.info(
        "Output table for transformed events: {}".format(output_table))

    _ = (p
         | 'Read events from PubSub' >> ReadStringsFromPubSub(incoming_topic)
         | 'Transform PubSub events' >> beam.ParDo(TransformEvents())
         | 'Write to BigQuery' >> WriteToBigQuery(table=output_table))

    p.run()
Exemplo n.º 14
0
  def get_replacement_transform(self, ptransform):
    # Imported here to avoid circular dependencies.
    # pylint: disable=wrong-import-order, wrong-import-position
    from apache_beam import io

    class WriteToBigQuery(io.WriteToBigQuery):
      override = True

      def __init__(self, transform, outputs):
        self.transform = transform
        self.outputs = outputs

      def __getattr__(self, name):
        """Returns the given attribute from the parent.

        This allows this transform to act like a WriteToBigQuery transform
        without having to construct a new WriteToBigQuery transform.
        """
        return self.transform.__getattribute__(name)

      def expand(self, pcoll):
        from apache_beam.io.gcp.bigquery_tools import parse_table_schema_from_json
        import json

        schema = None
        if self.schema:
          schema = parse_table_schema_from_json(json.dumps(self.schema))

        out = pcoll | io.Write(
            io.BigQuerySink(
                self.table_reference.tableId,
                self.table_reference.datasetId,
                self.table_reference.projectId,
                schema,
                self.create_disposition,
                self.write_disposition,
                kms_key=self.kms_key))

        # The WriteToBigQuery can have different outputs depending on if it's
        # Batch or Streaming. This retrieved the output keys from the node and
        # is replacing them here to be consistent.
        return {key: out for key in self.outputs}

    return WriteToBigQuery(ptransform, self.outputs)
Exemplo n.º 15
0
def run(argv=None):
    """Pipeline for reading data from a PubSub topic or a Cloud Storage bucket,
    redacting the data using Cloud DLP and writing the results to BigQuery"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        help='PubSub topic to read from.')
    parser.add_argument('--output',
                        dest='output',
                        help='BigQuery output dataset and table name in the format dataset.tablename')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        lines = (p
                 # 1. Read in the file from PubSub.
                 | 'ReadFromPubSub' >> ReadFromPubSub()

                 # 2. Process the JSON message from PubSub
                 | 'ParseMessage'

                 )

        average = (lines
                   | 'ApplyWindow'
                   )
        # 3. For each Key, sum up the values
        # 4. Format the as Python dictionaries for writing to BigQuery

        (lines
         # 4. Format the as Python dictionaries for writing to BigQuery
         | 'ConvertToDictionary'
         # 5. Write the output to BigQuery
         | 'WriteToBigQuery' >> WriteToBigQuery(
                    known_args.output,
                    schema='id:INTEGER, total:INTEGER',
                    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                    write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
                ))
Exemplo n.º 16
0
def run(argv=None):
    """Pipeline for reading data from a PubSub topic or a Cloud Storage bucket,
    redacting the data using Cloud DLP and writing the results to BigQuery"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        help='PubSub topic to read from.')
    parser.add_argument(
        '--output',
        dest='output',
        help=
        'BigQuery output dataset and table name in the format dataset.tablename'
    )
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        # Read in the CSV file
        lines = (p
                 | 'ReadFromPubSub' >> ReadFromPubSub(
                     topic=known_args.input).with_output_types(bytes)
                 | 'DecodeMessage' >> beam.Map(lambda x: x.decode('utf-8'))
                 | 'ParseMessage' >> beam.ParDo(ParsePubSubMessageFn()))

        windows = (lines
                   | 'WindowInto' >> beam.WindowInto(FixedWindows(30, 0))
                   | 'SumValues' >> beam.CombinePerKey(sum))

        # Format rows and write to BigQuery.
        (windows
         | 'ConvertToDictionary' >> beam.Map(lambda row: {
             'id': row[0],
             'total': row[1]
         })
         | 'WriteToBigQuery' >> WriteToBigQuery(
             known_args.output,
             schema='id:INTEGER, total:INTEGER',
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
Exemplo n.º 17
0
    def test(self):
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            return {'data': base64.b64encode(record[1])}

        # pylint: disable=expression-not-assigned
        (self.pipeline
         | 'ProduceRows' >> Read(
             SyntheticSource(self.parseTestPipelineOptions()))
         | 'Format' >> Map(format_record)
         | 'WriteToBigQuery' >> WriteToBigQuery(
             self.output_dataset + '.' + self.output_table,
             schema=SCHEMA,
             create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=BigQueryDisposition.WRITE_EMPTY))
Exemplo n.º 18
0
def run(argv=None, save_main_session=True):
    pipeline_args = []
    pipeline_args.extend([
        # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DirectRunner',
        #'--runner=dataflow',
        # CHANGE 3/5: Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=dragon-test-270305',
        # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://duysdf/',
        # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://duysdf/temp',
        '--job_name=backend_log_dataflow_to_bigquery',
        '--streaming',
        '--region=asia-southeast1',
        '--max-workers=1',
  ])
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
    with beam.Pipeline(options=pipeline_options) as p:
        elements = (p | "CreatePCollectionData" >> beam.Create([
            json.dumps({'_Dest': 'T3', 'X': 'AAA', 'Y':123 }),
            json.dumps({'_Dest': 'T4', 'X': 'BBB', 'Y':456 }),
        ]))

        # output multi pcollections ; yield TaggedOutput('T3', element) goes to T3 pcollection
        processed_tagged_log = elements | "multiplex-pcoll" >> beam.ParDo(ParseBackendLog()).with_outputs(*g_backend_tables)  
        for key in g_backend_tables:
            processed_tagged_log[key] | "WriteBQ_%s" % key >> WriteToBigQuery(
               table=key,
               dataset=g_dataset,
               project=g_project_id,
               write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
Exemplo n.º 19
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default=
        '/home/shravan/Desktop/gcp_files/2020-10-02-11-34-19-EA6C5E314B70B157',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        default='output',
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    with beam.Pipeline(options=pipeline_options) as p:

        #obj = Utility()
        table_spec = bigquery.TableReference(projectId='justlikethat-294122',
                                             datasetId='log_analysis',
                                             tableId='quotes')

        table_schema = 'source:STRING,  quote:STRING'

        data_ingestion = dataingestion()
        (p | 'Read from a File' >> beam.io.ReadFromText(known_args.input)
         | 'String To BigQuery Row' >>
         beam.Map(lambda s: data_ingestion.parse_method(s))
         | 'Write to BigQuery' >> WriteToBigQuery(
             table_spec,
             schema='source:STRING,  quote:STRING',
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
        """
Exemplo n.º 20
0
def main(argv):
    """Main entry point"""

    # Define and parse command line arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        type=str,
                        default='',
                        help='Path to the data file(s) containing game data.')

    parser.add_argument('--output_dataset',
                        type=str,
                        default='',
                        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument('--output_table_name',
                        type=str,
                        default='',
                        help='The BigQuery table name where to write all the data.')

    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True

    # Create and run the pipeline
    with beam.Pipeline(options=options) as p:
        (p | 'ReadInputText'          >> beam.io.ReadFromText(args.input)
           | 'ParseGameEvent'         >> ParDo(ParseEventFn())
           | 'AddEventTimestamps'     >> beam.Map(lambda element: TimestampedValue(element, element['timestamp']))
           | 'WindowedTeamScore'      >> WindowedTeamScore(3600000) # 1 hour = 3600 seconds = 3600000 milliseconds
           | 'FormatTeamScoreSums'    >> ParDo(FormatTeamScoreSumsFn())
           | 'WriteTeamScoreSums'     >> WriteToBigQuery(
                    args.output_table_name,
                    args.output_dataset,
                    options.get_all_options().get("project"),
                    table_schema()
            )
        )
Exemplo n.º 21
0
    def test(self):
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            return {'data': base64.b64encode(record[1])}

        (  # pylint: disable=expression-not-assigned
            self.pipeline
            | 'Produce rows' >> Read(
                SyntheticSource(self.parse_synthetic_source_options()))
            | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
            | 'Format' >> Map(format_record)
            | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
            | 'Write to BigQuery' >> WriteToBigQuery(
                dataset=self.output_dataset,
                table=self.output_table,
                schema=SCHEMA,
                create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=BigQueryDisposition.WRITE_TRUNCATE))
Exemplo n.º 22
0
def run_pipeline(source, target):
    header = get_header(source)
    fields = header.split(CSV_DELIMITER)

    (bq_schema, schema) = get_schema(target)

    input_path = 'gs://dotz-hiring-datalake/raw/{}.csv'.format(source)
    output_path = 'gs://dotz-hiring-datalake/processed/{}.json/part'.format(
        target)

    pipeline_args = [
        '--job_name={}-{}'.format(target,
                                  str(time.time()).replace('.', '-')),
        '--input={}'.format(input_path), '--output={}'.format(output_path)
    ]

    pipeline_args.extend(BASE_PIPELINE_ARGS)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as pipeline:
        lines = pipeline | ReadFromText(input_path)

        # not so bright way to remove a CSV header
        lines = lines | 'RemoveHeader' >> beam.Filter(
            lambda line: line != header)
        objs = lines | 'CSV2JSON' >> beam.Map(csv2json(fields))
        proc_objs = objs | 'ProcessJSONs' >> beam.Map(process(schema))
        filtered_proc_objs = proc_objs | 'FilterEmpties' >> beam.Filter(
            lambda x: x)

        dumped_objs = filtered_proc_objs | 'DumpJSONs' >> beam.Map(json.dumps)
        dumped_objs | WriteToText(output_path)

        filtered_proc_objs | WriteToBigQuery(
            'dotz-hiring:tubulation.{}'.format(target),
            write_disposition=BigQueryDisposition.WRITE_TRUNCATE,
            create_disposition=BigQueryDisposition.CREATE_NEVER)
def run(argv=None):
    """Pipeline for reading data from a Cloud Storage bucket and writing the results to BigQuery"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', dest='input', help='File to read in.')
    parser.add_argument(
        '--output',
        dest='output',
        help=
        'BigQuery output dataset and table name in the format dataset.tablename'
    )
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:

        user_request_header = [
            'id', 'first_name', 'last_name', 'email', 'gender'
        ]

        input_rows = (
            p
            | 'ReadFile' >> ReadFromText(known_args.input, skip_header_lines=1)
            | 'ParseFile' >> beam.ParDo(ParseFileFn(user_request_header)))

        (input_rows
         | 'WriteToBigQuery' >> WriteToBigQuery(
             known_args.output,
             schema=
             'id:INTEGER, first_name:STRING, last_name:STRING, email:STRING, gender:STRING',
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        p.run().wait_until_finish()
Exemplo n.º 24
0
def run(argv=None):
    """Main entry point. It defines and runs the pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://meetup-batch-processing/input/googleplaystore.csv',
        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        default='gs://meetup-batch-processing/output/googleplaystore.csv',
        help='Output file to process.')
    parser.add_argument(
        '--table-output',
        dest='table_output',
        default=
        'meetup-hands-on-gcp-2019:googleplaystore_batch_dataflow.play_store',
        help='Bigquery table name for output.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    with beam.Pipeline(options=pipeline_options) as pipeline:
        raw_lines = pipeline | 'ReadFromCsv' >> ReadFromText(
            known_args.input, skip_header_lines=1)

        lines = raw_lines | 'processCsv' >> beam.ParDo(ProcessCSV())

        output = lines | 'parseRecord' >> beam.ParDo(ParseRecord())

        output | 'writeBigQuery' >> WriteToBigQuery(
            known_args.table_output,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
            create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER)

        logging.info('Finished.')
Exemplo n.º 25
0
def run(argv=None):
    """Pipeline for reading data from a Cloud Storage bucket and writing the results to BigQuery"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        help='File to read in.')
    parser.add_argument('--output',
                        dest='output',
                        help='BigQuery output dataset and table name in the format dataset.tablename')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:

        retail_header = ['department', 'value']

        input_rows = (p
                      | 'ReadFile' >> ReadFromText(known_args.input, skip_header_lines=1)
                      | 'ParseFile' >> beam.ParDo(ParseFileFn(retail_header))
                      | 'CreateKVPairs' >> beam.Map(lambda x: (x['department'], float(x['value'])))
                      | 'SumValues' >> beam.CombinePerKey(sum)
                      | 'Format' >> beam.Map(lambda x: {'department': x[0], 'value': float(x[1])})
                      )

        (input_rows
         | 'WriteToBigQuery' >> WriteToBigQuery(
                    known_args.output,
                    schema='department:STRING, value:FLOAT',
                    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
         )

        p.run().wait_until_finish()
Exemplo n.º 26
0
def run(argv=None):
    """Pipeline for reading data from a Cloud Storage bucket and writing the results to BigQuery"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', dest='input', help='File to read in.')
    parser.add_argument(
        '--output',
        dest='output',
        help=
        'BigQuery output dataset and table name in the format dataset.tablename'
    )
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        # 1. Read in the file from Google Cloud Storage. Hint: remember there is a header line in the CSV.
        input_rows = p | 'ReadFile' >> ReadFromText()

        # 2. Convert the rows into Key, Value pairs. Hint: use tuples

        # 3. For each Key, sum up the values. Hint: CombinePerKey(sum)

        # 4. Format the as Python dictionaries for writing to BigQuery

        # 5. Write the output to BigQuery
        (input_rows
         | 'WriteToBigQuery' >> WriteToBigQuery(
             known_args.output,
             schema='department:STRING, value:FLOAT',
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

        p.run().wait_until_finish()
Exemplo n.º 27
0
def main(argv):
    parser = argparse.ArgumentParser()

    parser.add_argument('--topic',
                        type=str,
                        default='',
                        help='Topic to subscribe to (use either this parameter or --input but not both).')

    parser.add_argument('--output_dataset',
                        type=str,
                        default='',
                        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument('--output_table_name',
                        type=str,
                        default='',
                        help='The BigQuery table name where to write all the data.')


    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=options) as p:
        (p | 'ReadMessages'           >> ReadFromPubSub(args.topic)             
           | 'FormatRecord'           >> beam.Map(lambda element: {"data": element})  
#          | "PrintBeforeInsert"      >> beam.Map(lambda record: print str(element))
           | 'WriteDataElementBQ'     >> WriteToBigQuery(                                                        
                     args.output_table_name,
                     args.output_dataset,
                     options.get_all_options().get("project"),
                     table_schema(),
                     BigQueryDisposition.CREATE_IF_NEEDED,
                     BigQueryDisposition.WRITE_APPEND
                 )
           )
                |
                "Decode and format json" >> beam.Map(lambda x: json.loads(x)))

    order_product = messages | "Extract id" >> beam.Map(lambda x:
                                                        (x.get("order_id"), x))

    def group_products(order_products):
        order_id, products = order_products
        output = {"order_id": str(order_id), "product": []}
        logging.info("order_id: {}".format(str(order_id)))
        for product in products:
            output["device_id"] = product.pop("device_id")
            product.pop("order_id")
            output["product"] = output["product"] + [product]
        return output

    orders = (order_product
              | beam.WindowInto(window.Sessions(500))
              | "Group by order" >> beam.GroupByKey()
              | "Join orders" >> beam.Map(group_products))

    # output = (orders
    #           | "Format orders" >> beam.Map(format_orders))

    orders | WriteToBigQuery(
        args.table,
        args.dataset,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

    p.run()
def run(argv=None):
    import random
    import datetime

    currentDT = datetime.datetime.now()

    # Your GCP Project ID and GCS locations
    # are passed through as part of your Dataflow job command

    # BigQuery output info
    dataset = 'lab_dev'
    table = 'play_by_play'

    # Dataflow job name (don't edit)
    job_name = 'play-by-play-{}'.format(
        currentDT.strftime("%Y-%m-%d-%H-%M-%S"))
    filepath = 'gs://cloud-training-demos/ncaa/next-bootcamp/2018-19/play_by_play/*'

    pipeline_args = [
        # change these
        '--runner=DataflowRunner',
        '--project={}'.format(argv['project_id']),
        '--dataset={}'.format(dataset),
        '--table={}'.format(table),
        '--staging_location={}'.format(argv['staging']),
        '--temp_location={}'.format(argv['temp_location']),
        '--num_workers=5',
        '--max_num_workers=20',
        '--region={}'.format(argv['region']),
        '--job_name={}'.format(job_name)
    ]

    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        files = p | ReadFromText(filepath)

        keyed = files | 'Key' >> beam.Map(lambda x:
                                          (random.randint(1, 101), x))
        grouped = keyed | 'GBK' >> beam.GroupByKey()
        flattended = grouped | 'Expand' >> beam.FlatMap(lambda x: x[1])

        to_insert = flattended | 'Format' >> beam.ParDo(Format())

        # to_insert | beam.ParDo(Check())

        table_schema = bigquery.TableSchema()
        for col, col_type in play_by_play_schema.iteritems():
            this_schema = bigquery.TableFieldSchema()
            this_schema.name = col
            this_schema.type = col_type
            this_schema.mode = 'nullable'
            table_schema.fields.append(this_schema)

        to_insert | WriteToBigQuery(
            table='{_project_}:{_dataset_}.{_table_}'.format(
                _dataset_=dataset, _project_=argv['project_id'],
                _table_=table),
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
Exemplo n.º 30
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--config',
                        dest='config',
                        required=True,
                        help='Configuration file with bigquery project settings and date range.')
    parser.add_argument('--oauth_file',
                        dest='oauth_file',
                        required=True,
                        help='File to authorize process.')
    parser.add_argument('--schema',
                        dest='schema',
                        required=True,
                        help='File with schema of table.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = known_args.oauth_file

    data = {}
    if os.path.isfile(known_args.schema):
        with open(known_args.schema) as file:
            data = json.load(file)
    else:
        print('Missing configuration file' + known_args.schema)
        return
    schema = data['schema']

    data = {}
    if os.path.isfile(known_args.config):
        with open(known_args.config) as file:
            data = json.load(file)
    else:
        print('Missing configuration file' + known_args.schema)
        return
    project = data['project_id']
    dataset = data['dataset_id']
    table = data['table']
    date_start = datetime.datetime.strptime(data['start_date'], '%Y-%m-%d').date()
    date_end = datetime.datetime.strptime(data['end_date'], '%Y-%m-%d').date()
    keys = schema.replace(':STRING','').replace(':INTEGER','').replace(':FLOAT','').split(',')

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    lines = p | 'readFromGCS' >> ReadFromText(known_args.input, skip_header_lines=1).with_output_types(unicode)

    while date_end - date_start >= datetime.timedelta(days=0):
        date = date_start.strftime('%Y-%m-%d')
        date_start += datetime.timedelta(days=1)
        output = lines | 'splitCSV_'+date >> beam.ParDo(Split(date, keys))

        output | 'writeToBQ_'+date >> WriteToBigQuery(table=table+'_'+date.replace('-',''),
                                                        dataset=dataset,
                                                        project=project,
                                                        schema=schema,
                                                        create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                                                        write_disposition=BigQueryDisposition.WRITE_APPEND)

    result = p.run()
    result.wait_until_finish()