def expand(self, input):
        # [START EXERCISE 3]:
        # Docs: https://beam.apache.org/documentation/sdks/pydoc/2.5.0/apache_beam.io.gcp.pubsub.html

        # Determine whether to use files or topic based on options.
        if (not self.args.input == None) and (not self.args.input == ""):
            return (
                input
                # Read game events from files. See exercise2.
                # Don't forget to parse events or to include the TimestampedValue transform to assign timestamps to events.
                | beam.io.ReadFromText(self.args.input)
                | ParDo(ParseEventFn())
                | beam.Map(lambda element: TimestampedValue(
                    element, element[self.TIMESTAMP_ATTRIBUTE])))
        else:
            return (
                input
                # Read game events from Pub/Sub topic self.options.topic using custom timestamps, which
                # are extracted from the pubsub attribute TIMESTAMP_ATTRIBUTE.
                # Use ReadFromPubSub() and use parameters topic and timestamp_attribute.
                # https://beam.apache.org/documentation/sdks/python-streaming/
                | ReadFromPubSub(self.args.topic,
                                 timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)

                # Parse the messages the same way as when they come from the text file. Note that we no
                # longer have to run WithTimestamps transform, as the timestamps are already set by
                # ReadFromPubSub.
                | ParDo(ParseEventFn()))
    def test(self):
        def format_record(record):
            import base64
            return base64.b64encode(record[1])

        def make_insert_mutations(element):
            import uuid  # pylint: disable=reimported
            from apache_beam.io.gcp.experimental.spannerio import WriteMutation
            ins_mutation = WriteMutation.insert(table='test',
                                                columns=('id', 'data'),
                                                values=[(str(uuid.uuid1()),
                                                         element)])
            return [ins_mutation]

        (  # pylint: disable=expression-not-assigned
            self.pipeline
            | 'Produce rows' >> Read(
                SyntheticSource(self.parse_synthetic_source_options()))
            | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
            | 'Format' >> Map(format_record)
            | 'Make mutations' >> FlatMap(make_insert_mutations)
            | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
            | 'Write to Spanner' >> WriteToSpanner(
                project_id=self.project,
                instance_id=self.spanner_instance,
                database_id=self.TEST_DATABASE,
                max_batch_size_bytes=5120))
Exemplo n.º 3
0
def main(argv):
    """Main entry point"""

    # Define and parse command line arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        type=str,
                        default='',
                        help='Path to the data file(s) containing game data.')

    parser.add_argument(
        '--output_dataset',
        type=str,
        default='',
        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument(
        '--output_table_name',
        type=str,
        default='',
        help='The BigQuery table name where to write all the data.')

    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)

    # Create and run the pipeline
    with beam.Pipeline(options=options) as p:
        (p | 'ReadInputText' >> beam.io.ReadFromText(args.input)
         | 'ParseGameEvent' >> ParDo(ParseEventFn())
         | 'ExtractUserScore' >> ExtractAndSumScore()
         | 'FormatUserScoreSums' >> ParDo(FormatUserScoreSumsFn())
         | 'WriteTeamScoreSums' >> WriteToBigQuery(
             args.output_table_name, args.output_dataset,
             options.get_all_options().get("project"), table_schema()))
Exemplo n.º 4
0
  def expand(self, pbegin):
    if self._read_operations is not None and isinstance(pbegin, PBegin):
      pcoll = pbegin.pipeline | Create(self._read_operations)
    elif not isinstance(pbegin, PBegin):
      if self._read_operations is not None:
        raise ValueError(
            "Read operation in the constructor only works with "
            "the root of the pipeline.")
      pcoll = pbegin
    else:
      raise ValueError(
          "Spanner required read operation, sql or table "
          "with columns.")

    if self._transaction is None:
      # reading as batch read using the spanner partitioning query to create
      # batches.
      p = (
          pcoll
          | 'Generate Partitions' >> ParDo(
              _CreateReadPartitions(spanner_configuration=self._configuration))
          | 'Reshuffle' >> Reshuffle()
          | 'Read From Partitions' >> ParDo(
              _ReadFromPartitionFn(spanner_configuration=self._configuration)))
    else:
      # reading as naive read, in which we don't make batches and execute the
      # queries as a single read.
      p = (
          pcoll
          | 'Reshuffle' >> Reshuffle().with_input_types(ReadOperation)
          | 'Perform Read' >> ParDo(
              _NaiveSpannerReadDoFn(spanner_configuration=self._configuration),
              AsSingleton(self._transaction)))
    return p
Exemplo n.º 5
0
 def test(self):
   self.result = (self.pipeline
                  | 'Read from BigQuery' >> Read(BigQuerySource(
                      dataset=self.input_dataset, table=self.input_table))
                  | 'Count messages' >> ParDo(CountMessages(
                      self.metrics_namespace))
                  | 'Measure time' >> ParDo(MeasureTime(
                      self.metrics_namespace))
                  | 'Count' >> Count.Globally())
Exemplo n.º 6
0
 def test(self):
   output = (
       self.pipeline
       | 'Read from BigQuery' >> Read(
           BigQuerySource(dataset=self.input_dataset, table=self.input_table))
       | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
       | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
       | 'Count' >> Count.Globally())
   assert_that(output, equal_to([self.input_options['num_records']]))
Exemplo n.º 7
0
 def test(self):
     output = (
         self.pipeline
         | 'Read from Spanner' >> ReadFromSpanner(
             self.project,
             self.spanner_instance,
             self.spanner_database,
             sql="select data from test_data")
         | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
         | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
         | 'Count' >> Count.Globally())
     assert_that(output, equal_to([self.input_options['num_records']]))
Exemplo n.º 8
0
    def test_translate_portable_job_step_name(self):
        mock_client, mock_job_result = self.setup_mock_client_result(
            self.ONLY_COUNTERS_LIST)

        pipeline_options = PipelineOptions([
            '--experiments=use_runner_v2',
            '--experiments=use_portable_job_submission',
            '--temp_location=gs://any-location/temp',
            '--project=dummy_project',
        ])

        pipeline = Pipeline(options=pipeline_options)
        pipeline | Create([1, 2, 3]) | 'MyTestParDo' >> ParDo(DoFn())  # pylint:disable=expression-not-assigned

        test_environment = DockerEnvironment(
            container_image='test_default_image')
        proto_pipeline, _ = pipeline.to_runner_api(
            return_context=True, default_environment=test_environment)

        job = apiclient.Job(pipeline_options, proto_pipeline)
        dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result,
                                              job)
        self.assertEqual(
            'MyTestParDo',
            dm._translate_step_name('ref_AppliedPTransform_MyTestParDo_14'))
Exemplo n.º 9
0
 def expand(self, pcoll):
     return (pcoll
             | "make batches" >> _WriteGroup(
                 max_batch_size_bytes=self._max_batch_size_bytes,
                 max_number_rows=self._max_number_rows,
                 max_number_cells=self._max_number_cells)
             | 'Writing to spanner' >> ParDo(
                 _WriteToSpannerDoFn(self._configuration)))
Exemplo n.º 10
0
 def expand(self, pcoll):
   input_coder = coders.registry.get_coder(pcoll)
   return pcoll | ParDo(
       _pardo_group_into_batches_with_multi_bags(
           input_coder,
           self.batch_size,
           self.max_buffering_duration_secs,
           self.clock))
Exemplo n.º 11
0
 def expand(self, pvalue):
     beam_options = self.beam_options
     return (
         pvalue
         | ParDo(
             _BigtableWriteFn(
                 beam_options['project_id'], beam_options['instance_id'],
                 beam_options['table_id'], beam_options['flush_count'],
                 beam_options['max_row_bytes'])))
Exemplo n.º 12
0
def main():
  options = PipelineOptions()
  options.view_as(SetupOptions).save_main_session = True
  p = Pipeline(options=options)

  start = 1
  end = 10

  (p
   | 'From {} to {}'.format(start, end)
   >> Create(list(range(start, end + 1)))
   | 'ToXml' >> ParDo(ToXmlDoFn())
   # If a job finishes too quickly, worker VMs can be shutdown before they send
   # logs in local files to Cloud Logging. Adding 30s sleep to avoid this
   | 'Sleep30s' >> ParDo(Sleep(30))
   | 'Print' >> ParDo(lambda xml: logging.info(xml))
   )

  p.run()
Exemplo n.º 13
0
def main(argv):
    """Main entry point"""

    # Define and parse command line arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        type=str,
                        default='',
                        help='Path to the data file(s) containing game data.')

    parser.add_argument('--output_dataset',
                        type=str,
                        default='',
                        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument('--output_table_name',
                        type=str,
                        default='',
                        help='The BigQuery table name where to write all the data.')

    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True

    # Create and run the pipeline
    with beam.Pipeline(options=options) as p:
        (p | 'ReadInputText'          >> beam.io.ReadFromText(args.input)
           | 'ParseGameEvent'         >> ParDo(ParseEventFn())
           | 'AddEventTimestamps'     >> beam.Map(lambda element: TimestampedValue(element, element['timestamp']))
           | 'WindowedTeamScore'      >> WindowedTeamScore(3600000) # 1 hour = 3600 seconds = 3600000 milliseconds
           | 'FormatTeamScoreSums'    >> ParDo(FormatTeamScoreSumsFn())
           | 'WriteTeamScoreSums'     >> WriteToBigQuery(
                    args.output_table_name,
                    args.output_dataset,
                    options.get_all_options().get("project"),
                    table_schema()
            )
        )
Exemplo n.º 14
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)
    (p
     | Create(list(range(NUM_SHARDS)))
     | FlatMap(lambda _:
               (bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD)))
     | WithKeys('')
     | ParDo(BigBagDoFn()))

    p.run()
Exemplo n.º 15
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True
    project = options.view_as(GoogleCloudOptions).project

    p = Pipeline(options=options)
    (p
     | Create(EN_TEXTS)
     | ParDo(TranslateDoFn(project, SOURCE_LANGUAGE_CODE,
                           TARGET_LANGUAGE_CODE))
     | Map(print_translation))

    p.run()
Exemplo n.º 16
0
    def expand(self, pcoll):
        filter_batchable_mutations = (
            pcoll
            | 'Making mutation groups' >> ParDo(_MakeMutationGroupsFn())
            | 'Filtering Batchable Mutations' >> ParDo(
                _BatchableFilterFn(
                    max_batch_size_bytes=self._max_batch_size_bytes,
                    max_number_rows=self._max_number_rows,
                    max_number_cells=self._max_number_cells)).with_outputs(
                        _BatchableFilterFn.OUTPUT_TAG_UNBATCHABLE,
                        main='batchable'))

        batching_batchables = (
            filter_batchable_mutations['batchable']
            | ParDo(
                _BatchFn(max_batch_size_bytes=self._max_batch_size_bytes,
                         max_number_rows=self._max_number_rows,
                         max_number_cells=self._max_number_cells)))

        return ((batching_batchables, filter_batchable_mutations[
            _BatchableFilterFn.OUTPUT_TAG_UNBATCHABLE])
                | 'Merging batchable and unbatchable' >> Flatten())
 def test_convert_variant_to_bigquery_row_omit_empty_calls(self):
   variant, row, header_num_dict = self._get_sample_variant_with_empty_calls()
   header_fields = vcf_header_util.make_header(header_num_dict)
   proc_var = processed_variant.ProcessedVariantFactory(
       header_fields).create_processed_variant(variant)
   pipeline = TestPipeline(blocking=True)
   bigquery_rows = (
       pipeline
       | Create([proc_var])
       | 'ConvertToRow' >> ParDo(ConvertToBigQueryTableRow(
           self._row_generator, omit_empty_sample_calls=True)))
   assert_that(bigquery_rows, equal_to([row]))
   pipeline.run()
Exemplo n.º 18
0
    def test(self):
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            return {'data': base64.b64encode(record[1])}

        (  # pylint: disable=expression-not-assigned
            self.pipeline
            | 'Produce rows' >> Read(
                SyntheticSource(self.parse_synthetic_source_options()))
            | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
            | 'Format' >> Map(format_record)
            | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
            | 'Write to BigQuery' >> WriteToBigQuery(
                dataset=self.output_dataset,
                table=self.output_table,
                schema=SCHEMA,
                create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=BigQueryDisposition.WRITE_TRUNCATE))
Exemplo n.º 19
0
    def _generate_data(self, p, output_prefix, init_size, data_size):
        init_data = [x for x in range(init_size)]

        lines = (p
                 | 'create' >> Create(init_data)
                 | 'produce' >> ParDo(ProducerFn(data_size)))

        schema = pa.schema([('name', pa.binary()), ('number', pa.int64())])

        files = lines | 'write' >> WriteToParquet(
            output_prefix, schema, codec='snappy', file_name_suffix='.parquet')

        return files
Exemplo n.º 20
0
 def expand(self, pcoll):
     return (
         pcoll
         # Bind window info to each element using element timestamp (or publish time).
         | "Window into fixed intervals" >> WindowInto(
             FixedWindows(self.window_size))
         | "Add timestamp to windowed elements" >> ParDo(AddTimestamp())
         # Assign a random key to each windowed element based on the number of shards.
         | "Add key" >>
         WithKeys(lambda _: random.randint(0, self.num_shards - 1))
         # Group windowed elements by key. All the elements in the same window must fit
         # memory for this. If not, you need to use `beam.util.BatchElements`.
         | "Group by key" >> GroupByKey())
 def test_convert_variant_to_bigquery_row_allow_incompatible_recoreds(self):
     variant, row = self._get_sample_variant_with_incompatible_records()
     header_fields = vcf_header_io.VcfHeader()
     proc_var = processed_variant.ProcessedVariantFactory(
         header_fields).create_processed_variant(variant)
     pipeline = TestPipeline(blocking=True)
     bigquery_rows = (
         pipeline
         | Create([proc_var])
         | 'ConvertToRow' >> ParDo(
             ConvertToBigQueryTableRow(self._row_generator,
                                       allow_incompatible_records=True)))
     assert_that(bigquery_rows, equal_to([row]))
     pipeline.run()
Exemplo n.º 22
0
  def _materialize_transform(self, pipeline):
    result = _allocate_materialized_result(pipeline)

    # Need to define _MaterializeValuesDoFn here to avoid circular
    # dependencies.
    from apache_beam import DoFn
    from apache_beam import ParDo

    class _MaterializeValuesDoFn(DoFn):
      def process(self, element):
        result.elements.append(element)

    materialization_label = '_MaterializeValues%d' % result._result_id
    return (materialization_label >> ParDo(_MaterializeValuesDoFn()), result)
Exemplo n.º 23
0
  def expand(self, pbegin):
    """Expands the TestStream into the DirectRunner implementation.


    Takes the TestStream transform and creates a _TestStream -> multiplexer ->
    _WatermarkController.
    """

    assert isinstance(pbegin, pvalue.PBegin)

    # If there is only one tag there is no need to add the multiplexer.
    if len(self.test_stream.output_tags) == 1:
      return (
          pbegin
          | _TestStream(
              self.test_stream.output_tags,
              events=self.test_stream._events,
              coder=self.test_stream.coder,
              endpoint=self.test_stream._endpoint)
          | _WatermarkController(list(self.test_stream.output_tags)[0]))

    # Multiplex to the correct PCollection based upon the event tag.
    def mux(event):
      if event.tag:
        yield pvalue.TaggedOutput(event.tag, event)
      else:
        yield event

    mux_output = (
        pbegin
        | _TestStream(
            self.test_stream.output_tags,
            events=self.test_stream._events,
            coder=self.test_stream.coder,
            endpoint=self.test_stream._endpoint)
        | 'TestStream Multiplexer' >> ParDo(mux).with_outputs())

    # Apply a way to control the watermark per output. It is necessary to
    # have an individual _WatermarkController per PCollection because the
    # calculation of the input watermark of a transform is based on the event
    # timestamp of the elements flowing through it. Meaning, it is impossible
    # to control the output watermarks of the individual PCollections solely
    # on the event timestamps.
    outputs = {}
    for tag in self.test_stream.output_tags:
      label = '_WatermarkController[{}]'.format(tag)
      outputs[tag] = (mux_output[tag] | label >> _WatermarkController(tag))

    return outputs
Exemplo n.º 24
0
def main(argv):
    """Main entry point"""

    # Define and parse command line arguments
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        type=str,
        default='',
        help=
        'Path to the data file(s) containing game data (use either this parameter or --topic but not both).'
    )

    parser.add_argument(
        '--topic',
        type=str,
        default='',
        help=
        'Topic to subscribe to (use either this parameter or --input but not both).'
    )

    parser.add_argument(
        '--output_dataset',
        type=str,
        default='',
        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument(
        '--output_table_name',
        type=str,
        default='',
        help='The BigQuery table name where to write all the data.')

    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=options) as p:
        (p | 'ReadGameEvents' >> ReadGameEvents(args)
         | 'WindowedTeamScore' >> WindowedTeamScore(30)
         | 'FormatTeamScoreSums' >> ParDo(
             FormatTeamScoreSumsFn(
                 (args.topic != None) and (args.topic != "")))
         | 'WriteTeamScoreSums' >>
         WriteToBigQuery(args.output_table_name, args.output_dataset,
                         options.get_all_options().get("project"),
                         table_schema(), BigQueryDisposition.CREATE_IF_NEEDED,
                         BigQueryDisposition.WRITE_APPEND))
Exemplo n.º 25
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)

    start = 1
    end = 100
    (p
     |
     'From {} to {}'.format(start, end) >> Create(list(range(start, end + 1)))
     | 'Sum' >> CombineGlobally(sum)
     | 'Print' >>
     ParDo(lambda total: logging.info('Sum from 1 to 100 is %s', total)))

    p.run()
Exemplo n.º 26
0
def create_transaction(
    pbegin,
    project_id,
    instance_id,
    database_id,
    credentials=None,
    pool=None,
    read_timestamp=None,
    exact_staleness=None):
  """
  A PTransform method to create a batch transaction.

  Args:
    pbegin: Root of the pipeline
    project_id: Cloud spanner project id. Be sure to use the Project ID,
      not the Project Number.
    instance_id: Cloud spanner instance id.
    database_id: Cloud spanner database id.
    credentials: (optional) The authorization credentials to attach to requests.
      These credentials identify this application to the service.
      If none are specified, the client will attempt to ascertain
      the credentials from the environment.
    pool: (optional) session pool to be used by database. If not passed,
      Spanner Cloud SDK uses the BurstyPool by default.
      `google.cloud.spanner.BurstyPool`. Ref:
      https://googleapis.dev/python/spanner/latest/database-api.html?#google.
      cloud.spanner_v1.database.Database
    read_timestamp: (optional) An instance of the `datetime.datetime` object to
      execute all reads at the given timestamp.
    exact_staleness: (optional) An instance of the `datetime.timedelta`
      object. These timestamp bounds execute reads at a user-specified
      timestamp.
  """

  assert isinstance(pbegin, PBegin)

  return (
      pbegin | Create([1]) | ParDo(
          _CreateTransactionFn(
              project_id,
              instance_id,
              database_id,
              credentials,
              pool,
              read_timestamp,
              exact_staleness)))
Exemplo n.º 27
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    project = options.view_as(GoogleCloudOptions).project
    assert project is not None, '"project" is not specified.'

    source_code = 'en-US'
    target_code = 'ja'
    texts = ['Hello', 'Thank you', 'Goodbye']

    p = Pipeline(options=options)
    (p
     | 'Texts' >> Create(texts)
     | 'Translate' >> ParDo(Translate(project, source_code, target_code))
     | 'Print' >> Map(lambda pair: logging.info('%s -> %s', pair[0], pair[1])))

    p.run()
 def test_convert_variant_to_bigquery_row(self):
     variant_1, row_1 = self._get_sample_variant_1()
     variant_2, row_2 = self._get_sample_variant_2()
     variant_3, row_3 = self._get_sample_variant_3()
     header_fields = vcf_header_io.VcfHeader()
     proc_var_1 = processed_variant.ProcessedVariantFactory(
         header_fields).create_processed_variant(variant_1)
     proc_var_2 = processed_variant.ProcessedVariantFactory(
         header_fields).create_processed_variant(variant_2)
     proc_var_3 = processed_variant.ProcessedVariantFactory(
         header_fields).create_processed_variant(variant_3)
     pipeline = TestPipeline(blocking=True)
     bigquery_rows = (pipeline
                      | Create([proc_var_1, proc_var_2, proc_var_3])
                      | 'ConvertToRow' >> ParDo(
                          ConvertToBigQueryTableRow(self._row_generator)))
     assert_that(bigquery_rows, equal_to([row_1, row_2, row_3]))
     pipeline.run()
Exemplo n.º 29
0
def main():
  options = PipelineOptions()
  options.view_as(SetupOptions).save_main_session = True

  BATCH_SIZE = 1000000
  BUFFERING_SECS = 600

  p = Pipeline(options=options)
  (p
   | Create(range(100), reshuffle=True)
   | ParDo(make_large_elements)  # 128 KiB
   | WithKeys('')
   | GroupIntoBatchesWithMultiBags(BATCH_SIZE, BUFFERING_SECS)  # Big batch size with 1 minute trigger
   | Map(lambda kv: logging.info('key: %s, value count: %s',
                                 kv[0], len(kv[1]))))

  run = p.run()
  run.wait_until_finish()
Exemplo n.º 30
0
def run(input_topic, num_shards, window_size):

    # Set `save_main_session` to True so DoFns can access globally imported modules.
    pipeline_options = PipelineOptions(pipeline_args,
                                       streaming=True,
                                       save_main_session=True)

    custom_options = pipeline_options.view_as(CustomPipelineOptions)

    with Pipeline(options=custom_options) as pipeline:
        (pipeline
         # Because `timestamp_attribute` is unspecified in `ReadFromPubSub`, Beam
         # binds the publish time returned by the Pub/Sub server for each message
         # to the element's timestamp parameter, accessible via `DoFn.TimestampParam`.
         # https://beam.apache.org/releases/pydoc/current/apache_beam.io.gcp.pubsub.html#apache_beam.io.gcp.pubsub.ReadFromPubSub
         | "Read from Pub/Sub" >> io.ReadFromPubSub(topic=input_topic)
         |
         "Window into" >> GroupMessagesByFixedWindows(window_size, num_shards)
         | "Write to GCS" >> ParDo(WriteToGCS(custom_options.output_path)))