Exemplos de ParDo em Python, exemplos de apache_beam.ParDo em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: exercise3.py Projeto: luxiangu/Nahuel-DataflowSME-Python

    def expand(self, input):
        # [START EXERCISE 3]:
        # Docs: https://beam.apache.org/documentation/sdks/pydoc/2.5.0/apache_beam.io.gcp.pubsub.html

        # Determine whether to use files or topic based on options.
        if (not self.args.input == None) and (not self.args.input == ""):
            return (
                input
                # Read game events from files. See exercise2.
                # Don't forget to parse events or to include the TimestampedValue transform to assign timestamps to events.
                | beam.io.ReadFromText(self.args.input)
                | ParDo(ParseEventFn())
                | beam.Map(lambda element: TimestampedValue(
                    element, element[self.TIMESTAMP_ATTRIBUTE])))
        else:
            return (
                input
                # Read game events from Pub/Sub topic self.options.topic using custom timestamps, which
                # are extracted from the pubsub attribute TIMESTAMP_ATTRIBUTE.
                # Use ReadFromPubSub() and use parameters topic and timestamp_attribute.
                # https://beam.apache.org/documentation/sdks/python-streaming/
                | ReadFromPubSub(self.args.topic,
                                 timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)

                # Parse the messages the same way as when they come from the text file. Note that we no
                # longer have to run WithTimestamps transform, as the timestamps are already set by
                # ReadFromPubSub.
                | ParDo(ParseEventFn()))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: spannerio_write_perf_test.py Projeto: piter75/apache-beam

    def test(self):
        def format_record(record):
            import base64
            return base64.b64encode(record[1])

        def make_insert_mutations(element):
            import uuid  # pylint: disable=reimported
            from apache_beam.io.gcp.experimental.spannerio import WriteMutation
            ins_mutation = WriteMutation.insert(table='test',
                                                columns=('id', 'data'),
                                                values=[(str(uuid.uuid1()),
                                                         element)])
            return [ins_mutation]

        (  # pylint: disable=expression-not-assigned
            self.pipeline
            | 'Produce rows' >> Read(
                SyntheticSource(self.parse_synthetic_source_options()))
            | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
            | 'Format' >> Map(format_record)
            | 'Make mutations' >> FlatMap(make_insert_mutations)
            | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
            | 'Write to Spanner' >> WriteToSpanner(
                project_id=self.project,
                instance_id=self.spanner_instance,
                database_id=self.TEST_DATABASE,
                max_batch_size_bytes=5120))

Exemplo n.º 3

0

Exibir arquivo

def main(argv):
    """Main entry point"""

    # Define and parse command line arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        type=str,
                        default='',
                        help='Path to the data file(s) containing game data.')

    parser.add_argument(
        '--output_dataset',
        type=str,
        default='',
        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument(
        '--output_table_name',
        type=str,
        default='',
        help='The BigQuery table name where to write all the data.')

    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)

    # Create and run the pipeline
    with beam.Pipeline(options=options) as p:
        (p | 'ReadInputText' >> beam.io.ReadFromText(args.input)
         | 'ParseGameEvent' >> ParDo(ParseEventFn())
         | 'ExtractUserScore' >> ExtractAndSumScore()
         | 'FormatUserScoreSums' >> ParDo(FormatUserScoreSumsFn())
         | 'WriteTeamScoreSums' >> WriteToBigQuery(
             args.output_table_name, args.output_dataset,
             options.get_all_options().get("project"), table_schema()))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: spannerio.py Projeto: mszb/beam

  def expand(self, pbegin):
    if self._read_operations is not None and isinstance(pbegin, PBegin):
      pcoll = pbegin.pipeline | Create(self._read_operations)
    elif not isinstance(pbegin, PBegin):
      if self._read_operations is not None:
        raise ValueError(
            "Read operation in the constructor only works with "
            "the root of the pipeline.")
      pcoll = pbegin
    else:
      raise ValueError(
          "Spanner required read operation, sql or table "
          "with columns.")

    if self._transaction is None:
      # reading as batch read using the spanner partitioning query to create
      # batches.
      p = (
          pcoll
          | 'Generate Partitions' >> ParDo(
              _CreateReadPartitions(spanner_configuration=self._configuration))
          | 'Reshuffle' >> Reshuffle()
          | 'Read From Partitions' >> ParDo(
              _ReadFromPartitionFn(spanner_configuration=self._configuration)))
    else:
      # reading as naive read, in which we don't make batches and execute the
      # queries as a single read.
      p = (
          pcoll
          | 'Reshuffle' >> Reshuffle().with_input_types(ReadOperation)
          | 'Perform Read' >> ParDo(
              _NaiveSpannerReadDoFn(spanner_configuration=self._configuration),
              AsSingleton(self._transaction)))
    return p

Exemplo n.º 5

0

Exibir arquivo

Arquivo: bigquery_read_perf_test.py Projeto: xsm110/Beam15.0

 def test(self):
   self.result = (self.pipeline
                  | 'Read from BigQuery' >> Read(BigQuerySource(
                      dataset=self.input_dataset, table=self.input_table))
                  | 'Count messages' >> ParDo(CountMessages(
                      self.metrics_namespace))
                  | 'Measure time' >> ParDo(MeasureTime(
                      self.metrics_namespace))
                  | 'Count' >> Count.Globally())

Exemplo n.º 6

0

Exibir arquivo

 def test(self):
   output = (
       self.pipeline
       | 'Read from BigQuery' >> Read(
           BigQuerySource(dataset=self.input_dataset, table=self.input_table))
       | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
       | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
       | 'Count' >> Count.Globally())
   assert_that(output, equal_to([self.input_options['num_records']]))

Exemplo n.º 7

0

Exibir arquivo

 def test(self):
     output = (
         self.pipeline
         | 'Read from Spanner' >> ReadFromSpanner(
             self.project,
             self.spanner_instance,
             self.spanner_database,
             sql="select data from test_data")
         | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
         | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
         | 'Count' >> Count.Globally())
     assert_that(output, equal_to([self.input_options['num_records']]))

Exemplo n.º 8

0

Exibir arquivo

Arquivo: dataflow_metrics_test.py Projeto: mahak/beam

    def test_translate_portable_job_step_name(self):
        mock_client, mock_job_result = self.setup_mock_client_result(
            self.ONLY_COUNTERS_LIST)

        pipeline_options = PipelineOptions([
            '--experiments=use_runner_v2',
            '--experiments=use_portable_job_submission',
            '--temp_location=gs://any-location/temp',
            '--project=dummy_project',
        ])

        pipeline = Pipeline(options=pipeline_options)
        pipeline | Create([1, 2, 3]) | 'MyTestParDo' >> ParDo(DoFn())  # pylint:disable=expression-not-assigned

        test_environment = DockerEnvironment(
            container_image='test_default_image')
        proto_pipeline, _ = pipeline.to_runner_api(
            return_context=True, default_environment=test_environment)

        job = apiclient.Job(pipeline_options, proto_pipeline)
        dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result,
                                              job)
        self.assertEqual(
            'MyTestParDo',
            dm._translate_step_name('ref_AppliedPTransform_MyTestParDo_14'))

Exemplo n.º 9

0

Exibir arquivo

 def expand(self, pcoll):
     return (pcoll
             | "make batches" >> _WriteGroup(
                 max_batch_size_bytes=self._max_batch_size_bytes,
                 max_number_rows=self._max_number_rows,
                 max_number_cells=self._max_number_cells)
             | 'Writing to spanner' >> ParDo(
                 _WriteToSpannerDoFn(self._configuration)))

Exemplo n.º 10

0

Exibir arquivo

 def expand(self, pcoll):
   input_coder = coders.registry.get_coder(pcoll)
   return pcoll | ParDo(
       _pardo_group_into_batches_with_multi_bags(
           input_coder,
           self.batch_size,
           self.max_buffering_duration_secs,
           self.clock))

Exemplo n.º 11

0

Exibir arquivo

 def expand(self, pvalue):
     beam_options = self.beam_options
     return (
         pvalue
         | ParDo(
             _BigtableWriteFn(
                 beam_options['project_id'], beam_options['instance_id'],
                 beam_options['table_id'], beam_options['flush_count'],
                 beam_options['max_row_bytes'])))

Exemplo n.º 12

0

Exibir arquivo

def main():
  options = PipelineOptions()
  options.view_as(SetupOptions).save_main_session = True
  p = Pipeline(options=options)

  start = 1
  end = 10

  (p
   | 'From {} to {}'.format(start, end)
   >> Create(list(range(start, end + 1)))
   | 'ToXml' >> ParDo(ToXmlDoFn())
   # If a job finishes too quickly, worker VMs can be shutdown before they send
   # logs in local files to Cloud Logging. Adding 30s sleep to avoid this
   | 'Sleep30s' >> ParDo(Sleep(30))
   | 'Print' >> ParDo(lambda xml: logging.info(xml))
   )

  p.run()

Exemplo n.º 13

0

Exibir arquivo

Arquivo: exercise2.py Projeto: luxiangu/Nahuel-DataflowSME-Python

def main(argv):
    """Main entry point"""

    # Define and parse command line arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        type=str,
                        default='',
                        help='Path to the data file(s) containing game data.')

    parser.add_argument('--output_dataset',
                        type=str,
                        default='',
                        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument('--output_table_name',
                        type=str,
                        default='',
                        help='The BigQuery table name where to write all the data.')

    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True

    # Create and run the pipeline
    with beam.Pipeline(options=options) as p:
        (p | 'ReadInputText'          >> beam.io.ReadFromText(args.input)
           | 'ParseGameEvent'         >> ParDo(ParseEventFn())
           | 'AddEventTimestamps'     >> beam.Map(lambda element: TimestampedValue(element, element['timestamp']))
           | 'WindowedTeamScore'      >> WindowedTeamScore(3600000) # 1 hour = 3600 seconds = 3600000 milliseconds
           | 'FormatTeamScoreSums'    >> ParDo(FormatTeamScoreSumsFn())
           | 'WriteTeamScoreSums'     >> WriteToBigQuery(
                    args.output_table_name,
                    args.output_dataset,
                    options.get_all_options().get("project"),
                    table_schema()
            )
        )

Exemplo n.º 14

0

Exibir arquivo

def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)
    (p
     | Create(list(range(NUM_SHARDS)))
     | FlatMap(lambda _:
               (bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD)))
     | WithKeys('')
     | ParDo(BigBagDoFn()))

    p.run()

Exemplo n.º 15

0

Exibir arquivo

def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True
    project = options.view_as(GoogleCloudOptions).project

    p = Pipeline(options=options)
    (p
     | Create(EN_TEXTS)
     | ParDo(TranslateDoFn(project, SOURCE_LANGUAGE_CODE,
                           TARGET_LANGUAGE_CODE))
     | Map(print_translation))

    p.run()

Exemplo n.º 16

0

Exibir arquivo

    def expand(self, pcoll):
        filter_batchable_mutations = (
            pcoll
            | 'Making mutation groups' >> ParDo(_MakeMutationGroupsFn())
            | 'Filtering Batchable Mutations' >> ParDo(
                _BatchableFilterFn(
                    max_batch_size_bytes=self._max_batch_size_bytes,
                    max_number_rows=self._max_number_rows,
                    max_number_cells=self._max_number_cells)).with_outputs(
                        _BatchableFilterFn.OUTPUT_TAG_UNBATCHABLE,
                        main='batchable'))

        batching_batchables = (
            filter_batchable_mutations['batchable']
            | ParDo(
                _BatchFn(max_batch_size_bytes=self._max_batch_size_bytes,
                         max_number_rows=self._max_number_rows,
                         max_number_cells=self._max_number_cells)))

        return ((batching_batchables, filter_batchable_mutations[
            _BatchableFilterFn.OUTPUT_TAG_UNBATCHABLE])
                | 'Merging batchable and unbatchable' >> Flatten())

Exemplo n.º 17

0

Exibir arquivo

Arquivo: variant_to_bigquery_test.py Projeto: slagelwa/gcp-variant-transforms

 def test_convert_variant_to_bigquery_row_omit_empty_calls(self):
   variant, row, header_num_dict = self._get_sample_variant_with_empty_calls()
   header_fields = vcf_header_util.make_header(header_num_dict)
   proc_var = processed_variant.ProcessedVariantFactory(
       header_fields).create_processed_variant(variant)
   pipeline = TestPipeline(blocking=True)
   bigquery_rows = (
       pipeline
       | Create([proc_var])
       | 'ConvertToRow' >> ParDo(ConvertToBigQueryTableRow(
           self._row_generator, omit_empty_sample_calls=True)))
   assert_that(bigquery_rows, equal_to([row]))
   pipeline.run()

Exemplo n.º 18

0

Exibir arquivo

    def test(self):
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            return {'data': base64.b64encode(record[1])}

        (  # pylint: disable=expression-not-assigned
            self.pipeline
            | 'Produce rows' >> Read(
                SyntheticSource(self.parse_synthetic_source_options()))
            | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
            | 'Format' >> Map(format_record)
            | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
            | 'Write to BigQuery' >> WriteToBigQuery(
                dataset=self.output_dataset,
                table=self.output_table,
                schema=SCHEMA,
                create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=BigQueryDisposition.WRITE_TRUNCATE))

Exemplo n.º 19

0

Exibir arquivo

    def _generate_data(self, p, output_prefix, init_size, data_size):
        init_data = [x for x in range(init_size)]

        lines = (p
                 | 'create' >> Create(init_data)
                 | 'produce' >> ParDo(ProducerFn(data_size)))

        schema = pa.schema([('name', pa.binary()), ('number', pa.int64())])

        files = lines | 'write' >> WriteToParquet(
            output_prefix, schema, codec='snappy', file_name_suffix='.parquet')

        return files

Exemplo n.º 20

0

Exibir arquivo

 def expand(self, pcoll):
     return (
         pcoll
         # Bind window info to each element using element timestamp (or publish time).
         | "Window into fixed intervals" >> WindowInto(
             FixedWindows(self.window_size))
         | "Add timestamp to windowed elements" >> ParDo(AddTimestamp())
         # Assign a random key to each windowed element based on the number of shards.
         | "Add key" >>
         WithKeys(lambda _: random.randint(0, self.num_shards - 1))
         # Group windowed elements by key. All the elements in the same window must fit
         # memory for this. If not, you need to use `beam.util.BatchElements`.
         | "Group by key" >> GroupByKey())

Exemplo n.º 21

0

Exibir arquivo

Arquivo: variant_to_bigquery_test.py Projeto: sanjaysiddhanti/gcp-variant-transforms

 def test_convert_variant_to_bigquery_row_allow_incompatible_recoreds(self):
     variant, row = self._get_sample_variant_with_incompatible_records()
     header_fields = vcf_header_io.VcfHeader()
     proc_var = processed_variant.ProcessedVariantFactory(
         header_fields).create_processed_variant(variant)
     pipeline = TestPipeline(blocking=True)
     bigquery_rows = (
         pipeline
         | Create([proc_var])
         | 'ConvertToRow' >> ParDo(
             ConvertToBigQueryTableRow(self._row_generator,
                                       allow_incompatible_records=True)))
     assert_that(bigquery_rows, equal_to([row]))
     pipeline.run()

Exemplo n.º 22

0

Exibir arquivo

Arquivo: ptransform.py Projeto: zihuaihuai/beam

  def _materialize_transform(self, pipeline):
    result = _allocate_materialized_result(pipeline)

    # Need to define _MaterializeValuesDoFn here to avoid circular
    # dependencies.
    from apache_beam import DoFn
    from apache_beam import ParDo

    class _MaterializeValuesDoFn(DoFn):
      def process(self, element):
        result.elements.append(element)

    materialization_label = '_MaterializeValues%d' % result._result_id
    return (materialization_label >> ParDo(_MaterializeValuesDoFn()), result)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: test_stream_impl.py Projeto: m-akiayma/beam

  def expand(self, pbegin):
    """Expands the TestStream into the DirectRunner implementation.


    Takes the TestStream transform and creates a _TestStream -> multiplexer ->
    _WatermarkController.
    """

    assert isinstance(pbegin, pvalue.PBegin)

    # If there is only one tag there is no need to add the multiplexer.
    if len(self.test_stream.output_tags) == 1:
      return (
          pbegin
          | _TestStream(
              self.test_stream.output_tags,
              events=self.test_stream._events,
              coder=self.test_stream.coder,
              endpoint=self.test_stream._endpoint)
          | _WatermarkController(list(self.test_stream.output_tags)[0]))

    # Multiplex to the correct PCollection based upon the event tag.
    def mux(event):
      if event.tag:
        yield pvalue.TaggedOutput(event.tag, event)
      else:
        yield event

    mux_output = (
        pbegin
        | _TestStream(
            self.test_stream.output_tags,
            events=self.test_stream._events,
            coder=self.test_stream.coder,
            endpoint=self.test_stream._endpoint)
        | 'TestStream Multiplexer' >> ParDo(mux).with_outputs())

    # Apply a way to control the watermark per output. It is necessary to
    # have an individual _WatermarkController per PCollection because the
    # calculation of the input watermark of a transform is based on the event
    # timestamp of the elements flowing through it. Meaning, it is impossible
    # to control the output watermarks of the individual PCollections solely
    # on the event timestamps.
    outputs = {}
    for tag in self.test_stream.output_tags:
      label = '_WatermarkController[{}]'.format(tag)
      outputs[tag] = (mux_output[tag] | label >> _WatermarkController(tag))

    return outputs

Exemplo n.º 24

0

Exibir arquivo

Arquivo: exercise3.py Projeto: luxiangu/Nahuel-DataflowSME-Python

def main(argv):
    """Main entry point"""

    # Define and parse command line arguments
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        type=str,
        default='',
        help=
        'Path to the data file(s) containing game data (use either this parameter or --topic but not both).'
    )

    parser.add_argument(
        '--topic',
        type=str,
        default='',
        help=
        'Topic to subscribe to (use either this parameter or --input but not both).'
    )

    parser.add_argument(
        '--output_dataset',
        type=str,
        default='',
        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument(
        '--output_table_name',
        type=str,
        default='',
        help='The BigQuery table name where to write all the data.')

    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=options) as p:
        (p | 'ReadGameEvents' >> ReadGameEvents(args)
         | 'WindowedTeamScore' >> WindowedTeamScore(30)
         | 'FormatTeamScoreSums' >> ParDo(
             FormatTeamScoreSumsFn(
                 (args.topic != None) and (args.topic != "")))
         | 'WriteTeamScoreSums' >>
         WriteToBigQuery(args.output_table_name, args.output_dataset,
                         options.get_all_options().get("project"),
                         table_schema(), BigQueryDisposition.CREATE_IF_NEEDED,
                         BigQueryDisposition.WRITE_APPEND))

Exemplo n.º 25

0

Exibir arquivo

def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)

    start = 1
    end = 100
    (p
     |
     'From {} to {}'.format(start, end) >> Create(list(range(start, end + 1)))
     | 'Sum' >> CombineGlobally(sum)
     | 'Print' >>
     ParDo(lambda total: logging.info('Sum from 1 to 100 is %s', total)))

    p.run()

Exemplo n.º 26

0

Exibir arquivo

Arquivo: spannerio.py Projeto: mszb/beam

def create_transaction(
    pbegin,
    project_id,
    instance_id,
    database_id,
    credentials=None,
    pool=None,
    read_timestamp=None,
    exact_staleness=None):
  """
  A PTransform method to create a batch transaction.

  Args:
    pbegin: Root of the pipeline
    project_id: Cloud spanner project id. Be sure to use the Project ID,
      not the Project Number.
    instance_id: Cloud spanner instance id.
    database_id: Cloud spanner database id.
    credentials: (optional) The authorization credentials to attach to requests.
      These credentials identify this application to the service.
      If none are specified, the client will attempt to ascertain
      the credentials from the environment.
    pool: (optional) session pool to be used by database. If not passed,
      Spanner Cloud SDK uses the BurstyPool by default.
      `google.cloud.spanner.BurstyPool`. Ref:
      https://googleapis.dev/python/spanner/latest/database-api.html?#google.
      cloud.spanner_v1.database.Database
    read_timestamp: (optional) An instance of the `datetime.datetime` object to
      execute all reads at the given timestamp.
    exact_staleness: (optional) An instance of the `datetime.timedelta`
      object. These timestamp bounds execute reads at a user-specified
      timestamp.
  """

  assert isinstance(pbegin, PBegin)

  return (
      pbegin | Create([1]) | ParDo(
          _CreateTransactionFn(
              project_id,
              instance_id,
              database_id,
              credentials,
              pool,
              read_timestamp,
              exact_staleness)))

Exemplo n.º 27

0

Exibir arquivo

def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    project = options.view_as(GoogleCloudOptions).project
    assert project is not None, '"project" is not specified.'

    source_code = 'en-US'
    target_code = 'ja'
    texts = ['Hello', 'Thank you', 'Goodbye']

    p = Pipeline(options=options)
    (p
     | 'Texts' >> Create(texts)
     | 'Translate' >> ParDo(Translate(project, source_code, target_code))
     | 'Print' >> Map(lambda pair: logging.info('%s -> %s', pair[0], pair[1])))

    p.run()

Exemplo n.º 28

0

Exibir arquivo

Arquivo: variant_to_bigquery_test.py Projeto: sanjaysiddhanti/gcp-variant-transforms

 def test_convert_variant_to_bigquery_row(self):
     variant_1, row_1 = self._get_sample_variant_1()
     variant_2, row_2 = self._get_sample_variant_2()
     variant_3, row_3 = self._get_sample_variant_3()
     header_fields = vcf_header_io.VcfHeader()
     proc_var_1 = processed_variant.ProcessedVariantFactory(
         header_fields).create_processed_variant(variant_1)
     proc_var_2 = processed_variant.ProcessedVariantFactory(
         header_fields).create_processed_variant(variant_2)
     proc_var_3 = processed_variant.ProcessedVariantFactory(
         header_fields).create_processed_variant(variant_3)
     pipeline = TestPipeline(blocking=True)
     bigquery_rows = (pipeline
                      | Create([proc_var_1, proc_var_2, proc_var_3])
                      | 'ConvertToRow' >> ParDo(
                          ConvertToBigQueryTableRow(self._row_generator)))
     assert_that(bigquery_rows, equal_to([row_1, row_2, row_3]))
     pipeline.run()

Exemplo n.º 29

0

Exibir arquivo

def main():
  options = PipelineOptions()
  options.view_as(SetupOptions).save_main_session = True

  BATCH_SIZE = 1000000
  BUFFERING_SECS = 600

  p = Pipeline(options=options)
  (p
   | Create(range(100), reshuffle=True)
   | ParDo(make_large_elements)  # 128 KiB
   | WithKeys('')
   | GroupIntoBatchesWithMultiBags(BATCH_SIZE, BUFFERING_SECS)  # Big batch size with 1 minute trigger
   | Map(lambda kv: logging.info('key: %s, value count: %s',
                                 kv[0], len(kv[1]))))

  run = p.run()
  run.wait_until_finish()

Exemplo n.º 30

0

Exibir arquivo

Arquivo: Beam.py Projeto: Vamsi-Bandi/GCP

def run(input_topic, num_shards, window_size):

    # Set `save_main_session` to True so DoFns can access globally imported modules.
    pipeline_options = PipelineOptions(pipeline_args,
                                       streaming=True,
                                       save_main_session=True)

    custom_options = pipeline_options.view_as(CustomPipelineOptions)

    with Pipeline(options=custom_options) as pipeline:
        (pipeline
         # Because `timestamp_attribute` is unspecified in `ReadFromPubSub`, Beam
         # binds the publish time returned by the Pub/Sub server for each message
         # to the element's timestamp parameter, accessible via `DoFn.TimestampParam`.
         # https://beam.apache.org/releases/pydoc/current/apache_beam.io.gcp.pubsub.html#apache_beam.io.gcp.pubsub.ReadFromPubSub
         | "Read from Pub/Sub" >> io.ReadFromPubSub(topic=input_topic)
         |
         "Window into" >> GroupMessagesByFixedWindows(window_size, num_shards)
         | "Write to GCS" >> ParDo(WriteToGCS(custom_options.output_path)))