Пример #1
0
    def _run_group_into_batches_and_get_step_properties(
            self, with_sharded_key, additional_properties):
        self.default_properties.append('--streaming')
        self.default_properties.append(
            '--experiment=enable_streaming_auto_sharding')
        for property in additional_properties:
            self.default_properties.append(property)

        runner = DataflowRunner()
        with beam.Pipeline(runner=runner,
                           options=PipelineOptions(
                               self.default_properties)) as p:
            # pylint: disable=expression-not-assigned
            input = p | beam.Create([('a', 1), ('a', 1), ('b', 3), ('b', 4)])
            if with_sharded_key:
                (input | beam.GroupIntoBatches.WithShardedKey(2)
                 | beam.Map(lambda key_values:
                            (key_values[0].key, key_values[1])))
                step_name = (
                    u'WithShardedKey/GroupIntoBatches/ParDo(_GroupIntoBatchesDoFn)'
                )
            else:
                input | beam.GroupIntoBatches(2)
                step_name = u'GroupIntoBatches/ParDo(_GroupIntoBatchesDoFn)'

        return self._find_step(runner.job, step_name)['properties']
Пример #2
0
    def expand(
        self, results: beam.PCollection[job_run_result.JobRunResult]
    ) -> beam.pvalue.PDone:
        """Writes the given job results to the NDB datastore.

        This overrides expand from parent class.

        Args:
            results: PCollection. Models, can also contain just one model.

        Returns:
            PCollection. An empty PCollection.
        """
        return (
            results
            # NOTE: Pylint is wrong. WithKeys() is a decorated function with a
            # different signature than the one it's defined with.
            | beam.WithKeys(None)  # pylint: disable=no-value-for-parameter
            # GroupIntoBatches() requires (key, value) pairs as input, so we
            # give everything None keys and then immediately discard them.
            | beam.GroupIntoBatches(self._MAX_RESULT_INSTANCES_PER_MODEL)
            | beam.Values()  # pylint: disable=no-value-for-parameter
            | beam.FlatMap(job_run_result.JobRunResult.accumulate)
            | beam.Map(self.create_beam_job_run_result_model,
                       results.pipeline.options.namespace)
            | ndb_io.PutModels())
Пример #3
0
 def expand(self, results):
     """Writes the given job results to the NDB datastore."""
     return (
         results
         # NOTE: Pylint is wrong. WithKeys() is a decorated function with a
         # different signature than the one it's defined with.
         | beam.WithKeys(None)  # pylint: disable=no-value-for-parameter
         # GroupIntoBatches() requires (key, value) pairs as input, so we
         # give everything None keys and then immediately discard them.
         | beam.GroupIntoBatches(self._MAX_RESULT_INSTANCES_PER_MODEL)
         | beam.Values()
         | beam.FlatMap(job_run_result.JobRunResult.accumulate)
         | beam.Map(self.create_beam_job_run_result_model)
         | ndb_io.PutModels(self.datastoreio_stub))
Пример #4
0
def groupintobatches(test=None):
    # [START groupintobatches]
    import apache_beam as beam

    with beam.Pipeline() as pipeline:
        batches_with_keys = (pipeline
                             | 'Create produce' >> beam.Create([
                                 ('spring', '🍓'),
                                 ('spring', '🥕'),
                                 ('spring', '🍆'),
                                 ('spring', '🍅'),
                                 ('summer', '🥕'),
                                 ('summer', '🍅'),
                                 ('summer', '🌽'),
                                 ('fall', '🥕'),
                                 ('fall', '🍅'),
                                 ('winter', '🍆'),
                             ])
                             | 'Group into batches' >> beam.GroupIntoBatches(3)
                             | beam.Map(print))
        # [END groupintobatches]
        if test:
            test(batches_with_keys)