Пример #1
0
  def test(self):
    def join_fn(element, side_input, iterations):
      result = []
      for i in range(iterations):
        for key, value in side_input:
          if i == iterations - 1:
            result.append({key: element[1] + value})
      yield result

    main_input = (
        self.pipeline
        | "Read pcoll 1" >> beam.io.Read(
            SyntheticSource(self.parse_synthetic_source_options()))
        | 'Measure time: Start pcoll 1' >> beam.ParDo(
            MeasureTime(self.metrics_namespace)))

    side_input = (
        self.pipeline
        | "Read pcoll 2" >> beam.io.Read(
            SyntheticSource(self.parse_synthetic_source_options()))
        | 'Measure time: Start pcoll 2' >> beam.ParDo(
            MeasureTime(self.metrics_namespace)))
    # pylint: disable=expression-not-assigned
    (
        main_input
        | "Merge" >> beam.ParDo(join_fn, AsIter(side_input), self.iterations)
        | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)))
Пример #2
0
    def test(self):
        class CounterOperation(beam.DoFn):
            def __init__(self, number_of_counters, number_of_operations):
                self.number_of_operations = number_of_operations
                self.counters = []
                for i in range(number_of_counters):
                    self.counters.append(
                        Metrics.counter('do-not-publish', 'name-{}'.format(i)))

            def process(self, element):
                for _ in range(self.number_of_operations):
                    for counter in self.counters:
                        counter.inc()
                yield element

        pc = (self.pipeline
              | 'Read synthetic' >> beam.io.Read(
                  SyntheticSource(self.parse_synthetic_source_options()))
              | 'Measure time: Start' >> beam.ParDo(
                  MeasureTime(self.metrics_namespace)))

        for i in range(self.iterations):
            pc = (pc
                  | 'Step: %d' % i >> beam.ParDo(
                      CounterOperation(self.number_of_counters,
                                       self.number_of_operations)))

        # pylint: disable=expression-not-assigned
        (pc
         | 'Measure time: End' >> beam.ParDo(
             MeasureTime(self.metrics_namespace)))
Пример #3
0
    def _create_input_data(self):
        """
    Runs an additional pipeline which creates test data and waits for its
    completion.
    """
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            import base64
            return {'data': base64.b64encode(record[1])}

        with TestPipeline() as p:
            (  # pylint: disable=expression-not-assigned
                p
                | 'Produce rows' >> Read(
                    SyntheticSource(self.parse_synthetic_source_options()))
                | 'Format' >> Map(format_record)
                | 'Write to BigQuery' >> WriteToBigQuery(
                    dataset=self.input_dataset,
                    table=self.input_table,
                    schema=SCHEMA,
                    create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                    write_disposition=BigQueryDisposition.WRITE_EMPTY))
Пример #4
0
    def test(self):
        if self.get_option_or_default('use_stateful_load_generator', False):
            source = (
                self.pipeline
                | 'LoadGenerator' >> StatefulLoadGenerator(self.input_options)
                | beam.ParDo(AssignTimestamps())
                | beam.WindowInto(window.FixedWindows(20)))
        else:
            source = (
                self.pipeline
                | 'Read synthetic' >> beam.io.Read(
                    SyntheticSource(self.parse_synthetic_source_options())))

        pc = (source
              | 'Measure time: Start' >> beam.ParDo(
                  MeasureTime(self.metrics_namespace)))

        for branch in range(self.fanout):
            (  # pylint: disable=expression-not-assigned
                pc
                | 'Combine with Top %i' % branch >> beam.CombineGlobally(
                    beam.combiners.TopCombineFn(
                        self.top_count)).without_defaults()
                | 'Consume %i' % branch >> beam.ParDo(self._GetElement())
                | 'Measure time: End %i' % branch >> beam.ParDo(
                    MeasureTime(self.metrics_namespace)))
    def test(self):
        def format_record(record):
            import base64
            return base64.b64encode(record[1])

        def make_insert_mutations(element):
            import uuid  # pylint: disable=reimported
            from apache_beam.io.gcp.experimental.spannerio import WriteMutation
            ins_mutation = WriteMutation.insert(table='test',
                                                columns=('id', 'data'),
                                                values=[(str(uuid.uuid1()),
                                                         element)])
            return [ins_mutation]

        (  # pylint: disable=expression-not-assigned
            self.pipeline
            | 'Produce rows' >> Read(
                SyntheticSource(self.parse_synthetic_source_options()))
            | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
            | 'Format' >> Map(format_record)
            | 'Make mutations' >> FlatMap(make_insert_mutations)
            | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
            | 'Write to Spanner' >> WriteToSpanner(
                project_id=self.project,
                instance_id=self.spanner_instance,
                database_id=self.TEST_DATABASE,
                max_batch_size_bytes=5120))
Пример #6
0
    def _create_input_data(self):
        """
    Runs an additional pipeline which creates test data and waits for its
    completion.
    """
        def format_record(record):
            import base64
            return base64.b64encode(record[1])

        def make_insert_mutations(element):
            import uuid
            from apache_beam.io.gcp.experimental.spannerio import WriteMutation
            ins_mutation = WriteMutation.insert(table='test_data',
                                                columns=('id', 'data'),
                                                values=[(str(uuid.uuid1()),
                                                         element)])
            return [ins_mutation]

        with TestPipeline() as p:
            (  # pylint: disable=expression-not-assigned
                p
                | 'Produce rows' >> Read(
                    SyntheticSource(self.parse_synthetic_source_options()))
                | 'Format' >> Map(format_record)
                | 'Make mutations' >> FlatMap(make_insert_mutations)
                | 'Write to Spanner' >> WriteToSpanner(
                    project_id=self.project,
                    instance_id=self.spanner_instance,
                    database_id=self.spanner_database,
                    max_batch_size_bytes=5120))
Пример #7
0
  def test(self):
    class CounterOperation(beam.DoFn):
      def __init__(self, number_of_counters, number_of_operations):
        self.number_of_operations = number_of_operations
        self.counters = []
        for i in range(number_of_counters):
          self.counters.append(
              Metrics.counter('do-not-publish', 'name-{}'.format(i)))

      state_param = beam.DoFn.StateParam(
          userstate.CombiningValueStateSpec(
              'count',
              beam.coders.IterableCoder(beam.coders.VarIntCoder()),
              sum)) if self.stateful else None

      def process(self, element, state=state_param):
        for _ in range(self.number_of_operations):
          for counter in self.counters:
            counter.inc()
          if state:
            state.add(1)
        yield element

    if self.get_option_or_default('streaming', False):
      source = (
          self.pipeline
          | 'LoadGenerator' >> StatefulLoadGenerator(self.input_options))
    else:
      source = (
          self.pipeline
          | 'Read synthetic' >> beam.io.Read(
              SyntheticSource(self.parse_synthetic_source_options())))

    pc = (
        source
        | 'Measure time: Start' >> beam.ParDo(
            MeasureTime(self.metrics_namespace))
        | 'Assign timestamps' >> beam.ParDo(AssignTimestamps()))

    for i in range(self.iterations):
      pc = (
          pc
          | 'Step: %d' % i >> beam.ParDo(
              CounterOperation(
                  self.number_of_counters, self.number_of_operations)))

    # pylint: disable=expression-not-assigned
    (
        pc
        |
        'Measure latency' >> beam.ParDo(MeasureLatency(self.metrics_namespace))
        |
        'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)))
Пример #8
0
    def test(self):
        input = (self.pipeline
                 | beam.io.Read(
                     SyntheticSource(self.parse_synthetic_source_options()))
                 | 'Measure time: Start' >> beam.ParDo(
                     MeasureTime(self.metrics_namespace)))

        for branch in range(self.fanout):
            (  # pylint: disable=expression-not-assigned
                input
                | 'Combine with Top %i' % branch >> beam.CombineGlobally(
                    beam.combiners.TopCombineFn(self.top_count))
                | 'Consume %i' % branch >> beam.ParDo(self._GetElement())
                | 'Measure time: End %i' % branch >> beam.ParDo(
                    MeasureTime(self.metrics_namespace)))
Пример #9
0
    def test(self):
        pc = (self.pipeline
              | beam.io.Read(
                  SyntheticSource(self.parse_synthetic_source_options()))
              | 'Measure time: Start' >> beam.ParDo(
                  MeasureTime(self.metrics_namespace)))

        for branch in range(self.fanout):
            (  # pylint: disable=expression-not-assigned
                pc
                | 'GroupByKey %i' % branch >> beam.GroupByKey()
                | 'Ungroup %i' % branch >> beam.ParDo(
                    self._UngroupAndReiterate(), self.iterations)
                | 'Measure time: End %i' % branch >> beam.ParDo(
                    MeasureTime(self.metrics_namespace)))
Пример #10
0
    def test(self):
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            return {'data': base64.b64encode(record[1])}

        # pylint: disable=expression-not-assigned
        (self.pipeline
         | 'ProduceRows' >> Read(
             SyntheticSource(self.parseTestPipelineOptions()))
         | 'Format' >> Map(format_record)
         | 'WriteToBigQuery' >> WriteToBigQuery(
             self.output_dataset + '.' + self.output_table,
             schema=SCHEMA,
             create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=BigQueryDisposition.WRITE_EMPTY))
Пример #11
0
  def test(self):
    def to_pubsub_message(element):
      import uuid
      from apache_beam.io import PubsubMessage
      return PubsubMessage(
          data=element[1],
          attributes={'id': str(uuid.uuid1()).encode('utf-8')},
      )

    _ = (
        self.pipeline
        | 'Create input' >> Read(
            SyntheticSource(self.parse_synthetic_source_options()))
        | 'Format to pubsub message in bytes' >> beam.Map(to_pubsub_message)
        | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))
        | 'Write to Pubsub' >> beam.io.WriteToPubSub(
            self.topic_name,
            with_attributes=True,
            id_label='id',
        ))
Пример #12
0
    def test(self):
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            return {'data': base64.b64encode(record[1])}

        (  # pylint: disable=expression-not-assigned
            self.pipeline
            | 'Produce rows' >> Read(
                SyntheticSource(self.parse_synthetic_source_options()))
            | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
            | 'Format' >> Map(format_record)
            | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
            | 'Write to BigQuery' >> WriteToBigQuery(
                dataset=self.output_dataset,
                table=self.output_table,
                schema=SCHEMA,
                create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=BigQueryDisposition.WRITE_TRUNCATE))
Пример #13
0
    def test(self):
        class SequenceSideInputTestDoFn(beam.DoFn):
            """Iterate over first n side_input elements."""
            def __init__(self, first_n):
                self._first_n = first_n

            def process(self, unused_element, side_input):
                i = 0
                it = iter(side_input)
                while i < self._first_n:
                    i += 1
                    try:
                        # No-op. We only make sure that the element is accessed.
                        next(it)
                    except StopIteration:
                        return

        class MappingSideInputTestDoFn(beam.DoFn):
            """Take a sequence of keys as an additional side input and for each
      key in the sequence checks the value for key in the dictionary."""
            def process(self, unused_element, dict_side_input, keys_to_check):
                for key in keys_to_check:
                    # No-op. We only make sure that the element is accessed.
                    dict_side_input[key]

        class GetRandomKeys(beam.DoFn):
            def __init__(self, n):
                self._n = n

            def process(self, unused_element, dict_side_input):
                import random
                n = min(self._n, len(dict_side_input))
                return random.sample(dict_side_input.keys(), n)

        class AddEventTimestamps(beam.DoFn):
            """Assign timestamp to each element of PCollection."""
            def setup(self):
                self._timestamp = 0

            def process(self, element):
                from apache_beam.transforms.combiners import window
                yield window.TimestampedValue(element, self._timestamp)
                self._timestamp += 1

        input_pc = (self.pipeline
                    | 'Read synthetic' >> beam.io.Read(
                        SyntheticSource(self.parse_synthetic_source_options()))
                    | 'Collect start time metrics' >> beam.ParDo(
                        MeasureTime(self.metrics_namespace)))

        if self.side_input_size != self.input_options.get('num_records'):
            side_input = (
                input_pc
                | 'Sample {} elements'.format(self.side_input_size) >>
                beam.combiners.Sample.FixedSizeGlobally(self.side_input_size)
                | 'Flatten a sequence' >> beam.FlatMap(lambda x: x))
        else:
            side_input = input_pc

        if self.windows > 0:
            window_size = self.side_input_size / self.windows
            logging.info('Fixed windows of %s seconds will be applied',
                         window_size)
            side_input = (
                side_input
                | 'Add event timestamps' >> beam.ParDo(AddEventTimestamps())
                | 'Apply windows' >> beam.WindowInto(
                    beam.combiners.window.FixedWindows(window_size)))

        side_input_type = self.materialize_as()
        elements_to_access = self.side_input_size * self.access_percentage // 100
        logging.info(
            '%s out of %s total elements in the side input will be '
            'accessed.', elements_to_access, self.side_input_size)
        if side_input_type is beam.pvalue.AsDict:
            random_keys = (self.pipeline
                           | beam.Impulse()
                           | 'Get random keys' >> beam.ParDo(
                               GetRandomKeys(elements_to_access),
                               beam.pvalue.AsDict(side_input)))
            pc = input_pc | beam.ParDo(MappingSideInputTestDoFn(),
                                       side_input_type(side_input),
                                       beam.pvalue.AsList(random_keys))
        else:
            pc = input_pc | beam.ParDo(
                SequenceSideInputTestDoFn(elements_to_access),
                side_input_type(side_input))

        _ = pc | 'Collect end time metrics' >> beam.ParDo(
            MeasureTime(self.metrics_namespace))