Пример #1
0
    def find_all(pcoll, regex, group=0, outputEmpty=True):
        """
    Returns the matches if a portion of the line matches the Regex. By default,
    list of group 0 will return with empty items. To get all groups, pass the
    `Regex.ALL` flag in the `group` parameter which returns all the groups in
    the tuple format.

    Args:
      regex: the regular expression string or (re.compile) pattern.
      group: (optional) name of the group, it can be integer or a string value.
      outputEmpty: (optional) Should empty be output. True to output empties
        and false if not.
    """
        regex = Regex._regex_compile(regex)

        def _process(element):
            matches = regex.finditer(element)
            if group == Regex.ALL:
                yield [(m.group(), m.groups()[0]) for m in matches
                       if outputEmpty or m.groups()[0]]
            else:
                yield [
                    m.group(group) for m in matches
                    if outputEmpty or m.group(group)
                ]

        return pcoll | FlatMap(_process)
Пример #2
0
    def expand(self, pcoll):
        class ReifyTimestamps(DoFn):
            def process(self, element, timestamp=DoFn.TimestampParam):
                yield element[0], TimestampedValue(element[1], timestamp)

        class RestoreTimestamps(DoFn):
            def process(self, element, window=DoFn.WindowParam):
                # Pass the current window since _IdentityWindowFn wouldn't know how
                # to generate it.
                yield windowed_value.WindowedValue(
                    (element[0], element[1].value), element[1].timestamp,
                    [window])

        windowing_saved = pcoll.windowing
        # The linter is confused.
        # pylint: disable=abstract-class-instantiated
        result = (
            pcoll
            | ParDo(ReifyTimestamps())
            | 'IdentityWindow' >> WindowInto(
                _IdentityWindowFn(windowing_saved.windowfn.get_window_coder()),
                trigger=AfterCount(1),
                accumulation_mode=AccumulationMode.DISCARDING,
                timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST,
            )
            | GroupByKey()
            | 'ExpandIterable' >> FlatMap(lambda e: [(e[0], value)
                                                     for value in e[1]])
            | ParDo(RestoreTimestamps()))
        result._windowing = windowing_saved
        return result
Пример #3
0
    def expand(self, pcoll):
        windowing_saved = pcoll.windowing
        if windowing_saved.is_default():
            # In this (common) case we can use a trivial trigger driver
            # and avoid the (expensive) window param.
            globally_windowed = window.GlobalWindows.windowed_value(None)
            MIN_TIMESTAMP = window.MIN_TIMESTAMP

            def reify_timestamps(element, timestamp=DoFn.TimestampParam):
                key, value = element
                if timestamp == MIN_TIMESTAMP:
                    timestamp = None
                return key, (value, timestamp)

            def restore_timestamps(element):
                key, values = element
                return [
                    globally_windowed.with_value((key, value)) if
                    timestamp is None else window.GlobalWindows.windowed_value(
                        (key, value), timestamp)
                    for (value, timestamp) in values
                ]
        else:

            # typing: All conditional function variants must have identical signatures
            def reify_timestamps(  # type: ignore[misc]
                    element,
                    timestamp=DoFn.TimestampParam,
                    window=DoFn.WindowParam):
                key, value = element
                # Transport the window as part of the value and restore it later.
                return key, windowed_value.WindowedValue(
                    value, timestamp, [window])

            def restore_timestamps(element):
                key, windowed_values = element
                return [
                    wv.with_value((key, wv.value)) for wv in windowed_values
                ]

        ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any)

        # TODO(BEAM-8104) Using global window as one of the standard window.
        # This is to mitigate the Dataflow Java Runner Harness limitation to
        # accept only standard coders.
        ungrouped._windowing = Windowing(
            window.GlobalWindows(),
            triggerfn=Always(),
            accumulation_mode=AccumulationMode.DISCARDING,
            timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST)
        result = (ungrouped
                  | GroupByKey()
                  | FlatMap(restore_timestamps).with_output_types(Any))
        result._windowing = windowing_saved
        return result
Пример #4
0
  def expand(self, pcoll):
    windowing_saved = pcoll.windowing
    if windowing_saved.is_default():
      # In this (common) case we can use a trivial trigger driver
      # and avoid the (expensive) window param.
      globally_windowed = window.GlobalWindows.windowed_value(None)
      window_fn = window.GlobalWindows()
      MIN_TIMESTAMP = window.MIN_TIMESTAMP

      def reify_timestamps(element, timestamp=DoFn.TimestampParam):
        key, value = element
        if timestamp == MIN_TIMESTAMP:
          timestamp = None
        return key, (value, timestamp)

      def restore_timestamps(element):
        key, values = element
        return [
            globally_windowed.with_value((key, value))
            if timestamp is None
            else window.GlobalWindows.windowed_value((key, value), timestamp)
            for (value, timestamp) in values]

    else:
      # The linter is confused.
      # hash(1) is used to force "runtime" selection of _IdentityWindowFn
      # pylint: disable=abstract-class-instantiated
      cls = hash(1) and _IdentityWindowFn
      window_fn = cls(
          windowing_saved.windowfn.get_window_coder())

      def reify_timestamps(element, timestamp=DoFn.TimestampParam):
        key, value = element
        return key, TimestampedValue(value, timestamp)

      def restore_timestamps(element, window=DoFn.WindowParam):
        # Pass the current window since _IdentityWindowFn wouldn't know how
        # to generate it.
        key, values = element
        return [
            windowed_value.WindowedValue(
                (key, value.value), value.timestamp, [window])
            for value in values]

    ungrouped = pcoll | Map(reify_timestamps)
    ungrouped._windowing = Windowing(
        window_fn,
        triggerfn=AfterCount(1),
        accumulation_mode=AccumulationMode.DISCARDING,
        timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST)
    result = (ungrouped
              | GroupByKey()
              | FlatMap(restore_timestamps))
    result._windowing = windowing_saved
    return result
Пример #5
0
    def expand(self, pvalue):
        keyed_pc = (pvalue | 'AssignKey' >> Map(lambda x: (uuid.uuid4(), x)))
        if keyed_pc.windowing.windowfn.is_merging():
            raise ValueError(
                'Transform ReadAllFiles cannot be used in the presence '
                'of merging windows')
        if not isinstance(keyed_pc.windowing.triggerfn, DefaultTrigger):
            raise ValueError(
                'Transform ReadAllFiles cannot be used in the presence '
                'of non-trivial triggers')

        return (keyed_pc | 'GroupByKey' >> GroupByKey()
                # Using FlatMap below due to the possibility of key collisions.
                | 'DropKey' >> FlatMap(lambda k_values: k_values[1]))
Пример #6
0
    def all_matches(pcoll, regex):
        """
    Returns all matches (groups) if zero or more characters at the beginning
    of string match the regular expression.

    Args:
      regex: the regular expression string or (re.compile) pattern.
    """
        regex = Regex._regex_compile(regex)

        def _process(element):
            m = regex.match(element)
            if m:
                yield [m.group(ix) for ix in range(m.lastindex + 1)]

        return pcoll | FlatMap(_process)
Пример #7
0
  def find(pcoll, regex, group=0):
    """
    Returns the matches if a portion of the line matches the Regex. Returns
    the entire group (group 0 by default). Group can be integer value or a
    string value.

    Args:
      regex: the regular expression string or (re.compile) pattern.
      group: (optional) name of the group, it can be integer or a string value.
    """
    regex = Regex._regex_compile(regex)

    def _process(element):
      r = regex.search(element)
      if r:
        yield r.group(group)
    return pcoll | FlatMap(_process)
Пример #8
0
  def matches_kv(pcoll, regex, keyGroup, valueGroup=0):
    """
    Returns the KV pairs if the string matches the regular expression, deriving
    the key & value from the specified group of the regular expression.

    Args:
      regex: the regular expression string or (re.compile) pattern.
      keyGroup: The Regex group to use as the key. Can be int or str.
      valueGroup: (optional) Regex group to use the value. Can be int or str.
        The default value "0" returns entire matched string.
    """
    regex = Regex._regex_compile(regex)

    def _process(element):
      match = regex.match(element)
      if match:
        yield (match.group(keyGroup), match.group(valueGroup))
    return pcoll | FlatMap(_process)
Пример #9
0
    def split(pcoll, regex, outputEmpty=False):
        """
    Returns the list string which was splitted on the basis of regular
    expression. It will not output empty items (by defaults).

    Args:
      regex: the regular expression string or (re.compile) pattern.
      outputEmpty: (optional) Should empty be output. True to output empties
          and false if not.
    """
        regex = Regex._regex_compile(regex)
        outputEmpty = bool(outputEmpty)

        def _process(element):
            r = regex.split(element)
            if r and not outputEmpty:
                r = list(filter(None, r))
            yield r

        return pcoll | FlatMap(_process)
Пример #10
0
    def find_kv(pcoll, regex, keyGroup, valueGroup=0):
        """
    Returns the matches if a portion of the line matches the Regex. Returns the
    specified groups as the key and value pair.

    Args:
      regex: the regular expression string or (re.compile) pattern.
      keyGroup: The Regex group to use as the key. Can be int or str.
      valueGroup: (optional) Regex group to use the value. Can be int or str.
        The default value "0" returns entire matched string.
    """
        regex = Regex._regex_compile(regex)

        def _process(element):
            matches = regex.finditer(element)
            if matches:
                for match in matches:
                    yield (match.group(keyGroup), match.group(valueGroup))

        return pcoll | FlatMap(_process)
Пример #11
0
  def matches(pcoll, regex, group=0):
    """
    Returns the matches (group 0 by default) if zero or more characters at the
    beginning of string match the regular expression. To match the entire
    string, add "$" sign at the end of regex expression.

    Group can be integer value or a string value.

    Args:
      regex: the regular expression string or (re.compile) pattern.
      group: (optional) name/number of the group, it can be integer or a string
        value. Defaults to 0, meaning the entire matched string will be
        returned.
    """
    regex = Regex._regex_compile(regex)

    def _process(element):
      m = regex.match(element)
      if m:
        yield m.group(group)
    return pcoll | FlatMap(_process)
Пример #12
0
    def test_avro_it(self):
        num_records = self.test_pipeline.get_option('records')
        num_records = int(num_records) if num_records else 1000000

        # Seed a `PCollection` with indices that will each be FlatMap'd into
        # `batch_size` records, to avoid having a too-large list in memory at
        # the outset
        batch_size = self.test_pipeline.get_option('batch-size')
        batch_size = int(batch_size) if batch_size else 10000

        # pylint: disable=range-builtin-not-iterating
        batches = range(int(num_records / batch_size))

        def batch_indices(start):
            # pylint: disable=range-builtin-not-iterating
            return range(start * batch_size, (start + 1) * batch_size)

        # A `PCollection` with `num_records` avro records
        records_pcoll = \
            self.test_pipeline \
            | 'create-batches' >> Create(batches) \
            | 'expand-batches' >> FlatMap(batch_indices) \
            | 'create-records' >> Map(record)

        fastavro_output = '/'.join([self.output, 'fastavro'])
        avro_output = '/'.join([self.output, 'avro'])

        self.addCleanup(delete_files, [self.output + '*'])

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_fastavro' >> WriteToAvro(
            fastavro_output,
            self.SCHEMA,
            use_fastavro=True
        )

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_avro' >> WriteToAvro(
            avro_output,
            self.SCHEMA,
            use_fastavro=False
        )

        result = self.test_pipeline.run()
        result.wait_until_finish()
        assert result.state == PipelineState.DONE

        fastavro_read_pipeline = TestPipeline(is_integration_test=True)

        fastavro_records = \
            fastavro_read_pipeline \
            | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \
            | 'read-fastavro' >> ReadAllFromAvro(use_fastavro=True) \
            | Map(lambda rec: (rec['number'], rec))

        avro_records = \
            fastavro_read_pipeline \
            | 'create-avro' >> Create(['%s*' % avro_output]) \
            | 'read-avro' >> ReadAllFromAvro(use_fastavro=False) \
            | Map(lambda rec: (rec['number'], rec))

        def check(elem):
            v = elem[1]

            def assertEqual(l, r):
                if l != r:
                    raise BeamAssertException('Assertion failed: %s == %s' %
                                              (l, r))

            assertEqual(v.keys(), ['avro', 'fastavro'])
            avro_values = v['avro']
            fastavro_values = v['fastavro']
            assertEqual(avro_values, fastavro_values)
            assertEqual(len(avro_values), 1)

        # pylint: disable=expression-not-assigned
        {
            'avro': avro_records,
            'fastavro': fastavro_records
        } \
        | CoGroupByKey() \
        | Map(check)

        fastavro_read_pipeline.run().wait_until_finish()
        assert result.state == PipelineState.DONE
Пример #13
0
    def test_avro_it(self):
        num_records = self.test_pipeline.get_option('records')
        num_records = int(num_records) if num_records else 1000000
        fastavro_output = '/'.join([self.output, 'fastavro'])

        # Seed a `PCollection` with indices that will each be FlatMap'd into
        # `batch_size` records, to avoid having a too-large list in memory at
        # the outset
        batch_size = self.test_pipeline.get_option('batch-size')
        batch_size = int(batch_size) if batch_size else 10000

        # pylint: disable=bad-option-value
        batches = range(int(num_records / batch_size))

        def batch_indices(start):
            # pylint: disable=bad-option-value
            return range(start * batch_size, (start + 1) * batch_size)

        # A `PCollection` with `num_records` avro records
        records_pcoll = \
            self.test_pipeline \
            | 'create-batches' >> Create(batches) \
            | 'expand-batches' >> FlatMap(batch_indices) \
            | 'create-records' >> Map(record)

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_fastavro' >> WriteToAvro(
            fastavro_output,
            parse_schema(json.loads(self.SCHEMA_STRING)),
        )
        result = self.test_pipeline.run()
        result.wait_until_finish()
        fastavro_pcoll = self.test_pipeline \
                         | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \
                         | 'read-fastavro' >> ReadAllFromAvro()

        mapped_fastavro_pcoll = fastavro_pcoll | "map_fastavro" >> Map(
            lambda x: (x['number'], x))
        mapped_record_pcoll = records_pcoll | "map_record" >> Map(
            lambda x: (x['number'], x))

        def validate_record(elem):
            v = elem[1]

            def assertEqual(l, r):
                if l != r:
                    raise BeamAssertException('Assertion failed: %s == %s' %
                                              (l, r))

            assertEqual(sorted(v.keys()), ['fastavro', 'record_pcoll'])
            record_pcoll_values = v['record_pcoll']
            fastavro_values = v['fastavro']
            assertEqual(record_pcoll_values, fastavro_values)
            assertEqual(len(record_pcoll_values), 1)

        {
            "record_pcoll": mapped_record_pcoll,
            "fastavro": mapped_fastavro_pcoll
        } | CoGroupByKey() | Map(validate_record)

        result = self.test_pipeline.run()
        result.wait_until_finish()

        self.addCleanup(delete_files, [self.output])
        assert result.state == PipelineState.DONE
Пример #14
0
def main():
  project = 'chromeperf'
  options = PipelineOptions()
  options.view_as(DebugOptions).add_experiment('use_beam_bq_sink')
  options.view_as(GoogleCloudOptions).project = project
  bq_export_options = options.view_as(BqExportOptions)

  p = beam.Pipeline(options=options)
  entities_read = Metrics.counter('main', 'entities_read')
  failed_entity_transforms = Metrics.counter('main', 'failed_entity_transforms')

  """
  CREATE TABLE `chromeperf.chromeperf_dashboard_data.rows_test`
  (revision INT64 NOT NULL,
   value FLOAT64 NOT NULL,
   std_error FLOAT64,
   `timestamp` TIMESTAMP NOT NULL,
   test STRING NOT NULL,
   master STRING,
   bot STRING,
   properties STRING)
  PARTITION BY DATE(`timestamp`);
  """  # pylint: disable=pointless-string-statement
  bq_row_schema = {'fields': [
      {'name': 'revision', 'type': 'INT64', 'mode': 'REQUIRED'},
      {'name': 'value', 'type': 'FLOAT', 'mode': 'REQUIRED'},
      {'name': 'std_error', 'type': 'FLOAT', 'mode': 'NULLABLE'},
      {'name': 'timestamp', 'type': 'TIMESTAMP', 'mode': 'REQUIRED'},
      {'name': 'test', 'type': 'STRING', 'mode': 'REQUIRED'},
      {'name': 'master', 'type': 'STRING', 'mode': 'NULLABLE'},
      {'name': 'bot', 'type': 'STRING', 'mode': 'NULLABLE'},
      {'name': 'properties', 'type': 'STRING', 'mode': 'NULLABLE'},
  ]}
  def RowEntityToRowDict(entity):
    entities_read.inc()
    try:
      d = {
          'revision': entity.key.id,
          'value': FloatHack(entity['value']),
          'std_error': FloatHack(entity.get('error')),
          'timestamp': entity['timestamp'].isoformat(),
          'test': entity.key.parent.name,
      }
      # Add the expando properties as a JSON-encoded dict.
      properties = {}
      for key, value in entity.items():
        if key in d or key in ['parent_test', 'error']:
          # skip properties with dedicated columns.
          continue
        if isinstance(value, float):
          value = FloatHack(value)
        properties[key] = value
      d['properties'] = json.dumps(properties) if properties else None
      # Add columns derived from test: master, bot.
      test_path_parts = d['test'].split('/', 2)
      if len(test_path_parts) >= 3:
        d['master'] = test_path_parts[0]
        d['bot'] = test_path_parts[1]
      return [d]
    except KeyError:
      logging.getLogger().exception('Failed to convert Row')
      failed_entity_transforms.inc()
      return []

  row_query_params = dict(project=project, kind='Row')
  row_entities = (
      p
      | 'ReadFromDatastore(Row)' >> ReadTimestampRangeFromDatastore(
          row_query_params,
          time_range_provider=bq_export_options.GetTimeRangeProvider(),
          step=datetime.timedelta(minutes=5)))

  row_dicts = (
      row_entities | 'ConvertEntityToRow(Row)' >> FlatMap(RowEntityToRowDict))

  table_name = '{}:chromeperf_dashboard_data.rows{}'.format(
      project, bq_export_options.table_suffix)
  _ = row_dicts | 'WriteToBigQuery(rows)' >> WriteToPartitionedBigQuery(
      table_name, bq_row_schema)

  result = p.run()
  result.wait_until_finish()
  PrintCounters(result)
Пример #15
0
def main():
    project = 'chromeperf'
    options = PipelineOptions()
    options.view_as(DebugOptions).add_experiment('use_beam_bq_sink')
    options.view_as(GoogleCloudOptions).project = project
    bq_export_options = options.view_as(BqExportOptions)

    p = beam.Pipeline(options=options)
    entities_read = Metrics.counter('main', 'entities_read')
    failed_entity_transforms = Metrics.counter('main',
                                               'failed_entity_transforms')
    row_conflicts = Metrics.counter('main', 'row_conflicts')
    multiple_histograms_for_row = Metrics.counter(
        'main', 'multiple_histograms_for_row')
    orphaned_histogram = Metrics.counter('main', 'orphaned_histogram')

    """
  CREATE TABLE `chromeperf.chromeperf_dashboard_rows.<MASTER>`
  (revision INT64 NOT NULL,
   value FLOAT64 NOT NULL,
   std_error FLOAT64,
   `timestamp` TIMESTAMP NOT NULL,
   master STRING NOT NULL,
   bot STRING NOT NULL,
   measurement STRING,
   test STRING NOT NULL,
   properties STRING,
   sample_values ARRAY<FLOAT64>)
  PARTITION BY DATE(`timestamp`)
  CLUSTER BY master, bot, measurement;
  """  # pylint: disable=pointless-string-statement
    bq_row_schema = {
        'fields': [
            {
                'name': 'revision',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'value',
                'type': 'FLOAT',
                'mode': 'REQUIRED'
            },
            {
                'name': 'std_error',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'timestamp',
                'type': 'TIMESTAMP',
                'mode': 'REQUIRED'
            },
            {
                'name': 'master',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'bot',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'measurement',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'test',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'properties',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'sample_values',
                'type': 'FLOAT',
                'mode': 'REPEATED'
            },
        ]
    }

    def RowEntityToRowDict(entity):
        entities_read.inc()
        try:
            d = {
                'revision': entity.key.id,
                'value': FloatHack(entity['value']),
                'std_error': FloatHack(entity.get('error')),
                'timestamp': entity['timestamp'].isoformat(),
                'test': entity.key.parent.name,
            }
            # Add the expando properties as a JSON-encoded dict.
            properties = {}
            for key, value in entity.items():
                if key in d or key in ['parent_test', 'error']:
                    # skip properties with dedicated columns.
                    continue
                if isinstance(value, float):
                    value = FloatHack(value)
                properties[key] = value
            d['properties'] = json.dumps(properties) if properties else None
            # Add columns derived from test: master, bot.
            test_path_parts = d['test'].split('/', 2)
            if len(test_path_parts) >= 3:
                d['master'] = test_path_parts[0]
                d['bot'] = test_path_parts[1]
                d['measurement'] = '/'.join(test_path_parts[2:])
            return [d]
        except KeyError:
            logging.getLogger().exception('Failed to convert Row')
            failed_entity_transforms.inc()
            return []

    row_query_params = dict(project=project, kind='Row')
    row_entities = (
        p
        | 'ReadFromDatastore(Row)' >> ReadTimestampRangeFromDatastore(
            row_query_params,
            time_range_provider=bq_export_options.GetTimeRangeProvider(),
            step=datetime.timedelta(minutes=5)))

    row_dicts = (row_entities
                 | 'ConvertEntityToDict(Row)' >> FlatMap(RowEntityToRowDict))

    # The sample_values are not found in the Row entity.  So we have to fetch all
    # the corresponding Histogram entities and join them with our collection of
    # Rows (by using test + revision as the join key).  We also need to unpack the
    # sample values arrays out of the zlib-compressed JSON stored in the
    # Histogram's "data" property.
    def HistogramEntityToDict(entity):
        """Returns dicts with keys: 'test', 'revision', 'sample_values'."""
        entities_read.inc()
        try:
            data = entity['data']
        except KeyError:
            logging.getLogger().exception('Histogram missing "data" field')
            failed_entity_transforms.inc()
            return []
        try:
            json_str = zlib.decompress(data)
        except zlib.error:
            logging.getLogger().exception('Histogram data not valid zlib: %r',
                                          data)
            failed_entity_transforms.inc()
            return []
        try:
            data_dict = json.loads(json_str)
        except json.JSONDecodeError:
            logging.getLogger().exception('Histogram data not valid json.')
            failed_entity_transforms.inc()
            return []
        sample_values = data_dict.get('sampleValues', [])
        if not isinstance(sample_values, list):
            logging.getLogger().exception(
                'Histogram data.sampleValues not valid list.')
            failed_entity_transforms.inc()
            return []
        count = len(sample_values)
        sample_values = [v for v in sample_values if v is not None]
        if len(sample_values) != count:
            logging.getLogger().warn(
                'Histogram data.sampleValues contains null: %r', entity.key)
        for v in sample_values:
            if not isinstance(v, (int, float)):
                logging.getLogger().exception(
                    'Histogram data.sampleValues contains non-numeric: %r', v)
                failed_entity_transforms.inc()
                return []
        try:
            return [{
                'test': entity['test'].name,
                'revision': entity['revision'],
                'sample_values': sample_values,
            }]
        except KeyError:
            logging.getLogger().exception(
                'Histogram missing test or revision field/s')
            failed_entity_transforms.inc()
            return []

    histogram_query_params = dict(project=project, kind='Histogram')
    histogram_entities = (
        p
        | 'ReadFromDatastore(Histogram)' >> ReadTimestampRangeFromDatastore(
            histogram_query_params,
            time_range_provider=bq_export_options.GetTimeRangeProvider(),
            step=datetime.timedelta(minutes=5)))

    histogram_dicts = (
        histogram_entities
        | 'ConvertEntityToDict(Histogram)' >> FlatMap(HistogramEntityToDict))

    def TestRevision(element):
        return (element['test'], element['revision'])

    rows_with_key = (row_dicts
                     | 'WithKeys(Row)' >> beam.WithKeys(TestRevision))
    histograms_with_key = (
        histogram_dicts | 'WithKeys(Histogram)' >> beam.WithKeys(TestRevision))

    def MergeRowAndSampleValues(element):
        group_key, join_values = element
        rows, histograms = join_values
        if len(rows) == 0:
            orphaned_histogram.inc()
            logging.getLogger().error("No Row for Histogram(s) (%r)",
                                      group_key)
            return []
        elif len(rows) > 1:
            row_conflicts.inc()
            logging.getLogger().error("Multiple rows (%d) for %r", len(rows),
                                      group_key)
            return rows
        row = rows[0]
        if len(histograms) > 1:
            # We'll merge these, so this isn't an error.
            multiple_histograms_for_row.inc()
        elif len(histograms) == 0:
            # No sample values to annotate the row with.  This is common.
            return [row]
        # Merge multiple histogram's values into a single row.
        row['sample_values'] = list(
            itertools.chain.from_iterable(h['sample_values']
                                          for h in histograms))
        return [row]

    joined_and_annotated = ((rows_with_key, histograms_with_key)
                            | beam.CoGroupByKey()
                            | beam.FlatMap(MergeRowAndSampleValues))

    def TableNameFn(unused_element):
        return '{project}:{dataset}.rows{suffix}'.format(
            project=project,
            dataset=bq_export_options.dataset.get(),
            suffix=bq_export_options.table_suffix)

    _ = (joined_and_annotated
         | 'WriteToBigQuery(rows)' >> WriteToPartitionedBigQuery(
             TableNameFn,
             bq_row_schema,
             additional_bq_parameters={
                 'clustering': {
                     'fields': ['master', 'bot', 'measurement']
                 }
             }))

    result = p.run()
    result.wait_until_finish()
    PrintCounters(result)