Exemplo n.º 1
0
 def test_topk_empty(self):
   examples = []
   expected_result = []
   generator = top_k_stats_generator.TopKStatsGenerator(
       num_top_values=4, num_rank_histogram_buckets=3)
   self.assertSlicingAwareTransformOutputEqual(examples, generator,
                                               expected_result)
Exemplo n.º 2
0
 def test_single_string_feature_manual(self):
     # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
     batches = [{
         'fa':
         np.array([
             np.array(['a', 'b', 'c', 'e']),
             np.array(['a', 'c', 'd', 'a'])
         ],
                  dtype=np.object)
     }, {
         'fa':
         np.array([np.array(['a', 'b', 'c', 'd'])], dtype=np.object)
     }, {}]
     expected_result = text_format.Parse(
         """
   features {
     name: 'fa'
     type: STRING
     string_stats {
       top_values {
         value: 'a'
         frequency: 4
       }
       top_values {
         value: 'c'
         frequency: 3
       }
       top_values {
         value: 'd'
         frequency: 2
       }
       top_values {
         value: 'b'
         frequency: 2
       }
       rank_histogram {
         buckets {
           low_rank: 0
           high_rank: 0
           label: "a"
           sample_count: 4.0
         }
         buckets {
           low_rank: 1
           high_rank: 1
           label: "c"
           sample_count: 3.0
         }
         buckets {
           low_rank: 2
           high_rank: 2
           label: "d"
           sample_count: 2.0
         }
       }
     }
   }""", statistics_pb2.DatasetFeatureStatistics())
     generator = top_k_stats_generator.TopKStatsGenerator(
         num_top_values=4, num_rank_histogram_buckets=3)
     self.assertTransformOutputEqual(batches, generator, [expected_result])
Exemplo n.º 3
0
 def test_with_categorical_feature(self):
     batches = [{
         'fa':
         np.array([np.array([12, 23, 34, 12]),
                   np.array([45, 23])])
     }, {
         'fa': np.array([np.array([12, 12, 34, 45])])
     }]
     expected_result_fa = text_format.Parse(
         """
   features {
     name: 'fa'
     type: INT
     string_stats {
       top_values {
         value: '12'
         frequency: 4
       }
       top_values {
         value: '45'
         frequency: 2
       }
       rank_histogram {
         buckets {
           low_rank: 0
           high_rank: 0
           label: "12"
           sample_count: 4.0
         }
         buckets {
           low_rank: 1
           high_rank: 1
           label: "45"
           sample_count: 2.0
         }
         buckets {
           low_rank: 2
           high_rank: 2
           label: "34"
           sample_count: 2.0
         }
       }
     }
   }""", statistics_pb2.DatasetFeatureStatistics())
     schema = text_format.Parse(
         """
     feature {
       name: "fa"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     generator = top_k_stats_generator.TopKStatsGenerator(
         schema=schema, num_top_values=2, num_rank_histogram_buckets=3)
     self.assertTransformOutputEqual(batches, generator,
                                     [expected_result_fa])
Exemplo n.º 4
0
  def test_topk_with_numeric_feature(self):
    # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
    examples = [{'fa': np.array(['a', 'b', 'c', 'e']),
                 'fb': np.array([1.0, 2.0, 3.0])},
                {'fa': None,
                 'fb': np.array([4.0, 5.0])},
                {'fa': np.array(['a', 'c', 'd']),
                 'fb': None},
                {'fa': np.array(['a', 'a', 'b', 'c', 'd']),
                 'fb': None}]

    expected_result = [
        text_format.Parse(
            """
      features {
        name: 'fa'
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 4
          }
          top_values {
            value: 'c'
            frequency: 3
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "d"
              sample_count: 2.0
            }
          }
        }
    }""", statistics_pb2.DatasetFeatureStatistics())
    ]

    generator = top_k_stats_generator.TopKStatsGenerator(
        num_top_values=2, num_rank_histogram_buckets=3)
    self.assertSlicingAwareTransformOutputEqual(
        examples,
        generator,
        expected_result,
        add_default_slice_key_to_input=True,
        add_default_slice_key_to_output=True)
Exemplo n.º 5
0
  def test_topk_with_single_string_feature(self):
    # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
    examples = [{'fa': np.array(['a', 'b', 'c', 'e'])},
                {'fa': np.array(['a', 'c', 'd', 'a'])},
                {'fa': np.array(['a', 'b', 'c', 'd'])}]

    # Note that if two feature values have the same frequency, the one with the
    # lexicographically larger feature value will be higher in the order.
    expected_result = text_format.Parse(
        """
      features {
        name: 'fa'
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 4
          }
          top_values {
            value: 'c'
            frequency: 3
          }
          top_values {
            value: 'd'
            frequency: 2
          }
          top_values {
            value: 'b'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "d"
              sample_count: 2.0
            }
          }
        }
      }""", statistics_pb2.DatasetFeatureStatistics())
    generator = top_k_stats_generator.TopKStatsGenerator(
        num_top_values=4, num_rank_histogram_buckets=3)
    self.assertTransformOutputEqual(examples, generator, [expected_result])
Exemplo n.º 6
0
 def test_topk_with_empty_dict(self):
   examples = [{}]
   expected_result = []
   generator = top_k_stats_generator.TopKStatsGenerator(
       num_top_values=4, num_rank_histogram_buckets=3)
   self.assertSlicingAwareTransformOutputEqual(
       examples,
       generator,
       expected_result,
       add_default_slice_key_to_input=True,
       add_default_slice_key_to_output=True)
Exemplo n.º 7
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile and then batch input examples.
        batched_dataset = (dataset
                           | 'Profile' >> profile_util.Profile()
                           | 'BatchInputs' >> batch_util.BatchExamples())

        # If a set of whitelist features are provided, keep only those features.
        filtered_dataset = batched_dataset
        if self._options.feature_whitelist:
            filtered_dataset = (
                batched_dataset | 'RemoveNonWhitelistedFeatures' >> beam.Map(
                    _filter_features,
                    feature_whitelist=self._options.feature_whitelist))

        return (filtered_dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
Exemplo n.º 8
0
def _get_default_generators(
    options, in_memory = False
):
  """Initialize default list of stats generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
  stats_generators = [
      common_stats_generator.CommonStatsGenerator(
          schema=options.schema,
          weight_feature=options.weight_feature,
          num_values_histogram_buckets=options.num_values_histogram_buckets,
          epsilon=options.epsilon),
      numeric_stats_generator.NumericStatsGenerator(
          schema=options.schema,
          weight_feature=options.weight_feature,
          num_histogram_buckets=options.num_histogram_buckets,
          num_quantiles_histogram_buckets=\
            options.num_quantiles_histogram_buckets,
          epsilon=options.epsilon),
      string_stats_generator.StringStatsGenerator(
          schema=options.schema)
  ]
  if in_memory:
    stats_generators.append(
        top_k_uniques_combiner_stats_generator.
        TopKUniquesCombinerStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets))
  else:
    stats_generators.extend([
        top_k_stats_generator.TopKStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets),
        uniques_stats_generator.UniquesStatsGenerator(schema=options.schema)
    ])
  return stats_generators
Exemplo n.º 9
0
  def test_topk_with_invalid_utf8_value(self):
    # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
    examples = [{'fa': np.array(['a', b'\x80abc', 'a', b'\x80abc', 'a'],
                                dtype=np.object)}]
    expected_result = [
        text_format.Parse(
            """
      features {
        name: 'fa'
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 3
          }
          top_values {
            value: '__BYTES_VALUE__'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 3.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "__BYTES_VALUE__"
              sample_count: 2.0
            }
          }
        }
    }""", statistics_pb2.DatasetFeatureStatistics())
    ]

    generator = top_k_stats_generator.TopKStatsGenerator(
        num_top_values=4, num_rank_histogram_buckets=3)
    self.assertSlicingAwareTransformOutputEqual(
        examples,
        generator,
        expected_result,
        add_default_slice_key_to_input=True,
        add_default_slice_key_to_output=True)
Exemplo n.º 10
0
 def test_with_empty_list(self):
     batches = []
     expected_result = []
     generator = top_k_stats_generator.TopKStatsGenerator(
         num_top_values=4, num_rank_histogram_buckets=3)
     self.assertTransformOutputEqual(batches, generator, expected_result)
Exemplo n.º 11
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile the input examples.
        dataset |= 'ProfileExamples' >> profile_util.Profile()

        # Sample input data if sample_count option is provided.
        if self._options.sample_count is not None:
            # beam.combiners.Sample.FixedSizeGlobally returns a
            # PCollection[List[types.Example]], which we then flatten to get a
            # PCollection[types.Example].
            dataset |= ('SampleExamples(%s)' % self._options.sample_count >>
                        beam.combiners.Sample.FixedSizeGlobally(
                            self._options.sample_count)
                        | 'FlattenExamples' >> beam.FlatMap(lambda lst: lst))
        elif self._options.sample_rate is not None:
            dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate
                        >> beam.FlatMap(_sample_at_rate,
                                        sample_rate=self._options.sample_rate))

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        return (dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
 def test_topk_with_weights(self):
   # non-weighted ordering
   # 3 'a', 2 'e', 2 'd', 2 'c', 1 'b'
   # weighted ordering
   # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b'
   batches = [{'fa': np.array([np.array(['a', 'b', 'c', 'e']),
                               np.array(['a', 'c', 'd', 'a'])],
                              dtype=np.object),
               'w': np.array([np.array([5.0]), np.array([5.0])])},
              {'fa': np.array([np.array(['d', 'e'])], dtype=np.object),
               'w': np.array([np.array([15.0])])}]
   expected_result = [
       text_format.Parse(
           """
           features {
             name: 'fa'
             type: STRING
             string_stats {
               top_values {
                 value: 'a'
                 frequency: 3.0
               }
               top_values {
                 value: 'e'
                 frequency: 2.0
               }
               top_values {
                 value: 'd'
                 frequency: 2.0
               }
               top_values {
                 value: 'c'
                 frequency: 2.0
               }
               rank_histogram {
                 buckets {
                   low_rank: 0
                   high_rank: 0
                   label: "a"
                   sample_count: 3.0
                 }
                 buckets {
                   low_rank: 1
                   high_rank: 1
                   label: "e"
                   sample_count: 2.0
                 }
                 buckets {
                   low_rank: 2
                   high_rank: 2
                   label: "d"
                   sample_count: 2.0
                 }
               }
             }
           }""", statistics_pb2.DatasetFeatureStatistics()),
       text_format.Parse(
           """
           features {
             name: 'fa'
             type: STRING
             string_stats {
               weighted_string_stats {
                 top_values {
                   value: 'e'
                   frequency: 20.0
                 }
                 top_values {
                   value: 'd'
                   frequency: 20.0
                 }
                 top_values {
                   value: 'a'
                   frequency: 15.0
                 }
                 top_values {
                   value: 'c'
                   frequency: 10.0
                 }
                 rank_histogram {
                   buckets {
                     low_rank: 0
                     high_rank: 0
                     label: "e"
                     sample_count: 20.0
                   }
                   buckets {
                     low_rank: 1
                     high_rank: 1
                     label: "d"
                     sample_count: 20.0
                   }
                   buckets {
                     low_rank: 2
                     high_rank: 2
                     label: "a"
                     sample_count: 15.0
                   }
                 }
               }
             }
       }""", statistics_pb2.DatasetFeatureStatistics())]
   generator = top_k_stats_generator.TopKStatsGenerator(
       weight_feature='w',
       num_top_values=4, num_rank_histogram_buckets=3)
   self.assertTransformOutputEqual(batches, generator, expected_result)
Exemplo n.º 13
0
  def test_topk_with_slicing(self):
    examples = [('slice1', {
        'fa': np.array(['a', 'b', 'c', 'e']),
        'fb': np.array(['1', '1', '0'])
    }),
                ('slice2', {
                    'fa': np.array(['b', 'a', 'e', 'c']),
                    'fb': np.array(['0', '0', '1'])
                }),
                ('slice1', {
                    'fa': np.array(['a', 'c', 'd', 'a']),
                    'fb': None
                }),
                ('slice2', {
                    'fa': np.array(['b', 'e', 'd', 'b']),
                    'fb': None
                })]

    # Note that if two feature values have the same frequency, the one with the
    # lexicographically larger feature value will be higher in the order.
    expected_result = [('slice1',
                        text_format.Parse(
                            """
      features {
        name: 'fa'
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 3
          }
          top_values {
            value: 'c'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 3.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 2.0
            }
          }
        }
      }
    """, statistics_pb2.DatasetFeatureStatistics())),
                       ('slice1',
                        text_format.Parse(
                            """
      features {
        name: 'fb'
        type: STRING
        string_stats {
          top_values {
            value: '1'
            frequency: 2
          }
          top_values {
            value: '0'
            frequency: 1
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "1"
              sample_count: 2.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "0"
              sample_count: 1.0
            }
          }
        }
      }
    """, statistics_pb2.DatasetFeatureStatistics())),
                       ('slice2',
                        text_format.Parse(
                            """
      features {
        name: 'fa'
        type: STRING
        string_stats {
          top_values {
            value: 'b'
            frequency: 3
          }
          top_values {
            value: 'e'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "b"
              sample_count: 3.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "e"
              sample_count: 2.0
            }
          }
        }
      }
    """, statistics_pb2.DatasetFeatureStatistics())),
                       ('slice2',
                        text_format.Parse(
                            """
      features {
        name: 'fb'
        type: STRING
        string_stats {
          top_values {
            value: '0'
            frequency: 2
          }
          top_values {
            value: '1'
            frequency: 1
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "0"
              sample_count: 2.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "1"
              sample_count: 1.0
            }
          }
        }
      }
    """, statistics_pb2.DatasetFeatureStatistics()))]

    generator = top_k_stats_generator.TopKStatsGenerator(
        num_top_values=2, num_rank_histogram_buckets=2)
    self.assertSlicingAwareTransformOutputEqual(examples, generator,
                                                expected_result)
Exemplo n.º 14
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        result_protos = []
        # Iterate over the stats generators. For each generator,
        #   a) if it is a CombinerStatsGenerator, wrap it as a beam.CombineFn
        #      and run it.
        #   b) if it is a TransformStatsGenerator, wrap it as a beam.PTransform
        #      and run it.
        for generator in stats_generators:
            if isinstance(generator, stats_generator.CombinerStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> beam.CombineGlobally(
                                         _CombineFnWrapper(generator)))
            elif isinstance(generator,
                            stats_generator.TransformStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> generator.ptransform)
            else:
                raise TypeError(
                    'Statistics generator must extend one of '
                    'CombinerStatsGenerator or TransformStatsGenerator, '
                    'found object of type %s' % generator.__class__.__name__)

        # Each stats generator will output a PCollection of DatasetFeatureStatistics
        # protos. We now flatten the list of PCollections into a single PCollection,
        # then merge the DatasetFeatureStatistics protos in the PCollection into a
        # single DatasetFeatureStatisticsList proto.
        return (result_protos | 'FlattenFeatureStatistics' >> beam.Flatten()
                | 'MergeDatasetFeatureStatisticsProtos' >>
                beam.CombineGlobally(_merge_dataset_feature_stats_protos)
                | 'MakeDatasetFeatureStatisticsListProto' >>
                beam.Map(_make_dataset_feature_statistics_list_proto))
Exemplo n.º 15
0
 def test_topk_with_missing_feature(self):
   # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
   # fb: 1 'a', 1 'b', 2 'c'
   examples = [{'fa': np.array(['a', 'b', 'c', 'e']),
                'fb': np.array(['a', 'c', 'c'])},
               {'fa': None,
                'fb': np.array(['b'])},
               {'fa': np.array(['a', 'c', 'd']),
                'fb': None},
               {'fa': np.array(['a', 'a', 'b', 'c', 'd'])},
               {'fa': None}]
   expected_result_fa = text_format.Parse(
       """
     features {
       name: 'fa'
       type: STRING
       string_stats {
         top_values {
           value: 'a'
           frequency: 4
         }
         top_values {
           value: 'c'
           frequency: 3
         }
         top_values {
           value: 'd'
           frequency: 2
         }
         top_values {
           value: 'b'
           frequency: 2
         }
         rank_histogram {
           buckets {
             low_rank: 0
             high_rank: 0
             label: "a"
             sample_count: 4.0
           }
           buckets {
             low_rank: 1
             high_rank: 1
             label: "c"
             sample_count: 3.0
           }
           buckets {
             low_rank: 2
             high_rank: 2
             label: "d"
             sample_count: 2.0
           }
         }
       }
     }""", statistics_pb2.DatasetFeatureStatistics())
   expected_result_fb = text_format.Parse(
       """
     features {
       name: 'fb'
       type: STRING
       string_stats {
         top_values {
           value: 'c'
           frequency: 2
         }
         top_values {
           value: 'b'
           frequency: 1
         }
         top_values {
           value: 'a'
           frequency: 1
         }
         rank_histogram {
           buckets {
             low_rank: 0
             high_rank: 0
             label: "c"
             sample_count: 2.0
           }
           buckets {
             low_rank: 1
             high_rank: 1
             label: "b"
             sample_count: 1.0
           }
           buckets {
             low_rank: 2
             high_rank: 2
             label: "a"
             sample_count: 1.0
           }
         }
       }
     }""", statistics_pb2.DatasetFeatureStatistics())
   generator = top_k_stats_generator.TopKStatsGenerator(
       num_top_values=4, num_rank_histogram_buckets=3)
   self.assertTransformOutputEqual(examples, generator,
                                   [expected_result_fa, expected_result_fb])