示例#1
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile and then batch input examples.
        batched_dataset = (dataset
                           | 'Profile' >> profile_util.Profile()
                           | 'BatchInputs' >> batch_util.BatchExamples())

        # If a set of whitelist features are provided, keep only those features.
        filtered_dataset = batched_dataset
        if self._options.feature_whitelist:
            filtered_dataset = (
                batched_dataset | 'RemoveNonWhitelistedFeatures' >> beam.Map(
                    _filter_features,
                    feature_whitelist=self._options.feature_whitelist))

        return (filtered_dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
示例#2
0
def _get_default_generators(
    options, in_memory = False
):
  """Initialize default list of stats generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
  stats_generators = [
      common_stats_generator.CommonStatsGenerator(
          schema=options.schema,
          weight_feature=options.weight_feature,
          num_values_histogram_buckets=options.num_values_histogram_buckets,
          epsilon=options.epsilon),
      numeric_stats_generator.NumericStatsGenerator(
          schema=options.schema,
          weight_feature=options.weight_feature,
          num_histogram_buckets=options.num_histogram_buckets,
          num_quantiles_histogram_buckets=\
            options.num_quantiles_histogram_buckets,
          epsilon=options.epsilon),
      string_stats_generator.StringStatsGenerator(
          schema=options.schema)
  ]
  if in_memory:
    stats_generators.append(
        top_k_uniques_combiner_stats_generator.
        TopKUniquesCombinerStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets))
  else:
    stats_generators.extend([
        top_k_stats_generator.TopKStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets),
        uniques_stats_generator.UniquesStatsGenerator(schema=options.schema)
    ])
  return stats_generators
示例#3
0
 def test_numeric_stats_generator_categorical_feature(self):
     # input with two batches: first batch has two examples and second batch
     # has a single example.
     batches = [{
         'a': np.array([np.array([1, 0]),
                        np.array([0, 1, 0])])
     }, {
         'a': np.array([np.array([1])])
     }]
     expected_result = {}
     schema = text_format.Parse(
         """
     feature {
       name: "a"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     generator = numeric_stats_generator.NumericStatsGenerator(
         schema=schema)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
示例#4
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile the input examples.
        dataset |= 'ProfileExamples' >> profile_util.Profile()

        # Sample input data if sample_count option is provided.
        if self._options.sample_count is not None:
            # beam.combiners.Sample.FixedSizeGlobally returns a
            # PCollection[List[types.Example]], which we then flatten to get a
            # PCollection[types.Example].
            dataset |= ('SampleExamples(%s)' % self._options.sample_count >>
                        beam.combiners.Sample.FixedSizeGlobally(
                            self._options.sample_count)
                        | 'FlattenExamples' >> beam.FlatMap(lambda lst: lst))
        elif self._options.sample_rate is not None:
            dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate
                        >> beam.FlatMap(_sample_at_rate,
                                        sample_rate=self._options.sample_rate))

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        return (dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
示例#5
0
 def test_numeric_stats_generator_invalid_value_type(self):
     batches = [{'a': np.array([np.array([1.34]), np.array([12])])}]
     generator = numeric_stats_generator.NumericStatsGenerator()
     with self.assertRaises(TypeError):
         self.assertCombinerOutputEqual(batches, generator, None)
示例#6
0
 def test_numeric_stats_generator_empty_list(self):
     batches = []
     expected_result = {}
     generator = numeric_stats_generator.NumericStatsGenerator()
     self.assertCombinerOutputEqual(batches, generator, expected_result)
示例#7
0
 def test_numeric_stats_generator_empty_batch(self):
     batches = [{'a': np.array([])}]
     expected_result = {}
     generator = numeric_stats_generator.NumericStatsGenerator()
     self.assertCombinerOutputEqual(batches, generator, expected_result)
示例#8
0
 def test_numeric_stats_generator_with_missing_feature(self):
     # Input with two batches: first batch has two examples and second batch
     # has a single example. The first batch is missing feature 'b'.
     batches = [{
         'a':
         np.array([np.array([1.0, 2.0]),
                   np.array([3.0, 4.0, 5.0])])
     }, {
         'a': np.array([np.array([1.0])]),
         'b': np.array([np.linspace(1, 3000, 3000, dtype=np.int32)])
     }]
     expected_result = {
         'a':
         text_format.Parse(
             """
         name: 'a'
         type: FLOAT
         num_stats {
           mean: 2.66666666
           std_dev: 1.49071198
           num_zeros: 0
           min: 1.0
           max: 5.0
           median: 3.0
           histograms {
             buckets {
               low_value: 1.0
               high_value: 2.3333333
               sample_count: 2.9866667
             }
             buckets {
               low_value: 2.3333333
               high_value: 3.6666667
               sample_count: 1.0066667
             }
             buckets {
               low_value: 3.6666667
               high_value: 5.0
               sample_count: 2.0066667
             }
             type: STANDARD
           }
           histograms {
             buckets {
               low_value: 1.0
               high_value: 1.0
               sample_count: 1.5
             }
             buckets {
               low_value: 1.0
               high_value: 3.0
               sample_count: 1.5
             }
             buckets {
               low_value: 3.0
               high_value: 4.0
               sample_count: 1.5
             }
             buckets {
               low_value: 4.0
               high_value: 5.0
               sample_count: 1.5
             }
             type: QUANTILES
           }
         }
         """, statistics_pb2.FeatureNameStatistics()),
         'b':
         text_format.Parse(
             """
         name: 'b'
         type: INT
         num_stats {
           mean: 1500.5
           std_dev: 866.025355672
           min: 1.0
           max: 3000.0
           median: 1501.0
           histograms {
             buckets {
               low_value: 1.0
               high_value: 1000.66666667
               sample_count: 999.666666667
             }
             buckets {
               low_value: 1000.66666667
               high_value: 2000.33333333
               sample_count: 999.666666667
             }
             buckets {
               low_value: 2000.33333333
               high_value: 3000.0
               sample_count: 1000.66666667
             }
             type: STANDARD
           }
           histograms {
             buckets {
               low_value: 1.0
               high_value: 751.0
               sample_count: 750.0
             }
             buckets {
               low_value: 751.0
               high_value: 1501.0
               sample_count: 750.0
             }
             buckets {
               low_value: 1501.0
               high_value: 2250.0
               sample_count: 750.0
             }
             buckets {
               low_value: 2250.0
               high_value: 3000.0
               sample_count: 750.0
             }
             type: QUANTILES
           }
         }
         """, statistics_pb2.FeatureNameStatistics())
     }
     generator = numeric_stats_generator.NumericStatsGenerator(
         num_histogram_buckets=3,
         num_quantiles_histogram_buckets=4,
         epsilon=0.001)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
示例#9
0
 def test_numeric_stats_generator_single_feature(self):
     # input with two batches: first batch has two examples and second batch
     # has a single example.
     batches = [{
         'a':
         np.array([np.array([1.0, 2.0]),
                   np.array([3.0, 4.0, 5.0])])
     }, {
         'a': np.array([np.array([1.0])])
     }]
     expected_result = {
         'a':
         text_format.Parse(
             """
         name: 'a'
         type: FLOAT
         num_stats {
           mean: 2.66666666
           std_dev: 1.49071198
           num_zeros: 0
           min: 1.0
           max: 5.0
           median: 3.0
           histograms {
             buckets {
               low_value: 1.0
               high_value: 2.3333333
               sample_count: 2.9866667
             }
             buckets {
               low_value: 2.3333333
               high_value: 3.6666667
               sample_count: 1.0066667
             }
             buckets {
               low_value: 3.6666667
               high_value: 5.0
               sample_count: 2.0066667
             }
             type: STANDARD
           }
           histograms {
             buckets {
               low_value: 1.0
               high_value: 1.0
               sample_count: 1.5
             }
             buckets {
               low_value: 1.0
               high_value: 3.0
               sample_count: 1.5
             }
             buckets {
               low_value: 3.0
               high_value: 4.0
               sample_count: 1.5
             }
             buckets {
               low_value: 4.0
               high_value: 5.0
               sample_count: 1.5
             }
             type: QUANTILES
           }
         }
         """, statistics_pb2.FeatureNameStatistics())
     }
     generator = numeric_stats_generator.NumericStatsGenerator(
         num_histogram_buckets=3, num_quantiles_histogram_buckets=4)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
示例#10
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        result_protos = []
        # Iterate over the stats generators. For each generator,
        #   a) if it is a CombinerStatsGenerator, wrap it as a beam.CombineFn
        #      and run it.
        #   b) if it is a TransformStatsGenerator, wrap it as a beam.PTransform
        #      and run it.
        for generator in stats_generators:
            if isinstance(generator, stats_generator.CombinerStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> beam.CombineGlobally(
                                         _CombineFnWrapper(generator)))
            elif isinstance(generator,
                            stats_generator.TransformStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> generator.ptransform)
            else:
                raise TypeError(
                    'Statistics generator must extend one of '
                    'CombinerStatsGenerator or TransformStatsGenerator, '
                    'found object of type %s' % generator.__class__.__name__)

        # Each stats generator will output a PCollection of DatasetFeatureStatistics
        # protos. We now flatten the list of PCollections into a single PCollection,
        # then merge the DatasetFeatureStatistics protos in the PCollection into a
        # single DatasetFeatureStatisticsList proto.
        return (result_protos | 'FlattenFeatureStatistics' >> beam.Flatten()
                | 'MergeDatasetFeatureStatisticsProtos' >>
                beam.CombineGlobally(_merge_dataset_feature_stats_protos)
                | 'MakeDatasetFeatureStatisticsListProto' >>
                beam.Map(_make_dataset_feature_statistics_list_proto))
示例#11
0
def generate_statistics_in_memory(examples,
                                  options=stats_options.StatsOptions()):
    """Generates statistics for an in-memory list of examples.

  Args:
    examples: A list of input examples.
    options: Options for generating data statistics.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """

    stats_generators = [
        common_stats_generator.CommonStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_values_histogram_buckets=\
              options.num_values_histogram_buckets,
            epsilon=options.epsilon),

        numeric_stats_generator.NumericStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_histogram_buckets=options.num_histogram_buckets,
            num_quantiles_histogram_buckets=\
              options.num_quantiles_histogram_buckets,
            epsilon=options.epsilon),

        string_stats_generator.StringStatsGenerator(schema=options.schema),

        top_k_uniques_combiner_stats_generator.TopKUniquesCombinerStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets),
    ]

    if options.generators is not None:
        for generator in options.generators:
            if isinstance(generator, stats_generator.CombinerStatsGenerator):
                stats_generators.append(generator)
            else:
                raise TypeError(
                    'Statistics generator used in '
                    'generate_statistics_in_memory must '
                    'extend CombinerStatsGenerator, found object of type '
                    '%s.' % generator.__class__.__name__)

    batch = batch_util.merge_single_batch(examples)

    # If whitelist features are provided, keep only those features.
    if options.feature_whitelist:
        batch = {
            feature_name: batch[feature_name]
            for feature_name in options.feature_whitelist
        }

    outputs = [
        generator.extract_output(
            generator.add_input(generator.create_accumulator(), batch))
        for generator in stats_generators
    ]

    return _make_dataset_feature_statistics_list_proto(
        _merge_dataset_feature_stats_protos(outputs))