def test_common_stats_generator_invalid_weight_feature(self): batches = [{'a': np.array([np.array([1])])}] generator = common_stats_generator.CommonStatsGenerator( weight_feature='w') with self.assertRaisesRegexp(ValueError, 'Weight feature.*not present.*'): self.assertCombinerOutputEqual(batches, generator, None)
def test_common_stats_generator_with_weight_feature(self): # input with two batches: first batch has two examples and second batch # has a single example. batches = [{'a': np.array([np.array([1.0, 2.0]), np.array([3.0, 4.0, 5.0])]), 'w': np.array([np.array([1.0]), np.array([2.0])])}, {'a': np.array([np.array([1.0,]), None]), 'w': np.array([np.array([3.0]), np.array([2.0])])}] expected_result = { 'a': text_format.Parse( """ name: 'a' type: FLOAT num_stats { common_stats { num_non_missing: 3 num_missing: 1 min_num_values: 1 max_num_values: 3 avg_num_values: 2.0 tot_num_values: 6 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.75 } buckets { low_value: 1.0 high_value: 2.0 sample_count: 0.75 } buckets { low_value: 2.0 high_value: 3.0 sample_count: 0.75 } buckets { low_value: 3.0 high_value: 3.0 sample_count: 0.75 } type: QUANTILES } weighted_common_stats { num_non_missing: 6.0 num_missing: 2.0 avg_num_values: 1.83333333 tot_num_values: 11.0 } } } """, statistics_pb2.FeatureNameStatistics())} generator = common_stats_generator.CommonStatsGenerator( weight_feature='w', num_values_histogram_buckets=4) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_common_stats_generator_categorical_feature(self): batches = [{ 'c': np.array([np.array([1, 5, 10]), np.array([0])]) }, { 'c': np.array([np.array([1, 1, 1, 5, 15])]) }] expected_result = { 'c': text_format.Parse( """ name: 'c' type: INT string_stats { common_stats { num_non_missing: 3 num_missing: 0 min_num_values: 1 max_num_values: 5 avg_num_values: 3.0 tot_num_values: 9 num_values_histogram { buckets { low_value: 1.0 high_value: 3.0 sample_count: 1.0 } buckets { low_value: 3.0 high_value: 5.0 sample_count: 1.0 } buckets { low_value: 5.0 high_value: 5.0 sample_count: 1.0 } type: QUANTILES } } } """, statistics_pb2.FeatureNameStatistics()) } schema = text_format.Parse( """ feature { name: "c" type: INT int_domain { is_categorical: true } } """, schema_pb2.Schema()) generator = common_stats_generator.CommonStatsGenerator( schema=schema, num_values_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_common_stats_generator_weight_feature_multiple_values(self): batches = [{ 'a': np.array([np.array([1])]), 'w': np.array([np.array([2, 3])]) }] generator = common_stats_generator.CommonStatsGenerator( weight_feature='w') with self.assertRaisesRegexp(ValueError, 'Weight feature.*single value.*'): self.assertCombinerOutputEqual(batches, generator, None)
def test_common_stats_generator_weight_feature_string_type(self): batches = [{ 'a': np.array([np.array([1])]), 'w': np.array([np.array(['a'])]) }] generator = common_stats_generator.CommonStatsGenerator( weight_feature='w') with self.assertRaisesRegexp(ValueError, 'Weight feature.*numeric type.*'): self.assertCombinerOutputEqual(batches, generator, None)
def expand(self, dataset): # Initialize a list of stats generators to run. stats_generators = [ # Create common stats generator. common_stats_generator.CommonStatsGenerator( schema=self._options.schema, num_values_histogram_buckets=\ self._options.num_values_histogram_buckets, epsilon=self._options.epsilon), # Create numeric stats generator. numeric_stats_generator.NumericStatsGenerator( schema=self._options.schema, num_histogram_buckets=self._options.num_histogram_buckets, num_quantiles_histogram_buckets=\ self._options.num_quantiles_histogram_buckets, epsilon=self._options.epsilon), # Create string stats generator. string_stats_generator.StringStatsGenerator( schema=self._options.schema), # Create topk stats generator. top_k_stats_generator.TopKStatsGenerator( schema=self._options.schema, num_top_values=self._options.num_top_values, num_rank_histogram_buckets=\ self._options.num_rank_histogram_buckets), # Create uniques stats generator. uniques_stats_generator.UniquesStatsGenerator( schema=self._options.schema) ] if self._options.generators is not None: # Add custom stats generators. stats_generators.extend(self._options.generators) # Profile and then batch input examples. batched_dataset = (dataset | 'Profile' >> profile_util.Profile() | 'BatchInputs' >> batch_util.BatchExamples()) # If a set of whitelist features are provided, keep only those features. filtered_dataset = batched_dataset if self._options.feature_whitelist: filtered_dataset = ( batched_dataset | 'RemoveNonWhitelistedFeatures' >> beam.Map( _filter_features, feature_whitelist=self._options.feature_whitelist)) return (filtered_dataset | 'RunStatsGenerators' >> stats_impl.GenerateStatisticsImpl(stats_generators))
def _get_default_generators( options, in_memory = False ): """Initialize default list of stats generators. Args: options: A StatsOptions object. in_memory: Whether the generators will be used to generate statistics in memory (True) or using Beam (False). Returns: A list of stats generator objects. """ stats_generators = [ common_stats_generator.CommonStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_values_histogram_buckets=options.num_values_histogram_buckets, epsilon=options.epsilon), numeric_stats_generator.NumericStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_histogram_buckets=options.num_histogram_buckets, num_quantiles_histogram_buckets=\ options.num_quantiles_histogram_buckets, epsilon=options.epsilon), string_stats_generator.StringStatsGenerator( schema=options.schema) ] if in_memory: stats_generators.append( top_k_uniques_combiner_stats_generator. TopKUniquesCombinerStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_top_values=options.num_top_values, num_rank_histogram_buckets=options.num_rank_histogram_buckets)) else: stats_generators.extend([ top_k_stats_generator.TopKStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_top_values=options.num_top_values, num_rank_histogram_buckets=options.num_rank_histogram_buckets), uniques_stats_generator.UniquesStatsGenerator(schema=options.schema) ]) return stats_generators
def test_common_stats_generator_empty_batch(self): batches = [{'a': np.array([])}] expected_result = { 'a': text_format.Parse( """ name: 'a' type: STRING string_stats { common_stats { num_non_missing: 0 num_missing: 0 tot_num_values: 0 } } """, statistics_pb2.FeatureNameStatistics())} generator = common_stats_generator.CommonStatsGenerator() self.assertCombinerOutputEqual(batches, generator, expected_result)
def expand(self, dataset): # Initialize a list of stats generators to run. stats_generators = [ # Create common stats generator. common_stats_generator.CommonStatsGenerator( schema=self._options.schema, num_values_histogram_buckets=\ self._options.num_values_histogram_buckets, epsilon=self._options.epsilon), # Create numeric stats generator. numeric_stats_generator.NumericStatsGenerator( schema=self._options.schema, num_histogram_buckets=self._options.num_histogram_buckets, num_quantiles_histogram_buckets=\ self._options.num_quantiles_histogram_buckets, epsilon=self._options.epsilon), # Create string stats generator. string_stats_generator.StringStatsGenerator( schema=self._options.schema), # Create topk stats generator. top_k_stats_generator.TopKStatsGenerator( schema=self._options.schema, num_top_values=self._options.num_top_values, num_rank_histogram_buckets=\ self._options.num_rank_histogram_buckets), # Create uniques stats generator. uniques_stats_generator.UniquesStatsGenerator( schema=self._options.schema) ] if self._options.generators is not None: # Add custom stats generators. stats_generators.extend(self._options.generators) # Profile the input examples. dataset |= 'ProfileExamples' >> profile_util.Profile() # Sample input data if sample_count option is provided. if self._options.sample_count is not None: # beam.combiners.Sample.FixedSizeGlobally returns a # PCollection[List[types.Example]], which we then flatten to get a # PCollection[types.Example]. dataset |= ('SampleExamples(%s)' % self._options.sample_count >> beam.combiners.Sample.FixedSizeGlobally( self._options.sample_count) | 'FlattenExamples' >> beam.FlatMap(lambda lst: lst)) elif self._options.sample_rate is not None: dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate >> beam.FlatMap(_sample_at_rate, sample_rate=self._options.sample_rate)) # Batch the input examples. desired_batch_size = (None if self._options.sample_count is None else self._options.sample_count) dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples( desired_batch_size=desired_batch_size)) # If a set of whitelist features are provided, keep only those features. if self._options.feature_whitelist: dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map( _filter_features, feature_whitelist=self._options.feature_whitelist)) return (dataset | 'RunStatsGenerators' >> stats_impl.GenerateStatisticsImpl(stats_generators))
def test_common_stats_generator_invalid_value_numpy_dtype(self): batches = [{'a': np.array([np.array([1+2j])])}] generator = common_stats_generator.CommonStatsGenerator() with self.assertRaises(TypeError): self.assertCombinerOutputEqual(batches, generator, None)
def test_common_stats_generator_empty_list(self): batches = [] expected_result = {} generator = common_stats_generator.CommonStatsGenerator() self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_common_stats_generator_with_multiple_features(self): # input with two batches: first batch has two examples and second batch # has a single example. batches = [{'a': np.array([np.array([1.0, 2.0]), np.array([3.0, 4.0, 5.0])]), 'b': np.array([np.array(['x', 'y', 'z', 'w']), np.array(['qwe', 'abc'])]), 'c': np.array([np.array([1, 5, 10]), np.array([0])])}, {'a': np.array([np.array([1.0])]), 'b': np.array([np.array(['ab'])]), 'c': np.array([np.array([1, 1, 1, 5, 15])])}] expected_result = { 'a': text_format.Parse( """ name: 'a' type: FLOAT num_stats { common_stats { num_non_missing: 3 num_missing: 0 min_num_values: 1 max_num_values: 3 avg_num_values: 2.0 tot_num_values: 6 num_values_histogram { buckets { low_value: 1.0 high_value: 2.0 sample_count: 1.0 } buckets { low_value: 2.0 high_value: 3.0 sample_count: 1.0 } buckets { low_value: 3.0 high_value: 3.0 sample_count: 1.0 } type: QUANTILES } } } """, statistics_pb2.FeatureNameStatistics()), 'b': text_format.Parse( """ name: 'b' type: STRING string_stats { common_stats { num_non_missing: 3 num_missing: 0 min_num_values: 1 max_num_values: 4 avg_num_values: 2.33333333 tot_num_values: 7 num_values_histogram { buckets { low_value: 1.0 high_value: 2.0 sample_count: 1.0 } buckets { low_value: 2.0 high_value: 4.0 sample_count: 1.0 } buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } type: QUANTILES } } } """, statistics_pb2.FeatureNameStatistics()), 'c': text_format.Parse( """ name: 'c' type: INT num_stats { common_stats { num_non_missing: 3 num_missing: 0 min_num_values: 1 max_num_values: 5 avg_num_values: 3.0 tot_num_values: 9 num_values_histogram { buckets { low_value: 1.0 high_value: 3.0 sample_count: 1.0 } buckets { low_value: 3.0 high_value: 5.0 sample_count: 1.0 } buckets { low_value: 5.0 high_value: 5.0 sample_count: 1.0 } type: QUANTILES } } } """, statistics_pb2.FeatureNameStatistics())} generator = common_stats_generator.CommonStatsGenerator( num_values_histogram_buckets=3, epsilon=0.001) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_common_stats_generator_with_entire_feature_value_list_missing(self): # input with two batches: first batch has three examples and second batch # has two examples. batches = [{'a': np.array([np.array([1.0, 2.0]), None, np.array([3.0, 4.0, 5.0])], dtype=np.object), 'b': np.array([np.array(['x', 'y', 'z', 'w']), None, np.array(['qwe', 'abc'])], dtype=np.object)}, {'a': np.array([np.array([1.0]), None], dtype=np.object), 'b': np.array([None, np.array(['qwe'])], dtype=np.object)}] expected_result = { 'a': text_format.Parse( """ name: 'a' type: FLOAT num_stats { common_stats { num_non_missing: 3 num_missing: 2 min_num_values: 1 max_num_values: 3 avg_num_values: 2.0 tot_num_values: 6 num_values_histogram { buckets { low_value: 1.0 high_value: 2.0 sample_count: 1.0 } buckets { low_value: 2.0 high_value: 3.0 sample_count: 1.0 } buckets { low_value: 3.0 high_value: 3.0 sample_count: 1.0 } type: QUANTILES } } } """, statistics_pb2.FeatureNameStatistics()), 'b': text_format.Parse( """ name: 'b' type: STRING string_stats { common_stats { num_non_missing: 3 num_missing: 2 min_num_values: 1 max_num_values: 4 avg_num_values: 2.33333333 tot_num_values: 7 num_values_histogram { buckets { low_value: 1.0 high_value: 2.0 sample_count: 1.0 } buckets { low_value: 2.0 high_value: 4.0 sample_count: 1.0 } buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } type: QUANTILES } } } """, statistics_pb2.FeatureNameStatistics())} generator = common_stats_generator.CommonStatsGenerator( num_values_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_tfdv_telemetry(self): batches = [ { 'a': np.array([ np.array([1.0, 2.0], dtype=np.floating), np.array([3.0, 4.0, np.NaN, 5.0], dtype=np.floating)]), 'b': np.array([ np.array(['a', 'b', 'c', 'e'], dtype=np.object), np.array(['d', 'e', 'f'], dtype=np.object)]), 'c': np.array([None, None]) }, { 'a': np.array([None]), 'b': np.array([np.array(['a', 'b', 'c'], dtype=np.object)]), 'c': np.array([np.array([10, 20, 30], dtype=np.integer)]) }, { 'a': np.array([np.array([5.0], dtype=np.floating)]), 'b': np.array([np.array(['d', 'e', 'f'], dtype=np.object)]), 'c': np.array([np.array([1], dtype=np.integer)]) } ] p = beam.Pipeline() _ = (p | 'CreateBatches' >> beam.Create(batches) | 'CommonStatsCombiner' >> beam.CombineGlobally( stats_impl._CombineFnWrapper( common_stats_generator.CommonStatsGenerator()))) runner = p.run() runner.wait_until_finish() result_metrics = runner.metrics() num_metrics = len( result_metrics.query(beam.metrics.metric.MetricsFilter().with_namespace( constants.METRICS_NAMESPACE))['counters']) self.assertEqual(num_metrics, 14) expected_result = { 'num_instances': 4, 'num_missing_feature_values': 3, 'num_int_feature_values': 2, 'int_feature_values_min_count': 1, 'int_feature_values_max_count': 3, 'int_feature_values_mean_count': 2, 'num_float_feature_values': 3, 'float_feature_values_min_count': 1, 'float_feature_values_max_count': 4, 'float_feature_values_mean_count': 2, 'num_string_feature_values': 4, 'string_feature_values_min_count': 3, 'string_feature_values_max_count': 4, 'string_feature_values_mean_count': 3, } # Check number of counters. actual_metrics = result_metrics.query( beam.metrics.metric.MetricsFilter().with_namespace( constants.METRICS_NAMESPACE))['counters'] self.assertLen(actual_metrics, len(expected_result)) # Check each counter. for counter_name in expected_result: actual_counter = result_metrics.query( beam.metrics.metric.MetricsFilter().with_name(counter_name) )['counters'] self.assertLen(actual_counter, 1) self.assertEqual(actual_counter[0].committed, expected_result[counter_name])
def expand(self, dataset): # Initialize a list of stats generators to run. stats_generators = [ # Create common stats generator. common_stats_generator.CommonStatsGenerator( schema=self._options.schema, weight_feature=self._options.weight_feature, num_values_histogram_buckets=\ self._options.num_values_histogram_buckets, epsilon=self._options.epsilon), # Create numeric stats generator. numeric_stats_generator.NumericStatsGenerator( schema=self._options.schema, weight_feature=self._options.weight_feature, num_histogram_buckets=self._options.num_histogram_buckets, num_quantiles_histogram_buckets=\ self._options.num_quantiles_histogram_buckets, epsilon=self._options.epsilon), # Create string stats generator. string_stats_generator.StringStatsGenerator( schema=self._options.schema), # Create topk stats generator. top_k_stats_generator.TopKStatsGenerator( schema=self._options.schema, weight_feature=self._options.weight_feature, num_top_values=self._options.num_top_values, num_rank_histogram_buckets=\ self._options.num_rank_histogram_buckets), # Create uniques stats generator. uniques_stats_generator.UniquesStatsGenerator( schema=self._options.schema) ] if self._options.generators is not None: # Add custom stats generators. stats_generators.extend(self._options.generators) # Batch the input examples. desired_batch_size = (None if self._options.sample_count is None else self._options.sample_count) dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples( desired_batch_size=desired_batch_size)) # If a set of whitelist features are provided, keep only those features. if self._options.feature_whitelist: dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map( _filter_features, feature_whitelist=self._options.feature_whitelist)) result_protos = [] # Iterate over the stats generators. For each generator, # a) if it is a CombinerStatsGenerator, wrap it as a beam.CombineFn # and run it. # b) if it is a TransformStatsGenerator, wrap it as a beam.PTransform # and run it. for generator in stats_generators: if isinstance(generator, stats_generator.CombinerStatsGenerator): result_protos.append(dataset | generator.name >> beam.CombineGlobally( _CombineFnWrapper(generator))) elif isinstance(generator, stats_generator.TransformStatsGenerator): result_protos.append(dataset | generator.name >> generator.ptransform) else: raise TypeError( 'Statistics generator must extend one of ' 'CombinerStatsGenerator or TransformStatsGenerator, ' 'found object of type %s' % generator.__class__.__name__) # Each stats generator will output a PCollection of DatasetFeatureStatistics # protos. We now flatten the list of PCollections into a single PCollection, # then merge the DatasetFeatureStatistics protos in the PCollection into a # single DatasetFeatureStatisticsList proto. return (result_protos | 'FlattenFeatureStatistics' >> beam.Flatten() | 'MergeDatasetFeatureStatisticsProtos' >> beam.CombineGlobally(_merge_dataset_feature_stats_protos) | 'MakeDatasetFeatureStatisticsListProto' >> beam.Map(_make_dataset_feature_statistics_list_proto))
def generate_statistics_in_memory(examples, options=stats_options.StatsOptions()): """Generates statistics for an in-memory list of examples. Args: examples: A list of input examples. options: Options for generating data statistics. Returns: A DatasetFeatureStatisticsList proto. """ stats_generators = [ common_stats_generator.CommonStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_values_histogram_buckets=\ options.num_values_histogram_buckets, epsilon=options.epsilon), numeric_stats_generator.NumericStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_histogram_buckets=options.num_histogram_buckets, num_quantiles_histogram_buckets=\ options.num_quantiles_histogram_buckets, epsilon=options.epsilon), string_stats_generator.StringStatsGenerator(schema=options.schema), top_k_uniques_combiner_stats_generator.TopKUniquesCombinerStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_top_values=options.num_top_values, num_rank_histogram_buckets=options.num_rank_histogram_buckets), ] if options.generators is not None: for generator in options.generators: if isinstance(generator, stats_generator.CombinerStatsGenerator): stats_generators.append(generator) else: raise TypeError( 'Statistics generator used in ' 'generate_statistics_in_memory must ' 'extend CombinerStatsGenerator, found object of type ' '%s.' % generator.__class__.__name__) batch = batch_util.merge_single_batch(examples) # If whitelist features are provided, keep only those features. if options.feature_whitelist: batch = { feature_name: batch[feature_name] for feature_name in options.feature_whitelist } outputs = [ generator.extract_output( generator.add_input(generator.create_accumulator(), batch)) for generator in stats_generators ] return _make_dataset_feature_statistics_list_proto( _merge_dataset_feature_stats_protos(outputs))