def test_nl_generator_match_ratio_check(self): """Tests generator match ratio with fake heuristic.""" input_batches = [ pa.Column.from_array( 'feature', pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH', 'Nope']])), pa.Column.from_array('feature', pa.array([['MATCH', 'MATCH', 'MATCH']])), pa.Column.from_array('feature', pa.array([['12345', 'No']])), ] # Set values_threshold=5 so it always passes. # Try generators with match_ratio 0.71 (should not create stats) and # 0.69 (should create stats) generator = nlsg.NLStatsGenerator(_FakeHeuristic(), match_ratio=0.71, values_threshold=5) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics()) generator = nlsg.NLStatsGenerator(_FakeHeuristic(), match_ratio=0.69, values_threshold=5) self.assertCombinerOutputEqual( input_batches, generator, statistics_pb2.FeatureNameStatistics(custom_stats=[ statistics_pb2.CustomStatistic( name='domain_info', str='natural_language_domain {}'), statistics_pb2.CustomStatistic( name='natural_language_match_rate', num=0.7) ]))
def test_nl_generator_example_threshold_check(self): """Tests generator example threshold with fake heuristic.""" # Expected to give 6 matches. input_batches = [ [ np.array(['MATCH', 'MATCH', 'MATCH']), np.array(['MATCH']), ], [ np.array(['MATCH', 'MATCH']), ], # Nones should be ignored. [ None, np.array([None] * 10), ], ] # Try generators with examples_threshold=7 (should not create stats) and # 6 (should create stats) generator = nlsg.NLStatsGenerator(_FakeHeuristic(), examples_threshold=7) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics()) generator = nlsg.NLStatsGenerator(_FakeHeuristic(), examples_threshold=6) self.assertCombinerOutputEqual( input_batches, generator, statistics_pb2.FeatureNameStatistics(custom_stats=[ statistics_pb2.CustomStatistic( name='domain_info', str='natural_language_domain {}'), statistics_pb2.CustomStatistic( name='natural_language_match_rate', num=1.0) ]))
def test_nl_generator_values_threshold_check(self): """Tests generator values threshold with fake heuristic.""" # Expected to give 6 matches. input_batches = [ pa.Column.from_array( 'feature', pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']])), pa.Column.from_array('feature', pa.array([['MATCH', 'MATCH']])), # Nones should be ignored. pa.Column.from_array('feature', pa.array([None, None])), ] # Try generators with values_threshold=7 (should not create stats) and # 6 (should create stats) generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=7) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics()) generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=6) self.assertCombinerOutputEqual( input_batches, generator, statistics_pb2.FeatureNameStatistics(custom_stats=[ statistics_pb2.CustomStatistic( name='domain_info', str='natural_language_domain {}'), statistics_pb2.CustomStatistic( name='natural_language_match_rate', num=1.0) ]))
def test_nl_generator_bad_initialization(self): """Tests bad initialization values.""" with self.assertRaisesRegexp( ValueError, 'NLStatsGenerator expects values_threshold > 0.'): nlsg.NLStatsGenerator(values_threshold=0) with self.assertRaisesRegexp( ValueError, r'NLStatsGenerator expects a match_ratio in \[0, 1\].'): nlsg.NLStatsGenerator(match_ratio=1.1)
def get_generators( options: stats_options.StatsOptions, in_memory: bool = False) -> List[stats_generator.StatsGenerator]: """Initializes the list of stats generators, including custom generators. Args: options: A StatsOptions object. in_memory: Whether the generators will be used to generate statistics in memory (True) or using Beam (False). Returns: A list of stats generator objects. """ generators = _get_default_generators(options, in_memory) if options.generators: # Add custom stats generators. generators.extend(options.generators) if options.enable_semantic_domain_stats: semantic_domain_feature_stats_generators = [ image_stats_generator.ImageStatsGenerator(), natural_language_stats_generator.NLStatsGenerator(), time_stats_generator.TimeStatsGenerator(), ] # Wrap semantic domain feature stats generators as a separate combiner # stats generator, so that we can apply sampling only for those and other # feature stats generators are not affected by it. generators.append( CombinerFeatureStatsWrapperGenerator( semantic_domain_feature_stats_generators, weight_feature=options.weight_feature, sample_rate=options.semantic_domain_stats_sample_rate)) if options.schema is not None and _schema_has_sparse_features( options.schema): generators.append( sparse_feature_stats_generator.SparseFeatureStatsGenerator( options.schema)) # Replace all CombinerFeatureStatsGenerator with a single # CombinerFeatureStatsWrapperGenerator. feature_generators = [ x for x in generators if isinstance(x, stats_generator.CombinerFeatureStatsGenerator) ] if feature_generators: generators = [ x for x in generators if not isinstance(x, stats_generator.CombinerFeatureStatsGenerator) ] + [ CombinerFeatureStatsWrapperGenerator( feature_generators, weight_feature=options.weight_feature) ] if in_memory: for generator in generators: if not isinstance(generator, stats_generator.CombinerStatsGenerator): raise TypeError( 'Statistics generator used in ' 'generate_statistics_in_memory must ' 'extend CombinerStatsGenerator, found object of ' 'type %s.' % generator.__class__.__name__) return generators
def test_nl_generator_token_and_sequence_histograms(self): """Tests generator calculation of token and sequence histograms.""" with tempfile.NamedTemporaryFile() as vocab_file: vocab_file.write(b'Foo\nBar\nBaz\nBazz\nCar\nRazzz\n') vocab_file.flush() input_batches = [pa.array([[0, 1, 2, 4, 4], [3, 3, 3, 5]])] generator = nlsg.NLStatsGenerator( schema=self._schema, vocab_paths={'my_vocab': vocab_file.name}, num_quantiles_histogram_buckets=2, num_rank_histogram_buckets=2, num_histogram_buckets=2) expected_reported_sequences = [['Foo', 'Bar', 'Baz', 'Car', 'Car'], ['Bazz', 'Bazz', 'Bazz', 'Razzz'] ] * 2 self.assertCombinerOutputEqual( input_batches, generator, self._create_expected_feature_name_statistics( feature_coverage=0.8571428571428571, avg_token_length=(3 + 3 + 4 + 4 + 4 + 5) / 6, min_sequence_length=3, max_sequence_length=5, token_len_quantiles=[(3, 4, 3), (4, 5, 3)], sequence_len_quantiles=[(3, 5, 1), (5, 5, 1)], sorted_token_names_and_counts=[('Bazz', 3), ('Car', 2)], reported_sequences=expected_reported_sequences), self._int_nlp_feature_with_vocab_path)
def test_nl_generator_avg_word_heuristic_match(self): """Tests generator with avg word length heuristic.""" generator = nlsg.NLStatsGenerator(values_threshold=2) input_batches = [ pa.Column.from_array( 'feature', pa.array([[ 'This looks correct.', 'This one too, it should be text.' ], ['xosuhddsofuhg123fdgosh']])), pa.Column.from_array( 'feature', pa.array( [['This should be text as well', 'Here is another text']])), pa.Column.from_array( 'feature', pa.array([['This should also be considered good.']])), ] self.assertCombinerOutputEqual( input_batches, generator, statistics_pb2.FeatureNameStatistics(custom_stats=[ statistics_pb2.CustomStatistic( name='domain_info', str='natural_language_domain {}'), statistics_pb2.CustomStatistic( name='natural_language_match_rate', num=0.8333333) ]))
def test_nl_generator_int_feature_no_vocab(self): """Tests generator calculation with a int domain having no vocab.""" input_batches = [pa.array([[1], [2], [3]])] generator = nlsg.NLStatsGenerator(schema=self._schema) self.assertCombinerOutputEqual( input_batches, generator, self._create_expected_feature_name_statistics( feature_coverage=0.0), self._int_nlp_feature_no_vocab_path)
def test_nl_generator_string_feature_no_vocab(self): """Tests generator calculation with a string domain having no vocab.""" input_batches = [pa.array([['Foo'], None, ['Baz']])] generator = nlsg.NLStatsGenerator(schema=self._schema) self.assertCombinerOutputEqual( input_batches, generator, self._create_expected_feature_name_statistics( feature_coverage=0.5, avg_token_length=3.0), self._string_nlp_feature_no_vocab_path)
def test_nl_generator_avg_word_heuristic_non_match(self): """Tests generator with avg word length heuristic.""" generator = nlsg.NLStatsGenerator(values_threshold=2) input_batches = [ pa.array([['abc' * 10, 'xxxxxxxxx'], ['xosuhddsofuhg123fdgosh']]), pa.array([['Only one valid text?']]), ] self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics())
def test_nl_generator_invalidation_check_no_nld(self): """Tests generator invalidation with no natural language domain.""" generator = nlsg.NLStatsGenerator(self._schema, None, 0, 0, 0) generator.setup() accumulator = generator.create_accumulator() self.assertFalse(accumulator.invalidate) valid_input = pa.array([['Foo'], ['Bar']]) accumulator = generator.add_input(accumulator, self._non_nlp_feature_path, valid_input) self.assertTrue(accumulator.invalidate)
def test_nl_generator_invalidation_check_empty_nld(self): """Tests generator invalidation whith empty natural language domain.""" generator = nlsg.NLStatsGenerator(self._schema, None, 0, 0, 0) generator.setup() accumulator = generator.create_accumulator() self.assertFalse(accumulator.invalidate) valid_input = pa.array([[0], [1]]) accumulator = generator.add_input(accumulator, self._int_nlp_feature_empty_domain, valid_input) self.assertTrue(accumulator.invalidate)
def test_nl_generator_int_feature_no_vocab(self): """Tests generator calculation with a int domain having no vocab.""" input_batches = [pa.array([[1, 2, 3]])] generator = nlsg.NLStatsGenerator(self._schema, None, 0, 0, 0) expected_reported_sequences = [[1, 2, 3]] * 2 self.assertCombinerOutputEqual( input_batches, generator, self._create_expected_feature_name_statistics( feature_coverage=0.0, reported_sequences=expected_reported_sequences), self._int_nlp_feature_no_vocab_path)
def test_nl_generator_int_feature_vocab(self): """Tests generator calcualtion with an int domain and a vocab.""" with tempfile.NamedTemporaryFile() as vocab_file: vocab_file.write(b'Foo\nBar\nBaz\nBazz\n') vocab_file.flush() input_batches = [pa.array([[0], [1], [2], [3], [4]])] generator = nlsg.NLStatsGenerator( vocab_paths={'my_vocab': vocab_file.name}, schema=self._schema) self.assertCombinerOutputEqual( input_batches, generator, self._create_expected_feature_name_statistics( feature_coverage=float(1) / 3, avg_token_length=4.0), self._int_nlp_feature_with_vocab_path)
def test_nl_generator_invalidation_check_float_input(self): """Tests generator invalidation with float inputs.""" generator = nlsg.NLStatsGenerator(self._schema, None, 0, 0, 0) generator.setup() accumulator = generator.create_accumulator() self.assertFalse(accumulator.invalidate) valid_input = pa.array([['Foo'], ['Bar']]) accumulator = generator.add_input( accumulator, self._string_nlp_feature_no_vocab_path, valid_input) self.assertFalse(accumulator.invalidate) invalid_input = pa.array([[1.0], [2.0], [3.0]]) accumulator = generator.add_input( accumulator, self._string_nlp_feature_no_vocab_path, invalid_input) self.assertTrue(accumulator.invalidate)
def test_nl_generator_utf8_check(self): """Tests generator utf8 check with fake heuristic.""" # Expected to give 6 matches. input_batches = [ pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']]), pa.array([['MATCH', 'MATCH']]), # Non utf-8 string invalidates accumulator. pa.array([[b'\xF0']]), ] # Try generators with values_threshold=1 which should have generated # stats without the non utf-8 value. generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=1) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics())
def test_nl_generator_invalidation_check(self): """Tests generator invalidation with fake heuristic.""" # Expected to give 6 matches. input_batches = [ pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']]), pa.array([['MATCH', 'MATCH']]), # Incorrect type invalidates accumulator. pa.array([[42]]), ] # No domain_info is generated as the incorrect type of 42 value invalidated # the stats. generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=1) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics())
def test_nl_generator_string_feature_vocab(self): """Tests generator calculation with a string domain having a vocab.""" with tempfile.NamedTemporaryFile() as vocab_file: vocab_file.write(b'Foo\nBar\nBazz\n') vocab_file.flush() input_batches = [pa.array([['Bar'], None, ['Bazz']])] generator = nlsg.NLStatsGenerator( vocab_paths={'my_vocab': vocab_file.name}, schema=self._schema) self.assertCombinerOutputEqual( input_batches, generator, self._create_expected_feature_name_statistics( feature_coverage=1.0, avg_token_length=4.0), self._string_nlp_feature_with_vocab_path)
def get_generators(options, in_memory=False): """Initializes the list of stats generators, including custom generators. Args: options: A StatsOptions object. in_memory: Whether the generators will be used to generate statistics in memory (True) or using Beam (False). Returns: A list of stats generator objects. """ generators = _get_default_generators(options, in_memory) if options.generators is not None: # Add custom stats generators. generators.extend(options.generators) if options.enable_semantic_domain_stats: generators += [ image_stats_generator.ImageStatsGenerator(), natural_language_stats_generator.NLStatsGenerator(), time_stats_generator.TimeStatsGenerator(), ] # Replace all CombinerFeatureStatsGenerator with a single # CombinerFeatureStatsWrapperGenerator. feature_generators = [ x for x in generators if isinstance(x, stats_generator.CombinerFeatureStatsGenerator) ] if feature_generators: generators = [ x for x in generators if not isinstance(x, stats_generator.CombinerFeatureStatsGenerator) ] + [ CombinerFeatureStatsWrapperGenerator( feature_generators, weight_feature=options.weight_feature) ] if in_memory: for generator in generators: if not isinstance(generator, stats_generator.CombinerStatsGenerator): raise TypeError( 'Statistics generator used in ' 'generate_statistics_in_memory must ' 'extend CombinerStatsGenerator, found object of ' 'type %s.' % generator.__class__.__name__) return generators
def test_nl_generator_token_stats(self): """Tests generator calculation of token statistics.""" with tempfile.NamedTemporaryFile() as vocab_file: vocab_file.write(b'Foo\nBar\n') vocab_file.flush() input_batches = [pa.array([[0, 1, 0], [1, 0, 0]])] generator = nlsg.NLStatsGenerator( schema=self._schema, vocab_paths={'my_vocab': vocab_file.name}, num_quantiles_histogram_buckets=0, num_rank_histogram_buckets=0, num_histogram_buckets=3) expected_reported_sequences = [['Foo', 'Bar', 'Foo'], ['Bar', 'Foo', 'Foo']] * 2 position_histogram_1 = statistics_pb2.Histogram() position_histogram_1.buckets.add(low_value=0, high_value=float(1) / 3, sample_count=1) position_histogram_1.buckets.add(low_value=float(1) / 3, high_value=float(2) / 3, sample_count=1) position_histogram_foo = statistics_pb2.Histogram() position_histogram_foo.buckets.add(low_value=0, high_value=float(1) / 3, sample_count=1) position_histogram_foo.buckets.add(low_value=float(1) / 3, high_value=float(2) / 3, sample_count=1) position_histogram_foo.buckets.add(low_value=float(2) / 3, high_value=1, sample_count=2) expected_token_stats = { 1: (2, 1.0, 1, 1, 1, position_histogram_1), 'Foo': (4, 1.0, 2, 2, 2, position_histogram_foo) } self.assertCombinerOutputEqual( input_batches, generator, self._create_expected_feature_name_statistics( feature_coverage=1.0, avg_token_length=3, min_sequence_length=3, max_sequence_length=3, reported_sequences=expected_reported_sequences, token_statistics=expected_token_stats), self._int_nlp_feature_with_vocab_and_token_constraints_path)
def test_nl_generator_int_feature_vocab(self): """Tests generator calcualtion with an int domain and a vocab.""" with tempfile.NamedTemporaryFile() as vocab_file: vocab_file.write(b'Foo\nBar\nBaz\nBazz\n') vocab_file.flush() input_batches = [pa.array([[0, 1, 2, 3, 4]])] generator = nlsg.NLStatsGenerator(self._schema, {'my_vocab': vocab_file.name}, 0, 0, 0) expected_reported_sequences = [['Foo', 'Bar', 'Baz', 'Bazz', 4] ] * 2 self.assertCombinerOutputEqual( input_batches, generator, self._create_expected_feature_name_statistics( feature_coverage=float(1) / 3, avg_token_length=4, reported_sequences=expected_reported_sequences), self._int_nlp_feature_with_vocab_path)
def test_nl_generator_invalidation_check(self): """Tests generator example threshold with fake heuristic.""" # Expected to give 6 matches. input_batches = [ [ np.array(['MATCH', 'MATCH', 'MATCH']), np.array(['MATCH']), ], [ np.array(['MATCH', 'MATCH']), ], [ # Incorrect type, this would invalidate the stats. np.array([42]), ], ] # No domain_info is generated as the incorrect type of 42 value invalidated # the stats. generator = nlsg.NLStatsGenerator(_FakeHeuristic(), examples_threshold=1) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics())
def test_nl_generator_empty_input(self): """Tests generator on empty input with fake heuristic.""" generator = nlsg.NLStatsGenerator(_FakeHeuristic()) self.assertCombinerOutputEqual([], generator, statistics_pb2.FeatureNameStatistics())
def get_generators( options: stats_options.StatsOptions, in_memory: bool = False) -> List[stats_generator.StatsGenerator]: """Initializes the list of stats generators, including custom generators. Args: options: A StatsOptions object. in_memory: Whether the generators will be used to generate statistics in memory (True) or using Beam (False). Returns: A list of stats generator objects. """ generators = [NumExamplesStatsGenerator(options.weight_feature)] if options.add_default_generators: generators.extend(_get_default_generators(options, in_memory)) if options.generators: # Add custom stats generators. generators.extend(options.generators) if options.enable_semantic_domain_stats: semantic_domain_feature_stats_generators = [ image_stats_generator.ImageStatsGenerator(), natural_language_domain_inferring_stats_generator. NLDomainInferringStatsGenerator(), time_stats_generator.TimeStatsGenerator(), ] # Wrap semantic domain feature stats generators as a separate combiner # stats generator, so that we can apply sampling only for those and other # feature stats generators are not affected by it. generators.append( CombinerFeatureStatsWrapperGenerator( semantic_domain_feature_stats_generators, sample_rate=options.semantic_domain_stats_sample_rate)) if options.schema is not None: if _schema_has_sparse_features(options.schema): generators.append( sparse_feature_stats_generator.SparseFeatureStatsGenerator( options.schema)) if _schema_has_natural_language_domains(options.schema): generators.append( natural_language_stats_generator.NLStatsGenerator( options.schema, options.vocab_paths, options.num_histogram_buckets, options.num_quantiles_histogram_buckets, options.num_rank_histogram_buckets)) if options.schema.weighted_feature: generators.append( weighted_feature_stats_generator.WeightedFeatureStatsGenerator( options.schema)) if options.label_feature and not in_memory: # The LiftStatsGenerator is not a CombinerStatsGenerator and therefore # cannot currenty be used for in_memory executions. generators.append( lift_stats_generator.LiftStatsGenerator( y_path=types.FeaturePath([options.label_feature]), schema=options.schema, example_weight_map=options.example_weight_map, output_custom_stats=True)) # Replace all CombinerFeatureStatsGenerator with a single # CombinerFeatureStatsWrapperGenerator. feature_generators = [ x for x in generators if isinstance(x, stats_generator.CombinerFeatureStatsGenerator) ] if feature_generators: generators = [ x for x in generators if not isinstance(x, stats_generator.CombinerFeatureStatsGenerator) ] + [CombinerFeatureStatsWrapperGenerator(feature_generators)] if in_memory: for generator in generators: if not isinstance(generator, stats_generator.CombinerStatsGenerator): raise TypeError( 'Statistics generator used in ' 'generate_statistics_in_memory must ' 'extend CombinerStatsGenerator, found object of ' 'type %s.' % generator.__class__.__name__) return generators
def test_nl_generator_empty_input(self): generator = nlsg.NLStatsGenerator(None, None, 0, 0, 0) self.assertCombinerOutputEqual( [], generator, self._create_expected_feature_name_statistics())