def __init__(self, schema: schema_pb2.Schema, name: Text = 'SparseFeatureStatsGenerator') -> None: """Initializes a sparse feature statistics generator. Args: schema: A required schema for the dataset. name: An optional unique name associated with the statistics generator. """ self._sparse_feature_components = _get_components( _get_all_sparse_features(schema)) # Create length diff generators for each index / value pair and count # missing generator for all paths. constituents = [] for _, (value, indices) in self._sparse_feature_components.items(): required_paths = [value] + list(indices) constituents.append( count_missing_generator.CountMissingGenerator( value, required_paths)) for index in indices: constituents.append( length_diff_generator.LengthDiffGenerator( index, value, required_paths)) constituents.append( count_missing_generator.CountMissingGenerator( index, required_paths)) super(SparseFeatureStatsGenerator, self).__init__(name, constituents, schema)
def test_length_diff_generator_key(self): path1 = types.FeaturePath(['f1']) path2 = types.FeaturePath(['f2']) generator = length_diff_generator.LengthDiffGenerator(path1, path2) expected_key = ('LengthDiffGenerator', path1, path2) self.assertDictEqual({expected_key: None}, {generator.get_key(): None}) self.assertDictEqual( {expected_key: None}, {length_diff_generator.LengthDiffGenerator.key(path1, path2): None})
def test_length_diff_generator_both_missing(self): batch = input_batch.InputBatch( pa.Table.from_arrays([pa.array([[1], [1], [1]])], ['required'])) path1 = types.FeaturePath(['f1']) path2 = types.FeaturePath(['f2']) required_path = types.FeaturePath('required') generator = length_diff_generator.LengthDiffGenerator( path1, path2, required_paths=[required_path]) accumulator = generator.create_accumulator() accumulator = generator.add_input(accumulator, batch) self.assertEqual((0, 0), generator.extract_output(accumulator))
def test_length_diff_generator_negative_min_max(self): batch = input_batch.InputBatch( pa.Table.from_arrays([ pa.array([[1, 2, 3], None, [1]]), pa.array([[1], None, []]), pa.array([[1], None, [1]]) ], ['f1', 'f2', 'required'])) path1 = types.FeaturePath(['f1']) path2 = types.FeaturePath(['f2']) required_path = types.FeaturePath('required') generator = length_diff_generator.LengthDiffGenerator( path2, path1, required_paths=[path1, path2, required_path]) accumulator = generator.create_accumulator() accumulator = generator.add_input(accumulator, batch) self.assertEqual((-2, -1), generator.extract_output(accumulator))
def test_length_diff_generator_key_with_required(self): path1 = types.FeaturePath(['f1']) path2 = types.FeaturePath(['f2']) required_path = types.FeaturePath(['required']) required_paths = [path1, path2, required_path] generator = length_diff_generator.LengthDiffGenerator( path1, path2, required_paths) expected_key = ('LengthDiffGenerator', path1, path2, path1, path2, required_path) self.assertDictEqual({expected_key: None}, {generator.get_key(): None}) self.assertDictEqual({expected_key: None}, { length_diff_generator.LengthDiffGenerator.key( path1, path2, required_paths): None })
def __init__(self, schema: schema_pb2.Schema, name: Text = 'WeightedFeatureStatsGenerator') -> None: constituents = [] for weighted_feature in schema.weighted_feature: weight = types.FeaturePath.from_proto( weighted_feature.weight_feature) value = types.FeaturePath.from_proto(weighted_feature.feature) component_paths = [weight, value] constituents.append( length_diff_generator.LengthDiffGenerator( weight, value, required_paths=component_paths)) constituents.append( count_missing_generator.CountMissingGenerator( value, required_paths=component_paths)) constituents.append( count_missing_generator.CountMissingGenerator( weight, required_paths=component_paths)) super(WeightedFeatureStatsGenerator, self).__init__(name, constituents, schema)