def __init__(self,
                 schema: schema_pb2.Schema,
                 name: Text = 'SparseFeatureStatsGenerator') -> None:
        """Initializes a sparse feature statistics generator.

    Args:
      schema: A required schema for the dataset.
      name: An optional unique name associated with the statistics generator.
    """
        self._sparse_feature_components = _get_components(
            _get_all_sparse_features(schema))

        # Create length diff generators for each index / value pair and count
        # missing generator for all paths.
        constituents = []
        for _, (value, indices) in self._sparse_feature_components.items():
            required_paths = [value] + list(indices)
            constituents.append(
                count_missing_generator.CountMissingGenerator(
                    value, required_paths))
            for index in indices:
                constituents.append(
                    length_diff_generator.LengthDiffGenerator(
                        index, value, required_paths))
                constituents.append(
                    count_missing_generator.CountMissingGenerator(
                        index, required_paths))

        super(SparseFeatureStatsGenerator,
              self).__init__(name, constituents, schema)
예제 #2
0
 def test_length_diff_generator_key(self):
   path1 = types.FeaturePath(['f1'])
   path2 = types.FeaturePath(['f2'])
   generator = length_diff_generator.LengthDiffGenerator(path1, path2)
   expected_key = ('LengthDiffGenerator', path1, path2)
   self.assertDictEqual({expected_key: None}, {generator.get_key(): None})
   self.assertDictEqual(
       {expected_key: None},
       {length_diff_generator.LengthDiffGenerator.key(path1, path2): None})
예제 #3
0
 def test_length_diff_generator_both_missing(self):
     batch = input_batch.InputBatch(
         pa.Table.from_arrays([pa.array([[1], [1], [1]])], ['required']))
     path1 = types.FeaturePath(['f1'])
     path2 = types.FeaturePath(['f2'])
     required_path = types.FeaturePath('required')
     generator = length_diff_generator.LengthDiffGenerator(
         path1, path2, required_paths=[required_path])
     accumulator = generator.create_accumulator()
     accumulator = generator.add_input(accumulator, batch)
     self.assertEqual((0, 0), generator.extract_output(accumulator))
예제 #4
0
 def test_length_diff_generator_negative_min_max(self):
     batch = input_batch.InputBatch(
         pa.Table.from_arrays([
             pa.array([[1, 2, 3], None, [1]]),
             pa.array([[1], None, []]),
             pa.array([[1], None, [1]])
         ], ['f1', 'f2', 'required']))
     path1 = types.FeaturePath(['f1'])
     path2 = types.FeaturePath(['f2'])
     required_path = types.FeaturePath('required')
     generator = length_diff_generator.LengthDiffGenerator(
         path2, path1, required_paths=[path1, path2, required_path])
     accumulator = generator.create_accumulator()
     accumulator = generator.add_input(accumulator, batch)
     self.assertEqual((-2, -1), generator.extract_output(accumulator))
예제 #5
0
 def test_length_diff_generator_key_with_required(self):
     path1 = types.FeaturePath(['f1'])
     path2 = types.FeaturePath(['f2'])
     required_path = types.FeaturePath(['required'])
     required_paths = [path1, path2, required_path]
     generator = length_diff_generator.LengthDiffGenerator(
         path1, path2, required_paths)
     expected_key = ('LengthDiffGenerator', path1, path2, path1, path2,
                     required_path)
     self.assertDictEqual({expected_key: None}, {generator.get_key(): None})
     self.assertDictEqual({expected_key: None}, {
         length_diff_generator.LengthDiffGenerator.key(
             path1, path2, required_paths):
         None
     })
 def __init__(self,
              schema: schema_pb2.Schema,
              name: Text = 'WeightedFeatureStatsGenerator') -> None:
     constituents = []
     for weighted_feature in schema.weighted_feature:
         weight = types.FeaturePath.from_proto(
             weighted_feature.weight_feature)
         value = types.FeaturePath.from_proto(weighted_feature.feature)
         component_paths = [weight, value]
         constituents.append(
             length_diff_generator.LengthDiffGenerator(
                 weight, value, required_paths=component_paths))
         constituents.append(
             count_missing_generator.CountMissingGenerator(
                 value, required_paths=component_paths))
         constituents.append(
             count_missing_generator.CountMissingGenerator(
                 weight, required_paths=component_paths))
     super(WeightedFeatureStatsGenerator,
           self).__init__(name, constituents, schema)