def _to_topk_tuples( sliced_table: Tuple[Text, pa.Table], categorical_features: FrozenSet[types.FeaturePath], weight_feature: Optional[Text] ) -> Iterable[Tuple[Tuple[Text, FeaturePathTuple, Any], Union[int, Tuple[ int, Union[int, float]]]]]: """Generates tuples for computing top-k and uniques from input tables.""" slice_key, table = sliced_table for feature_path, feature_array, weights in arrow_util.enumerate_arrays( table, weight_column=weight_feature, enumerate_leaves_only=True): feature_array_type = feature_array.type if (feature_path in categorical_features or stats_util.get_feature_type_from_arrow_type( feature_path, feature_array_type) == statistics_pb2.FeatureNameStatistics.STRING): flattened_values = feature_array.flatten() if weights is not None and flattened_values: # Slow path: weighted uniques. flattened_values_np = arrow_util.primitive_array_to_numpy( flattened_values) parent_indices = (arrow_util.primitive_array_to_numpy( arrow_util.GetFlattenedArrayParentIndices(feature_array))) weights_ndarray = weights[parent_indices] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = arrow_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def add_input( self, accumulator: Dict[types.FeaturePath, _PartialBasicStats], examples_table: pa.Table ) -> Dict[types.FeaturePath, _PartialBasicStats]: for feature_path, feature_array, weights in arrow_util.enumerate_arrays( examples_table, weight_column=self._weight_feature, enumerate_leaves_only=False): stats_for_feature = accumulator.get(feature_path) if stats_for_feature is None: stats_for_feature = _PartialBasicStats( self._weight_feature is not None) # Store empty summary. stats_for_feature.common_stats.num_values_summary = ( self._num_values_quantiles_combiner.create_accumulator()) stats_for_feature.numeric_stats.quantiles_summary = ( self._values_quantiles_combiner.create_accumulator()) accumulator[feature_path] = stats_for_feature feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) stats_for_feature.common_stats.update( feature_path, feature_array, feature_type, self._num_values_quantiles_combiner, weights) is_categorical_feature = feature_path in self._categorical_features if (is_categorical_feature or feature_type == statistics_pb2.FeatureNameStatistics.STRING): stats_for_feature.string_stats.update(feature_array) elif feature_type in (statistics_pb2.FeatureNameStatistics.INT, statistics_pb2.FeatureNameStatistics.FLOAT): stats_for_feature.numeric_stats.update( feature_array, self._values_quantiles_combiner, weights) return accumulator
def testEnumerateArrays(self): for leaves_only, has_weights in itertools.combinations_with_replacement( [True, False], 2): actual_results = {} for feature_path, feature_array, weights in arrow_util.enumerate_arrays( _INPUT_TABLE, "w" if has_weights else None, leaves_only): actual_results[feature_path] = (feature_array, weights) expected_results = {} for p in [["f1"], ["w"], ["f2", "sf1"], ["f2", "sf2", "ssf1"]]: feature_path = types.FeaturePath(p) expected_results[feature_path] = ( _FEATURES_TO_ARRAYS[feature_path][0], _FEATURES_TO_ARRAYS[feature_path][1] if has_weights else None) if not leaves_only: for p in [["f2"], ["f2", "sf2"]]: feature_path = types.FeaturePath(p) expected_results[feature_path] = ( _FEATURES_TO_ARRAYS[feature_path][0], _FEATURES_TO_ARRAYS[feature_path][1] if has_weights else None) self.assertLen(actual_results, len(expected_results)) for k, v in six.iteritems(expected_results): self.assertIn(k, actual_results) actual = actual_results[k] self.assertTrue( actual[0].equals(v[0]), "leaves_only={}; has_weights={}; " "feature={}; expected: {}; actual: {}".format( leaves_only, has_weights, k, v, actual)) np.testing.assert_array_equal(actual[1], v[1])
def add_input(self, wrapper_accumulator: WrapperAccumulator, input_record_batch: pa.RecordBatch) -> WrapperAccumulator: """Returns result of folding a batch of inputs into wrapper_accumulator. Args: wrapper_accumulator: The current wrapper accumulator. input_record_batch: An arrow RecordBatch representing a batch of examples, which should be added to the accumulator. Returns: The wrapper_accumulator after updating the statistics for the batch of inputs. """ if self._sample_rate is not None and random.random() <= self._sample_rate: return wrapper_accumulator for feature_path, feature_array, _ in arrow_util.enumerate_arrays( input_record_batch, weight_column=self._weight_feature, enumerate_leaves_only=True): for index, generator in enumerate(self._feature_stats_generators): self._perhaps_initialize_for_feature_path(wrapper_accumulator, feature_path) wrapper_accumulator[feature_path][index] = generator.add_input( generator.create_accumulator(), feature_path, feature_array) return wrapper_accumulator
def testInvalidWeightColumnStringValues(self): with self.assertRaisesRegex( ValueError, 'Weight feature "w" must be of numeric type.*'): for _ in arrow_util.enumerate_arrays(pa.Table.from_arrays( [pa.array([[1], [2, 3]]), pa.array([["two"], ["two"]])], ["v", "w"]), weight_column="w", enumerate_leaves_only=False): pass
def testInvalidWeightColumn(self): with self.assertRaisesRegex( ValueError, "weight feature must have exactly one value in each example"): for _ in arrow_util.enumerate_arrays(pa.Table.from_arrays( [pa.array([[1], [2, 3]]), pa.array([[1], []])], ["v", "w"]), weight_column="w", enumerate_leaves_only=False): pass
def testEnumerateArraysStringWeight(self): # The arrow type of a string changes between py2 and py3 so we accept either with self.assertRaisesRegex( ValueError, r'Weight column "w" must be of numeric type. Found (string|binary).*'): for _ in arrow_util.enumerate_arrays( pa.RecordBatch.from_arrays( [pa.array([[1], [2, 3]]), pa.array([["a"], ["b"]])], ["v", "w"]), weight_column="w", enumerate_leaves_only=True): pass
def add_input( self, accumulator: Dict[tfdv_types.FeaturePath, _CombinedSketch], input_record_batch: pa.RecordBatch ) -> Dict[tfdv_types.FeaturePath, _CombinedSketch]: for feature_path, leaf_array, weights in arrow_util.enumerate_arrays( input_record_batch, example_weight_map=self._example_weight_map, enumerate_leaves_only=True): feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, leaf_array.type) if self._should_run(feature_path, feature_type): self._update_combined_sketch_for_feature( feature_path, leaf_array, weights, accumulator) return accumulator
def testEnumerateArraysWithColumnSelectFn(self, col_fn, expected_features): actual = list( arrow_util.enumerate_arrays(_INPUT_RECORD_BATCH, _EXAMPLE_WEIGHT_MAP, True, column_select_fn=col_fn)) expected = list( (f, _FEATURES_TO_ARRAYS[f].array, _FEATURES_TO_ARRAYS[f].weights) for f in expected_features) for (actual_path, actual_col, actual_w), (expected_path, expected_col, expected_w) in zip(actual, expected): self.assertEqual(expected_path, actual_path) self.assertEqual(expected_col, actual_col) self.assertEqual(pa.array(expected_w), pa.array(actual_w))
def testEnumerateMissingPropagatedInFlattenedStruct( self, batch, expected_results): actual_results = {} for feature_path, feature_array, _ in arrow_util.enumerate_arrays( batch, example_weight_map=None, enumerate_leaves_only=False): actual_results[feature_path] = feature_array self.assertLen(actual_results, len(expected_results)) for k, v in six.iteritems(expected_results): assert k in actual_results, (k, list(actual_results.keys())) self.assertIn(k, actual_results) actual = _Normalize(actual_results[k]) v = _Normalize(v) self.assertTrue( actual.equals(v), "feature={}; expected: {}; actual: {}; diff: {}".format( k, v, actual, actual.diff(v)))
def add_input( self, accumulator: Dict[tfdv_types.FeaturePath, _CombinedSketch], input_record_batch: pa.RecordBatch ) -> Dict[tfdv_types.FeaturePath, _CombinedSketch]: for feature_path, leaf_array, weights in arrow_util.enumerate_arrays( input_record_batch, example_weight_map=self._example_weight_map, enumerate_leaves_only=True): feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, leaf_array.type) # Only compute top-k and unique stats for categorical and string features. if ((feature_type == statistics_pb2.FeatureNameStatistics.INT and feature_path in self._categorical_features) or feature_type == statistics_pb2.FeatureNameStatistics.STRING): self._update_combined_sketch_for_feature( feature_path, leaf_array, weights, accumulator) return accumulator
def add_input( self, accumulator: Dict[types.FeaturePath, _ValueCounts], input_record_batch: pa.RecordBatch ) -> Dict[types.FeaturePath, _ValueCounts]: for feature_path, leaf_array, weights in arrow_util.enumerate_arrays( input_record_batch, weight_column=self._weight_feature, enumerate_leaves_only=True): feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, leaf_array.type) if feature_type is None: continue # if it's not a categorical feature nor a string feature, we don't bother # with topk stats. if (feature_path in self._categorical_features or feature_type == statistics_pb2.FeatureNameStatistics.STRING): flattened_values = leaf_array.flatten() unweighted_counts = collections.Counter() # Compute unweighted counts. value_counts = array_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): unweighted_counts[value] = count # Compute weighted counts if a weight feature is specified. weighted_counts = _WeightedCounter() if weights is not None: flattened_values_np = np.asarray(flattened_values) parent_indices = array_util.GetFlattenedArrayParentIndices( leaf_array) weighted_counts.weighted_update( flattened_values_np, weights[np.asarray(parent_indices)]) if feature_path not in accumulator: accumulator[feature_path] = _ValueCounts( unweighted_counts=unweighted_counts, weighted_counts=weighted_counts) else: accumulator[feature_path].unweighted_counts.update( unweighted_counts) accumulator[feature_path].weighted_counts.update( weighted_counts) return accumulator
def testEnumerateArrays(self): for leaves_only, has_weights, wrap_flat_struct_in_list in ( itertools.product([True, False], [True, False], [True, False])): actual_results = {} for feature_path, feature_array, weights in arrow_util.enumerate_arrays( _INPUT_RECORD_BATCH, _EXAMPLE_WEIGHT_MAP if has_weights else None, leaves_only, wrap_flat_struct_in_list): actual_results[feature_path] = (feature_array, weights) expected_results = {} # leaf fields for p in [["f1"], ["w"], ["w_override1"], ["w_override2"], ["f2", "sf1"], ["f2", "sf2", "ssf1"], ["f3", "sf1"], ["f3", "sf2"]]: feature_path = types.FeaturePath(p) expected_results[feature_path] = ( _FEATURES_TO_ARRAYS[feature_path].array, _FEATURES_TO_ARRAYS[feature_path].weights if has_weights else None) if not leaves_only: for p in [["f2"], ["f2", "sf2"], ["f3"]]: feature_path = types.FeaturePath(p) expected_array = _FEATURES_TO_ARRAYS[feature_path][0] if wrap_flat_struct_in_list and pa.types.is_struct( expected_array.type): expected_array = array_util.ToSingletonListArray( expected_array) expected_results[feature_path] = ( expected_array, _FEATURES_TO_ARRAYS[feature_path].weights if has_weights else None) self.assertLen(actual_results, len(expected_results)) for k, v in six.iteritems(expected_results): self.assertIn(k, actual_results) actual = actual_results[k] self.assertTrue( actual[0].equals(v[0]), "leaves_only={}; has_weights={}; " "wrap_flat_struct_in_list={} feature={}; expected: {}; actual: {}" .format(leaves_only, has_weights, wrap_flat_struct_in_list, k, v, actual)) np.testing.assert_array_equal(actual[1], v[1])
def add_input( self, accumulator: Dict[types.FeaturePath, _ValueCounts], input_record_batch: pa.RecordBatch ) -> Dict[types.FeaturePath, _ValueCounts]: for feature_path, leaf_array, weights in arrow_util.enumerate_arrays( input_record_batch, example_weight_map=self._example_weight_map, enumerate_leaves_only=True): feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, leaf_array.type) # if it's not a categorical int feature nor a string feature, we don't # bother with topk stats. if ((feature_type == statistics_pb2.FeatureNameStatistics.INT and feature_path in self._categorical_features) or feature_type == statistics_pb2.FeatureNameStatistics.STRING): flattened_values, parent_indices = arrow_util.flatten_nested( leaf_array, weights is not None) unweighted_counts = collections.Counter() # Compute unweighted counts. value_counts = flattened_values.value_counts() values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in zip(values, counts): unweighted_counts[value] = count # Compute weighted counts if a weight feature is specified. weighted_counts = _WeightedCounter() if weights is not None: flattened_values_np = np.asarray(flattened_values) weighted_counts.weighted_update(flattened_values_np, weights[parent_indices]) if feature_path not in accumulator: accumulator[feature_path] = _ValueCounts( unweighted_counts=unweighted_counts, weighted_counts=weighted_counts) else: accumulator[feature_path].unweighted_counts.update( unweighted_counts) accumulator[feature_path].weighted_counts.update( weighted_counts) return accumulator
def _to_topk_tuples( sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch], bytes_features: FrozenSet[types.FeaturePath], categorical_features: FrozenSet[types.FeaturePath], example_weight_map: ExampleWeightMap, ) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[ int, Tuple[int, Union[int, float]]]]]: """Generates tuples for computing top-k and uniques from the input.""" slice_key, record_batch = sliced_record_batch has_any_weight = bool(example_weight_map.all_weight_features()) for feature_path, feature_array, weights in arrow_util.enumerate_arrays( record_batch, example_weight_map=example_weight_map, enumerate_leaves_only=True): feature_array_type = feature_array.type feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array_type) if feature_path in bytes_features: continue if ((feature_type == statistics_pb2.FeatureNameStatistics.INT and feature_path in categorical_features) or feature_type == statistics_pb2.FeatureNameStatistics.STRING): flattened_values, parent_indices = arrow_util.flatten_nested( feature_array, weights is not None) if weights is not None and flattened_values: # Slow path: weighted uniques. flattened_values_np = np.asarray(flattened_values) weights_ndarray = weights[parent_indices] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = flattened_values.value_counts() values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() if has_any_weight: for value, count in zip(values, counts): yield ((slice_key, feature_path.steps(), value), (count, 1)) else: for value, count in zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def add_input( self, accumulator: Dict[types.FeaturePath, _PartialBasicStats], examples: pa.RecordBatch ) -> Dict[types.FeaturePath, _PartialBasicStats]: for feature_path, feature_array, weights in arrow_util.enumerate_arrays( examples, example_weight_map=self._example_weight_map, enumerate_leaves_only=False): stats_for_feature = accumulator.get(feature_path) if stats_for_feature is None: stats_for_feature = _PartialBasicStats( weights is not None, self._make_quantiles_sketch_fn) accumulator[feature_path] = stats_for_feature feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) stats_for_feature.common_stats.update(feature_path, feature_array, feature_type, self._make_quantiles_sketch_fn, weights) # The user may make certain claims about a feature's data type # (e.g. _bytes_features imply string data type). However we should not # trust those claims because TFDV is also responsible for detecting # mismatching types. We collect stats according to the actual type, and # only when the actual type matches the claim do we collect the # type-specific stats (like for categorical int and bytes features). if feature_type == statistics_pb2.FeatureNameStatistics.STRING: if feature_path in self._bytes_features: stats_for_feature.bytes_stats.update(feature_array) else: stats_for_feature.string_stats.update(feature_array) elif feature_type == statistics_pb2.FeatureNameStatistics.INT: if feature_path in self._categorical_features: stats_for_feature.string_stats.update(feature_array) else: stats_for_feature.numeric_stats.update(feature_array, weights) elif feature_type == statistics_pb2.FeatureNameStatistics.FLOAT: stats_for_feature.numeric_stats.update(feature_array, weights) return accumulator
def _to_topk_tuples( sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch], bytes_features: FrozenSet[types.FeaturePath], categorical_features: FrozenSet[types.FeaturePath], weight_feature: Optional[Text] ) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[ int, Tuple[int, Union[int, float]]]]]: """Generates tuples for computing top-k and uniques from the input.""" slice_key, record_batch = sliced_record_batch for feature_path, feature_array, weights in arrow_util.enumerate_arrays( record_batch, weight_column=weight_feature, enumerate_leaves_only=True): feature_array_type = feature_array.type feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array_type) # Skip null columns. if feature_type is None: continue if feature_path in bytes_features: continue if (feature_path in categorical_features or feature_type == statistics_pb2.FeatureNameStatistics.STRING): flattened_values, parent_indices = arrow_util.flatten_nested( feature_array, weights is not None) if weights is not None and flattened_values: # Slow path: weighted uniques. flattened_values_np = np.asarray(flattened_values) weights_ndarray = weights[parent_indices] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = array_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def testEnumerate(self): input_table = pa.Table.from_arrays([ pa.array([[1], [2, 3]]), pa.array([[{ "sf1": [["a", "b"]] }], [{ "sf2": [{ "ssf1": [[3], [4]] }] }]]), pa.array([[1.0], [2.0]]) ], ["f1", "f2", "w"]) possible_results = { types.FeaturePath(["f1"]): (pa.array([[1], [2, 3]]), [1.0, 2.0]), types.FeaturePath(["w"]): (pa.array([[1.0], [2.0]]), [1.0, 2.0]), types.FeaturePath(["f2"]): (pa.array([[{ "sf1": [["a", "b"]] }], [{ "sf2": [{ "ssf1": [[3], [4]] }] }]]), [1.0, 2.0]), types.FeaturePath(["f2", "sf1"]): (pa.array([[["a", "b"]], None]), [1.0, 2.0]), types.FeaturePath(["f2", "sf2"]): (pa.array([None, [{ "ssf1": [[3], [4]] }]]), [1.0, 2.0]), types.FeaturePath(["f2", "sf2", "ssf1"]): (pa.array([[[3], [4]]]), [2.0]), } for leaves_only, has_weights in itertools.combinations_with_replacement( [True, False], 2): actual_results = {} for feature_path, feature_array, weights in arrow_util.enumerate_arrays( input_table, "w" if has_weights else None, leaves_only): actual_results[feature_path] = (feature_array, weights) expected_results = {} for p in [["f1"], ["w"], ["f2", "sf1"], ["f2", "sf2", "ssf1"]]: feature_path = types.FeaturePath(p) expected_results[feature_path] = ( possible_results[feature_path][0], possible_results[feature_path][1] if has_weights else None) if not leaves_only: for p in [["f2"], ["f2", "sf2"]]: feature_path = types.FeaturePath(p) expected_results[feature_path] = ( possible_results[feature_path][0], possible_results[feature_path][1] if has_weights else None) self.assertLen(actual_results, len(expected_results)) for k, v in six.iteritems(expected_results): self.assertIn(k, actual_results) actual = actual_results[k] self.assertTrue( actual[0].equals(v[0]), "leaves_only={}; has_weights={}; " "feature={}; expected: {}; actual: {}".format( leaves_only, has_weights, k, v, actual)) np.testing.assert_array_equal(actual[1], v[1])
def add_input( self, accumulator: Dict[types.FeaturePath, _PartialSparseFeatureStats], input_table: pa.Table ) -> Dict[types.FeaturePath, _PartialSparseFeatureStats]: """Returns result of folding a batch of inputs into the current accumulator. Args: accumulator: The current accumulator. input_table: An Arrow Table whose columns are features and rows are examples. Returns: The accumulator after updating the statistics for the batch of inputs. """ feature_value_list_lengths = dict() feature_is_missing = dict() batch_example_count = input_table.num_rows # Do a single pass through the input table to determine the value list # lengths and whether the feature is missing for every feature # that is an index or value feature in any sparse feature in the schema. for feature_path, leaf_array, _ in arrow_util.enumerate_arrays( input_table, weight_column=None, enumerate_leaves_only=True): if (feature_path in self._all_index_feature_paths or feature_path in self._all_value_feature_paths): if pa.types.is_null(leaf_array.type): # If the column is a NullArray, it is missing from the entire batch # (missing features have value list lengths of 0). feature_value_list_lengths[feature_path] = np.full( batch_example_count, 0) feature_is_missing[feature_path] = np.full( batch_example_count, True) else: feature_value_list_lengths[feature_path] = np.asarray( array_util.ListLengthsFromListArray(leaf_array)) feature_is_missing[feature_path] = np.asarray( array_util.GetArrayNullBitmapAsByteArray(leaf_array)) # Now create a partial sparse feature stats object for each sparse feature # using the value list lengths and feature missing information collected # above. for feature_path in self._sparse_feature_component_paths: value_feature_path = self._sparse_feature_component_paths[ feature_path].value_feature index_feature_paths = self._sparse_feature_component_paths[ feature_path].index_features # Create a filter identifying examples in which the entire sparse feature # is missing since those examples should not be included in counting # missing counts or length differences. component_features_missing = np.array([ feature_is_missing.get(path, np.full(batch_example_count, True)) for path in itertools.chain([value_feature_path], index_feature_paths) ]) entire_sparse_feature_missing = np.all(component_features_missing, axis=0) num_examples_missing_sparse_feature = np.sum( entire_sparse_feature_missing) # If all examples in the batch are missing the sparse feature, do not # update the accumulator with the partial stats for that sparse feature. if num_examples_missing_sparse_feature == batch_example_count: continue is_missing_value_feature = feature_is_missing.get( value_feature_path) # If this batch does not have the value feature at all, # missing_value_count is the number of examples in the batch. # Also populate the value list lengths for the value feature with all 0s # since a missing feature is considered to have a value list length of 0. if is_missing_value_feature is None: missing_value_count = batch_example_count feature_value_list_lengths[value_feature_path] = np.full( batch_example_count, 0) else: missing_value_count = np.sum(is_missing_value_feature) # Do not include examples that are entirely missing the sparse feature in # the missing value count. missing_value_count -= num_examples_missing_sparse_feature missing_index_counts = collections.Counter() min_length_diff = dict() max_length_diff = dict() for index_feature_path in index_feature_paths: is_missing_index_feature = feature_is_missing.get( index_feature_path) if is_missing_index_feature is None: # If this batch does not have this index feature at all, # missing_index_count for that index feature is the number of # examples in the batch. missing_index_count = batch_example_count # Populate the value list lengths for the index feature with all 0s # since a missing feature is considered to have a value list length of # 0. feature_value_list_lengths[index_feature_path] = np.full( batch_example_count, 0) else: missing_index_count = np.sum(is_missing_index_feature) # Do not include examples that are entirely missing the sparse feature # in the missing value count. missing_index_counts[index_feature_path] = ( missing_index_count - num_examples_missing_sparse_feature) length_differences = np.subtract( feature_value_list_lengths[index_feature_path], feature_value_list_lengths[value_feature_path]) # Do not include examples that are entirely missing the sparse feature # in determining the min and max length differences. filtered_length_differences = length_differences[ ~entire_sparse_feature_missing] # This generator should not get to this point if the current sparse # feature is missing from all examples in the batch (which would cause # filtered_length_differences to be empty). assert filtered_length_differences.size != 0 min_length_diff[index_feature_path] = np.min( filtered_length_differences) max_length_diff[index_feature_path] = np.max( filtered_length_differences) stats_for_feature = _PartialSparseFeatureStats( missing_value_count, missing_index_counts, min_length_diff, max_length_diff) existing_stats_for_feature = accumulator.get(feature_path) if existing_stats_for_feature is None: accumulator[feature_path] = stats_for_feature else: accumulator[feature_path] += stats_for_feature return accumulator
def add_input( self, accumulator: Dict[types.FeaturePath, _PartialSparseFeatureStats], input_table: pa.Table ) -> Dict[types.FeaturePath, _PartialSparseFeatureStats]: """Returns result of folding a batch of inputs into the current accumulator. Args: accumulator: The current accumulator. input_table: An Arrow Table whose columns are features and rows are examples. Returns: The accumulator after updating the statistics for the batch of inputs. """ component_feature_value_list_lengths = dict() component_feature_num_missing = dict() batch_example_count = input_table.num_rows # Do a single pass through the input table to determine the value list # lengths and number missing for every feature that is an index or value # feature in any sparse feature in the schema. for feature_path, leaf_array, _ in arrow_util.enumerate_arrays( input_table, weight_column=None, enumerate_leaves_only=True): if (feature_path in self._all_index_feature_paths or feature_path in self._all_value_feature_paths): # If the column is a NullArray, skip it when populating the # component_feature_ dicts. Features that are missing from those dicts # are treated as entirely missing for the batch. if not pa.types.is_null(leaf_array.type): component_feature_value_list_lengths[ feature_path] = arrow_util.primitive_array_to_numpy( arrow_util.ListLengthsFromListArray(leaf_array)) component_feature_num_missing[ feature_path] = leaf_array.null_count # Now create a partial sparse feature stats object for each sparse feature # using the value list lengths and numbers missing information collected # above. for feature_path in self._sparse_feature_component_paths: value_feature_path = self._sparse_feature_component_paths[ feature_path].value_feature index_feature_paths = self._sparse_feature_component_paths[ feature_path].index_features missing_value_count = component_feature_num_missing.get( value_feature_path) # If this batch does not have the value feature at all, # missing_value_count is the number of examples in the batch. # Also populate the value list lengths for the value feature with all 0s # since a missing feature is considered to have a value list length of 0. if missing_value_count is None: missing_value_count = batch_example_count component_feature_value_list_lengths[ value_feature_path] = np.full(batch_example_count, 0) missing_index_counts = collections.Counter() min_length_diff = dict() max_length_diff = dict() for index_feature_path in index_feature_paths: missing_index_count = component_feature_num_missing.get( index_feature_path) # If this batch does not have this index feature at all, # missing_index_count for that index feature is the number of # examples in the batch. # Also populate the value list lengths for the index feature with all 0s # since a missing feature is considered to have a value list length of # 0. if missing_index_count is None: missing_index_counts[ index_feature_path] = batch_example_count component_feature_value_list_lengths[ index_feature_path] = np.full(batch_example_count, 0) else: missing_index_counts[ index_feature_path] = missing_index_count length_differences = np.subtract( component_feature_value_list_lengths[index_feature_path], component_feature_value_list_lengths[value_feature_path]) min_length_diff[index_feature_path] = np.min( length_differences) max_length_diff[index_feature_path] = np.max( length_differences) stats_for_feature = _PartialSparseFeatureStats( missing_value_count, missing_index_counts, min_length_diff, max_length_diff) existing_stats_for_feature = accumulator.get(feature_path) if existing_stats_for_feature is None: accumulator[feature_path] = stats_for_feature else: accumulator[feature_path] += stats_for_feature return accumulator