def add_input( self, accumulator: Dict[types.FeaturePath, _PartialBasicStats], examples_table: pa.Table ) -> Dict[types.FeaturePath, _PartialBasicStats]: for feature_path, feature_array, weights in arrow_util.enumerate_arrays( examples_table, weight_column=self._weight_feature, enumerate_leaves_only=False): stats_for_feature = accumulator.get(feature_path) if stats_for_feature is None: stats_for_feature = _PartialBasicStats( self._weight_feature is not None) # Store empty summary. stats_for_feature.common_stats.num_values_summary = ( self._num_values_quantiles_combiner.create_accumulator()) stats_for_feature.numeric_stats.quantiles_summary = ( self._values_quantiles_combiner.create_accumulator()) accumulator[feature_path] = stats_for_feature feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) stats_for_feature.common_stats.update( feature_path, feature_array, feature_type, self._num_values_quantiles_combiner, weights) is_categorical_feature = feature_path in self._categorical_features if (is_categorical_feature or feature_type == statistics_pb2.FeatureNameStatistics.STRING): stats_for_feature.string_stats.update(feature_array) elif feature_type in (statistics_pb2.FeatureNameStatistics.INT, statistics_pb2.FeatureNameStatistics.FLOAT): stats_for_feature.numeric_stats.update( feature_array, self._values_quantiles_combiner, weights) return accumulator
def _to_topk_tuples( sliced_table: Tuple[Text, pa.Table], categorical_features: FrozenSet[types.FeaturePath], weight_feature: Optional[Text] ) -> Iterable[Tuple[Tuple[Text, FeaturePathTuple, Any], Union[int, Tuple[ int, Union[int, float]]]]]: """Generates tuples for computing top-k and uniques from input tables.""" slice_key, table = sliced_table for feature_path, feature_array, weights in arrow_util.enumerate_arrays( table, weight_column=weight_feature, enumerate_leaves_only=True): feature_array_type = feature_array.type if (feature_path in categorical_features or stats_util.get_feature_type_from_arrow_type( feature_path, feature_array_type) == statistics_pb2.FeatureNameStatistics.STRING): flattened_values = feature_array.flatten() if weights is not None and flattened_values: # Slow path: weighted uniques. flattened_values_np = arrow_util.primitive_array_to_numpy( flattened_values) parent_indices = (arrow_util.primitive_array_to_numpy( arrow_util.GetFlattenedArrayParentIndices(feature_array))) weights_ndarray = weights[parent_indices] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = arrow_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def _remove_unsupported_feature_columns(examples_table: pa.Table, schema: schema_pb2.Schema) -> pa.Table: """Removes feature columns that contain unsupported values. All feature columns that are multivalent are dropped since they are not supported by sk-learn. All columns of STRUCT type are also dropped. Args: examples_table: Arrow table containing a batch of examples. schema: The schema for the data. Returns: Arrow table. """ multivalent_features = schema_util.get_multivalent_features(schema) unsupported_columns = set() for f in multivalent_features: unsupported_columns.add(f.steps()[0]) for column_name, column in zip(examples_table.schema.names, examples_table.itercolumns()): if (stats_util.get_feature_type_from_arrow_type( types.FeaturePath([column_name]), column.type) == statistics_pb2.FeatureNameStatistics.STRUCT): unsupported_columns.add(column_name) return examples_table.drop(unsupported_columns)
def add_input(self, accumulator, input_table): weight_ndarrays = [] if self._weight_feature is not None: for a in input_table.column( self._weight_feature).data.iterchunks(): weight_array = arrow_util.FlattenListArray(a) if len(weight_array) != len(a): raise ValueError( 'If weight is specified, then each example must have a weight ' 'feature of length 1.') # to_numpy() can only be called against a non-empty arrow array. if weight_array: weight_ndarrays.append(weight_array.to_numpy()) else: weight_ndarrays.append( np.array([], dtype=weight_array.to_pandas_dtype())) for column in input_table.columns: feature_name = column.name if feature_name == self._weight_feature: continue unweighted_counts = collections.Counter() weighted_counts = _WeightedCounter() feature_type = stats_util.get_feature_type_from_arrow_type( feature_name, column.type) if not (feature_name in self._categorical_features or feature_type == statistics_pb2.FeatureNameStatistics.STRING): continue for feature_array, weight_ndarray in six.moves.zip_longest( column.data.iterchunks(), weight_ndarrays, fillvalue=None): flattened_values_array = arrow_util.FlattenListArray( feature_array) # to_numpy() cannot be called if the array is empty. if not flattened_values_array: continue if feature_type == statistics_pb2.FeatureNameStatistics.STRING: values_ndarray = flattened_values_array.to_pandas() else: values_ndarray = flattened_values_array.to_numpy() value_parent_indices = arrow_util.GetFlattenedArrayParentIndices( feature_array).to_numpy() unweighted_counts.update(values_ndarray) if weight_ndarray is not None: weight_per_value = weight_ndarray[value_parent_indices] weighted_counts.weighted_update(values_ndarray, weight_per_value) if feature_name not in accumulator: accumulator[feature_name] = _ValueCounts( unweighted_counts=unweighted_counts, weighted_counts=weighted_counts) else: accumulator[feature_name].unweighted_counts.update( unweighted_counts) accumulator[feature_name].weighted_counts.update( weighted_counts) return accumulator
def add_input(self, accumulator: _PartialNLStats, feature_path: types.FeaturePath, feature_array: pa.Array) -> _PartialNLStats: """Return result of folding a batch of inputs into accumulator. Args: accumulator: The current accumulator. feature_path: The path of the feature. feature_array: An arrow Array representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if feature_path not in self._valid_feature_paths: accumulator.invalidate = True return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) # Ignore null array. if feature_type is None: return accumulator if feature_type not in self._feature_type_fns: accumulator.invalidate = True return accumulator feature_type_fn = self._feature_type_fns[feature_type] vocab = None rvocab = None if self._nld_vocabularies[feature_path]: vocab_name = self._nld_vocabularies[feature_path] vocab = self._vocabs[vocab_name] rvocab = self._rvocabs[vocab_name] excluded_string_tokens = self._nld_excluded_string_tokens[feature_path] excluded_int_tokens = self._nld_excluded_int_tokens[feature_path] oov_string_tokens = self._nld_oov_string_tokens[feature_path] int_tokens = self._nld_specified_int_tokens[feature_path] string_tokens = self._nld_specified_str_tokens[feature_path] sequence_length_excluded_int_tokens = ( self._nld_sequence_length_excluded_int_tokens[feature_path]) sequence_length_excluded_string_tokens = ( self._nld_sequence_length_excluded_string_tokens[feature_path]) # TODO(b/175875824): Benchmark and optimize performance. for row in feature_array.to_pylist(): if row is not None: feature_type_fn(row, accumulator, excluded_string_tokens, excluded_int_tokens, oov_string_tokens, vocab, rvocab, int_tokens, string_tokens, sequence_length_excluded_int_tokens, sequence_length_excluded_string_tokens, self._num_histogram_buckets) return accumulator
def add_input(self, accumulator, input_table): weight_column = (input_table.column(self._weight_feature) if self._weight_feature else None) weight_array = weight_column.data.chunk(0) if weight_column else [] if weight_array: flattened_weights = arrow_util.FlattenListArray( weight_array).to_numpy() for column in input_table.columns: feature_name = column.name # Skip the weight feature. if feature_name == self._weight_feature: continue feature_path = types.FeaturePath([feature_name]) feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, column.type) # if it's not a categorical feature nor a string feature, we don't bother # with topk stats. if not (feature_path in self._categorical_features or feature_type == statistics_pb2.FeatureNameStatistics.STRING): continue value_array = column.data.chunk(0) flattened_values = arrow_util.FlattenListArray(value_array) unweighted_counts = collections.Counter() # Compute unweighted counts. value_counts = arrow_util.ValueCounts(flattened_values) for value_count in value_counts: value_count = value_count.as_py() unweighted_counts[ value_count['values']] = value_count['counts'] # Compute weighted counts if a weight feature is specified. weighted_counts = _WeightedCounter() if weight_array: if (pa.types.is_binary(flattened_values.type) or pa.types.is_string(flattened_values.type)): # no free conversion. flattened_values_np = flattened_values.to_pandas() else: flattened_values_np = flattened_values.to_numpy() indices = arrow_util.GetFlattenedArrayParentIndices( value_array) weighted_counts.weighted_update( flattened_values_np, flattened_weights[indices.to_numpy()]) if feature_path not in accumulator: accumulator[feature_path] = _ValueCounts( unweighted_counts=unweighted_counts, weighted_counts=weighted_counts) else: accumulator[feature_path].unweighted_counts.update( unweighted_counts) accumulator[feature_path].weighted_counts.update( weighted_counts) return accumulator
def add_input(self, accumulator: _PartialImageStats, feature_path: types.FeaturePath, feature_array: pa.Array) -> _PartialImageStats: """Return result of folding a batch of inputs into accumulator. Args: accumulator: The current accumulator. feature_path: The path of the feature. feature_array: An arrow array representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidate: return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) # Ignore null array. if feature_type is None: return accumulator # If we see a different type, invalidate. if feature_type != statistics_pb2.FeatureNameStatistics.STRING: accumulator.invalidate = True return accumulator # Consider using memoryview to avoid copying after upgrading to # arrow 0.12. Note that this would involve modifying the subsequent logic # to iterate over the values in a loop. values = np.asarray(arrow_util.flatten_nested(feature_array)[0]) accumulator.total_num_values += values.size image_formats = self._image_decoder.get_formats(values) valid_mask = ~pd.isnull(image_formats) valid_formats = image_formats[valid_mask] format_counts = np.unique(valid_formats, return_counts=True) for (image_format, count) in zip(*format_counts): accumulator.counter_by_format[image_format] += count unknown_count = image_formats.size - valid_formats.size if unknown_count > 0: accumulator.counter_by_format[''] += unknown_count if self._enable_size_stats: # Get image height and width. image_sizes = self._image_decoder.get_sizes(values[valid_mask]) if image_sizes.any(): max_sizes = np.max(image_sizes, axis=0) # Update the max image height/width with all image values. accumulator.max_height = max(accumulator.max_height, max_sizes[0]) accumulator.max_width = max(accumulator.max_width, max_sizes[1]) return accumulator
def add_input( self, accumulator: Dict[tfdv_types.FeaturePath, _CombinedSketch], input_record_batch: pa.RecordBatch ) -> Dict[tfdv_types.FeaturePath, _CombinedSketch]: for feature_path, leaf_array, weights in arrow_util.enumerate_arrays( input_record_batch, example_weight_map=self._example_weight_map, enumerate_leaves_only=True): feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, leaf_array.type) if self._should_run(feature_path, feature_type): self._update_combined_sketch_for_feature( feature_path, leaf_array, weights, accumulator) return accumulator
def _remove_unsupported_feature_columns( self, examples: pa.RecordBatch, schema: schema_pb2.Schema) -> pa.RecordBatch: """Removes feature columns that contain unsupported values. All feature columns that are multivalent are dropped since they are not supported by sk-learn. All columns of STRUCT type are also dropped. Args: examples: Arrow RecordBatch containing a batch of examples. schema: The schema for the data. Returns: Arrow RecordBatch. """ columns = set(examples.schema.names) multivalent_features = schema_util.get_multivalent_features(schema) unsupported_columns = set() for f in multivalent_features: # Drop the column if they were in the examples. if f.steps()[0] in columns: unsupported_columns.add(f.steps()[0]) for column_name, column in zip(examples.schema.names, examples.columns): # only support 1-nested non-struct arrays. column_type = column.type if (arrow_util.get_nest_level(column_type) != 1 or stats_util.get_feature_type_from_arrow_type( types.FeaturePath([column_name]), column_type) == statistics_pb2.FeatureNameStatistics.STRUCT): unsupported_columns.add(column_name) # Drop columns that were not in the schema. if types.FeaturePath([column_name]) not in self._schema_features: unsupported_columns.add(column_name) supported_columns = [] supported_column_names = [] for column_name, column in zip(examples.schema.names, examples.columns): if column_name not in unsupported_columns: supported_columns.append(column) supported_column_names.append(column_name) return pa.RecordBatch.from_arrays(supported_columns, supported_column_names)
def add_input( self, accumulator: Dict[tfdv_types.FeaturePath, _CombinedSketch], input_record_batch: pa.RecordBatch ) -> Dict[tfdv_types.FeaturePath, _CombinedSketch]: for feature_path, leaf_array, weights in arrow_util.enumerate_arrays( input_record_batch, example_weight_map=self._example_weight_map, enumerate_leaves_only=True): feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, leaf_array.type) # Only compute top-k and unique stats for categorical and string features. if ((feature_type == statistics_pb2.FeatureNameStatistics.INT and feature_path in self._categorical_features) or feature_type == statistics_pb2.FeatureNameStatistics.STRING): self._update_combined_sketch_for_feature( feature_path, leaf_array, weights, accumulator) return accumulator
def add_input( self, accumulator: Dict[types.FeaturePath, _ValueCounts], input_record_batch: pa.RecordBatch ) -> Dict[types.FeaturePath, _ValueCounts]: for feature_path, leaf_array, weights in arrow_util.enumerate_arrays( input_record_batch, weight_column=self._weight_feature, enumerate_leaves_only=True): feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, leaf_array.type) if feature_type is None: continue # if it's not a categorical feature nor a string feature, we don't bother # with topk stats. if (feature_path in self._categorical_features or feature_type == statistics_pb2.FeatureNameStatistics.STRING): flattened_values = leaf_array.flatten() unweighted_counts = collections.Counter() # Compute unweighted counts. value_counts = array_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): unweighted_counts[value] = count # Compute weighted counts if a weight feature is specified. weighted_counts = _WeightedCounter() if weights is not None: flattened_values_np = np.asarray(flattened_values) parent_indices = array_util.GetFlattenedArrayParentIndices( leaf_array) weighted_counts.weighted_update( flattened_values_np, weights[np.asarray(parent_indices)]) if feature_path not in accumulator: accumulator[feature_path] = _ValueCounts( unweighted_counts=unweighted_counts, weighted_counts=weighted_counts) else: accumulator[feature_path].unweighted_counts.update( unweighted_counts) accumulator[feature_path].weighted_counts.update( weighted_counts) return accumulator
def _get_univalent_values_with_parent_indices( self, examples_table: pa.Table) -> Dict[types.FeatureName, pd.DataFrame]: """Extracts univalent values for each feature along with parent indices.""" result = {} for feature_column in examples_table.itercolumns(): feature_name = feature_column.name if (self._features_needed is not None and feature_name not in self._features_needed): continue feature_type = stats_util.get_feature_type_from_arrow_type( feature_name, feature_column.type) # Only consider crosses of numeric features. # TODO(zhuo): Support numeric features nested under structs. if feature_type in (statistics_pb2.FeatureNameStatistics.STRING, statistics_pb2.FeatureNameStatistics.STRUCT): continue # Assume we have only a single chunk. assert feature_column.data.num_chunks == 1 feat_arr = feature_column.data.chunk(0) value_lengths = arrow_util.primitive_array_to_numpy( arrow_util.ListLengthsFromListArray(feat_arr)) univalent_parent_indices = set((value_lengths == 1).nonzero()[0]) # If there are no univalent values, continue to the next feature. if not univalent_parent_indices: continue non_missing_values = arrow_util.primitive_array_to_numpy( feat_arr.flatten()) value_parent_indices = arrow_util.primitive_array_to_numpy( arrow_util.GetFlattenedArrayParentIndices(feat_arr)) if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT: # Remove any NaN values if present. non_nan_mask = ~np.isnan(non_missing_values) non_missing_values = non_missing_values[non_nan_mask] value_parent_indices = value_parent_indices[non_nan_mask] df = pd.DataFrame({ feature_name: non_missing_values, 'parent_index': value_parent_indices }) # Only keep the univalent feature values. df = df[df['parent_index'].isin(univalent_parent_indices)] result[feature_name] = df return result
def _to_topk_tuples( sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch], bytes_features: FrozenSet[types.FeaturePath], categorical_features: FrozenSet[types.FeaturePath], example_weight_map: ExampleWeightMap, ) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[ int, Tuple[int, Union[int, float]]]]]: """Generates tuples for computing top-k and uniques from the input.""" slice_key, record_batch = sliced_record_batch has_any_weight = bool(example_weight_map.all_weight_features()) for feature_path, feature_array, weights in arrow_util.enumerate_arrays( record_batch, example_weight_map=example_weight_map, enumerate_leaves_only=True): feature_array_type = feature_array.type feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array_type) if feature_path in bytes_features: continue if ((feature_type == statistics_pb2.FeatureNameStatistics.INT and feature_path in categorical_features) or feature_type == statistics_pb2.FeatureNameStatistics.STRING): flattened_values, parent_indices = arrow_util.flatten_nested( feature_array, weights is not None) if weights is not None and flattened_values: # Slow path: weighted uniques. flattened_values_np = np.asarray(flattened_values) weights_ndarray = weights[parent_indices] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = flattened_values.value_counts() values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() if has_any_weight: for value, count in zip(values, counts): yield ((slice_key, feature_path.steps(), value), (count, 1)) else: for value, count in zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def add_input( self, accumulator: Dict[types.FeaturePath, _ValueCounts], input_record_batch: pa.RecordBatch ) -> Dict[types.FeaturePath, _ValueCounts]: for feature_path, leaf_array, weights in arrow_util.enumerate_arrays( input_record_batch, example_weight_map=self._example_weight_map, enumerate_leaves_only=True): feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, leaf_array.type) # if it's not a categorical int feature nor a string feature, we don't # bother with topk stats. if ((feature_type == statistics_pb2.FeatureNameStatistics.INT and feature_path in self._categorical_features) or feature_type == statistics_pb2.FeatureNameStatistics.STRING): flattened_values, parent_indices = arrow_util.flatten_nested( leaf_array, weights is not None) unweighted_counts = collections.Counter() # Compute unweighted counts. value_counts = flattened_values.value_counts() values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in zip(values, counts): unweighted_counts[value] = count # Compute weighted counts if a weight feature is specified. weighted_counts = _WeightedCounter() if weights is not None: flattened_values_np = np.asarray(flattened_values) weighted_counts.weighted_update(flattened_values_np, weights[parent_indices]) if feature_path not in accumulator: accumulator[feature_path] = _ValueCounts( unweighted_counts=unweighted_counts, weighted_counts=weighted_counts) else: accumulator[feature_path].unweighted_counts.update( unweighted_counts) accumulator[feature_path].weighted_counts.update( weighted_counts) return accumulator
def add_input(self, accumulator, examples_table): weights = None if self._weight_feature: weights = (arrow_util.FlattenListArray( examples_table.column( self._weight_feature).data.chunk(0)).to_numpy()) if len(weights) != len(examples_table): raise ValueError('Expected exactly one weight per example.') for feature_column in examples_table.itercolumns(): feature_name = feature_column.name # Skip the weight feature. if feature_name == self._weight_feature: continue feature_path = types.FeaturePath([feature_name]) is_categorical_feature = feature_path in self._categorical_features # If we encounter this feature for the first time, create a # new partial basic stats. stats_for_feature = accumulator.get(feature_path) if stats_for_feature is None: stats_for_feature = _PartialBasicStats( self._weight_feature is not None) # Store empty summary. stats_for_feature.common_stats.num_values_summary = ( self._num_values_quantiles_combiner.create_accumulator()) stats_for_feature.numeric_stats.quantiles_summary = ( self._values_quantiles_combiner.create_accumulator()) accumulator[feature_path] = stats_for_feature feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_column.type) stats_for_feature.common_stats.update( feature_column, feature_type, self._num_values_quantiles_combiner, weights) if (is_categorical_feature or feature_type == statistics_pb2.FeatureNameStatistics.STRING): stats_for_feature.string_stats.update(feature_column) elif feature_type is not None: stats_for_feature.numeric_stats.update( feature_column, self._values_quantiles_combiner, weights) return accumulator
def add_input(self, accumulator: _PartialTimeStats, feature_path: types.FeaturePath, feature_array: pa.Column) -> _PartialTimeStats: """Returns result of folding a batch of inputs into the current accumulator. Args: accumulator: The current accumulator. feature_path: The path of the feature. feature_array: An arrow Array representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidated: return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) # Ignore null array. if feature_type is None: return accumulator if feature_type == statistics_pb2.FeatureNameStatistics.STRING: def _maybe_get_utf8(val): return stats_util.maybe_get_utf8(val) if isinstance( val, bytes) else val values = arrow_util.primitive_array_to_numpy( feature_array.flatten()) maybe_utf8 = np.vectorize(_maybe_get_utf8, otypes=[np.object])(values) if not maybe_utf8.all(): accumulator.invalidated = True return accumulator accumulator.update(maybe_utf8, feature_type) elif feature_type == statistics_pb2.FeatureNameStatistics.INT: values = arrow_util.primitive_array_to_numpy( feature_array.flatten()) accumulator.update(values, feature_type) else: accumulator.invalidated = True return accumulator
def _get_univalent_values_with_parent_indices( self, examples: pa.RecordBatch) -> Dict[types.FeatureName, DataFrame]: """Extracts univalent values for each feature along with parent indices.""" result = {} for feature_name, feat_arr in zip(examples.schema.names, examples.columns): if (self._features_needed is not None and feature_name not in self._features_needed): continue feature_type = stats_util.get_feature_type_from_arrow_type( feature_name, feat_arr.type) # Only consider crosses of numeric features. # TODO(zhuo): Support numeric features nested under structs. if feature_type in (None, statistics_pb2.FeatureNameStatistics.STRING, statistics_pb2.FeatureNameStatistics.STRUCT): continue value_lengths = np.asarray( array_util.ListLengthsFromListArray(feat_arr)) univalent_parent_indices = set((value_lengths == 1).nonzero()[0]) # If there are no univalent values, continue to the next feature. if not univalent_parent_indices: continue flattened, value_parent_indices = arrow_util.flatten_nested( feat_arr, True) non_missing_values = np.asarray(flattened) if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT: # Remove any NaN values if present. non_nan_mask = ~np.isnan(non_missing_values) non_missing_values = non_missing_values[non_nan_mask] value_parent_indices = value_parent_indices[non_nan_mask] df = pd.DataFrame({ feature_name: non_missing_values, 'parent_index': value_parent_indices }) # Only keep the univalent feature values. df = df[df['parent_index'].isin(univalent_parent_indices)] result[feature_name] = df return result
def add_input(self, accumulator: _PartialNLStats, feature_path: types.FeaturePath, feature_array: pa.Array) -> _PartialNLStats: """Return result of folding a batch of inputs into accumulator. Args: accumulator: The current accumulator. feature_path: The path of the feature. feature_array: An arrow Array representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidate: return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) # Ignore null array. if feature_type is None: return accumulator # If we see a different type, invalidate. if feature_type != statistics_pb2.FeatureNameStatistics.STRING: accumulator.invalidate = True return accumulator def _is_non_utf8(value): return (isinstance(value, bytes) and stats_util.maybe_get_utf8(value) is None) is_non_utf_vec = np.vectorize(_is_non_utf8, otypes=[bool]) classify_vec = np.vectorize(self._classifier.classify, otypes=[bool]) values = np.asarray( arrow_util.flatten_nested(feature_array)[0].slice( 0, _CROP_AT_VALUES)) if np.any(is_non_utf_vec(values)): accumulator.invalidate = True return accumulator accumulator.considered += values.size accumulator.matched += np.sum(classify_vec(values)) return accumulator
def add_input(self, accumulator, input_column): """Returns result of folding a batch of inputs into the current accumulator. Args: accumulator: The current accumulator. input_column: An arrow column representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidated: return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( input_column.name, input_column.type) # Ignore null array. if feature_type is None: return accumulator if feature_type == statistics_pb2.FeatureNameStatistics.STRING: def _maybe_get_utf8(val): return stats_util.maybe_get_utf8(val) if isinstance( val, bytes) else val for feature_array in input_column.data.iterchunks(): values = arrow_util.FlattenListArray(feature_array).to_pandas() maybe_utf8 = np.vectorize(_maybe_get_utf8, otypes=[np.object])(values) if not maybe_utf8.all(): accumulator.invalidated = True return accumulator accumulator.update(maybe_utf8, feature_type) elif feature_type == statistics_pb2.FeatureNameStatistics.INT: for feature_array in input_column.data.iterchunks(): values = arrow_util.FlattenListArray(feature_array).to_pandas() accumulator.update(values, feature_type) else: accumulator.invalidated = True return accumulator
def add_input( self, accumulator: Dict[types.FeaturePath, _PartialBasicStats], examples: pa.RecordBatch ) -> Dict[types.FeaturePath, _PartialBasicStats]: for feature_path, feature_array, weights in arrow_util.enumerate_arrays( examples, example_weight_map=self._example_weight_map, enumerate_leaves_only=False): stats_for_feature = accumulator.get(feature_path) if stats_for_feature is None: stats_for_feature = _PartialBasicStats( weights is not None, self._make_quantiles_sketch_fn) accumulator[feature_path] = stats_for_feature feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) stats_for_feature.common_stats.update(feature_path, feature_array, feature_type, self._make_quantiles_sketch_fn, weights) # The user may make certain claims about a feature's data type # (e.g. _bytes_features imply string data type). However we should not # trust those claims because TFDV is also responsible for detecting # mismatching types. We collect stats according to the actual type, and # only when the actual type matches the claim do we collect the # type-specific stats (like for categorical int and bytes features). if feature_type == statistics_pb2.FeatureNameStatistics.STRING: if feature_path in self._bytes_features: stats_for_feature.bytes_stats.update(feature_array) else: stats_for_feature.string_stats.update(feature_array) elif feature_type == statistics_pb2.FeatureNameStatistics.INT: if feature_path in self._categorical_features: stats_for_feature.string_stats.update(feature_array) else: stats_for_feature.numeric_stats.update(feature_array, weights) elif feature_type == statistics_pb2.FeatureNameStatistics.FLOAT: stats_for_feature.numeric_stats.update(feature_array, weights) return accumulator
def _to_topk_tuples( sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch], bytes_features: FrozenSet[types.FeaturePath], categorical_features: FrozenSet[types.FeaturePath], weight_feature: Optional[Text] ) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[ int, Tuple[int, Union[int, float]]]]]: """Generates tuples for computing top-k and uniques from the input.""" slice_key, record_batch = sliced_record_batch for feature_path, feature_array, weights in arrow_util.enumerate_arrays( record_batch, weight_column=weight_feature, enumerate_leaves_only=True): feature_array_type = feature_array.type feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array_type) # Skip null columns. if feature_type is None: continue if feature_path in bytes_features: continue if (feature_path in categorical_features or feature_type == statistics_pb2.FeatureNameStatistics.STRING): flattened_values, parent_indices = arrow_util.flatten_nested( feature_array, weights is not None) if weights is not None and flattened_values: # Slow path: weighted uniques. flattened_values_np = np.asarray(flattened_values) weights_ndarray = weights[parent_indices] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = array_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def add_input(self, accumulator, input_column): """Return result of folding a batch of inputs into accumulator. Args: accumulator: The current accumulator. input_column: An arrow column representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidate: return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( input_column.name, input_column.type) # Ignore null array. if feature_type is None: return accumulator # If we see a different type, invalidate. if feature_type != statistics_pb2.FeatureNameStatistics.STRING: accumulator.invalidate = True return accumulator def _is_non_utf8(value): return (isinstance(value, bytes) and stats_util.maybe_get_utf8(value) is None) is_non_utf_vec = np.vectorize(_is_non_utf8, otypes=[np.bool]) classify_vec = np.vectorize(self._classifier.classify, otypes=[np.bool]) for feature_array in input_column.data.iterchunks(): values = arrow_util.FlattenListArray(feature_array).to_pandas() if np.any(is_non_utf_vec(values)): accumulator.invalidate = True return accumulator accumulator.considered += values.size accumulator.matched += np.sum(classify_vec(values)) return accumulator
def add_input(self, accumulator, examples_table): weights_column = None if self._weight_feature: weights_column = examples_table.column(self._weight_feature) for feature_column in examples_table.itercolumns(): feature_name = feature_column.name # Skip the weight feature. if feature_name == self._weight_feature: continue is_categorical_feature = feature_name in self._categorical_features # If we encounter this feature for the first time, create a # new partial basic stats. if feature_name not in accumulator: partial_stats = _PartialBasicStats( self._weight_feature is not None) # Store empty summary. partial_stats.common_stats.num_values_summary = ( self._num_values_quantiles_combiner.create_accumulator()) partial_stats.numeric_stats.quantiles_summary = ( self._values_quantiles_combiner.create_accumulator()) accumulator[feature_name] = partial_stats feature_type = stats_util.get_feature_type_from_arrow_type( feature_name, feature_column.type) accumulator[feature_name].common_stats.update( feature_column, feature_type, self._num_values_quantiles_combiner, weights_column) if (is_categorical_feature or feature_type == statistics_pb2.FeatureNameStatistics.STRING): accumulator[feature_name].string_stats.update(feature_column) elif feature_type is not None: accumulator[feature_name].numeric_stats.update( feature_column, self._values_quantiles_combiner, weights_column) return accumulator
def _get_example_value_presence( record_batch: pa.RecordBatch, path: types.FeaturePath, boundaries: Optional[Sequence[float]], weight_column_name: Optional[Text]) -> Optional[pd.DataFrame]: """Returns information about which examples contained which values. This function treats all values for a given path within a single example as a set and and returns a mapping between each example index and the distinct values which are present in that example. The result of calling this function for path 'p' on an arrow record batch with the two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be pd.Series(['a', 'b', 'a'], index=[0, 0, 1]). If the array retrieved from get_array is null, this function returns None. Args: record_batch: The RecordBatch in which to look up the path. path: The FeaturePath for which to fetch values. boundaries: Optionally, a set of bin boundaries to use for binning the array values. weight_column_name: Optionally, a weight column to return in addition to the value and example index. Returns: A Pandas DataFrame containing distinct pairs of array values and example indices, along with the corresponding flattened example weights. The index will be the example indices and the values will be stored in a column named 'values'. If weight_column_name is provided, a second column will be returned containing the array values, and 'weights' containing the weights for the example from which each value came. """ arr, example_indices = arrow_util.get_array(record_batch, path, return_example_indices=True) if stats_util.get_feature_type_from_arrow_type(path, arr.type) is None: return None arr_flat, parent_indices = arrow_util.flatten_nested( arr, return_parent_indices=True) is_binary_like = arrow_util.is_binary_like(arr_flat.type) assert boundaries is None or not is_binary_like, ( 'Boundaries can only be applied to numeric columns') if is_binary_like: # use dictionary_encode so we can use np.unique on object arrays dict_array = arr_flat.dictionary_encode() arr_flat = dict_array.indices arr_flat_dict = np.asarray(dict_array.dictionary) example_indices_flat = example_indices[parent_indices] if boundaries is not None: element_indices, bins = bin_util.bin_array(arr_flat, boundaries) rows = np.vstack([example_indices_flat[element_indices], bins]) else: rows = np.vstack([example_indices_flat, np.asarray(arr_flat)]) if not rows.size: return None # Deduplicate values which show up more than once in the same example. This # makes P(X=x|Y=y) in the standard lift definition behave as # P(x \in Xs | y \in Ys) if examples contain more than one value of X and Y. unique_rows = np.unique(rows, axis=1) example_indices = unique_rows[0, :] values = unique_rows[1, :] if is_binary_like: # return binary like values a pd.Categorical wrapped in a Series. This makes # subsqeuent operations like pd.Merge cheaper. values = pd.Categorical.from_codes(values, categories=arr_flat_dict) columns = {'example_indices': example_indices, 'values': values} if weight_column_name: weights = arrow_util.get_weight_feature(record_batch, weight_column_name) columns['weights'] = np.asarray(weights)[example_indices] df = pd.DataFrame(columns) return df.set_index('example_indices')