def add_input(self, accumulator, input_table): weight_ndarrays = [] if self._weight_feature is not None: for a in input_table.column( self._weight_feature).data.iterchunks(): weight_array = arrow_util.FlattenListArray(a) if len(weight_array) != len(a): raise ValueError( 'If weight is specified, then each example must have a weight ' 'feature of length 1.') # to_numpy() can only be called against a non-empty arrow array. if weight_array: weight_ndarrays.append(weight_array.to_numpy()) else: weight_ndarrays.append( np.array([], dtype=weight_array.to_pandas_dtype())) for column in input_table.columns: feature_name = column.name if feature_name == self._weight_feature: continue unweighted_counts = collections.Counter() weighted_counts = _WeightedCounter() feature_type = stats_util.get_feature_type_from_arrow_type( feature_name, column.type) if not (feature_name in self._categorical_features or feature_type == statistics_pb2.FeatureNameStatistics.STRING): continue for feature_array, weight_ndarray in six.moves.zip_longest( column.data.iterchunks(), weight_ndarrays, fillvalue=None): flattened_values_array = arrow_util.FlattenListArray( feature_array) # to_numpy() cannot be called if the array is empty. if not flattened_values_array: continue if feature_type == statistics_pb2.FeatureNameStatistics.STRING: values_ndarray = flattened_values_array.to_pandas() else: values_ndarray = flattened_values_array.to_numpy() value_parent_indices = arrow_util.GetFlattenedArrayParentIndices( feature_array).to_numpy() unweighted_counts.update(values_ndarray) if weight_ndarray is not None: weight_per_value = weight_ndarray[value_parent_indices] weighted_counts.weighted_update(values_ndarray, weight_per_value) if feature_name not in accumulator: accumulator[feature_name] = _ValueCounts( unweighted_counts=unweighted_counts, weighted_counts=weighted_counts) else: accumulator[feature_name].unweighted_counts.update( unweighted_counts) accumulator[feature_name].weighted_counts.update( weighted_counts) return accumulator
def test_flatten_list_array(self): flattened = arrow_util.FlattenListArray( pa.array([], type=pa.list_(pa.int64()))) self.assertTrue(flattened.equals(pa.array([], type=pa.int64()))) flattened = arrow_util.FlattenListArray( pa.array([[1.], [2.], [], [3.]])) self.assertTrue(flattened.equals(pa.array([1., 2., 3.])))
def update(self, feature_column, values_quantiles_combiner, weight_column=None): """Update the partial numeric statistics using the input value.""" # np.max / np.min below cannot handle empty arrays. And there's nothing # we can collect in this case. if not feature_column: return weight_chunks = weight_column.data.iterchunks( ) if weight_column else [] for feature_array, weight_array in six.moves.zip_longest( feature_column.data.iterchunks(), weight_chunks, fillvalue=None): flattened_value_array = arrow_util.FlattenListArray(feature_array) # Note: to_numpy will fail if flattened_value_array is empty. if not flattened_value_array: continue values = flattened_value_array.to_numpy() nan_mask = np.isnan(values) non_nan_mask = ~nan_mask values_no_nan = values[non_nan_mask] # This is to avoid integer overflow when computing sum or sum of squares. values_no_nan_as_double = values_no_nan.astype(np.float64) self.num_nan += np.sum(nan_mask) self.sum += np.sum(values_no_nan_as_double) self.sum_of_squares += np.sum(values_no_nan_as_double * values_no_nan_as_double) self.min = min(self.min, np.min(values_no_nan)) self.max = max(self.max, np.max(values_no_nan)) self.num_zeros += values_no_nan.size - np.count_nonzero( values_no_nan) self.quantiles_summary = values_quantiles_combiner.add_input( self.quantiles_summary, [values_no_nan, np.ones_like(values_no_nan)]) if weight_array: example_weights = arrow_util.FlattenListArray( weight_array).to_numpy().astype(np.float32, copy=False) if example_weights.size != len(weight_array): raise ValueError('Weight feature must not be missing.') value_parent_indices = arrow_util.GetFlattenedArrayParentIndices( feature_array).to_numpy() weights = example_weights[value_parent_indices] weights_no_nan = weights[non_nan_mask] weighted_values = weights_no_nan * values_no_nan self.weighted_sum += np.sum(weighted_values) self.weighted_sum_of_squares += np.sum(weighted_values * values_no_nan) self.weighted_quantiles_summary = values_quantiles_combiner.add_input( self.weighted_quantiles_summary, [values_no_nan, weights_no_nan]) self.weighted_total_num_values += np.sum(weights_no_nan)
def add_input(self, accumulator, input_table): weight_column = (input_table.column(self._weight_feature) if self._weight_feature else None) weight_array = weight_column.data.chunk(0) if weight_column else [] if weight_array: flattened_weights = arrow_util.FlattenListArray( weight_array).to_numpy() for column in input_table.columns: feature_name = column.name # Skip the weight feature. if feature_name == self._weight_feature: continue feature_path = types.FeaturePath([feature_name]) feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, column.type) # if it's not a categorical feature nor a string feature, we don't bother # with topk stats. if not (feature_path in self._categorical_features or feature_type == statistics_pb2.FeatureNameStatistics.STRING): continue value_array = column.data.chunk(0) flattened_values = arrow_util.FlattenListArray(value_array) unweighted_counts = collections.Counter() # Compute unweighted counts. value_counts = arrow_util.ValueCounts(flattened_values) for value_count in value_counts: value_count = value_count.as_py() unweighted_counts[ value_count['values']] = value_count['counts'] # Compute weighted counts if a weight feature is specified. weighted_counts = _WeightedCounter() if weight_array: if (pa.types.is_binary(flattened_values.type) or pa.types.is_string(flattened_values.type)): # no free conversion. flattened_values_np = flattened_values.to_pandas() else: flattened_values_np = flattened_values.to_numpy() indices = arrow_util.GetFlattenedArrayParentIndices( value_array) weighted_counts.weighted_update( flattened_values_np, flattened_weights[indices.to_numpy()]) if feature_path not in accumulator: accumulator[feature_path] = _ValueCounts( unweighted_counts=unweighted_counts, weighted_counts=weighted_counts) else: accumulator[feature_path].unweighted_counts.update( unweighted_counts) accumulator[feature_path].weighted_counts.update( weighted_counts) return accumulator
def update(self, feature_column, feature_type, num_values_quantiles_combiner, weight_column=None): """Update the partial common statistics using the input value.""" # All the values in this column is null and we cannot deduce the type of # the feature. This is not an error as this feature might have some values # in other batches. if feature_type is None: return if self.type is None: self.type = feature_type elif self.type != feature_type: raise TypeError('Cannot determine the type of feature %s. ' 'Found values of types %s and %s.' % (feature_column.name, self.type, feature_type)) # np.max / np.min below cannot handle empty arrays. And there's nothing # we can collect in this case. if not feature_column: return if weight_column and (feature_column.data.num_chunks != weight_column.data.num_chunks): raise ValueError( 'Expected the feature column {} and weight column {} to have the ' 'same number of chunks.'.format(feature_column.name, weight_column.name)) weight_chunks = weight_column.data.iterchunks( ) if weight_column else [] for feature_array, weight_array in six.moves.zip_longest( feature_column.data.iterchunks(), weight_chunks, fillvalue=None): num_values = arrow_util.ListLengthsFromListArray( feature_array).to_numpy() none_mask = arrow_util.GetArrayNullBitmapAsByteArray( feature_array).to_numpy().view(np.bool) num_values_not_none = num_values[~none_mask] self.num_non_missing += len( feature_array) - feature_array.null_count self.max_num_values = max(np.max(num_values_not_none), self.max_num_values) self.min_num_values = min(np.min(num_values_not_none), self.min_num_values) self.total_num_values += np.sum(num_values_not_none) self.num_values_summary = num_values_quantiles_combiner.add_input( self.num_values_summary, [num_values_not_none]) if weight_array: weights = (arrow_util.FlattenListArray( weight_array).to_numpy().astype(np.float32, copy=False)) if weights.size != num_values.size: raise ValueError('Weight feature must not be missing.') self.weighted_total_num_values += np.sum(num_values * weights) self.weighted_num_non_missing += np.sum(weights[~none_mask])
def add_input(self, accumulator, examples_table): accumulator[0] += examples_table.num_rows if self._weight_feature: weights_column = examples_table.column(self._weight_feature) for weight_array in weights_column.data.iterchunks(): accumulator[1] += np.sum( arrow_util.FlattenListArray(weight_array).to_numpy()) return accumulator
def _to_topk_tuples(sliced_table, categorical_features, weight_feature=None): """Generates tuples for computing top-k and uniques from input tables.""" slice_key, table = sliced_table weight_column = table.column(weight_feature) if weight_feature else None weight_array = weight_column.data.chunk(0) if weight_column else [] if weight_array: flattened_weights = arrow_util.FlattenListArray( weight_array).to_numpy() for feature_column in table.columns: feature_name = feature_column.name # Skip the weight feature. if feature_name == weight_feature: continue feature_path = types.FeaturePath([feature_name]) # if it's not a categorical feature nor a string feature, we don't bother # with topk stats. if not (feature_path in categorical_features or feature_column.type.equals(pa.list_(pa.binary())) or feature_column.type.equals(pa.list_(pa.string()))): continue value_array = feature_column.data.chunk(0) flattened_values = arrow_util.FlattenListArray(value_array) if weight_array and flattened_values: if (pa.types.is_binary(flattened_values.type) or pa.types.is_string(flattened_values.type)): # no free conversion. flattened_values_np = flattened_values.to_pandas() else: flattened_values_np = flattened_values.to_numpy() indices = arrow_util.GetFlattenedArrayParentIndices(value_array) weights_ndarray = flattened_weights[indices.to_numpy()] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = arrow_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def add_input(self, accumulator, input_column): """Return result of folding a batch of inputs into accumulator. Args: accumulator: The current accumulator. input_column: An arrow column representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidate: return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( input_column.name, input_column.type) # Ignore null array. if feature_type is None: return accumulator # If we see a different type, invalidate. if feature_type != statistics_pb2.FeatureNameStatistics.STRING: accumulator.invalidate = True return accumulator for feature_array in input_column.data.iterchunks(): # Consider using memoryview to avoid copying after upgrading to # arrow 0.12. Note that this would involve modifying the subsequent logic # to iterate over the values in a loop. values = arrow_util.FlattenListArray(feature_array).to_pandas() accumulator.total_num_values += values.size image_formats = self._image_decoder.get_formats(values) valid_mask = ~pd.isnull(image_formats) valid_formats = image_formats[valid_mask] format_counts = np.unique(valid_formats, return_counts=True) for (image_format, count) in zip(*format_counts): accumulator.counter_by_format[image_format] += count unknown_count = image_formats.size - valid_formats.size if unknown_count > 0: accumulator.counter_by_format[''] += unknown_count if self._enable_size_stats: # Get image height and width. image_sizes = self._image_decoder.get_sizes(values[valid_mask]) if image_sizes.any(): max_sizes = np.max(image_sizes, axis=0) # Update the max image height/width with all image values. accumulator.max_height = max(accumulator.max_height, max_sizes[0]) accumulator.max_width = max(accumulator.max_width, max_sizes[1]) return accumulator
def update(self, feature_column, values_quantiles_combiner, weights=None): """Update the partial numeric statistics using the input value.""" # np.max / np.min below cannot handle empty arrays. And there's nothing # we can collect in this case. if not feature_column: return feature_array = feature_column.data.chunk(0) flattened_value_array = arrow_util.FlattenListArray( feature_column.data.chunk(0)) # Note: to_numpy will fail if flattened_value_array is empty. if not flattened_value_array: return values = flattened_value_array.to_numpy() nan_mask = np.isnan(values) self.num_nan += np.sum(nan_mask) non_nan_mask = ~nan_mask values_no_nan = values[non_nan_mask] # We do this check to avoid failing in np.min/max with empty array. if values_no_nan.size == 0: return # This is to avoid integer overflow when computing sum or sum of squares. values_no_nan_as_double = values_no_nan.astype(np.float64) self.sum += np.sum(values_no_nan_as_double) self.sum_of_squares += np.sum(values_no_nan_as_double * values_no_nan_as_double) # Use np.minimum.reduce(values_no_nan, initial=self.min) once we upgrade # to numpy 1.16 self.min = min(self.min, np.min(values_no_nan)) self.max = max(self.max, np.max(values_no_nan)) self.num_zeros += values_no_nan.size - np.count_nonzero(values_no_nan) self.quantiles_summary = values_quantiles_combiner.add_input( self.quantiles_summary, [values_no_nan, np.ones_like(values_no_nan)]) if weights is not None: value_parent_indices = arrow_util.GetFlattenedArrayParentIndices( feature_array).to_numpy() flat_weights = weights[value_parent_indices] flat_weights_no_nan = flat_weights[non_nan_mask] weighted_values = flat_weights_no_nan * values_no_nan self.weighted_sum += np.sum(weighted_values) self.weighted_sum_of_squares += np.sum(weighted_values * values_no_nan) self.weighted_quantiles_summary = values_quantiles_combiner.add_input( self.weighted_quantiles_summary, [values_no_nan, flat_weights_no_nan]) self.weighted_total_num_values += np.sum(flat_weights_no_nan)
def add_input(self, accumulator, examples_table): weights = None if self._weight_feature: weights = (arrow_util.FlattenListArray( examples_table.column( self._weight_feature).data.chunk(0)).to_numpy()) if len(weights) != len(examples_table): raise ValueError('Expected exactly one weight per example.') for feature_column in examples_table.itercolumns(): feature_name = feature_column.name # Skip the weight feature. if feature_name == self._weight_feature: continue feature_path = types.FeaturePath([feature_name]) is_categorical_feature = feature_path in self._categorical_features # If we encounter this feature for the first time, create a # new partial basic stats. stats_for_feature = accumulator.get(feature_path) if stats_for_feature is None: stats_for_feature = _PartialBasicStats( self._weight_feature is not None) # Store empty summary. stats_for_feature.common_stats.num_values_summary = ( self._num_values_quantiles_combiner.create_accumulator()) stats_for_feature.numeric_stats.quantiles_summary = ( self._values_quantiles_combiner.create_accumulator()) accumulator[feature_path] = stats_for_feature feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_column.type) stats_for_feature.common_stats.update( feature_column, feature_type, self._num_values_quantiles_combiner, weights) if (is_categorical_feature or feature_type == statistics_pb2.FeatureNameStatistics.STRING): stats_for_feature.string_stats.update(feature_column) elif feature_type is not None: stats_for_feature.numeric_stats.update( feature_column, self._values_quantiles_combiner, weights) return accumulator
def update(self, feature_column): """Update the partial string statistics using the input value.""" # Iterate through the value array and update the partial stats. value_array = feature_column.data.chunk(0) flattened_values_array = arrow_util.FlattenListArray(value_array) if pa.types.is_binary( flattened_values_array.type) or pa.types.is_unicode( flattened_values_array.type): # GetBinaryArrayTotalByteSize returns a Python long (to be compatible # with Python3). To make sure we do cheaper integer arithemetics in # Python2, we first convert it to int. self.total_bytes_length += int( arrow_util.GetBinaryArrayTotalByteSize(flattened_values_array)) elif flattened_values_array: # We can only do flattened_values_array.to_numpy() when it's not empty. # This could be computed faster by taking log10 of the integer. def _len_after_conv(s): return len(str(s)) self.total_bytes_length += np.sum( np.vectorize(_len_after_conv, otypes=[np.int32 ])(flattened_values_array.to_numpy()))
def feature_value_slicer(table): """A function that generates sliced tables. The naive approach of doing this would be to iterate each row, identify slice keys for the row and keep track of index ranges for each slice key. And then generate an arrow table for each slice key based on the index ranges. This would be expensive as we are identifying the slice keys for each row individually and we would have to loop over the feature values including crossing them when we have to slice on multiple features. The current approach generates the slice keys for a batch by performing joins over indices of individual features. And then groups the joined table by slice key to get the row indices corresponding to a slice. Args: table: Arrow table. Yields: Sliced table (slice_key, Arrow table) where the table contains the rows corresponding to a slice. """ per_feature_parent_indices = [] for feature_name, values in six.iteritems(features): column = table.column(feature_name) # Assume we have a single chunk. feature_array = column.data.chunk(0) non_missing_values = arrow_util.FlattenListArray( feature_array).to_pandas() value_parent_indices = arrow_util.GetFlattenedArrayParentIndices( feature_array).to_numpy() # Create dataframe with feature value and parent index. df = pd.DataFrame({ feature_name: non_missing_values, _PARENT_INDEX_COLUMN: value_parent_indices }) df.drop_duplicates(inplace=True) # Filter based on slice values if values is not None: df = df.loc[df[feature_name].isin(values)] per_feature_parent_indices.append(df) # Join dataframes based on parent indices. # Note that we want the parent indices per slice key to be sorted in the # merged dataframe. The individual dataframes have the parent indices in # sorted order. We use "inner" join type to preserve the order of the left # keys (also note that same parent index rows would be consecutive). Hence # we expect the merged dataframe to have sorted parent indices per # slice key. merged_df = functools.reduce( lambda base, update: pd.merge( base, update, how='inner', # pylint: disable=g-long-lambda on=_PARENT_INDEX_COLUMN), per_feature_parent_indices) # Construct a new column in the merged dataframe with the slice keys. merged_df[_SLICE_KEY_COLUMN] = '' index = 0 for col_name in sorted(merged_df.columns): if col_name in [_PARENT_INDEX_COLUMN, _SLICE_KEY_COLUMN]: continue slice_key_col = (_to_slice_key(col_name) + '_' + merged_df[col_name].apply(_to_slice_key)) if index == 0: merged_df[_SLICE_KEY_COLUMN] = slice_key_col index += 1 else: merged_df[_SLICE_KEY_COLUMN] += ('_' + slice_key_col) # Since the parent indices are sorted per slice key, the groupby would # preserve the sorted order within each group. per_slice_parent_indices = merged_df.groupby( _SLICE_KEY_COLUMN, sort=False)[_PARENT_INDEX_COLUMN] for slice_key, parent_indices in per_slice_parent_indices: yield (slice_key, merge.SliceTableByRowIndices(table, parent_indices.to_numpy()))