def _recursion_helper( feature_path: types.FeaturePath, array: pa.Array, weights: Optional[np.ndarray] ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]: """Recursion helper.""" array_type = array.type if (pa.types.is_list(array_type) and pa.types.is_struct(array_type.value_type)): if not enumerate_leaves_only: yield (feature_path, array, weights) flat_struct_array = array.flatten() flat_weights = None if weights is not None: flat_weights = weights[ array_util.GetFlattenedArrayParentIndices( array).to_numpy()] for field in flat_struct_array.type: field_name = field.name # use "yield from" after PY 3.3. for e in _recursion_helper(feature_path.child(field_name), flat_struct_array.field(field_name), flat_weights): yield e else: yield (feature_path, array, weights)
def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, example_indices: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" if not query_path: return array, example_indices array_type = array.type if (not pa.types.is_list(array_type) or not pa.types.is_struct(array_type.value_type)): raise KeyError( 'Cannot process query_path "{}" inside an array of type ' '{}. Expecting a list<struct<...>>.'.format( query_path, array_type)) flat_struct_array = array.flatten() flat_indices = None if example_indices is not None: flat_indices = example_indices[ array_util.GetFlattenedArrayParentIndices(array).to_numpy()] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_indices)
def update(self, feature_array: pa.Array, values_quantiles_combiner: Any, weights: Optional[np.ndarray] = None) -> None: """Update the partial numeric statistics using the input value.""" # np.max / np.min below cannot handle empty arrays. And there's nothing # we can collect in this case. if not feature_array: return flattened_value_array = feature_array.flatten() # Note: to_numpy will fail if flattened_value_array is empty. if not flattened_value_array: return values = np.asarray(flattened_value_array) nan_mask = np.isnan(values) self.num_nan += np.sum(nan_mask) non_nan_mask = ~nan_mask values_no_nan = values[non_nan_mask] # We do this check to avoid failing in np.min/max with empty array. if values_no_nan.size == 0: return # This is to avoid integer overflow when computing sum or sum of squares. values_no_nan_as_double = values_no_nan.astype(np.float64) self.sum += np.sum(values_no_nan_as_double) self.sum_of_squares += np.sum(values_no_nan_as_double * values_no_nan_as_double) # Use np.minimum.reduce(values_no_nan, initial=self.min) once we upgrade # to numpy 1.16 curr_min = np.min(values_no_nan) curr_max = np.max(values_no_nan) self.min = min(self.min, curr_min) self.max = max(self.max, curr_max) if curr_min == float('-inf') or curr_max == float('inf'): finite_values = values_no_nan[np.isfinite(values_no_nan)] if finite_values.size > 0: self.finite_min = min(self.finite_min, np.min(finite_values)) self.finite_max = max(self.finite_max, np.max(finite_values)) self.num_zeros += values_no_nan.size - np.count_nonzero(values_no_nan) self.quantiles_summary = values_quantiles_combiner.add_input( self.quantiles_summary, [values_no_nan, np.ones_like(values_no_nan)]) if weights is not None: value_parent_indices = np.asarray( array_util.GetFlattenedArrayParentIndices(feature_array)) flat_weights = weights[value_parent_indices] flat_weights_no_nan = flat_weights[non_nan_mask] weighted_values = flat_weights_no_nan * values_no_nan self.weighted_sum += np.sum(weighted_values) self.weighted_sum_of_squares += np.sum(weighted_values * values_no_nan) self.weighted_quantiles_summary = values_quantiles_combiner.add_input( self.weighted_quantiles_summary, [values_no_nan, flat_weights_no_nan]) self.weighted_total_num_values += np.sum(flat_weights_no_nan)
def add_input(self, accumulator: _PartialTimeStats, feature_path: types.FeaturePath, feature_array: pa.Array) -> _PartialTimeStats: """Returns result of folding a batch of inputs into the current accumulator. Args: accumulator: The current accumulator. feature_path: The path of the feature. feature_array: An arrow Array representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidated: return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) # Ignore null array. if feature_type is None: return accumulator if feature_type == statistics_pb2.FeatureNameStatistics.STRING: def _maybe_get_utf8(val): return stats_util.maybe_get_utf8(val) if isinstance( val, bytes) else val values = np.asarray(feature_array.flatten()) maybe_utf8 = np.vectorize(_maybe_get_utf8, otypes=[np.object])(values) if not maybe_utf8.all(): accumulator.invalidated = True return accumulator accumulator.update(maybe_utf8, feature_type) elif feature_type == statistics_pb2.FeatureNameStatistics.INT: values = np.asarray(feature_array.flatten()) accumulator.update(values, feature_type) else: accumulator.invalidated = True return accumulator
def add_input(self, accumulator: _PartialImageStats, feature_path: types.FeaturePath, feature_array: pa.Array) -> _PartialImageStats: """Return result of folding a batch of inputs into accumulator. Args: accumulator: The current accumulator. feature_path: The path of the feature. feature_array: An arrow array representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidate: return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) # Ignore null array. if feature_type is None: return accumulator # If we see a different type, invalidate. if feature_type != statistics_pb2.FeatureNameStatistics.STRING: accumulator.invalidate = True return accumulator # Consider using memoryview to avoid copying after upgrading to # arrow 0.12. Note that this would involve modifying the subsequent logic # to iterate over the values in a loop. values = arrow_util.primitive_array_to_numpy(feature_array.flatten()) accumulator.total_num_values += values.size image_formats = self._image_decoder.get_formats(values) valid_mask = ~pd.isnull(image_formats) valid_formats = image_formats[valid_mask] format_counts = np.unique(valid_formats, return_counts=True) for (image_format, count) in zip(*format_counts): accumulator.counter_by_format[image_format] += count unknown_count = image_formats.size - valid_formats.size if unknown_count > 0: accumulator.counter_by_format[''] += unknown_count if self._enable_size_stats: # Get image height and width. image_sizes = self._image_decoder.get_sizes(values[valid_mask]) if image_sizes.any(): max_sizes = np.max(image_sizes, axis=0) # Update the max image height/width with all image values. accumulator.max_height = max(accumulator.max_height, max_sizes[0]) accumulator.max_width = max(accumulator.max_width, max_sizes[1]) return accumulator
def update(self, feature_array: pa.Array) -> None: """Update the partial bytes statistics using the input value.""" if pa.types.is_null(feature_array.type): return # Iterate through the value array and update the partial stats.' flattened_values_array = feature_array.flatten() if (pa.types.is_floating(flattened_values_array.type) or pa.types.is_integer(flattened_values_array.type)): raise ValueError( 'Bytes stats cannot be computed on INT/FLOAT features.') if flattened_values_array: num_bytes = array_util.GetElementLengths( flattened_values_array).to_numpy() self.min_num_bytes = min(self.min_num_bytes, np.min(num_bytes)) self.max_num_bytes = max(self.max_num_bytes, np.max(num_bytes)) self.total_num_bytes += np.sum(num_bytes)
def update(self, feature_array: pa.Array) -> None: """Update the partial string statistics using the input value.""" # Iterate through the value array and update the partial stats. flattened_values_array = feature_array.flatten() if pa.types.is_binary( flattened_values_array.type) or pa.types.is_unicode( flattened_values_array.type): # GetBinaryArrayTotalByteSize returns a Python long (to be compatible # with Python3). To make sure we do cheaper integer arithemetics in # Python2, we first convert it to int. self.total_bytes_length += int( arrow_util.GetBinaryArrayTotalByteSize(flattened_values_array)) elif flattened_values_array: # We can only do flattened_values_array.to_numpy() when it's not empty. # This could be computed faster by taking log10 of the integer. def _len_after_conv(s): return len(str(s)) self.total_bytes_length += np.sum( np.vectorize(_len_after_conv, otypes=[np.int32]) (arrow_util.primitive_array_to_numpy(flattened_values_array)))
def add_input(self, accumulator: _PartialNLStats, feature_path: types.FeaturePath, feature_array: pa.Array) -> _PartialNLStats: """Return result of folding a batch of inputs into accumulator. Args: accumulator: The current accumulator. feature_path: The path of the feature. feature_array: An arrow Array representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidate: return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) # Ignore null array. if feature_type is None: return accumulator # If we see a different type, invalidate. if feature_type != statistics_pb2.FeatureNameStatistics.STRING: accumulator.invalidate = True return accumulator def _is_non_utf8(value): return (isinstance(value, bytes) and stats_util.maybe_get_utf8(value) is None) is_non_utf_vec = np.vectorize(_is_non_utf8, otypes=[np.bool]) classify_vec = np.vectorize(self._classifier.classify, otypes=[np.bool]) values = np.asarray(feature_array.flatten()) if np.any(is_non_utf_vec(values)): accumulator.invalidate = True return accumulator accumulator.considered += values.size accumulator.matched += np.sum(classify_vec(values)) return accumulator