def _recursion_helper( feature_path: types.FeaturePath, array: pa.Array, weights: Optional[np.ndarray] ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]: """Recursion helper.""" array_type = array.type if is_list_like(array_type) and pa.types.is_struct( array_type.value_type): if not enumerate_leaves_only: yield (feature_path, array, weights) flat_struct_array = array.flatten() flat_weights = None if weights is not None: flat_weights = weights[ array_util.GetFlattenedArrayParentIndices( array).to_numpy()] for field in flat_struct_array.type: field_name = field.name # use "yield from" after PY 3.3. for e in _recursion_helper(feature_path.child(field_name), flat_struct_array.field(field_name), flat_weights): yield e else: yield (feature_path, array, weights)
def _ListArrayToTensor( self, list_array: pa.Array, produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]: """Converts a ListArray to a dense tensor.""" values = list_array.flatten() batch_size = len(list_array) expected_num_elements = batch_size * self._unbatched_flat_len if len(values) != expected_num_elements: raise ValueError( "Unable to convert a {} to a tensor of type spec {}: size mismatch. " "Expected {} elements but got {}. " "If your data type is tf.Example, make sure that the feature " "is always present, and have the same length in all the examples. " "TFX users should make sure there is no data anomaly for the feature." .format( type(list_array), self.type_spec, expected_num_elements, len(values))) actual_shape = list(self._shape) actual_shape[0] = batch_size if self._convert_to_binary_fn is not None: values = self._convert_to_binary_fn(values) values_np = np.asarray(values).reshape(actual_shape) if produce_eager_tensors: return tf.convert_to_tensor(values_np) return values_np
def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, example_indices: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" if not query_path: return array, example_indices array_type = array.type if (not is_list_like(array_type) or not pa.types.is_struct(array_type.value_type)): raise KeyError( 'Cannot process query_path "{}" inside an array of type ' '{}. Expecting a (large_)list<struct<...>>.'.format( query_path, array_type)) flat_struct_array = array.flatten() flat_indices = None if example_indices is not None: flat_indices = example_indices[ array_util.GetFlattenedArrayParentIndices(array).to_numpy()] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_indices)
def flatten_nested( array: pa.Array, return_parent_indices: bool = False ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Flattens all the list arrays nesting an array. If `array` is not list-like, itself will be returned. Args: array: pa.Array to flatten. return_parent_indices: If True, also returns the parent indices array. Returns: A tuple. The first term is the flattened array. The second term is None if `return_parent_indices` is False; otherwise it's a parent indices array parallel to the flattened array: if parent_indices[i] = j, then flattened_array[i] belongs to the j-th element of the input array. """ parent_indices = None while is_list_like(array.type): if return_parent_indices: cur_parent_indices = array_util.GetFlattenedArrayParentIndices( array).to_numpy() if parent_indices is None: parent_indices = cur_parent_indices else: parent_indices = parent_indices[cur_parent_indices] array = array.flatten() # the array is not nested at the first place. if return_parent_indices and parent_indices is None: parent_indices = np.arange(len(array)) return array, parent_indices
def update(self, feature_array: pa.Array, values_quantiles_combiner: Any, weights: Optional[np.ndarray] = None) -> None: """Update the partial numeric statistics using the input value.""" # np.max / np.min below cannot handle empty arrays. And there's nothing # we can collect in this case. if not feature_array: return flattened_value_array = feature_array.flatten() # Note: to_numpy will fail if flattened_value_array is empty. if not flattened_value_array: return values = np.asarray(flattened_value_array) nan_mask = np.isnan(values) self.num_nan += np.sum(nan_mask) non_nan_mask = ~nan_mask values_no_nan = values[non_nan_mask] # We do this check to avoid failing in np.min/max with empty array. if values_no_nan.size == 0: return # This is to avoid integer overflow when computing sum or sum of squares. values_no_nan_as_double = values_no_nan.astype(np.float64) self.sum += np.sum(values_no_nan_as_double) self.sum_of_squares += np.sum(values_no_nan_as_double * values_no_nan_as_double) # Use np.minimum.reduce(values_no_nan, initial=self.min) once we upgrade # to numpy 1.16 curr_min = np.min(values_no_nan) curr_max = np.max(values_no_nan) self.min = min(self.min, curr_min) self.max = max(self.max, curr_max) if curr_min == float('-inf') or curr_max == float('inf'): finite_values = values_no_nan[np.isfinite(values_no_nan)] if finite_values.size > 0: self.finite_min = min(self.finite_min, np.min(finite_values)) self.finite_max = max(self.finite_max, np.max(finite_values)) self.num_zeros += values_no_nan.size - np.count_nonzero(values_no_nan) self.quantiles_summary = values_quantiles_combiner.add_input( self.quantiles_summary, [values_no_nan, np.ones_like(values_no_nan)]) if weights is not None: value_parent_indices = np.asarray( array_util.GetFlattenedArrayParentIndices(feature_array)) flat_weights = weights[value_parent_indices] flat_weights_no_nan = flat_weights[non_nan_mask] weighted_values = flat_weights_no_nan * values_no_nan self.weighted_sum += np.sum(weighted_values) self.weighted_sum_of_squares += np.sum(weighted_values * values_no_nan) self.weighted_quantiles_summary = values_quantiles_combiner.add_input( self.weighted_quantiles_summary, [values_no_nan, flat_weights_no_nan]) self.weighted_total_num_values += np.sum(flat_weights_no_nan)
def add_input(self, accumulator: _PartialImageStats, feature_path: types.FeaturePath, feature_array: pa.Array) -> _PartialImageStats: """Return result of folding a batch of inputs into accumulator. Args: accumulator: The current accumulator. feature_path: The path of the feature. feature_array: An arrow array representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidate: return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) # Ignore null array. if feature_type is None: return accumulator # If we see a different type, invalidate. if feature_type != statistics_pb2.FeatureNameStatistics.STRING: accumulator.invalidate = True return accumulator # Consider using memoryview to avoid copying after upgrading to # arrow 0.12. Note that this would involve modifying the subsequent logic # to iterate over the values in a loop. values = np.asarray(feature_array.flatten()) accumulator.total_num_values += values.size image_formats = self._image_decoder.get_formats(values) valid_mask = ~pd.isnull(image_formats) valid_formats = image_formats[valid_mask] format_counts = np.unique(valid_formats, return_counts=True) for (image_format, count) in zip(*format_counts): accumulator.counter_by_format[image_format] += count unknown_count = image_formats.size - valid_formats.size if unknown_count > 0: accumulator.counter_by_format[''] += unknown_count if self._enable_size_stats: # Get image height and width. image_sizes = self._image_decoder.get_sizes(values[valid_mask]) if image_sizes.any(): max_sizes = np.max(image_sizes, axis=0) # Update the max image height/width with all image values. accumulator.max_height = max(accumulator.max_height, max_sizes[0]) accumulator.max_width = max(accumulator.max_width, max_sizes[1]) return accumulator
def add_input(self, accumulator: _PartialTimeStats, feature_path: types.FeaturePath, feature_array: pa.Array) -> _PartialTimeStats: """Returns result of folding a batch of inputs into the current accumulator. Args: accumulator: The current accumulator. feature_path: The path of the feature. feature_array: An arrow Array representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidated: return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) # Ignore null array. if feature_type is None: return accumulator if feature_type == statistics_pb2.FeatureNameStatistics.STRING: def _maybe_get_utf8(val): return stats_util.maybe_get_utf8(val) if isinstance(val, bytes) else val values = np.asarray(feature_array.flatten()) maybe_utf8 = np.vectorize(_maybe_get_utf8, otypes=[np.object])(values) if not maybe_utf8.all(): accumulator.invalidated = True return accumulator accumulator.update(maybe_utf8, feature_type) elif feature_type == statistics_pb2.FeatureNameStatistics.INT: values = np.asarray(feature_array.flatten()) accumulator.update(values, feature_type) else: accumulator.invalidated = True return accumulator
def update(self, feature_array: pa.Array) -> None: """Update the partial bytes statistics using the input value.""" if pa.types.is_null(feature_array.type): return # Iterate through the value array and update the partial stats.' flattened_values_array = feature_array.flatten() if (pa.types.is_floating(flattened_values_array.type) or pa.types.is_integer(flattened_values_array.type)): raise ValueError('Bytes stats cannot be computed on INT/FLOAT features.') if flattened_values_array: num_bytes = array_util.GetElementLengths( flattened_values_array).to_numpy() self.min_num_bytes = min(self.min_num_bytes, np.min(num_bytes)) self.max_num_bytes = max(self.max_num_bytes, np.max(num_bytes)) self.total_num_bytes += np.sum(num_bytes)
def update(self, feature_path: types.FeaturePath, feature_array: pa.Array, feature_type: types.FeatureNameStatisticsType, make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch], weights: Optional[np.ndarray] = None) -> None: """Update the partial common statistics using the input value.""" if self.type is None: self.type = feature_type # pytype: disable=annotation-type-mismatch elif feature_type is not None and self.type != feature_type: raise TypeError('Cannot determine the type of feature %s. ' 'Found values of types %s and %s.' % (feature_path, self.type, feature_type)) nest_level = arrow_util.get_nest_level(feature_array.type) if self.presence_and_valency_stats is None: self.presence_and_valency_stats = [ _PresenceAndValencyStats(make_quantiles_sketch_fn) for _ in range(nest_level) ] elif nest_level != len(self.presence_and_valency_stats): raise ValueError('Inconsistent nestedness in feature {}: {} vs {}'.format( feature_path, nest_level, len(self.presence_and_valency_stats))) # And there's nothing we can collect in this case. if not feature_array: return level = 0 while arrow_util.is_list_like(feature_array.type): presence_mask = ~np.asarray( array_util.GetArrayNullBitmapAsByteArray(feature_array)).view(np.bool) num_values = np.asarray( array_util.ListLengthsFromListArray(feature_array)) num_values_not_none = num_values[presence_mask] self.presence_and_valency_stats[level].update(feature_array, presence_mask, num_values, num_values_not_none, weights) flattened = feature_array.flatten() if weights is not None: parent_indices = array_util.GetFlattenedArrayParentIndices( feature_array).to_numpy() weights = weights[parent_indices] feature_array = flattened level += 1
def update(self, feature_array: pa.Array) -> None: """Update the partial string statistics using the input value.""" if pa.types.is_null(feature_array.type): return # Iterate through the value array and update the partial stats. flattened_values_array = feature_array.flatten() if arrow_util.is_binary_like(flattened_values_array.type): # GetBinaryArrayTotalByteSize returns a Python long (to be compatible # with Python3). To make sure we do cheaper integer arithemetics in # Python2, we first convert it to int. self.total_bytes_length += int(array_util.GetBinaryArrayTotalByteSize( flattened_values_array)) elif flattened_values_array: # We can only do flattened_values_array.to_numpy() when it's not empty. # This could be computed faster by taking log10 of the integer. def _len_after_conv(s): return len(str(s)) self.total_bytes_length += np.sum( np.vectorize(_len_after_conv, otypes=[np.int32])(np.asarray(flattened_values_array)))
def add_input(self, accumulator: _PartialNLStats, feature_path: types.FeaturePath, feature_array: pa.Array) -> _PartialNLStats: """Return result of folding a batch of inputs into accumulator. Args: accumulator: The current accumulator. feature_path: The path of the feature. feature_array: An arrow Array representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidate: return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) # Ignore null array. if feature_type is None: return accumulator # If we see a different type, invalidate. if feature_type != statistics_pb2.FeatureNameStatistics.STRING: accumulator.invalidate = True return accumulator def _is_non_utf8(value): return (isinstance(value, bytes) and stats_util.maybe_get_utf8(value) is None) is_non_utf_vec = np.vectorize(_is_non_utf8, otypes=[np.bool]) classify_vec = np.vectorize(self._classifier.classify, otypes=[np.bool]) values = np.asarray(feature_array.flatten().slice(0, _CROP_AT_VALUES)) if np.any(is_non_utf_vec(values)): accumulator.invalidate = True return accumulator accumulator.considered += values.size accumulator.matched += np.sum(classify_vec(values)) return accumulator
def _ListArrayToTensor( self, list_array: pa.Array, produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]: """Converts a ListArray to a dense tensor.""" values = list_array.flatten() batch_size = len(list_array) expected_num_elements = batch_size * self._unbatched_flat_len if len(values) != expected_num_elements: raise ValueError( "Unable to convert ListArray {} to {}: size mismatch. expected {} " "elements but got {}".format(list_array, self.type_spec, expected_num_elements, len(values))) actual_shape = list(self._shape) actual_shape[0] = batch_size if self._convert_to_binary_fn is not None: values = self._convert_to_binary_fn(values) values_np = np.asarray(values).reshape(actual_shape) if produce_eager_tensors: return tf.convert_to_tensor(values_np) return values_np
def _ListArrayToTensor( self, list_array: pa.Array, produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]: """Converts a ListArray to a dense tensor.""" values = list_array.flatten() batch_size = len(list_array) expected_num_elements = batch_size * self._unbatched_flat_len if len(values) != expected_num_elements: raise ValueError( "Unable to convert ListArray {} to {}: size mismatch. expected {} " "elements but got {}".format( list_array, self.type_spec, expected_num_elements, len(values))) # TODO(zhuo): Cast StringArrays to BinaryArrays before calling np.asarray() # to avoid generating unicode objects which are wasteful to feed to # TensorFlow, once pyarrow requirement is bumped to >=0.15. actual_shape = list(self._shape) actual_shape[0] = batch_size values_np = np.asarray(values).reshape(actual_shape) if produce_eager_tensors: return tf.convert_to_tensor(values_np) return values_np