def _get_sparse_feature( schema: schema_pb2.Schema, feature_path: types.FeaturePath) -> schema_pb2.SparseFeature: """Returns a sparse feature from the schema.""" if not isinstance(schema, schema_pb2.Schema): raise TypeError('schema is of type %s, should be a Schema proto.' % type(schema).__name__) feature_container = None parent = feature_path.parent() if parent: # Sparse features do not have a struct_domain and so can be only leaves. # Thus, we can assume that all parent steps are features, not sparse # features. feature_container = schema.feature for step in parent.steps(): f = schema_util.look_up_feature(step, feature_container) if f is None: raise ValueError('Feature %s not found in the schema.' % feature_path) if f.type != schema_pb2.STRUCT: raise ValueError( 'Step %s in feature %s does not refer to a valid STRUCT feature' % (step, feature_path)) feature_container = f.struct_domain.sparse_feature if feature_container is None: feature_container = schema.sparse_feature feature = _look_up_sparse_feature(feature_path.steps()[-1], feature_container) if feature is None: raise ValueError('Sparse Feature %s not found in the schema.' % feature_path) return feature
def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, example_indices: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" array_type = array.type if not query_path: if pa.types.is_struct(array_type) and wrap_flat_struct_in_list: array = array_util.ToSingletonListArray(array) return array, example_indices if not pa.types.is_struct(get_innermost_nested_type(array_type)): raise KeyError('Cannot process query_path "{}" inside an array of type ' '{}. Expecting a struct<...> or ' '(large_)list...<struct<...>>.'.format( query_path, array_type)) flat_struct_array, parent_indices = flatten_nested( array, example_indices is not None) flat_indices = None if example_indices is not None: flat_indices = example_indices[parent_indices] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_indices)
def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, example_indices: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" if not query_path: return array, example_indices array_type = array.type if (not is_list_like(array_type) or not pa.types.is_struct(array_type.value_type)): raise KeyError( 'Cannot process query_path "{}" inside an array of type ' '{}. Expecting a (large_)list<struct<...>>.'.format( query_path, array_type)) flat_struct_array = array.flatten() flat_indices = None if example_indices is not None: flat_indices = example_indices[ array_util.GetFlattenedArrayParentIndices(array).to_numpy()] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_indices)
def _recursion_helper( feature_path: types.FeaturePath, array: pa.Array, all_weights: Dict[types.FeatureName, np.ndarray], ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]: """Recursion helper.""" array_type = array.type innermost_nested_type = get_innermost_nested_type(array_type) if pa.types.is_struct(innermost_nested_type): if not enumerate_leaves_only: weights = all_weights.get(example_weight_map.get(feature_path)) # special handing for a flat struct array -- wrap it in a ListArray # whose elements are singleton lists. This way downstream can keep # assuming the enumerated arrays are list<*>. to_yield = array if pa.types.is_struct(array_type) and wrap_flat_struct_in_list: to_yield = array_util.ToSingletonListArray(array) yield (feature_path, to_yield, weights) flat_struct_array, parent_indices = flatten_nested( array, bool(all_weights)) # Potential optimization: # Only flatten weights that we know will be used in the recursion. flat_all_weights = { weight_feature_name: w[parent_indices] for weight_feature_name, w in all_weights.items() } for field in flat_struct_array.type: field_name = field.name yield from _recursion_helper( feature_path.child(field_name), flat_struct_array.field(field_name), flat_all_weights) else: weights = all_weights.get(example_weight_map.get(feature_path)) yield (feature_path, array, weights)
def _make_feature_stats_proto( stats_values: Dict[Text, float], feature_path: types.FeaturePath ) -> statistics_pb2.FeatureNameStatistics: """Creates the FeatureNameStatistics proto for one feature. Args: stats_values: A Dict[str,float] where the key of the dict is the name of the custom statistic and the value is the numeric value of the custom statistic of that feature. Ex. { 'Mutual Information': 0.5, 'Correlation': 0.1 } feature_path: The path of the feature. Returns: A FeatureNameStatistic proto containing the custom statistics for a feature. """ result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # Sort alphabetically by statistic name to have deterministic ordering stat_names = sorted(stats_values.keys()) for stat_name in stat_names: result.custom_stats.add(name=stat_name, num=stats_values[stat_name]) return result
def _recursion_helper( feature_path: types.FeaturePath, array: pa.Array, weights: Optional[np.ndarray] ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]: """Recursion helper.""" array_type = array.type innermost_nested_type = get_innermost_nested_type(array_type) if pa.types.is_struct(innermost_nested_type): if not enumerate_leaves_only: # special handing for a flat struct array -- wrap it in a ListArray # whose elements are singleton lists. This way downstream can keep # assuming the enumerated arrays are list<*>. to_yield = array if pa.types.is_struct(array_type) and wrap_flat_struct_in_list: to_yield = array_util.ToSingletonListArray(array) yield (feature_path, to_yield, weights) flat_struct_array, parent_indices = flatten_nested( array, weights is not None) flat_weights = None if weights is None else weights[parent_indices] for field in flat_struct_array.type: field_name = field.name # use "yield from" after PY 3.3. for e in _recursion_helper(feature_path.child(field_name), flat_struct_array.field(field_name), flat_weights): yield e else: yield (feature_path, array, weights)
def _recursion_helper( feature_path: types.FeaturePath, array: pa.Array, weights: Optional[np.ndarray] ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]: """Recursion helper.""" array_type = array.type if is_list_like(array_type) and pa.types.is_struct( array_type.value_type): if not enumerate_leaves_only: yield (feature_path, array, weights) flat_struct_array = array.flatten() flat_weights = None if weights is not None: flat_weights = weights[ array_util.GetFlattenedArrayParentIndices( array).to_numpy()] for field in flat_struct_array.type: field_name = field.name # use "yield from" after PY 3.3. for e in _recursion_helper(feature_path.child(field_name), flat_struct_array.field(field_name), flat_weights): yield e else: yield (feature_path, array, weights)
def _recursion_helper( parent_path: types.FeaturePath, container: Union[schema_pb2.Schema, schema_pb2.StructDomain] ) -> List[Tuple[types.FeaturePath, schema_pb2.SparseFeature]]: """Helper function that is used in finding sparse features in a tree.""" result = [] for sf in container.sparse_feature: # Sparse features do not have a struct_domain, so they cannot be parent # features. Thus, once this reaches a sparse feature, add it to the # result. result.append((parent_path.child(sf.name), sf)) for f in container.feature: if f.type == schema_pb2.STRUCT: result.extend( _recursion_helper(parent_path.child(f.name), f.struct_domain)) return result
def _make_feature_stats_proto_uniques( feature_path: types.FeaturePath, num_uniques: int, ) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing the uniques stats.""" result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) result.string_stats.unique = num_uniques return result
def _make_feature_stats_proto_topk( feature_path: types.FeaturePath, top_k_values_pairs: List[FeatureValueCount], is_categorical: bool, is_weighted_stats: bool, num_top_values: int, frequency_threshold: Union[float, int], num_rank_histogram_buckets: int) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing the top-k stats.""" # Sort (a copy of) the top_k_value_pairs in descending order by count. # Where multiple feature values have the same count, consider the feature with # the 'larger' feature value to be larger for purposes of breaking the tie. top_k_values_pairs = sorted( top_k_values_pairs, key=lambda pair: (pair.count, pair.feature_value), reverse=True) result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) if is_weighted_stats: string_stats = result.string_stats.weighted_string_stats else: string_stats = result.string_stats for i in range(len(top_k_values_pairs)): value, count = top_k_values_pairs[i] if count < frequency_threshold: break # Check if we have a valid utf-8 string. If not, assign a default invalid # string value. if isinstance(value, six.binary_type): decoded_value = stats_util.maybe_get_utf8(value) if decoded_value is None: logging.warning('Feature "%s" has bytes value "%s" which cannot be ' 'decoded as a UTF-8 string.', feature_path, value) value = constants.NON_UTF8_PLACEHOLDER else: value = decoded_value elif not isinstance(value, six.text_type): value = str(value) if i < num_top_values: freq_and_value = string_stats.top_values.add() freq_and_value.value = value freq_and_value.frequency = count if i < num_rank_histogram_buckets: bucket = string_stats.rank_histogram.buckets.add() bucket.low_rank = i bucket.high_rank = i bucket.sample_count = count bucket.label = value return result
def _make_dataset_feature_stats_proto( lifts: Tuple[_SlicedFeatureKey, _LiftSeries], y_path: types.FeaturePath, y_boundaries: Optional[np.ndarray] ) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]: """Generates DatasetFeatureStatistics proto for a given x_path, y_path pair. Args: lifts: The result of two successive group bys of lift values. The innermost grouping collects all the lift values for a given (slice, x_path and y_value) tuple (corresponding to a single LiftSeries message). The outermost grouping collects all the lift values for the same (slice, x_path) tuple (corresponding to the set of the LiftSeries which share the same value of y_path). The full structure of lifts is described by: (slice, x_path), [(y, y_count, [(x, lift, xy_count, x_count)])] y_path: The path used as Y in the lift expression: lift = P(Y=y|X=x) / P(Y=y). y_boundaries: Optionally, a set of bin boundaries used for binning y_path values. Returns: The populated DatasetFeatureStatistics proto. """ key, lift_series_list = lifts stats = statistics_pb2.DatasetFeatureStatistics() cross_stats = stats.cross_features.add(path_x=key.x_path.to_proto(), path_y=y_path.to_proto()) for lift_series in sorted(lift_series_list): lift_series_proto = ( cross_stats.categorical_cross_stats.lift.lift_series.add( y_count=lift_series.y_count)) y = lift_series.y if y_boundaries is not None: low_value, high_value = bin_util.get_boundaries(y, y_boundaries) lift_series_proto.y_bucket.low_value = low_value lift_series_proto.y_bucket.high_value = high_value elif isinstance(y, six.string_types): lift_series_proto.y_string = y else: lift_series_proto.y_int = y # dedupe possibly overlapping top_k and bottom_k x values. lift_values_deduped = {v.x: v for v in lift_series.lift_values} # sort by lift DESC, x ASC lift_values_sorted = sorted(lift_values_deduped.values(), key=lambda v: (-v.lift, v.x)) for lift_value in lift_values_sorted: lift_value_proto = lift_series_proto.lift_values.add( lift=lift_value.lift, x_count=lift_value.x_count, x_and_y_count=lift_value.xy_count) x = lift_value.x if isinstance(x, six.string_types): lift_value_proto.x_string = x else: lift_value_proto.x_int = x return key.slice_key, stats
def _PartitionTransform(pcol, row_partitions: int, column_partitions: int, label_feature: types.FeaturePath, seed: int): """Ptransform wrapping _default_assign_to_partition.""" # We need to find the column name associated with the label path. steps = label_feature.steps() if not steps: raise ValueError("Empty label feature") label = steps[0] return pcol | "PartitionRowsCols" >> beam.ParDo( _PartitionFn(row_partitions, column_partitions, label, seed))
def _recursion_helper( parent_path: types.FeaturePath, feature_container: Iterable[schema_pb2.Feature], result: List[Tuple[types.FeaturePath, schema_pb2.Feature]]): for f in feature_container: feature_path = parent_path.child(f.name) if f.type != schema_pb2.STRUCT: result.append((feature_path, f)) else: _recursion_helper(feature_path, f.struct_domain.feature, result)
def _make_feature_stats_proto_uniques( feature_path: types.FeaturePath, num_uniques: int, is_categorical: bool) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing the uniques stats.""" result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) result.string_stats.unique = num_uniques return result
def test_access_attributes(self): features_needed = { FeaturePath(['a', 'b']): [ validation_options.ReasonFeatureNeeded(comment='reason1'), validation_options.ReasonFeatureNeeded(comment='reason2') ] } new_features_are_warnings = True options = validation_options.ValidationOptions( features_needed, new_features_are_warnings) # Test getters self.assertEqual(features_needed, options.features_needed) self.assertEqual(new_features_are_warnings, options.new_features_are_warnings)
def get_array( table: pa.Table, query_path: types.FeaturePath, return_example_indices: bool) -> Tuple[pa.Array, Optional[np.ndarray]]: """Retrieve a nested array (and optionally example indices) from a table. It assumes all the columns in `table` have only one chunk. It assumes `table` contains only arrays of the following supported types: - list<primitive> - list<struct<[Ts]>> where Ts are the types of the fields in the struct type, and they can only be one of the supported types (recursion intended). If the provided path refers to a leaf in the table, then a ListArray with a primitive element type will be returned. If the provided path does not refer to a leaf, a ListArray with a StructArray element type will be returned. Args: table: The Table whose arrays to be visited. It is assumed that the table contains only one chunk. query_path: The FeaturePath to lookup in the table. return_example_indices: Whether to return an additional array containing the example indices of the elements in the array corresponding to the query_path. Returns: A tuple. The first term is the feature array and the second term is the example_indeices array for the feature array (i.e. array[i] came from the example at row example_indices[i] in the table.). Raises: KeyError: When the query_path is empty, or cannot be found in the table and its nested struct arrays. """ def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, example_indices: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" if not query_path: return array, example_indices array_type = array.type if (not is_list_like(array_type) or not pa.types.is_struct(array_type.value_type)): raise KeyError( 'Cannot process query_path "{}" inside an array of type ' '{}. Expecting a (large_)list<struct<...>>.'.format( query_path, array_type)) flat_struct_array = array.flatten() flat_indices = None if example_indices is not None: flat_indices = example_indices[ array_util.GetFlattenedArrayParentIndices(array).to_numpy()] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_indices) if not query_path: raise KeyError('query_path must be non-empty.') column_name = query_path.steps()[0] try: array = table.column(column_name).data.chunk(0) except KeyError: raise KeyError( 'query_path step 0 "{}" not in table.'.format(column_name)) array_path = types.FeaturePath(query_path.steps()[1:]) example_indices = np.arange( table.num_rows) if return_example_indices else None return _recursion_helper(array_path, array, example_indices)
def get_array( table: pa.Table, query_path: types.FeaturePath, broadcast_column_name: Optional[Text] = None ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Retrieve a nested array (and optionally weights) from a table. It assumes all the columns in `table` have only one chunk. It assumes `table` contains only arrays of the following supported types: - list<primitive> - list<struct<[Ts]>> where Ts are the types of the fields in the struct type, and they can only be one of the supported types (recursion intended). If the provided path refers to a leaf in the table, then a ListArray with a primitive element type will be returned. If the provided path does not refer to a leaf, a ListArray with a StructArray element type will be returned. Args: table: The Table whose arrays to be visited. It is assumed that the table contains only one chunk. query_path: The FeaturePath to lookup in the table. broadcast_column_name: The name of a column to broadcast, or None. Each list should contain exactly one value. Returns: A tuple. The first term is the feature array and the second term is the broadcast column array for the feature array (i.e. broadcast_column[i] is the corresponding value for array[i]). Raises: ValueError: When the broadcast column is not a list array or its elements are not 1-element arrays. Or, if copy_broadcast_column is False, an error will be raised if its elements are not of a numeric type. KeyError: When the query_path is empty, or cannot be found in the table and its nested struct arrays. """ def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, weights: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" if not query_path: return array, weights array_type = array.type if (not pa.types.is_list(array_type) or not pa.types.is_struct(array_type.value_type)): raise KeyError( 'Cannot process query_path "{}" inside an array of type ' '{}. Expecting a list<struct<...>>.'.format( query_path, array_type)) flat_struct_array = array.flatten() flat_weights = None if weights is not None: flat_weights = weights[array_util.GetFlattenedArrayParentIndices( array).to_numpy()] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_weights) if not query_path: raise KeyError('query_path must be non-empty.') column_name = query_path.steps()[0] try: array = table.column(column_name).data.chunk(0) except KeyError: raise KeyError( 'query_path step 0 "{}" not in table.'.format(column_name)) array_path = types.FeaturePath(query_path.steps()[1:]) broadcast_column = None if broadcast_column_name is not None: broadcast_column = np.asarray( get_broadcastable_column(table, broadcast_column_name)) return _recursion_helper(array_path, array, broadcast_column)
def make_feature_stats_proto_topk_uniques_custom_stats( feature_path: types.FeaturePath, is_categorical: bool, num_top_values: int, num_rank_histogram_buckets: int, num_unique: int, value_count_list: List[FeatureValueCount], weighted_value_count_list: Optional[List[FeatureValueCount]] = None, frequency_threshold: int = 1, weighted_frequency_threshold: Optional[float] = None ) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing top-k and uniques stats. Args: feature_path: The path of the feature. is_categorical: Whether the feature is categorical. num_top_values: The number of most frequent feature values to keep for string features. num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. num_unique: The number of unique values in the feature. value_count_list: A list of FeatureValueCount tuples. weighted_value_count_list: An optional list of FeatureValueCount tuples for weighted features. frequency_threshold: The minimum number of examples the most frequent values must be present in. weighted_frequency_threshold: The minimum weighted number of examples the most frequent weighted values must be present in. Optional. Returns: A FeatureNameStatistics proto containing the top-k and uniques stats. """ result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) # Create a FeatureNameStatistics proto that includes the unweighted top-k # stats. topk_stats = _make_feature_stats_proto_topk(feature_path, value_count_list, is_categorical, False, num_top_values, frequency_threshold, num_rank_histogram_buckets) # Topk rank histogram. topk_custom_stats = result.custom_stats.add( name=_TOPK_SKETCH_CUSTOM_STATS_NAME) topk_custom_stats.rank_histogram.CopyFrom( topk_stats.string_stats.rank_histogram) # If weights were provided, create another FeatureNameStatistics proto that # includes the weighted top-k stats, and then copy those weighted top-k stats # into the result proto. if weighted_value_count_list: assert weighted_frequency_threshold is not None weighted_topk_stats = _make_feature_stats_proto_topk( feature_path, weighted_value_count_list, is_categorical, True, num_top_values, weighted_frequency_threshold, num_rank_histogram_buckets) # Weighted Topk rank histogram. weighted_topk_custom_stats = result.custom_stats.add( name=_WEIGHTED_TOPK_SKETCH_CUSTOM_STATS_NAME) weighted_topk_custom_stats.rank_histogram.CopyFrom( weighted_topk_stats.string_stats.weighted_string_stats. rank_histogram) # Add the number of uniques to the FeatureNameStatistics proto. result.custom_stats.add(name=_UNIQUES_SKETCH_CUSTOM_STATS_NAME, num=num_unique) return result
def make_feature_stats_proto_with_topk_stats( feature_path: types.FeaturePath, top_k_value_count_list: List[FeatureValueCount], is_categorical: bool, is_weighted_stats: bool, num_top_values: int, frequency_threshold: Union[float, int], num_rank_histogram_buckets: int ) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing the top-k stats. Args: feature_path: The path of the feature. top_k_value_count_list: A list of FeatureValueCount tuples. is_categorical: Whether the feature is categorical. is_weighted_stats: Whether top_k_value_count_list incorporates weights. num_top_values: The number of most frequent feature values to keep for string features. frequency_threshold: The minimum number of examples (possibly weighted) the most frequent values must be present in. num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. Returns: A FeatureNameStatistics proto containing the top-k stats. """ # Sort the top_k_value_count_list in descending order by count. Where # multiple feature values have the same count, consider the feature with the # 'larger' feature value to be larger for purposes of breaking the tie. top_k_value_count_list.sort(key=lambda counts: (counts[1], counts[0]), reverse=True) result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) if is_weighted_stats: string_stats = result.string_stats.weighted_string_stats else: string_stats = result.string_stats for i in range(len(top_k_value_count_list)): value, count = top_k_value_count_list[i] if count < frequency_threshold: break # Check if we have a valid utf-8 string. If not, assign a default invalid # string value. if isinstance(value, six.binary_type): value = stats_util.maybe_get_utf8(value) if value is None: logging.warning( 'Feature "%s" has bytes value "%s" which cannot be ' 'decoded as a UTF-8 string.', feature_path, value) value = _INVALID_STRING elif not isinstance(value, six.text_type): value = str(value) if i < num_top_values: freq_and_value = string_stats.top_values.add() freq_and_value.value = value freq_and_value.frequency = count if i < num_rank_histogram_buckets: bucket = string_stats.rank_histogram.buckets.add() bucket.low_rank = i bucket.high_rank = i bucket.sample_count = count bucket.label = value return result
def _prepend_slice_path(slice_name: str, path: types.FeaturePath) -> types.FeaturePath: steps = path.steps() return types.FeaturePath(('slice(%s)::' % slice_name + steps[0], ) + steps[1:])
def get_array( record_batch: pa.RecordBatch, query_path: types.FeaturePath, return_example_indices: bool, wrap_flat_struct_in_list: bool = True, ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Retrieve a nested array (and optionally example indices) from RecordBatch. This function has the same assumption over `record_batch` as `enumerate_arrays()` does. If the provided path refers to a leaf in the `record_batch`, then a "nested_list" will be returned. If the provided path does not refer to a leaf, a "struct" with be returned. See `enumerate_arrays()` for definition of "nested_list" and "struct". Args: record_batch: The RecordBatch whose arrays to be visited. query_path: The FeaturePath to lookup in the record_batch. return_example_indices: Whether to return an additional array containing the example indices of the elements in the array corresponding to the query_path. wrap_flat_struct_in_list: if True, and if the query_path leads to a struct<[Ts]> array, it will be wrapped in a list array, where each sub-list contains one element. Caller can make use of this option to assume this function always returns a list<inner_type>. Returns: A tuple. The first term is the feature array and the second term is the example_indeices array for the feature array (i.e. array[i] came from the example at row example_indices[i] in the record_batch.). Raises: KeyError: When the query_path is empty, or cannot be found in the record_batch and its nested struct arrays. """ def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, example_indices: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" array_type = array.type if not query_path: if pa.types.is_struct(array_type) and wrap_flat_struct_in_list: array = array_util.ToSingletonListArray(array) return array, example_indices if not pa.types.is_struct(get_innermost_nested_type(array_type)): raise KeyError('Cannot process query_path "{}" inside an array of type ' '{}. Expecting a struct<...> or ' '(large_)list...<struct<...>>.'.format( query_path, array_type)) flat_struct_array, parent_indices = flatten_nested( array, example_indices is not None) flat_indices = None if example_indices is not None: flat_indices = example_indices[parent_indices] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_indices) if not query_path: raise KeyError('query_path must be non-empty.') column_name = query_path.steps()[0] field_index = record_batch.schema.get_field_index(column_name) if field_index < 0: raise KeyError('query_path step 0 "{}" not in record batch.' .format(column_name)) array = record_batch.column(field_index) array_path = types.FeaturePath(query_path.steps()[1:]) example_indices = np.arange( record_batch.num_rows) if return_example_indices else None return _recursion_helper(array_path, array, example_indices)
def _make_feature_stats_proto( feature_path: types.FeaturePath, basic_stats: _PartialBasicStats, parent_basic_stats: Optional[_PartialBasicStats], num_values_q_combiner: quantiles_util.QuantilesCombiner, values_q_combiner: quantiles_util.QuantilesCombiner, num_values_histogram_buckets: int, num_histogram_buckets: int, num_quantiles_histogram_buckets: int, is_categorical: bool, has_weights: bool) -> statistics_pb2.FeatureNameStatistics: """Convert the partial basic stats into a FeatureNameStatistics proto. Args: feature_path: The path of the feature. basic_stats: The partial basic stats associated with the feature. parent_basic_stats: The partial basic stats of the parent of the feature. num_values_q_combiner: The quantiles combiner used to construct the quantiles histogram for the number of values in the feature. values_q_combiner: The quantiles combiner used to construct the histogram for the values in the feature. num_values_histogram_buckets: Number of buckets in the quantiles histogram for the number of values per feature. num_histogram_buckets: Number of buckets in a standard NumericStatistics.histogram with equal-width buckets. num_quantiles_histogram_buckets: Number of buckets in a quantiles NumericStatistics.histogram. is_categorical: A boolean indicating whether the feature is categorical. has_weights: A boolean indicating whether a weight feature is specified. Returns: A statistics_pb2.FeatureNameStatistics proto. """ # Create a new FeatureNameStatistics proto. result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # Set the feature type. # If we have a categorical feature, we preserve the type to be the original # INT type. Currently we don't set the type if we cannot infer it, which # happens when all the values are missing. We need to add an UNKNOWN type # to the stats proto to handle this case. if is_categorical: result.type = statistics_pb2.FeatureNameStatistics.INT elif basic_stats.common_stats.type is None: # If a feature is completely missing, we assume the type to be STRING. result.type = statistics_pb2.FeatureNameStatistics.STRING else: result.type = basic_stats.common_stats.type # Construct common statistics proto. common_stats_proto = _make_common_stats_proto( basic_stats.common_stats, parent_basic_stats.common_stats if parent_basic_stats is not None else None, num_values_q_combiner, num_values_histogram_buckets, has_weights) # Copy the common stats into appropriate numeric/string stats. # If the type is not set, we currently wrap the common stats # within numeric stats. if (is_categorical or result.type == statistics_pb2.FeatureNameStatistics.STRING): # Construct string statistics proto. string_stats_proto = _make_string_stats_proto( basic_stats.string_stats, basic_stats.common_stats.total_num_values) # Add the common stats into string stats. string_stats_proto.common_stats.CopyFrom(common_stats_proto) result.string_stats.CopyFrom(string_stats_proto) elif result.type == statistics_pb2.FeatureNameStatistics.STRUCT: result.struct_stats.common_stats.CopyFrom(common_stats_proto) elif result.type in (statistics_pb2.FeatureNameStatistics.INT, statistics_pb2.FeatureNameStatistics.FLOAT): # Construct numeric statistics proto. numeric_stats_proto = _make_numeric_stats_proto( basic_stats.numeric_stats, basic_stats.common_stats.total_num_values, values_q_combiner, num_histogram_buckets, num_quantiles_histogram_buckets, has_weights) # Add the common stats into numeric stats. numeric_stats_proto.common_stats.CopyFrom(common_stats_proto) result.num_stats.CopyFrom(numeric_stats_proto) return result
def _make_dataset_feature_stats_proto( lifts: Tuple[_SlicedFeatureKey, Iterable[_LiftSeries]], y_path: types.FeaturePath, y_boundaries: Optional[np.ndarray], weighted_examples: bool, output_custom_stats: bool ) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]: """Generates DatasetFeatureStatistics proto for a given x_path, y_path pair. Args: lifts: The result of two successive group bys of lift values. The innermost grouping collects all the lift values for a given (slice, x_path and y_value) tuple (corresponding to a single LiftSeries message). The outermost grouping collects all the lift values for the same (slice, x_path) tuple (corresponding to the set of the LiftSeries which share the same value of y_path). The full structure of lifts is described by: (slice, x_path), [(y, y_count, [(x, lift, xy_count, x_count)])] y_path: The path used as Y in the lift expression: lift = P(Y=y|X=x) / P(Y=y). y_boundaries: Optionally, a set of bin boundaries used for binning y_path values. weighted_examples: Whether lift is computed over weighted examples, in which case the proto will output weighted counts (as floats) rather than simple counts (as ints). output_custom_stats: Whether to output custom stats for use with Facets. Returns: The populated DatasetFeatureStatistics proto. """ key, lift_series_list = lifts stats = statistics_pb2.DatasetFeatureStatistics() cross_stats = stats.cross_features.add(path_x=key.x_path.to_proto(), path_y=y_path.to_proto()) if output_custom_stats: feature_stats = stats.features.add(path=key.x_path.to_proto()) for lift_series in sorted(lift_series_list): lift_series_proto = ( cross_stats.categorical_cross_stats.lift.lift_series.add()) if weighted_examples: lift_series_proto.weighted_y_count = lift_series.y_count else: lift_series_proto.y_count = lift_series.y_count y = lift_series.y if y_boundaries is not None and isinstance(y, int): low_value, high_value = bin_util.get_boundaries(y, y_boundaries) lift_series_proto.y_bucket.low_value = low_value lift_series_proto.y_bucket.high_value = high_value y_display_fmt = '[{},{}]' if high_value == float( 'inf') else '[{},{})' y_display_val = y_display_fmt.format(low_value, high_value) elif isinstance(y, six.text_type): lift_series_proto.y_string = y y_display_val = y elif isinstance(y, six.binary_type): y_string = _get_unicode_value(y, y_path) lift_series_proto.y_string = y_string y_display_val = y_string else: lift_series_proto.y_int = y y_display_val = str(y) if output_custom_stats: hist = feature_stats.custom_stats.add( name='Lift (Y={})'.format(y_display_val)).rank_histogram # dedupe possibly overlapping top_k and bottom_k x values. lift_values_deduped = {v.x: v for v in lift_series.lift_values} # sort by lift DESC, x ASC lift_values_sorted = sorted(lift_values_deduped.values(), key=lambda v: (-v.lift, v.x)) for lift_value in lift_values_sorted: lift_value_proto = lift_series_proto.lift_values.add( lift=lift_value.lift) if weighted_examples: lift_value_proto.weighted_x_count = lift_value.x_count lift_value_proto.weighted_x_and_y_count = lift_value.xy_count else: lift_value_proto.x_count = lift_value.x_count lift_value_proto.x_and_y_count = lift_value.xy_count x = lift_value.x if isinstance(x, six.text_type): lift_value_proto.x_string = x x_display_val = x elif isinstance(x, six.binary_type): x_string = _get_unicode_value(x, key.x_path) lift_value_proto.x_string = x_string x_display_val = x_string else: lift_value_proto.x_int = x x_display_val = str(x) if output_custom_stats: hist.buckets.add(label=x_display_val, sample_count=lift_value.lift) return key.slice_key, stats
def _make_feature_stats_proto( feature_path: types.FeaturePath, basic_stats: _PartialBasicStats, parent_basic_stats: Optional[_PartialBasicStats], make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch], num_values_histogram_buckets: int, num_histogram_buckets: int, num_quantiles_histogram_buckets: int, is_bytes: bool, is_categorical: bool, has_weights: bool ) -> statistics_pb2.FeatureNameStatistics: """Convert the partial basic stats into a FeatureNameStatistics proto. Args: feature_path: The path of the feature. basic_stats: The partial basic stats associated with the feature. parent_basic_stats: The partial basic stats of the parent of the feature. make_quantiles_sketch_fn: A callable to create a quantiles sketch. num_values_histogram_buckets: Number of buckets in the quantiles histogram for the number of values per feature. num_histogram_buckets: Number of buckets in a standard NumericStatistics.histogram with equal-width buckets. num_quantiles_histogram_buckets: Number of buckets in a quantiles NumericStatistics.histogram. is_bytes: A boolean indicating whether the feature is bytes. is_categorical: A boolean indicating whether the feature is categorical. has_weights: A boolean indicating whether a weight feature is specified. Returns: A statistics_pb2.FeatureNameStatistics proto. """ # Create a new FeatureNameStatistics proto. result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # Set the feature type. inferred_type = basic_stats.common_stats.type if inferred_type is not None: # The user claims the feature to be BYTES. Only trust them if the inferred # type is STRING (which means the actual data is in strings/bytes). We # never infer BYTES. if (is_bytes and inferred_type == statistics_pb2.FeatureNameStatistics.STRING): result.type = statistics_pb2.FeatureNameStatistics.BYTES else: result.type = basic_stats.common_stats.type # The inferred type being None means we don't see any value for this feature. # We trust user's claim. elif is_bytes: result.type = statistics_pb2.FeatureNameStatistics.BYTES elif is_categorical: result.type = statistics_pb2.FeatureNameStatistics.INT else: # We don't have an "unknown" type so use STRING here. result.type = statistics_pb2.FeatureNameStatistics.STRING # Construct common statistics proto. common_stats_proto = _make_common_stats_proto( basic_stats.common_stats, parent_basic_stats.common_stats if parent_basic_stats is not None else None, make_quantiles_sketch_fn, num_values_histogram_buckets, has_weights) # this is the total number of values at the leaf level. total_num_values = ( 0 if basic_stats.common_stats.presence_and_valency_stats is None else basic_stats.common_stats.presence_and_valency_stats[-1].total_num_values) # Copy the common stats into appropriate numeric/string stats. # If the type is not set, we currently wrap the common stats # within numeric stats. if result.type == statistics_pb2.FeatureNameStatistics.BYTES: # Construct bytes statistics proto. bytes_stats_proto = _make_bytes_stats_proto( basic_stats.bytes_stats, common_stats_proto.tot_num_values) # Add the common stats into bytes stats. bytes_stats_proto.common_stats.CopyFrom(common_stats_proto) result.bytes_stats.CopyFrom(bytes_stats_proto) if (result.type == statistics_pb2.FeatureNameStatistics.STRING or (is_categorical and result.type == statistics_pb2.FeatureNameStatistics.INT)): # Construct string statistics proto. string_stats_proto = _make_string_stats_proto(basic_stats.string_stats, total_num_values) # Add the common stats into string stats. string_stats_proto.common_stats.CopyFrom(common_stats_proto) result.string_stats.CopyFrom(string_stats_proto) elif result.type == statistics_pb2.FeatureNameStatistics.STRUCT: result.struct_stats.common_stats.CopyFrom(common_stats_proto) elif result.type in (statistics_pb2.FeatureNameStatistics.INT, statistics_pb2.FeatureNameStatistics.FLOAT): # Construct numeric statistics proto. numeric_stats_proto = _make_numeric_stats_proto( basic_stats.numeric_stats, total_num_values, num_histogram_buckets, num_quantiles_histogram_buckets, has_weights) # Add the common stats into numeric stats. numeric_stats_proto.common_stats.CopyFrom(common_stats_proto) result.num_stats.CopyFrom(numeric_stats_proto) result.custom_stats.extend(_make_num_values_custom_stats_proto( basic_stats.common_stats, num_values_histogram_buckets)) return result