def _make_feature_stats_proto( stats_values: Dict[Text, float], feature_path: types.FeaturePath ) -> statistics_pb2.FeatureNameStatistics: """Creates the FeatureNameStatistics proto for one feature. Args: stats_values: A Dict[str,float] where the key of the dict is the name of the custom statistic and the value is the numeric value of the custom statistic of that feature. Ex. { 'Mutual Information': 0.5, 'Correlation': 0.1 } feature_path: The path of the feature. Returns: A FeatureNameStatistic proto containing the custom statistics for a feature. """ result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # Sort alphabetically by statistic name to have deterministic ordering stat_names = sorted(stats_values.keys()) for stat_name in stat_names: result.custom_stats.add(name=stat_name, num=stats_values[stat_name]) return result
def _make_feature_stats_proto_uniques( feature_path: types.FeaturePath, num_uniques: int, ) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing the uniques stats.""" result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) result.string_stats.unique = num_uniques return result
def _make_feature_stats_proto_topk( feature_path: types.FeaturePath, top_k_values_pairs: List[FeatureValueCount], is_categorical: bool, is_weighted_stats: bool, num_top_values: int, frequency_threshold: Union[float, int], num_rank_histogram_buckets: int) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing the top-k stats.""" # Sort (a copy of) the top_k_value_pairs in descending order by count. # Where multiple feature values have the same count, consider the feature with # the 'larger' feature value to be larger for purposes of breaking the tie. top_k_values_pairs = sorted( top_k_values_pairs, key=lambda pair: (pair.count, pair.feature_value), reverse=True) result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) if is_weighted_stats: string_stats = result.string_stats.weighted_string_stats else: string_stats = result.string_stats for i in range(len(top_k_values_pairs)): value, count = top_k_values_pairs[i] if count < frequency_threshold: break # Check if we have a valid utf-8 string. If not, assign a default invalid # string value. if isinstance(value, six.binary_type): decoded_value = stats_util.maybe_get_utf8(value) if decoded_value is None: logging.warning('Feature "%s" has bytes value "%s" which cannot be ' 'decoded as a UTF-8 string.', feature_path, value) value = constants.NON_UTF8_PLACEHOLDER else: value = decoded_value elif not isinstance(value, six.text_type): value = str(value) if i < num_top_values: freq_and_value = string_stats.top_values.add() freq_and_value.value = value freq_and_value.frequency = count if i < num_rank_histogram_buckets: bucket = string_stats.rank_histogram.buckets.add() bucket.low_rank = i bucket.high_rank = i bucket.sample_count = count bucket.label = value return result
def _make_dataset_feature_stats_proto( lifts: Tuple[_SlicedFeatureKey, _LiftSeries], y_path: types.FeaturePath, y_boundaries: Optional[np.ndarray] ) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]: """Generates DatasetFeatureStatistics proto for a given x_path, y_path pair. Args: lifts: The result of two successive group bys of lift values. The innermost grouping collects all the lift values for a given (slice, x_path and y_value) tuple (corresponding to a single LiftSeries message). The outermost grouping collects all the lift values for the same (slice, x_path) tuple (corresponding to the set of the LiftSeries which share the same value of y_path). The full structure of lifts is described by: (slice, x_path), [(y, y_count, [(x, lift, xy_count, x_count)])] y_path: The path used as Y in the lift expression: lift = P(Y=y|X=x) / P(Y=y). y_boundaries: Optionally, a set of bin boundaries used for binning y_path values. Returns: The populated DatasetFeatureStatistics proto. """ key, lift_series_list = lifts stats = statistics_pb2.DatasetFeatureStatistics() cross_stats = stats.cross_features.add(path_x=key.x_path.to_proto(), path_y=y_path.to_proto()) for lift_series in sorted(lift_series_list): lift_series_proto = ( cross_stats.categorical_cross_stats.lift.lift_series.add( y_count=lift_series.y_count)) y = lift_series.y if y_boundaries is not None: low_value, high_value = bin_util.get_boundaries(y, y_boundaries) lift_series_proto.y_bucket.low_value = low_value lift_series_proto.y_bucket.high_value = high_value elif isinstance(y, six.string_types): lift_series_proto.y_string = y else: lift_series_proto.y_int = y # dedupe possibly overlapping top_k and bottom_k x values. lift_values_deduped = {v.x: v for v in lift_series.lift_values} # sort by lift DESC, x ASC lift_values_sorted = sorted(lift_values_deduped.values(), key=lambda v: (-v.lift, v.x)) for lift_value in lift_values_sorted: lift_value_proto = lift_series_proto.lift_values.add( lift=lift_value.lift, x_count=lift_value.x_count, x_and_y_count=lift_value.xy_count) x = lift_value.x if isinstance(x, six.string_types): lift_value_proto.x_string = x else: lift_value_proto.x_int = x return key.slice_key, stats
def _make_feature_stats_proto_uniques( feature_path: types.FeaturePath, num_uniques: int, is_categorical: bool) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing the uniques stats.""" result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) result.string_stats.unique = num_uniques return result
def _make_dataset_feature_stats_proto( lifts: Tuple[_SlicedFeatureKey, Iterable[_LiftSeries]], y_path: types.FeaturePath, y_boundaries: Optional[np.ndarray], weighted_examples: bool, output_custom_stats: bool ) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]: """Generates DatasetFeatureStatistics proto for a given x_path, y_path pair. Args: lifts: The result of two successive group bys of lift values. The innermost grouping collects all the lift values for a given (slice, x_path and y_value) tuple (corresponding to a single LiftSeries message). The outermost grouping collects all the lift values for the same (slice, x_path) tuple (corresponding to the set of the LiftSeries which share the same value of y_path). The full structure of lifts is described by: (slice, x_path), [(y, y_count, [(x, lift, xy_count, x_count)])] y_path: The path used as Y in the lift expression: lift = P(Y=y|X=x) / P(Y=y). y_boundaries: Optionally, a set of bin boundaries used for binning y_path values. weighted_examples: Whether lift is computed over weighted examples, in which case the proto will output weighted counts (as floats) rather than simple counts (as ints). output_custom_stats: Whether to output custom stats for use with Facets. Returns: The populated DatasetFeatureStatistics proto. """ key, lift_series_list = lifts stats = statistics_pb2.DatasetFeatureStatistics() cross_stats = stats.cross_features.add(path_x=key.x_path.to_proto(), path_y=y_path.to_proto()) if output_custom_stats: feature_stats = stats.features.add(path=key.x_path.to_proto()) for lift_series in sorted(lift_series_list): lift_series_proto = ( cross_stats.categorical_cross_stats.lift.lift_series.add()) if weighted_examples: lift_series_proto.weighted_y_count = lift_series.y_count else: lift_series_proto.y_count = lift_series.y_count y = lift_series.y if y_boundaries is not None and isinstance(y, int): low_value, high_value = bin_util.get_boundaries(y, y_boundaries) lift_series_proto.y_bucket.low_value = low_value lift_series_proto.y_bucket.high_value = high_value y_display_fmt = '[{},{}]' if high_value == float( 'inf') else '[{},{})' y_display_val = y_display_fmt.format(low_value, high_value) elif isinstance(y, six.text_type): lift_series_proto.y_string = y y_display_val = y elif isinstance(y, six.binary_type): y_string = _get_unicode_value(y, y_path) lift_series_proto.y_string = y_string y_display_val = y_string else: lift_series_proto.y_int = y y_display_val = str(y) if output_custom_stats: hist = feature_stats.custom_stats.add( name='Lift (Y={})'.format(y_display_val)).rank_histogram # dedupe possibly overlapping top_k and bottom_k x values. lift_values_deduped = {v.x: v for v in lift_series.lift_values} # sort by lift DESC, x ASC lift_values_sorted = sorted(lift_values_deduped.values(), key=lambda v: (-v.lift, v.x)) for lift_value in lift_values_sorted: lift_value_proto = lift_series_proto.lift_values.add( lift=lift_value.lift) if weighted_examples: lift_value_proto.weighted_x_count = lift_value.x_count lift_value_proto.weighted_x_and_y_count = lift_value.xy_count else: lift_value_proto.x_count = lift_value.x_count lift_value_proto.x_and_y_count = lift_value.xy_count x = lift_value.x if isinstance(x, six.text_type): lift_value_proto.x_string = x x_display_val = x elif isinstance(x, six.binary_type): x_string = _get_unicode_value(x, key.x_path) lift_value_proto.x_string = x_string x_display_val = x_string else: lift_value_proto.x_int = x x_display_val = str(x) if output_custom_stats: hist.buckets.add(label=x_display_val, sample_count=lift_value.lift) return key.slice_key, stats
def _make_feature_stats_proto( feature_path: types.FeaturePath, basic_stats: _PartialBasicStats, parent_basic_stats: Optional[_PartialBasicStats], num_values_q_combiner: quantiles_util.QuantilesCombiner, values_q_combiner: quantiles_util.QuantilesCombiner, num_values_histogram_buckets: int, num_histogram_buckets: int, num_quantiles_histogram_buckets: int, is_categorical: bool, has_weights: bool) -> statistics_pb2.FeatureNameStatistics: """Convert the partial basic stats into a FeatureNameStatistics proto. Args: feature_path: The path of the feature. basic_stats: The partial basic stats associated with the feature. parent_basic_stats: The partial basic stats of the parent of the feature. num_values_q_combiner: The quantiles combiner used to construct the quantiles histogram for the number of values in the feature. values_q_combiner: The quantiles combiner used to construct the histogram for the values in the feature. num_values_histogram_buckets: Number of buckets in the quantiles histogram for the number of values per feature. num_histogram_buckets: Number of buckets in a standard NumericStatistics.histogram with equal-width buckets. num_quantiles_histogram_buckets: Number of buckets in a quantiles NumericStatistics.histogram. is_categorical: A boolean indicating whether the feature is categorical. has_weights: A boolean indicating whether a weight feature is specified. Returns: A statistics_pb2.FeatureNameStatistics proto. """ # Create a new FeatureNameStatistics proto. result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # Set the feature type. # If we have a categorical feature, we preserve the type to be the original # INT type. Currently we don't set the type if we cannot infer it, which # happens when all the values are missing. We need to add an UNKNOWN type # to the stats proto to handle this case. if is_categorical: result.type = statistics_pb2.FeatureNameStatistics.INT elif basic_stats.common_stats.type is None: # If a feature is completely missing, we assume the type to be STRING. result.type = statistics_pb2.FeatureNameStatistics.STRING else: result.type = basic_stats.common_stats.type # Construct common statistics proto. common_stats_proto = _make_common_stats_proto( basic_stats.common_stats, parent_basic_stats.common_stats if parent_basic_stats is not None else None, num_values_q_combiner, num_values_histogram_buckets, has_weights) # Copy the common stats into appropriate numeric/string stats. # If the type is not set, we currently wrap the common stats # within numeric stats. if (is_categorical or result.type == statistics_pb2.FeatureNameStatistics.STRING): # Construct string statistics proto. string_stats_proto = _make_string_stats_proto( basic_stats.string_stats, basic_stats.common_stats.total_num_values) # Add the common stats into string stats. string_stats_proto.common_stats.CopyFrom(common_stats_proto) result.string_stats.CopyFrom(string_stats_proto) elif result.type == statistics_pb2.FeatureNameStatistics.STRUCT: result.struct_stats.common_stats.CopyFrom(common_stats_proto) elif result.type in (statistics_pb2.FeatureNameStatistics.INT, statistics_pb2.FeatureNameStatistics.FLOAT): # Construct numeric statistics proto. numeric_stats_proto = _make_numeric_stats_proto( basic_stats.numeric_stats, basic_stats.common_stats.total_num_values, values_q_combiner, num_histogram_buckets, num_quantiles_histogram_buckets, has_weights) # Add the common stats into numeric stats. numeric_stats_proto.common_stats.CopyFrom(common_stats_proto) result.num_stats.CopyFrom(numeric_stats_proto) return result
def make_feature_stats_proto_with_topk_stats( feature_path: types.FeaturePath, top_k_value_count_list: List[FeatureValueCount], is_categorical: bool, is_weighted_stats: bool, num_top_values: int, frequency_threshold: Union[float, int], num_rank_histogram_buckets: int ) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing the top-k stats. Args: feature_path: The path of the feature. top_k_value_count_list: A list of FeatureValueCount tuples. is_categorical: Whether the feature is categorical. is_weighted_stats: Whether top_k_value_count_list incorporates weights. num_top_values: The number of most frequent feature values to keep for string features. frequency_threshold: The minimum number of examples (possibly weighted) the most frequent values must be present in. num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. Returns: A FeatureNameStatistics proto containing the top-k stats. """ # Sort the top_k_value_count_list in descending order by count. Where # multiple feature values have the same count, consider the feature with the # 'larger' feature value to be larger for purposes of breaking the tie. top_k_value_count_list.sort(key=lambda counts: (counts[1], counts[0]), reverse=True) result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) if is_weighted_stats: string_stats = result.string_stats.weighted_string_stats else: string_stats = result.string_stats for i in range(len(top_k_value_count_list)): value, count = top_k_value_count_list[i] if count < frequency_threshold: break # Check if we have a valid utf-8 string. If not, assign a default invalid # string value. if isinstance(value, six.binary_type): value = stats_util.maybe_get_utf8(value) if value is None: logging.warning( 'Feature "%s" has bytes value "%s" which cannot be ' 'decoded as a UTF-8 string.', feature_path, value) value = _INVALID_STRING elif not isinstance(value, six.text_type): value = str(value) if i < num_top_values: freq_and_value = string_stats.top_values.add() freq_and_value.value = value freq_and_value.frequency = count if i < num_rank_histogram_buckets: bucket = string_stats.rank_histogram.buckets.add() bucket.low_rank = i bucket.high_rank = i bucket.sample_count = count bucket.label = value return result
def _make_feature_stats_proto( feature_path: types.FeaturePath, basic_stats: _PartialBasicStats, parent_basic_stats: Optional[_PartialBasicStats], make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch], num_values_histogram_buckets: int, num_histogram_buckets: int, num_quantiles_histogram_buckets: int, is_bytes: bool, is_categorical: bool, has_weights: bool ) -> statistics_pb2.FeatureNameStatistics: """Convert the partial basic stats into a FeatureNameStatistics proto. Args: feature_path: The path of the feature. basic_stats: The partial basic stats associated with the feature. parent_basic_stats: The partial basic stats of the parent of the feature. make_quantiles_sketch_fn: A callable to create a quantiles sketch. num_values_histogram_buckets: Number of buckets in the quantiles histogram for the number of values per feature. num_histogram_buckets: Number of buckets in a standard NumericStatistics.histogram with equal-width buckets. num_quantiles_histogram_buckets: Number of buckets in a quantiles NumericStatistics.histogram. is_bytes: A boolean indicating whether the feature is bytes. is_categorical: A boolean indicating whether the feature is categorical. has_weights: A boolean indicating whether a weight feature is specified. Returns: A statistics_pb2.FeatureNameStatistics proto. """ # Create a new FeatureNameStatistics proto. result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # Set the feature type. inferred_type = basic_stats.common_stats.type if inferred_type is not None: # The user claims the feature to be BYTES. Only trust them if the inferred # type is STRING (which means the actual data is in strings/bytes). We # never infer BYTES. if (is_bytes and inferred_type == statistics_pb2.FeatureNameStatistics.STRING): result.type = statistics_pb2.FeatureNameStatistics.BYTES else: result.type = basic_stats.common_stats.type # The inferred type being None means we don't see any value for this feature. # We trust user's claim. elif is_bytes: result.type = statistics_pb2.FeatureNameStatistics.BYTES elif is_categorical: result.type = statistics_pb2.FeatureNameStatistics.INT else: # We don't have an "unknown" type so use STRING here. result.type = statistics_pb2.FeatureNameStatistics.STRING # Construct common statistics proto. common_stats_proto = _make_common_stats_proto( basic_stats.common_stats, parent_basic_stats.common_stats if parent_basic_stats is not None else None, make_quantiles_sketch_fn, num_values_histogram_buckets, has_weights) # this is the total number of values at the leaf level. total_num_values = ( 0 if basic_stats.common_stats.presence_and_valency_stats is None else basic_stats.common_stats.presence_and_valency_stats[-1].total_num_values) # Copy the common stats into appropriate numeric/string stats. # If the type is not set, we currently wrap the common stats # within numeric stats. if result.type == statistics_pb2.FeatureNameStatistics.BYTES: # Construct bytes statistics proto. bytes_stats_proto = _make_bytes_stats_proto( basic_stats.bytes_stats, common_stats_proto.tot_num_values) # Add the common stats into bytes stats. bytes_stats_proto.common_stats.CopyFrom(common_stats_proto) result.bytes_stats.CopyFrom(bytes_stats_proto) if (result.type == statistics_pb2.FeatureNameStatistics.STRING or (is_categorical and result.type == statistics_pb2.FeatureNameStatistics.INT)): # Construct string statistics proto. string_stats_proto = _make_string_stats_proto(basic_stats.string_stats, total_num_values) # Add the common stats into string stats. string_stats_proto.common_stats.CopyFrom(common_stats_proto) result.string_stats.CopyFrom(string_stats_proto) elif result.type == statistics_pb2.FeatureNameStatistics.STRUCT: result.struct_stats.common_stats.CopyFrom(common_stats_proto) elif result.type in (statistics_pb2.FeatureNameStatistics.INT, statistics_pb2.FeatureNameStatistics.FLOAT): # Construct numeric statistics proto. numeric_stats_proto = _make_numeric_stats_proto( basic_stats.numeric_stats, total_num_values, num_histogram_buckets, num_quantiles_histogram_buckets, has_weights) # Add the common stats into numeric stats. numeric_stats_proto.common_stats.CopyFrom(common_stats_proto) result.num_stats.CopyFrom(numeric_stats_proto) result.custom_stats.extend(_make_num_values_custom_stats_proto( basic_stats.common_stats, num_values_histogram_buckets)) return result
def make_feature_stats_proto_topk_uniques_custom_stats( feature_path: types.FeaturePath, is_categorical: bool, num_top_values: int, num_rank_histogram_buckets: int, num_unique: int, value_count_list: List[FeatureValueCount], weighted_value_count_list: Optional[List[FeatureValueCount]] = None, frequency_threshold: int = 1, weighted_frequency_threshold: Optional[float] = None ) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing top-k and uniques stats. Args: feature_path: The path of the feature. is_categorical: Whether the feature is categorical. num_top_values: The number of most frequent feature values to keep for string features. num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. num_unique: The number of unique values in the feature. value_count_list: A list of FeatureValueCount tuples. weighted_value_count_list: An optional list of FeatureValueCount tuples for weighted features. frequency_threshold: The minimum number of examples the most frequent values must be present in. weighted_frequency_threshold: The minimum weighted number of examples the most frequent weighted values must be present in. Optional. Returns: A FeatureNameStatistics proto containing the top-k and uniques stats. """ result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) # Create a FeatureNameStatistics proto that includes the unweighted top-k # stats. topk_stats = _make_feature_stats_proto_topk(feature_path, value_count_list, is_categorical, False, num_top_values, frequency_threshold, num_rank_histogram_buckets) # Topk rank histogram. topk_custom_stats = result.custom_stats.add( name=_TOPK_SKETCH_CUSTOM_STATS_NAME) topk_custom_stats.rank_histogram.CopyFrom( topk_stats.string_stats.rank_histogram) # If weights were provided, create another FeatureNameStatistics proto that # includes the weighted top-k stats, and then copy those weighted top-k stats # into the result proto. if weighted_value_count_list: assert weighted_frequency_threshold is not None weighted_topk_stats = _make_feature_stats_proto_topk( feature_path, weighted_value_count_list, is_categorical, True, num_top_values, weighted_frequency_threshold, num_rank_histogram_buckets) # Weighted Topk rank histogram. weighted_topk_custom_stats = result.custom_stats.add( name=_WEIGHTED_TOPK_SKETCH_CUSTOM_STATS_NAME) weighted_topk_custom_stats.rank_histogram.CopyFrom( weighted_topk_stats.string_stats.weighted_string_stats. rank_histogram) # Add the number of uniques to the FeatureNameStatistics proto. result.custom_stats.add(name=_UNIQUES_SKETCH_CUSTOM_STATS_NAME, num=num_unique) return result