def _make_feature_stats_proto(numeric_stats, feature_name, quantiles_combiner, num_histogram_buckets, num_quantiles_histogram_buckets): """Convert the partial numeric statistics into FeatureNameStatistics proto.""" numeric_stats_proto = statistics_pb2.NumericStatistics() # Set the stats in the proto only if we have at least one value for the # feature. if numeric_stats.total_num_values > 0: mean = numeric_stats.sum / numeric_stats.total_num_values variance = max( 0, (numeric_stats.sum_of_squares / numeric_stats.total_num_values) - mean * mean) numeric_stats_proto.mean = float(mean) numeric_stats_proto.std_dev = math.sqrt(variance) numeric_stats_proto.num_zeros = numeric_stats.num_zeros numeric_stats_proto.min = float(numeric_stats.min) numeric_stats_proto.max = float(numeric_stats.max) # Extract the quantiles from the summary. quantiles = quantiles_combiner.extract_output( numeric_stats.quantiles_summary) # Find the median from the quantiles and update the numeric stats proto. numeric_stats_proto.median = float( quantiles_util.find_median(quantiles)) # Construct the equi-width histogram from the quantiles and add it to the # numeric stats proto. std_histogram = quantiles_util.generate_equi_width_histogram( quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.total_num_values, num_histogram_buckets) std_histogram.num_nan = numeric_stats.num_nan new_std_histogram = numeric_stats_proto.histograms.add() new_std_histogram.CopyFrom(std_histogram) # Construct the quantiles histogram from the quantiles and add it to the # numeric stats proto. q_histogram = quantiles_util.generate_quantiles_histogram( quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.total_num_values, num_quantiles_histogram_buckets) q_histogram.num_nan = numeric_stats.num_nan new_q_histogram = numeric_stats_proto.histograms.add() new_q_histogram.CopyFrom(q_histogram) # Create a new FeatureNameStatistics proto. result = statistics_pb2.FeatureNameStatistics() result.name = feature_name result.type = numeric_stats.type result.num_stats.CopyFrom(numeric_stats_proto) return result
def get_dataset_feature_statistics(builder, split): """Calculate statistics for the specified split.""" statistics = statistics_pb2.DatasetFeatureStatistics() # Make this to the best of our abilities. schema = schema_pb2.Schema() dataset = builder.as_dataset(split=split) # Just computing the number of examples for now. statistics.num_examples = 0 # Feature dictionaries. feature_to_shape = {} feature_to_dtype = {} feature_to_num_examples = collections.defaultdict(int) feature_to_min = {} feature_to_max = {} for example in dataset: statistics.num_examples += 1 assert isinstance(example, dict) feature_names = example.keys() for feature_name in feature_names: # Update the number of examples this feature appears in. feature_to_num_examples[feature_name] += 1 feature_shape = example[feature_name].shape feature_dtype = example[feature_name].dtype feature_np = example[feature_name].numpy() feature_min, feature_max = None, None is_numeric = ( feature_dtype.is_floating or feature_dtype.is_integer or feature_dtype.is_bool) if is_numeric: feature_min = np.min(feature_np) feature_max = np.max(feature_np) # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add # logic for that. # Set the shape, or assert shapes match. if feature_name not in feature_to_shape: feature_to_shape[feature_name] = feature_shape else: assert feature_to_shape[feature_name] == feature_shape # Set the shape, or assert shapes match. if feature_name not in feature_to_dtype: feature_to_dtype[feature_name] = feature_dtype else: assert feature_to_dtype[feature_name] == feature_dtype # Set or update the min, max. if is_numeric: if ((feature_name not in feature_to_min) or (feature_to_min[feature_name] > feature_min)): feature_to_min[feature_name] = feature_min if ((feature_name not in feature_to_max) or (feature_to_max[feature_name] < feature_max)): feature_to_max[feature_name] = feature_max # Start here, we've processed all examples. # Assert that the keys match up. assert feature_to_shape.keys() == feature_to_dtype.keys() assert feature_to_shape.keys() == feature_to_num_examples.keys() for feature_name in feature_to_shape: # Try to fill in the schema. feature = schema.feature.add() feature.name = feature_name # TODO(afrozm): What do we do for non fixed size shapes? # What to do for scalars? for dim in feature_to_shape[feature_name].as_list(): feature.shape.dim.add().size = dim feature_type = feature_to_dtype[feature_name] feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES) common_statistics = statistics_pb2.CommonStatistics() common_statistics.num_non_missing = feature_to_num_examples[feature_name] common_statistics.num_missing = ( statistics.num_examples - common_statistics.num_non_missing) feature_name_statistics = statistics.features.add() feature_name_statistics.name = feature_name # TODO(afrozm): This can be skipped, since type information was added to # the Schema. feature_name_statistics.type = _SCHEMA_TYPE_MAP.get( feature.type, statistics_pb2.FeatureNameStatistics.BYTES) if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT: numeric_statistics = statistics_pb2.NumericStatistics() numeric_statistics.min = feature_to_min[feature_name] numeric_statistics.max = feature_to_max[feature_name] numeric_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.num_stats.CopyFrom(numeric_statistics) else: # Let's shove it into BytesStatistics for now. bytes_statistics = statistics_pb2.BytesStatistics() bytes_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics) return statistics, schema
def _make_numeric_stats_proto( numeric_stats, total_num_values, quantiles_combiner, num_histogram_buckets, num_quantiles_histogram_buckets, has_weights ): """Convert the partial numeric statistics into NumericStatistics proto.""" result = statistics_pb2.NumericStatistics() if numeric_stats.num_nan > 0: total_num_values -= numeric_stats.num_nan # Set the stats in the proto only if we have at least one value for the # feature. if total_num_values == 0: return result mean = numeric_stats.sum / total_num_values variance = max( 0, (numeric_stats.sum_of_squares / total_num_values) - mean * mean) result.mean = float(mean) result.std_dev = math.sqrt(variance) result.num_zeros = numeric_stats.num_zeros result.min = float(numeric_stats.min) result.max = float(numeric_stats.max) # Extract the quantiles from the summary. quantiles = quantiles_combiner.extract_output( numeric_stats.quantiles_summary) # Find the median from the quantiles and update the numeric stats proto. result.median = float(quantiles_util.find_median(quantiles)) # Construct the equi-width histogram from the quantiles and add it to the # numeric stats proto. std_histogram = quantiles_util.generate_equi_width_histogram( quantiles, numeric_stats.min, numeric_stats.max, total_num_values, num_histogram_buckets) std_histogram.num_nan = numeric_stats.num_nan new_std_histogram = result.histograms.add() new_std_histogram.CopyFrom(std_histogram) # Construct the quantiles histogram from the quantiles and add it to the # numeric stats proto. q_histogram = quantiles_util.generate_quantiles_histogram( quantiles, numeric_stats.min, numeric_stats.max, total_num_values, num_quantiles_histogram_buckets) q_histogram.num_nan = numeric_stats.num_nan new_q_histogram = result.histograms.add() new_q_histogram.CopyFrom(q_histogram) # Add weighted numeric stats to the proto. if has_weights: weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics() if numeric_stats.weighted_total_num_values == 0: weighted_mean = 0 weighted_variance = 0 else: weighted_mean = (numeric_stats.weighted_sum / numeric_stats.weighted_total_num_values) weighted_variance = max(0, (numeric_stats.weighted_sum_of_squares / numeric_stats.weighted_total_num_values) - weighted_mean**2) weighted_numeric_stats_proto.mean = weighted_mean weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance) # Extract the weighted quantiles from the summary. weighted_quantiles = quantiles_combiner.extract_output( numeric_stats.weighted_quantiles_summary) # Find the weighted median from the quantiles and update the proto. weighted_numeric_stats_proto.median = float( quantiles_util.find_median(weighted_quantiles)) # Construct the weighted equi-width histogram from the quantiles and # add it to the numeric stats proto. weighted_std_histogram = quantiles_util.generate_equi_width_histogram( weighted_quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.weighted_total_num_values, num_histogram_buckets) weighted_std_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend([weighted_std_histogram]) # Construct the weighted quantiles histogram from the quantiles and # add it to the numeric stats proto. weighted_q_histogram = quantiles_util.generate_quantiles_histogram( weighted_quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.weighted_total_num_values, num_quantiles_histogram_buckets) weighted_q_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram]) result.weighted_numeric_stats.CopyFrom( weighted_numeric_stats_proto) return result
def get_dataset_feature_statistics(builder, split): """Calculate statistics for the specified split.""" statistics = statistics_pb2.DatasetFeatureStatistics() # Make this to the best of our abilities. schema = schema_pb2.Schema() dataset = builder.as_dataset(split=split) # Just computing the number of examples for now. statistics.num_examples = 0 # Feature dictionaries. feature_to_num_examples = collections.defaultdict(int) feature_to_min = {} feature_to_max = {} np_dataset = dataset_utils.dataset_as_numpy(dataset) for example in tqdm.tqdm(np_dataset, unit=" examples"): statistics.num_examples += 1 assert isinstance(example, dict) feature_names = sorted(example.keys()) for feature_name in feature_names: # Update the number of examples this feature appears in. feature_to_num_examples[feature_name] += 1 feature_np = example[feature_name] # For compatibility in graph and eager mode, we can get PODs here and # everything may not be neatly wrapped up in numpy's ndarray. feature_dtype = type(feature_np) if isinstance(feature_np, np.ndarray): feature_dtype = feature_np.dtype.type feature_min, feature_max = None, None is_numeric = (np.issubdtype(feature_dtype, np.number) or feature_dtype == np.bool_) if is_numeric: feature_min = np.min(feature_np) feature_max = np.max(feature_np) # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add # logic for that. # Set or update the min, max. if is_numeric: if ((feature_name not in feature_to_min) or (feature_to_min[feature_name] > feature_min)): feature_to_min[feature_name] = feature_min if ((feature_name not in feature_to_max) or (feature_to_max[feature_name] < feature_max)): feature_to_max[feature_name] = feature_max # Start here, we've processed all examples. output_shapes_dict = dataset.output_shapes output_types_dict = dataset.output_types for feature_name in sorted(feature_to_num_examples.keys()): # Try to fill in the schema. feature = schema.feature.add() feature.name = feature_name # TODO(afrozm): Make this work with nested structures, currently the Schema # proto has no support for it. maybe_feature_shape = output_shapes_dict[feature_name] if not isinstance(maybe_feature_shape, tf.TensorShape): logging.error( "Statistics generation doesn't work for nested structures yet") continue for dim in maybe_feature_shape.as_list(): # We denote `None`s as -1 in the shape proto. feature.shape.dim.add().size = dim if dim else -1 feature_type = output_types_dict[feature_name] feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES) common_statistics = statistics_pb2.CommonStatistics() common_statistics.num_non_missing = feature_to_num_examples[ feature_name] common_statistics.num_missing = (statistics.num_examples - common_statistics.num_non_missing) feature_name_statistics = statistics.features.add() feature_name_statistics.name = feature_name # TODO(afrozm): This can be skipped, since type information was added to # the Schema. feature_name_statistics.type = _SCHEMA_TYPE_MAP.get( feature.type, statistics_pb2.FeatureNameStatistics.BYTES) if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT: numeric_statistics = statistics_pb2.NumericStatistics() numeric_statistics.min = feature_to_min[feature_name] numeric_statistics.max = feature_to_max[feature_name] numeric_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.num_stats.CopyFrom(numeric_statistics) else: # Let's shove it into BytesStatistics for now. bytes_statistics = statistics_pb2.BytesStatistics() bytes_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics) return statistics, schema
def _make_feature_stats_proto(common_stats, feature_name, q_combiner, num_values_histogram_buckets, is_categorical, has_weights): """Convert the partial common stats into a FeatureNameStatistics proto. Args: common_stats: The partial common stats associated with a feature. feature_name: The name of the feature. q_combiner: The quantiles combiner used to construct the quantiles histogram for the number of values in the feature. num_values_histogram_buckets: Number of buckets in the quantiles histogram for the number of values per feature. is_categorical: A boolean indicating whether the feature is categorical. has_weights: A boolean indicating whether a weight feature is specified. Returns: A statistics_pb2.FeatureNameStatistics proto. """ common_stats_proto = statistics_pb2.CommonStatistics() common_stats_proto.num_non_missing = common_stats.num_non_missing common_stats_proto.num_missing = common_stats.num_missing common_stats_proto.tot_num_values = common_stats.total_num_values if common_stats.num_non_missing > 0: common_stats_proto.min_num_values = common_stats.min_num_values common_stats_proto.max_num_values = common_stats.max_num_values common_stats_proto.avg_num_values = (common_stats.total_num_values / common_stats.num_non_missing) # Add num_values_histogram to the common stats proto. num_values_quantiles = q_combiner.extract_output( common_stats.num_values_summary) histogram = quantiles_util.generate_quantiles_histogram( num_values_quantiles, common_stats.min_num_values, common_stats.max_num_values, common_stats.num_non_missing, num_values_histogram_buckets) common_stats_proto.num_values_histogram.CopyFrom(histogram) # Add weighted common stats to the proto. if has_weights: weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics( num_non_missing=common_stats.weighted_num_non_missing, num_missing=common_stats.weighted_num_missing, tot_num_values=common_stats.weighted_total_num_values) if common_stats.weighted_num_non_missing > 0: weighted_common_stats_proto.avg_num_values = ( common_stats.weighted_total_num_values / common_stats.weighted_num_non_missing) common_stats_proto.weighted_common_stats.CopyFrom( weighted_common_stats_proto) # Create a new FeatureNameStatistics proto. result = statistics_pb2.FeatureNameStatistics() result.name = feature_name # Set the feature type. # If we have a categorical feature, we preserve the type to be the original # INT type. Currently we don't set the type if we cannot infer it, which # happens when all the values are missing. We need to add an UNKNOWN type # to the stats proto to handle this case. if is_categorical: result.type = statistics_pb2.FeatureNameStatistics.INT elif common_stats.type is None: # If a feature is completely missing, we assume the type to be STRING. result.type = statistics_pb2.FeatureNameStatistics.STRING else: result.type = common_stats.type # Copy the common stats into appropriate numeric/string stats. # If the type is not set, we currently wrap the common stats # within numeric stats. if (result.type == statistics_pb2.FeatureNameStatistics.STRING or is_categorical): # Add the common stats into string stats. string_stats_proto = statistics_pb2.StringStatistics() string_stats_proto.common_stats.CopyFrom(common_stats_proto) result.string_stats.CopyFrom(string_stats_proto) else: # Add the common stats into numeric stats. numeric_stats_proto = statistics_pb2.NumericStatistics() numeric_stats_proto.common_stats.CopyFrom(common_stats_proto) result.num_stats.CopyFrom(numeric_stats_proto) return result
def _make_numeric_stats_proto( numeric_stats: _PartialNumericStats, total_num_values: int, num_histogram_buckets: int, num_quantiles_histogram_buckets: int, has_weights: bool ) -> statistics_pb2.NumericStatistics: """Convert the partial numeric statistics into NumericStatistics proto.""" result = statistics_pb2.NumericStatistics() if numeric_stats.num_nan > 0: total_num_values -= numeric_stats.num_nan if total_num_values == 0: # If we only have nan values, we only set num_nan. if numeric_stats.num_nan > 0: result.histograms.add(type=statistics_pb2.Histogram.STANDARD).num_nan = ( numeric_stats.num_nan) result.histograms.add(type=statistics_pb2.Histogram.QUANTILES).num_nan = ( numeric_stats.num_nan) return result mean = numeric_stats.sum / total_num_values variance = max( 0, (numeric_stats.sum_of_squares / total_num_values) - mean * mean) result.mean = float(mean) result.std_dev = math.sqrt(variance) result.num_zeros = numeric_stats.num_zeros result.min = float(numeric_stats.min) result.max = float(numeric_stats.max) # Extract the quantiles from the summary. assert numeric_stats.quantiles_summary is not None quantiles = ( numeric_stats.quantiles_summary.GetQuantiles( max(num_quantiles_histogram_buckets, _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM * num_histogram_buckets)).flatten().to_pylist()) # Find the median from the quantiles and update the numeric stats proto. result.median = float(quantiles_util.find_median(quantiles)) # Construct the equi-width histogram from the quantiles and add it to the # numeric stats proto. std_histogram = quantiles_util.generate_equi_width_histogram( quantiles, numeric_stats.finite_min, numeric_stats.finite_max, total_num_values, num_histogram_buckets) std_histogram.num_nan = numeric_stats.num_nan new_std_histogram = result.histograms.add() new_std_histogram.CopyFrom(std_histogram) # Construct the quantiles histogram from the quantiles and add it to the # numeric stats proto. q_histogram = quantiles_util.generate_quantiles_histogram( quantiles, total_num_values, num_quantiles_histogram_buckets) q_histogram.num_nan = numeric_stats.num_nan new_q_histogram = result.histograms.add() new_q_histogram.CopyFrom(q_histogram) # Add weighted numeric stats to the proto. if has_weights: weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics() if numeric_stats.weighted_total_num_values == 0: weighted_mean = 0 weighted_variance = 0 else: weighted_mean = (numeric_stats.weighted_sum / numeric_stats.weighted_total_num_values) weighted_variance = max(0, (numeric_stats.weighted_sum_of_squares / numeric_stats.weighted_total_num_values) - weighted_mean**2) weighted_numeric_stats_proto.mean = weighted_mean weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance) # Extract the weighted quantiles from the summary. assert numeric_stats.weighted_quantiles_summary is not None weighted_quantiles = ( numeric_stats.weighted_quantiles_summary.GetQuantiles( max(num_quantiles_histogram_buckets, _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM * num_histogram_buckets)).flatten().to_pylist()) # Find the weighted median from the quantiles and update the proto. weighted_numeric_stats_proto.median = float( quantiles_util.find_median(weighted_quantiles)) # Construct the weighted equi-width histogram from the quantiles and # add it to the numeric stats proto. weighted_std_histogram = quantiles_util.generate_equi_width_histogram( weighted_quantiles, numeric_stats.finite_min, numeric_stats.finite_max, numeric_stats.weighted_total_num_values, num_histogram_buckets) weighted_std_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend([weighted_std_histogram]) # Construct the weighted quantiles histogram from the quantiles and # add it to the numeric stats proto. weighted_q_histogram = quantiles_util.generate_quantiles_histogram( weighted_quantiles, numeric_stats.weighted_total_num_values, num_quantiles_histogram_buckets) weighted_q_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram]) result.weighted_numeric_stats.CopyFrom( weighted_numeric_stats_proto) return result