def _run_quantiles_combiner_test(test: absltest.TestCase, q_combiner: quantiles_util.QuantilesCombiner, batches: List[List[np.ndarray]], expected_result: np.ndarray): """Tests quantiles combiner.""" summaries = [q_combiner.add_input(q_combiner.create_accumulator(), batch) for batch in batches] result = q_combiner.extract_output(q_combiner.merge_accumulators(summaries)) test.assertEqual(result.dtype, expected_result.dtype) test.assertEqual(result.size, expected_result.size) for i in range(expected_result.size): test.assertAlmostEqual(result[i], expected_result[i])
def _make_common_stats_proto( common_stats: _PartialCommonStats, parent_common_stats: Optional[_PartialCommonStats], q_combiner: quantiles_util.QuantilesCombiner, num_values_histogram_buckets: int, has_weights: bool ) -> statistics_pb2.CommonStatistics: """Convert the partial common stats into a CommonStatistics proto.""" result = statistics_pb2.CommonStatistics() result.num_non_missing = common_stats.num_non_missing if parent_common_stats is not None: result.num_missing = ( parent_common_stats.total_num_values - common_stats.num_non_missing) result.tot_num_values = common_stats.total_num_values # TODO(b/79685042): Need to decide on what is the expected values for # statistics like min_num_values, max_num_values, avg_num_values, when # all the values for the feature are missing. if common_stats.num_non_missing > 0: result.min_num_values = common_stats.min_num_values result.max_num_values = common_stats.max_num_values result.avg_num_values = ( common_stats.total_num_values / common_stats.num_non_missing) # Add num_values_histogram to the common stats proto. num_values_quantiles = q_combiner.extract_output( common_stats.num_values_summary) histogram = quantiles_util.generate_quantiles_histogram( num_values_quantiles, common_stats.num_non_missing, num_values_histogram_buckets) result.num_values_histogram.CopyFrom(histogram) # Add weighted common stats to the proto. if has_weights: weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics( num_non_missing=common_stats.weighted_num_non_missing, tot_num_values=common_stats.weighted_total_num_values) if parent_common_stats is not None: weighted_common_stats_proto.num_missing = ( parent_common_stats.weighted_total_num_values - common_stats.weighted_num_non_missing) if common_stats.weighted_num_non_missing > 0: weighted_common_stats_proto.avg_num_values = ( common_stats.weighted_total_num_values / common_stats.weighted_num_non_missing) result.weighted_common_stats.CopyFrom( weighted_common_stats_proto) return result
def _make_numeric_stats_proto( numeric_stats: _PartialNumericStats, total_num_values: int, quantiles_combiner: quantiles_util.QuantilesCombiner, num_histogram_buckets: int, num_quantiles_histogram_buckets: int, has_weights: bool) -> statistics_pb2.NumericStatistics: """Convert the partial numeric statistics into NumericStatistics proto.""" result = statistics_pb2.NumericStatistics() if numeric_stats.num_nan > 0: total_num_values -= numeric_stats.num_nan if total_num_values == 0: # If we only have nan values, we only set num_nan. if numeric_stats.num_nan > 0: result.histograms.add( type=statistics_pb2.Histogram.STANDARD).num_nan = ( numeric_stats.num_nan) result.histograms.add( type=statistics_pb2.Histogram.QUANTILES).num_nan = ( numeric_stats.num_nan) return result mean = numeric_stats.sum / total_num_values variance = max(0, (numeric_stats.sum_of_squares / total_num_values) - mean * mean) result.mean = float(mean) result.std_dev = math.sqrt(variance) result.num_zeros = numeric_stats.num_zeros result.min = float(numeric_stats.min) result.max = float(numeric_stats.max) # Extract the quantiles from the summary. quantiles = quantiles_combiner.extract_output( numeric_stats.quantiles_summary) # Find the median from the quantiles and update the numeric stats proto. result.median = float(quantiles_util.find_median(quantiles)) # Construct the equi-width histogram from the quantiles and add it to the # numeric stats proto. std_histogram = quantiles_util.generate_equi_width_histogram( quantiles, numeric_stats.min, numeric_stats.max, total_num_values, num_histogram_buckets) std_histogram.num_nan = numeric_stats.num_nan new_std_histogram = result.histograms.add() new_std_histogram.CopyFrom(std_histogram) # Construct the quantiles histogram from the quantiles and add it to the # numeric stats proto. q_histogram = quantiles_util.generate_quantiles_histogram( quantiles, numeric_stats.min, numeric_stats.max, total_num_values, num_quantiles_histogram_buckets) q_histogram.num_nan = numeric_stats.num_nan new_q_histogram = result.histograms.add() new_q_histogram.CopyFrom(q_histogram) # Add weighted numeric stats to the proto. if has_weights: weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics( ) if numeric_stats.weighted_total_num_values == 0: weighted_mean = 0 weighted_variance = 0 else: weighted_mean = (numeric_stats.weighted_sum / numeric_stats.weighted_total_num_values) weighted_variance = max(0, (numeric_stats.weighted_sum_of_squares / numeric_stats.weighted_total_num_values) - weighted_mean**2) weighted_numeric_stats_proto.mean = weighted_mean weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance) # Extract the weighted quantiles from the summary. weighted_quantiles = quantiles_combiner.extract_output( numeric_stats.weighted_quantiles_summary) # Find the weighted median from the quantiles and update the proto. weighted_numeric_stats_proto.median = float( quantiles_util.find_median(weighted_quantiles)) # Construct the weighted equi-width histogram from the quantiles and # add it to the numeric stats proto. weighted_std_histogram = quantiles_util.generate_equi_width_histogram( weighted_quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.weighted_total_num_values, num_histogram_buckets) weighted_std_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend( [weighted_std_histogram]) # Construct the weighted quantiles histogram from the quantiles and # add it to the numeric stats proto. weighted_q_histogram = quantiles_util.generate_quantiles_histogram( weighted_quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.weighted_total_num_values, num_quantiles_histogram_buckets) weighted_q_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram]) result.weighted_numeric_stats.CopyFrom(weighted_numeric_stats_proto) return result