def _make_feature_stats_proto(numeric_stats, feature_name, quantiles_combiner,
                              num_histogram_buckets,
                              num_quantiles_histogram_buckets):
    """Convert the partial numeric statistics into FeatureNameStatistics proto."""
    numeric_stats_proto = statistics_pb2.NumericStatistics()

    # Set the stats in the proto only if we have at least one value for the
    # feature.
    if numeric_stats.total_num_values > 0:
        mean = numeric_stats.sum / numeric_stats.total_num_values
        variance = max(
            0,
            (numeric_stats.sum_of_squares / numeric_stats.total_num_values) -
            mean * mean)
        numeric_stats_proto.mean = float(mean)
        numeric_stats_proto.std_dev = math.sqrt(variance)
        numeric_stats_proto.num_zeros = numeric_stats.num_zeros
        numeric_stats_proto.min = float(numeric_stats.min)
        numeric_stats_proto.max = float(numeric_stats.max)

        # Extract the quantiles from the summary.
        quantiles = quantiles_combiner.extract_output(
            numeric_stats.quantiles_summary)

        # Find the median from the quantiles and update the numeric stats proto.
        numeric_stats_proto.median = float(
            quantiles_util.find_median(quantiles))

        # Construct the equi-width histogram from the quantiles and add it to the
        # numeric stats proto.
        std_histogram = quantiles_util.generate_equi_width_histogram(
            quantiles, numeric_stats.min, numeric_stats.max,
            numeric_stats.total_num_values, num_histogram_buckets)
        std_histogram.num_nan = numeric_stats.num_nan
        new_std_histogram = numeric_stats_proto.histograms.add()
        new_std_histogram.CopyFrom(std_histogram)

        # Construct the quantiles histogram from the quantiles and add it to the
        # numeric stats proto.
        q_histogram = quantiles_util.generate_quantiles_histogram(
            quantiles, numeric_stats.min, numeric_stats.max,
            numeric_stats.total_num_values, num_quantiles_histogram_buckets)
        q_histogram.num_nan = numeric_stats.num_nan
        new_q_histogram = numeric_stats_proto.histograms.add()
        new_q_histogram.CopyFrom(q_histogram)

    # Create a new FeatureNameStatistics proto.
    result = statistics_pb2.FeatureNameStatistics()
    result.name = feature_name
    result.type = numeric_stats.type
    result.num_stats.CopyFrom(numeric_stats_proto)

    return result
def _make_numeric_stats_proto(
    numeric_stats,
    total_num_values,
    quantiles_combiner,
    num_histogram_buckets,
    num_quantiles_histogram_buckets,
    has_weights
    ):
  """Convert the partial numeric statistics into NumericStatistics proto."""
  result = statistics_pb2.NumericStatistics()

  if numeric_stats.num_nan > 0:
    total_num_values -= numeric_stats.num_nan

  # Set the stats in the proto only if we have at least one value for the
  # feature.
  if total_num_values == 0:
    return result

  mean = numeric_stats.sum / total_num_values
  variance = max(
      0, (numeric_stats.sum_of_squares / total_num_values) -
      mean * mean)
  result.mean = float(mean)
  result.std_dev = math.sqrt(variance)
  result.num_zeros = numeric_stats.num_zeros
  result.min = float(numeric_stats.min)
  result.max = float(numeric_stats.max)

  # Extract the quantiles from the summary.
  quantiles = quantiles_combiner.extract_output(
      numeric_stats.quantiles_summary)

  # Find the median from the quantiles and update the numeric stats proto.
  result.median = float(quantiles_util.find_median(quantiles))

  # Construct the equi-width histogram from the quantiles and add it to the
  # numeric stats proto.
  std_histogram = quantiles_util.generate_equi_width_histogram(
      quantiles, numeric_stats.min, numeric_stats.max,
      total_num_values, num_histogram_buckets)
  std_histogram.num_nan = numeric_stats.num_nan
  new_std_histogram = result.histograms.add()
  new_std_histogram.CopyFrom(std_histogram)

  # Construct the quantiles histogram from the quantiles and add it to the
  # numeric stats proto.
  q_histogram = quantiles_util.generate_quantiles_histogram(
      quantiles, numeric_stats.min, numeric_stats.max,
      total_num_values, num_quantiles_histogram_buckets)
  q_histogram.num_nan = numeric_stats.num_nan
  new_q_histogram = result.histograms.add()
  new_q_histogram.CopyFrom(q_histogram)

  # Add weighted numeric stats to the proto.
  if has_weights:
    weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics()

    if numeric_stats.weighted_total_num_values == 0:
      weighted_mean = 0
      weighted_variance = 0
    else:
      weighted_mean = (numeric_stats.weighted_sum /
                       numeric_stats.weighted_total_num_values)
      weighted_variance = max(0, (numeric_stats.weighted_sum_of_squares /
                                  numeric_stats.weighted_total_num_values)
                              - weighted_mean**2)
    weighted_numeric_stats_proto.mean = weighted_mean
    weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance)

    # Extract the weighted quantiles from the summary.
    weighted_quantiles = quantiles_combiner.extract_output(
        numeric_stats.weighted_quantiles_summary)

    # Find the weighted median from the quantiles and update the proto.
    weighted_numeric_stats_proto.median = float(
        quantiles_util.find_median(weighted_quantiles))

    # Construct the weighted equi-width histogram from the quantiles and
    # add it to the numeric stats proto.
    weighted_std_histogram = quantiles_util.generate_equi_width_histogram(
        weighted_quantiles, numeric_stats.min, numeric_stats.max,
        numeric_stats.weighted_total_num_values, num_histogram_buckets)
    weighted_std_histogram.num_nan = numeric_stats.num_nan
    weighted_numeric_stats_proto.histograms.extend([weighted_std_histogram])

    # Construct the weighted quantiles histogram from the quantiles and
    # add it to the numeric stats proto.
    weighted_q_histogram = quantiles_util.generate_quantiles_histogram(
        weighted_quantiles, numeric_stats.min, numeric_stats.max,
        numeric_stats.weighted_total_num_values,
        num_quantiles_histogram_buckets)
    weighted_q_histogram.num_nan = numeric_stats.num_nan
    weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram])

    result.weighted_numeric_stats.CopyFrom(
        weighted_numeric_stats_proto)
  return result
示例#3
0
 def test_find_median(self):
     self.assertEqual(quantiles_util.find_median([5.0]), 5.0)
     self.assertEqual(quantiles_util.find_median([3.0, 5.0]), 4.0)
     self.assertEqual(quantiles_util.find_median([3.0, 4.0, 5.0]), 4.0)
     self.assertEqual(quantiles_util.find_median([3.0, 4.0, 5.0, 6.0]), 4.5)
def _make_numeric_stats_proto(
    numeric_stats: _PartialNumericStats,
    total_num_values: int,
    num_histogram_buckets: int,
    num_quantiles_histogram_buckets: int,
    has_weights: bool
    ) -> statistics_pb2.NumericStatistics:
  """Convert the partial numeric statistics into NumericStatistics proto."""
  result = statistics_pb2.NumericStatistics()

  if numeric_stats.num_nan > 0:
    total_num_values -= numeric_stats.num_nan

  if total_num_values == 0:
    # If we only have nan values, we only set num_nan.
    if numeric_stats.num_nan > 0:
      result.histograms.add(type=statistics_pb2.Histogram.STANDARD).num_nan = (
          numeric_stats.num_nan)
      result.histograms.add(type=statistics_pb2.Histogram.QUANTILES).num_nan = (
          numeric_stats.num_nan)
    return result

  mean = numeric_stats.sum / total_num_values
  variance = max(
      0, (numeric_stats.sum_of_squares / total_num_values) -
      mean * mean)
  result.mean = float(mean)
  result.std_dev = math.sqrt(variance)
  result.num_zeros = numeric_stats.num_zeros
  result.min = float(numeric_stats.min)
  result.max = float(numeric_stats.max)

  # Extract the quantiles from the summary.
  assert numeric_stats.quantiles_summary is not None
  quantiles = (
      numeric_stats.quantiles_summary.GetQuantiles(
          max(num_quantiles_histogram_buckets,
              _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM *
              num_histogram_buckets)).flatten().to_pylist())

  # Find the median from the quantiles and update the numeric stats proto.
  result.median = float(quantiles_util.find_median(quantiles))

  # Construct the equi-width histogram from the quantiles and add it to the
  # numeric stats proto.
  std_histogram = quantiles_util.generate_equi_width_histogram(
      quantiles, numeric_stats.finite_min, numeric_stats.finite_max,
      total_num_values, num_histogram_buckets)
  std_histogram.num_nan = numeric_stats.num_nan
  new_std_histogram = result.histograms.add()
  new_std_histogram.CopyFrom(std_histogram)

  # Construct the quantiles histogram from the quantiles and add it to the
  # numeric stats proto.
  q_histogram = quantiles_util.generate_quantiles_histogram(
      quantiles, total_num_values, num_quantiles_histogram_buckets)
  q_histogram.num_nan = numeric_stats.num_nan
  new_q_histogram = result.histograms.add()
  new_q_histogram.CopyFrom(q_histogram)

  # Add weighted numeric stats to the proto.
  if has_weights:
    weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics()

    if numeric_stats.weighted_total_num_values == 0:
      weighted_mean = 0
      weighted_variance = 0
    else:
      weighted_mean = (numeric_stats.weighted_sum /
                       numeric_stats.weighted_total_num_values)
      weighted_variance = max(0, (numeric_stats.weighted_sum_of_squares /
                                  numeric_stats.weighted_total_num_values)
                              - weighted_mean**2)
    weighted_numeric_stats_proto.mean = weighted_mean
    weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance)

    # Extract the weighted quantiles from the summary.
    assert numeric_stats.weighted_quantiles_summary is not None
    weighted_quantiles = (
        numeric_stats.weighted_quantiles_summary.GetQuantiles(
            max(num_quantiles_histogram_buckets,
                _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM *
                num_histogram_buckets)).flatten().to_pylist())

    # Find the weighted median from the quantiles and update the proto.
    weighted_numeric_stats_proto.median = float(
        quantiles_util.find_median(weighted_quantiles))

    # Construct the weighted equi-width histogram from the quantiles and
    # add it to the numeric stats proto.
    weighted_std_histogram = quantiles_util.generate_equi_width_histogram(
        weighted_quantiles, numeric_stats.finite_min, numeric_stats.finite_max,
        numeric_stats.weighted_total_num_values, num_histogram_buckets)
    weighted_std_histogram.num_nan = numeric_stats.num_nan
    weighted_numeric_stats_proto.histograms.extend([weighted_std_histogram])

    # Construct the weighted quantiles histogram from the quantiles and
    # add it to the numeric stats proto.
    weighted_q_histogram = quantiles_util.generate_quantiles_histogram(
        weighted_quantiles, numeric_stats.weighted_total_num_values,
        num_quantiles_histogram_buckets)
    weighted_q_histogram.num_nan = numeric_stats.num_nan
    weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram])

    result.weighted_numeric_stats.CopyFrom(
        weighted_numeric_stats_proto)
  return result