def _make_weighted_presence_and_valency_stats_protos(
    parent_presence_and_valency: Optional[_PresenceAndValencyStats],
    presence_and_valency: List[_PresenceAndValencyStats]
    ) -> List[statistics_pb2.WeightedCommonStatistics]:
  """Converts weighted presence and valency stats to corresponding protos."""
  result = []
  # The top-level non-missing is computed by
  # weighted_num_examples - top_level.weighted_num_non_missing (outside
  # BasicStatsGenerator as num_examples cannot be computed here).
  # For all other levels,
  # it's (previous_level.weighted_total_num_values -
  # this_level.weighted_num_non_missing).
  for prev_s, s in zip(
      itertools.chain([parent_presence_and_valency], presence_and_valency),
      presence_and_valency):
    proto = statistics_pb2.WeightedCommonStatistics()
    if prev_s is not None:
      proto.num_missing = (
          prev_s.weighted_total_num_values - s.weighted_num_non_missing)
    proto.num_non_missing = s.weighted_num_non_missing
    proto.tot_num_values = s.weighted_total_num_values
    if s.weighted_num_non_missing > 0:
      proto.avg_num_values = (
          s.weighted_total_num_values / s.weighted_num_non_missing)
    result.append(proto)
  return result
Exemplo n.º 2
0
def _make_common_stats_proto(
    common_stats: _PartialCommonStats,
    parent_common_stats: Optional[_PartialCommonStats],
    q_combiner: quantiles_util.QuantilesCombiner,
    num_values_histogram_buckets: int,
    has_weights: bool
) -> statistics_pb2.CommonStatistics:
  """Convert the partial common stats into a CommonStatistics proto."""
  result = statistics_pb2.CommonStatistics()
  result.num_non_missing = common_stats.num_non_missing
  if parent_common_stats is not None:
    result.num_missing = (
        parent_common_stats.total_num_values - common_stats.num_non_missing)
  result.tot_num_values = common_stats.total_num_values

  # TODO(b/79685042): Need to decide on what is the expected values for
  # statistics like min_num_values, max_num_values, avg_num_values, when
  # all the values for the feature are missing.
  if common_stats.num_non_missing > 0:
    result.min_num_values = common_stats.min_num_values
    result.max_num_values = common_stats.max_num_values
    result.avg_num_values = (
        common_stats.total_num_values / common_stats.num_non_missing)

    # Add num_values_histogram to the common stats proto.
    num_values_quantiles = q_combiner.extract_output(
        common_stats.num_values_summary)
    histogram = quantiles_util.generate_quantiles_histogram(
        num_values_quantiles, common_stats.num_non_missing,
        num_values_histogram_buckets)
    result.num_values_histogram.CopyFrom(histogram)

  # Add weighted common stats to the proto.
  if has_weights:
    weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics(
        num_non_missing=common_stats.weighted_num_non_missing,
        tot_num_values=common_stats.weighted_total_num_values)
    if parent_common_stats is not None:
      weighted_common_stats_proto.num_missing = (
          parent_common_stats.weighted_total_num_values -
          common_stats.weighted_num_non_missing)

    if common_stats.weighted_num_non_missing > 0:
      weighted_common_stats_proto.avg_num_values = (
          common_stats.weighted_total_num_values /
          common_stats.weighted_num_non_missing)

    result.weighted_common_stats.CopyFrom(
        weighted_common_stats_proto)
  return result
Exemplo n.º 3
0
def _make_feature_stats_proto(common_stats, feature_name, q_combiner,
                              num_values_histogram_buckets, is_categorical,
                              has_weights):
    """Convert the partial common stats into a FeatureNameStatistics proto.

  Args:
    common_stats: The partial common stats associated with a feature.
    feature_name: The name of the feature.
    q_combiner: The quantiles combiner used to construct the quantiles
        histogram for the number of values in the feature.
    num_values_histogram_buckets: Number of buckets in the quantiles
        histogram for the number of values per feature.
    is_categorical: A boolean indicating whether the feature is categorical.
    has_weights: A boolean indicating whether a weight feature is specified.

  Returns:
    A statistics_pb2.FeatureNameStatistics proto.
  """
    common_stats_proto = statistics_pb2.CommonStatistics()
    common_stats_proto.num_non_missing = common_stats.num_non_missing
    common_stats_proto.num_missing = common_stats.num_missing
    common_stats_proto.tot_num_values = common_stats.total_num_values

    if common_stats.num_non_missing > 0:
        common_stats_proto.min_num_values = common_stats.min_num_values
        common_stats_proto.max_num_values = common_stats.max_num_values
        common_stats_proto.avg_num_values = (common_stats.total_num_values /
                                             common_stats.num_non_missing)

        # Add num_values_histogram to the common stats proto.
        num_values_quantiles = q_combiner.extract_output(
            common_stats.num_values_summary)
        histogram = quantiles_util.generate_quantiles_histogram(
            num_values_quantiles, common_stats.min_num_values,
            common_stats.max_num_values, common_stats.num_non_missing,
            num_values_histogram_buckets)
        common_stats_proto.num_values_histogram.CopyFrom(histogram)

    # Add weighted common stats to the proto.
    if has_weights:
        weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics(
            num_non_missing=common_stats.weighted_num_non_missing,
            num_missing=common_stats.weighted_num_missing,
            tot_num_values=common_stats.weighted_total_num_values)

        if common_stats.weighted_num_non_missing > 0:
            weighted_common_stats_proto.avg_num_values = (
                common_stats.weighted_total_num_values /
                common_stats.weighted_num_non_missing)

        common_stats_proto.weighted_common_stats.CopyFrom(
            weighted_common_stats_proto)

    # Create a new FeatureNameStatistics proto.
    result = statistics_pb2.FeatureNameStatistics()
    result.name = feature_name
    # Set the feature type.
    # If we have a categorical feature, we preserve the type to be the original
    # INT type. Currently we don't set the type if we cannot infer it, which
    # happens when all the values are missing. We need to add an UNKNOWN type
    # to the stats proto to handle this case.
    if is_categorical:
        result.type = statistics_pb2.FeatureNameStatistics.INT
    elif common_stats.type is None:
        # If a feature is completely missing, we assume the type to be STRING.
        result.type = statistics_pb2.FeatureNameStatistics.STRING
    else:
        result.type = common_stats.type

    # Copy the common stats into appropriate numeric/string stats.
    # If the type is not set, we currently wrap the common stats
    # within numeric stats.
    if (result.type == statistics_pb2.FeatureNameStatistics.STRING
            or is_categorical):
        # Add the common stats into string stats.
        string_stats_proto = statistics_pb2.StringStatistics()
        string_stats_proto.common_stats.CopyFrom(common_stats_proto)
        result.string_stats.CopyFrom(string_stats_proto)
    else:
        # Add the common stats into numeric stats.
        numeric_stats_proto = statistics_pb2.NumericStatistics()
        numeric_stats_proto.common_stats.CopyFrom(common_stats_proto)
        result.num_stats.CopyFrom(numeric_stats_proto)

    return result
def _make_common_stats_proto(
    common_stats: _PartialCommonStats,
    parent_common_stats: Optional[_PartialCommonStats],
    make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch],
    num_values_histogram_buckets: int,
    has_weights: bool
) -> statistics_pb2.CommonStatistics:
  """Convert the partial common stats into a CommonStatistics proto."""
  result = statistics_pb2.CommonStatistics()
  parent_presence_and_valency = None
  if parent_common_stats is not None:
    parent_presence_and_valency = (
        _PresenceAndValencyStats(make_quantiles_sketch_fn)
        if parent_common_stats.presence_and_valency_stats is None else
        parent_common_stats.presence_and_valency_stats[-1])

  presence_and_valency_stats = common_stats.presence_and_valency_stats
  # the CommonStatistics already contains the presence and valency
  # for a 1-nested feature.
  if (presence_and_valency_stats is not None and
      len(presence_and_valency_stats) > 1):
    result.presence_and_valency_stats.extend(
        _make_presence_and_valency_stats_protos(
            parent_presence_and_valency,
            common_stats.presence_and_valency_stats))
    if has_weights:
      result.weighted_presence_and_valency_stats.extend(
          _make_weighted_presence_and_valency_stats_protos(
              parent_presence_and_valency,
              common_stats.presence_and_valency_stats))

  top_level_presence_and_valency = (
      _PresenceAndValencyStats(make_quantiles_sketch_fn)
      if common_stats.presence_and_valency_stats is None else
      common_stats.presence_and_valency_stats[0])
  result.num_non_missing = top_level_presence_and_valency.num_non_missing

  if parent_presence_and_valency is not None:
    result.num_missing = (
        parent_presence_and_valency.total_num_values -
        top_level_presence_and_valency.num_non_missing)
  result.tot_num_values = top_level_presence_and_valency.total_num_values

  # TODO(b/79685042): Need to decide on what is the expected values for
  # statistics like min_num_values, max_num_values, avg_num_values, when
  # all the values for the feature are missing.
  if top_level_presence_and_valency.num_non_missing > 0:
    result.min_num_values = top_level_presence_and_valency.min_num_values
    result.max_num_values = top_level_presence_and_valency.max_num_values
    result.avg_num_values = (
        top_level_presence_and_valency.total_num_values /
        top_level_presence_and_valency.num_non_missing)

    if top_level_presence_and_valency.num_values_summary is not None:

      # Add num_values_histogram to the common stats proto.
      num_values_quantiles = (
          top_level_presence_and_valency.num_values_summary.GetQuantiles(
              num_values_histogram_buckets).flatten().to_pylist())
      histogram = quantiles_util.generate_quantiles_histogram(
          num_values_quantiles, top_level_presence_and_valency.num_non_missing,
          num_values_histogram_buckets)
      result.num_values_histogram.CopyFrom(histogram)

  # Add weighted common stats to the proto.
  if has_weights:
    weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics(
        num_non_missing=top_level_presence_and_valency.weighted_num_non_missing,
        tot_num_values=top_level_presence_and_valency.weighted_total_num_values)
    if parent_presence_and_valency is not None:
      weighted_common_stats_proto.num_missing = (
          parent_presence_and_valency.weighted_total_num_values -
          top_level_presence_and_valency.weighted_num_non_missing)

    if top_level_presence_and_valency.weighted_num_non_missing > 0:
      weighted_common_stats_proto.avg_num_values = (
          top_level_presence_and_valency.weighted_total_num_values /
          top_level_presence_and_valency.weighted_num_non_missing)

    result.weighted_common_stats.CopyFrom(
        weighted_common_stats_proto)
  return result