示例#1
0
def _make_feature_stats_proto(
        stats_values: Dict[Text, float], feature_path: types.FeaturePath
) -> statistics_pb2.FeatureNameStatistics:
    """Creates the FeatureNameStatistics proto for one feature.

  Args:
    stats_values: A Dict[str,float] where the key of the dict is the name of the
      custom statistic and the value is the numeric value of the custom
      statistic of that feature. Ex. {
              'Mutual Information': 0.5,
              'Correlation': 0.1 }
    feature_path: The path of the feature.

  Returns:
    A FeatureNameStatistic proto containing the custom statistics for a
    feature.
  """

    result = statistics_pb2.FeatureNameStatistics()
    result.path.CopyFrom(feature_path.to_proto())

    # Sort alphabetically by statistic name to have deterministic ordering
    stat_names = sorted(stats_values.keys())
    for stat_name in stat_names:
        result.custom_stats.add(name=stat_name, num=stats_values[stat_name])
    return result
示例#2
0
def _make_feature_stats_proto_uniques(
    feature_path: types.FeaturePath, num_uniques: int,
) -> statistics_pb2.FeatureNameStatistics:
  """Makes a FeatureNameStatistics proto containing the uniques stats."""
  result = statistics_pb2.FeatureNameStatistics()
  result.path.CopyFrom(feature_path.to_proto())
  result.string_stats.unique = num_uniques
  return result
def _make_feature_stats_proto_topk(
    feature_path: types.FeaturePath,
    top_k_values_pairs: List[FeatureValueCount], is_categorical: bool,
    is_weighted_stats: bool, num_top_values: int,
    frequency_threshold: Union[float, int],
    num_rank_histogram_buckets: int) -> statistics_pb2.FeatureNameStatistics:
  """Makes a FeatureNameStatistics proto containing the top-k stats."""
  # Sort (a copy of) the top_k_value_pairs in descending order by count.
  # Where multiple feature values have the same count, consider the feature with
  # the 'larger' feature value to be larger for purposes of breaking the tie.

  top_k_values_pairs = sorted(
      top_k_values_pairs,
      key=lambda pair: (pair.count, pair.feature_value),
      reverse=True)

  result = statistics_pb2.FeatureNameStatistics()
  result.path.CopyFrom(feature_path.to_proto())
  # If we have a categorical feature, we preserve the type to be the original
  # INT type.
  result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                 else statistics_pb2.FeatureNameStatistics.STRING)

  if is_weighted_stats:
    string_stats = result.string_stats.weighted_string_stats
  else:
    string_stats = result.string_stats

  for i in range(len(top_k_values_pairs)):
    value, count = top_k_values_pairs[i]
    if count < frequency_threshold:
      break
    # Check if we have a valid utf-8 string. If not, assign a default invalid
    # string value.
    if isinstance(value, six.binary_type):
      decoded_value = stats_util.maybe_get_utf8(value)
      if decoded_value is None:
        logging.warning('Feature "%s" has bytes value "%s" which cannot be '
                        'decoded as a UTF-8 string.', feature_path, value)
        value = constants.NON_UTF8_PLACEHOLDER
      else:
        value = decoded_value
    elif not isinstance(value, six.text_type):
      value = str(value)

    if i < num_top_values:
      freq_and_value = string_stats.top_values.add()
      freq_and_value.value = value
      freq_and_value.frequency = count
    if i < num_rank_histogram_buckets:
      bucket = string_stats.rank_histogram.buckets.add()
      bucket.low_rank = i
      bucket.high_rank = i
      bucket.sample_count = count
      bucket.label = value
  return result
示例#4
0
def _make_dataset_feature_stats_proto(
    lifts: Tuple[_SlicedFeatureKey, _LiftSeries], y_path: types.FeaturePath,
    y_boundaries: Optional[np.ndarray]
) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]:
    """Generates DatasetFeatureStatistics proto for a given x_path, y_path pair.

  Args:
    lifts: The result of two successive group bys of lift values. The innermost
      grouping collects all the lift values for a given (slice, x_path and
      y_value) tuple (corresponding to a single LiftSeries message). The
      outermost grouping collects all the lift values for the same (slice,
      x_path) tuple (corresponding to the set of the LiftSeries which share the
      same value of y_path). The full structure of lifts is described by:
        (slice, x_path), [(y, y_count, [(x, lift, xy_count, x_count)])]
    y_path: The path used as Y in the lift expression: lift = P(Y=y|X=x) /
      P(Y=y).
    y_boundaries: Optionally, a set of bin boundaries used for binning y_path
      values.

  Returns:
    The populated DatasetFeatureStatistics proto.
  """
    key, lift_series_list = lifts
    stats = statistics_pb2.DatasetFeatureStatistics()
    cross_stats = stats.cross_features.add(path_x=key.x_path.to_proto(),
                                           path_y=y_path.to_proto())
    for lift_series in sorted(lift_series_list):
        lift_series_proto = (
            cross_stats.categorical_cross_stats.lift.lift_series.add(
                y_count=lift_series.y_count))
        y = lift_series.y
        if y_boundaries is not None:
            low_value, high_value = bin_util.get_boundaries(y, y_boundaries)
            lift_series_proto.y_bucket.low_value = low_value
            lift_series_proto.y_bucket.high_value = high_value
        elif isinstance(y, six.string_types):
            lift_series_proto.y_string = y
        else:
            lift_series_proto.y_int = y

        # dedupe possibly overlapping top_k and bottom_k x values.
        lift_values_deduped = {v.x: v for v in lift_series.lift_values}
        # sort by lift DESC, x ASC
        lift_values_sorted = sorted(lift_values_deduped.values(),
                                    key=lambda v: (-v.lift, v.x))
        for lift_value in lift_values_sorted:
            lift_value_proto = lift_series_proto.lift_values.add(
                lift=lift_value.lift,
                x_count=lift_value.x_count,
                x_and_y_count=lift_value.xy_count)
            x = lift_value.x
            if isinstance(x, six.string_types):
                lift_value_proto.x_string = x
            else:
                lift_value_proto.x_int = x
    return key.slice_key, stats
def _make_feature_stats_proto_uniques(
        feature_path: types.FeaturePath, num_uniques: int,
        is_categorical: bool) -> statistics_pb2.FeatureNameStatistics:
    """Makes a FeatureNameStatistics proto containing the uniques stats."""
    result = statistics_pb2.FeatureNameStatistics()
    result.path.CopyFrom(feature_path.to_proto())
    # If we have a categorical feature, we preserve the type to be the original
    # INT type.
    result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                   else statistics_pb2.FeatureNameStatistics.STRING)
    result.string_stats.unique = num_uniques
    return result
示例#6
0
def _make_dataset_feature_stats_proto(
    lifts: Tuple[_SlicedFeatureKey, Iterable[_LiftSeries]],
    y_path: types.FeaturePath, y_boundaries: Optional[np.ndarray],
    weighted_examples: bool, output_custom_stats: bool
) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]:
    """Generates DatasetFeatureStatistics proto for a given x_path, y_path pair.

  Args:
    lifts: The result of two successive group bys of lift values. The innermost
      grouping collects all the lift values for a given (slice, x_path and
      y_value) tuple (corresponding to a single LiftSeries message). The
      outermost grouping collects all the lift values for the same (slice,
      x_path) tuple (corresponding to the set of the LiftSeries which share the
      same value of y_path). The full structure of lifts is described by:
        (slice, x_path), [(y, y_count, [(x, lift, xy_count, x_count)])]
    y_path: The path used as Y in the lift expression: lift = P(Y=y|X=x) /
      P(Y=y).
    y_boundaries: Optionally, a set of bin boundaries used for binning y_path
      values.
    weighted_examples: Whether lift is computed over weighted examples, in which
      case the proto will output weighted counts (as floats) rather than simple
      counts (as ints).
    output_custom_stats: Whether to output custom stats for use with Facets.

  Returns:
    The populated DatasetFeatureStatistics proto.
  """
    key, lift_series_list = lifts
    stats = statistics_pb2.DatasetFeatureStatistics()
    cross_stats = stats.cross_features.add(path_x=key.x_path.to_proto(),
                                           path_y=y_path.to_proto())
    if output_custom_stats:
        feature_stats = stats.features.add(path=key.x_path.to_proto())
    for lift_series in sorted(lift_series_list):
        lift_series_proto = (
            cross_stats.categorical_cross_stats.lift.lift_series.add())
        if weighted_examples:
            lift_series_proto.weighted_y_count = lift_series.y_count
        else:
            lift_series_proto.y_count = lift_series.y_count
        y = lift_series.y
        if y_boundaries is not None and isinstance(y, int):
            low_value, high_value = bin_util.get_boundaries(y, y_boundaries)
            lift_series_proto.y_bucket.low_value = low_value
            lift_series_proto.y_bucket.high_value = high_value
            y_display_fmt = '[{},{}]' if high_value == float(
                'inf') else '[{},{})'
            y_display_val = y_display_fmt.format(low_value, high_value)
        elif isinstance(y, six.text_type):
            lift_series_proto.y_string = y
            y_display_val = y
        elif isinstance(y, six.binary_type):
            y_string = _get_unicode_value(y, y_path)
            lift_series_proto.y_string = y_string
            y_display_val = y_string
        else:
            lift_series_proto.y_int = y
            y_display_val = str(y)

        if output_custom_stats:
            hist = feature_stats.custom_stats.add(
                name='Lift (Y={})'.format(y_display_val)).rank_histogram

        # dedupe possibly overlapping top_k and bottom_k x values.
        lift_values_deduped = {v.x: v for v in lift_series.lift_values}
        # sort by lift DESC, x ASC
        lift_values_sorted = sorted(lift_values_deduped.values(),
                                    key=lambda v: (-v.lift, v.x))
        for lift_value in lift_values_sorted:
            lift_value_proto = lift_series_proto.lift_values.add(
                lift=lift_value.lift)
            if weighted_examples:
                lift_value_proto.weighted_x_count = lift_value.x_count
                lift_value_proto.weighted_x_and_y_count = lift_value.xy_count
            else:
                lift_value_proto.x_count = lift_value.x_count
                lift_value_proto.x_and_y_count = lift_value.xy_count
            x = lift_value.x
            if isinstance(x, six.text_type):
                lift_value_proto.x_string = x
                x_display_val = x
            elif isinstance(x, six.binary_type):
                x_string = _get_unicode_value(x, key.x_path)
                lift_value_proto.x_string = x_string
                x_display_val = x_string
            else:
                lift_value_proto.x_int = x
                x_display_val = str(x)

            if output_custom_stats:
                hist.buckets.add(label=x_display_val,
                                 sample_count=lift_value.lift)

    return key.slice_key, stats
def _make_feature_stats_proto(
        feature_path: types.FeaturePath, basic_stats: _PartialBasicStats,
        parent_basic_stats: Optional[_PartialBasicStats],
        num_values_q_combiner: quantiles_util.QuantilesCombiner,
        values_q_combiner: quantiles_util.QuantilesCombiner,
        num_values_histogram_buckets: int, num_histogram_buckets: int,
        num_quantiles_histogram_buckets: int, is_categorical: bool,
        has_weights: bool) -> statistics_pb2.FeatureNameStatistics:
    """Convert the partial basic stats into a FeatureNameStatistics proto.

  Args:
    feature_path: The path of the feature.
    basic_stats: The partial basic stats associated with the feature.
    parent_basic_stats: The partial basic stats of the parent of the feature.
    num_values_q_combiner: The quantiles combiner used to construct the
        quantiles histogram for the number of values in the feature.
    values_q_combiner: The quantiles combiner used to construct the
        histogram for the values in the feature.
    num_values_histogram_buckets: Number of buckets in the quantiles
        histogram for the number of values per feature.
    num_histogram_buckets: Number of buckets in a standard
        NumericStatistics.histogram with equal-width buckets.
    num_quantiles_histogram_buckets: Number of buckets in a
        quantiles NumericStatistics.histogram.
    is_categorical: A boolean indicating whether the feature is categorical.
    has_weights: A boolean indicating whether a weight feature is specified.

  Returns:
    A statistics_pb2.FeatureNameStatistics proto.
  """
    # Create a new FeatureNameStatistics proto.
    result = statistics_pb2.FeatureNameStatistics()
    result.path.CopyFrom(feature_path.to_proto())
    # Set the feature type.
    # If we have a categorical feature, we preserve the type to be the original
    # INT type. Currently we don't set the type if we cannot infer it, which
    # happens when all the values are missing. We need to add an UNKNOWN type
    # to the stats proto to handle this case.
    if is_categorical:
        result.type = statistics_pb2.FeatureNameStatistics.INT
    elif basic_stats.common_stats.type is None:
        # If a feature is completely missing, we assume the type to be STRING.
        result.type = statistics_pb2.FeatureNameStatistics.STRING
    else:
        result.type = basic_stats.common_stats.type

    # Construct common statistics proto.
    common_stats_proto = _make_common_stats_proto(
        basic_stats.common_stats, parent_basic_stats.common_stats
        if parent_basic_stats is not None else None, num_values_q_combiner,
        num_values_histogram_buckets, has_weights)

    # Copy the common stats into appropriate numeric/string stats.
    # If the type is not set, we currently wrap the common stats
    # within numeric stats.
    if (is_categorical
            or result.type == statistics_pb2.FeatureNameStatistics.STRING):
        # Construct string statistics proto.
        string_stats_proto = _make_string_stats_proto(
            basic_stats.string_stats,
            basic_stats.common_stats.total_num_values)
        # Add the common stats into string stats.
        string_stats_proto.common_stats.CopyFrom(common_stats_proto)
        result.string_stats.CopyFrom(string_stats_proto)
    elif result.type == statistics_pb2.FeatureNameStatistics.STRUCT:
        result.struct_stats.common_stats.CopyFrom(common_stats_proto)
    elif result.type in (statistics_pb2.FeatureNameStatistics.INT,
                         statistics_pb2.FeatureNameStatistics.FLOAT):
        # Construct numeric statistics proto.
        numeric_stats_proto = _make_numeric_stats_proto(
            basic_stats.numeric_stats,
            basic_stats.common_stats.total_num_values, values_q_combiner,
            num_histogram_buckets, num_quantiles_histogram_buckets,
            has_weights)
        # Add the common stats into numeric stats.
        numeric_stats_proto.common_stats.CopyFrom(common_stats_proto)
        result.num_stats.CopyFrom(numeric_stats_proto)

    return result
示例#8
0
def make_feature_stats_proto_with_topk_stats(
        feature_path: types.FeaturePath,
        top_k_value_count_list: List[FeatureValueCount], is_categorical: bool,
        is_weighted_stats: bool, num_top_values: int,
        frequency_threshold: Union[float, int], num_rank_histogram_buckets: int
) -> statistics_pb2.FeatureNameStatistics:
    """Makes a FeatureNameStatistics proto containing the top-k stats.

  Args:
    feature_path: The path of the feature.
    top_k_value_count_list: A list of FeatureValueCount tuples.
    is_categorical: Whether the feature is categorical.
    is_weighted_stats: Whether top_k_value_count_list incorporates weights.
    num_top_values: The number of most frequent feature values to keep for
      string features.
    frequency_threshold: The minimum number of examples (possibly weighted) the
      most frequent values must be present in.
    num_rank_histogram_buckets: The number of buckets in the rank histogram for
      string features.

  Returns:
    A FeatureNameStatistics proto containing the top-k stats.
  """
    # Sort the top_k_value_count_list in descending order by count. Where
    # multiple feature values have the same count, consider the feature with the
    # 'larger' feature value to be larger for purposes of breaking the tie.
    top_k_value_count_list.sort(key=lambda counts: (counts[1], counts[0]),
                                reverse=True)

    result = statistics_pb2.FeatureNameStatistics()
    result.path.CopyFrom(feature_path.to_proto())
    # If we have a categorical feature, we preserve the type to be the original
    # INT type.
    result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                   else statistics_pb2.FeatureNameStatistics.STRING)

    if is_weighted_stats:
        string_stats = result.string_stats.weighted_string_stats
    else:
        string_stats = result.string_stats

    for i in range(len(top_k_value_count_list)):
        value, count = top_k_value_count_list[i]
        if count < frequency_threshold:
            break
        # Check if we have a valid utf-8 string. If not, assign a default invalid
        # string value.
        if isinstance(value, six.binary_type):
            value = stats_util.maybe_get_utf8(value)
            if value is None:
                logging.warning(
                    'Feature "%s" has bytes value "%s" which cannot be '
                    'decoded as a UTF-8 string.', feature_path, value)
                value = _INVALID_STRING
        elif not isinstance(value, six.text_type):
            value = str(value)

        if i < num_top_values:
            freq_and_value = string_stats.top_values.add()
            freq_and_value.value = value
            freq_and_value.frequency = count
        if i < num_rank_histogram_buckets:
            bucket = string_stats.rank_histogram.buckets.add()
            bucket.low_rank = i
            bucket.high_rank = i
            bucket.sample_count = count
            bucket.label = value
    return result
def _make_feature_stats_proto(
    feature_path: types.FeaturePath,
    basic_stats: _PartialBasicStats,
    parent_basic_stats: Optional[_PartialBasicStats],
    make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch],
    num_values_histogram_buckets: int,
    num_histogram_buckets: int,
    num_quantiles_histogram_buckets: int,
    is_bytes: bool, is_categorical: bool, has_weights: bool
) -> statistics_pb2.FeatureNameStatistics:
  """Convert the partial basic stats into a FeatureNameStatistics proto.

  Args:
    feature_path: The path of the feature.
    basic_stats: The partial basic stats associated with the feature.
    parent_basic_stats: The partial basic stats of the parent of the feature.
    make_quantiles_sketch_fn: A callable to create a quantiles sketch.
    num_values_histogram_buckets: Number of buckets in the quantiles
        histogram for the number of values per feature.
    num_histogram_buckets: Number of buckets in a standard
        NumericStatistics.histogram with equal-width buckets.
    num_quantiles_histogram_buckets: Number of buckets in a
        quantiles NumericStatistics.histogram.
    is_bytes: A boolean indicating whether the feature is bytes.
    is_categorical: A boolean indicating whether the feature is categorical.
    has_weights: A boolean indicating whether a weight feature is specified.

  Returns:
    A statistics_pb2.FeatureNameStatistics proto.
  """
  # Create a new FeatureNameStatistics proto.
  result = statistics_pb2.FeatureNameStatistics()
  result.path.CopyFrom(feature_path.to_proto())
  # Set the feature type.
  inferred_type = basic_stats.common_stats.type
  if inferred_type is not None:
    # The user claims the feature to be BYTES. Only trust them if the inferred
    # type is STRING (which means the actual data is in strings/bytes). We
    # never infer BYTES.
    if (is_bytes and
        inferred_type == statistics_pb2.FeatureNameStatistics.STRING):
      result.type = statistics_pb2.FeatureNameStatistics.BYTES
    else:
      result.type = basic_stats.common_stats.type
  # The inferred type being None means we don't see any value for this feature.
  # We trust user's claim.
  elif is_bytes:
    result.type = statistics_pb2.FeatureNameStatistics.BYTES
  elif is_categorical:
    result.type = statistics_pb2.FeatureNameStatistics.INT
  else:
    # We don't have an "unknown" type so use STRING here.
    result.type = statistics_pb2.FeatureNameStatistics.STRING

  # Construct common statistics proto.
  common_stats_proto = _make_common_stats_proto(
      basic_stats.common_stats,
      parent_basic_stats.common_stats
      if parent_basic_stats is not None else None,
      make_quantiles_sketch_fn,
      num_values_histogram_buckets, has_weights)

  # this is the total number of values at the leaf level.
  total_num_values = (
      0 if basic_stats.common_stats.presence_and_valency_stats is None else
      basic_stats.common_stats.presence_and_valency_stats[-1].total_num_values)

  # Copy the common stats into appropriate numeric/string stats.
  # If the type is not set, we currently wrap the common stats
  # within numeric stats.
  if result.type == statistics_pb2.FeatureNameStatistics.BYTES:
    # Construct bytes statistics proto.
    bytes_stats_proto = _make_bytes_stats_proto(
        basic_stats.bytes_stats, common_stats_proto.tot_num_values)
    # Add the common stats into bytes stats.
    bytes_stats_proto.common_stats.CopyFrom(common_stats_proto)
    result.bytes_stats.CopyFrom(bytes_stats_proto)
  if (result.type == statistics_pb2.FeatureNameStatistics.STRING or
      (is_categorical and
       result.type == statistics_pb2.FeatureNameStatistics.INT)):
    # Construct string statistics proto.
    string_stats_proto = _make_string_stats_proto(basic_stats.string_stats,
                                                  total_num_values)
    # Add the common stats into string stats.
    string_stats_proto.common_stats.CopyFrom(common_stats_proto)
    result.string_stats.CopyFrom(string_stats_proto)
  elif result.type == statistics_pb2.FeatureNameStatistics.STRUCT:
    result.struct_stats.common_stats.CopyFrom(common_stats_proto)
  elif result.type in (statistics_pb2.FeatureNameStatistics.INT,
                       statistics_pb2.FeatureNameStatistics.FLOAT):
    # Construct numeric statistics proto.
    numeric_stats_proto = _make_numeric_stats_proto(
        basic_stats.numeric_stats, total_num_values,
        num_histogram_buckets, num_quantiles_histogram_buckets, has_weights)
    # Add the common stats into numeric stats.
    numeric_stats_proto.common_stats.CopyFrom(common_stats_proto)
    result.num_stats.CopyFrom(numeric_stats_proto)

  result.custom_stats.extend(_make_num_values_custom_stats_proto(
      basic_stats.common_stats,
      num_values_histogram_buckets))
  return result
示例#10
0
def make_feature_stats_proto_topk_uniques_custom_stats(
    feature_path: types.FeaturePath,
    is_categorical: bool,
    num_top_values: int,
    num_rank_histogram_buckets: int,
    num_unique: int,
    value_count_list: List[FeatureValueCount],
    weighted_value_count_list: Optional[List[FeatureValueCount]] = None,
    frequency_threshold: int = 1,
    weighted_frequency_threshold: Optional[float] = None
) -> statistics_pb2.FeatureNameStatistics:
    """Makes a FeatureNameStatistics proto containing top-k and uniques stats.

  Args:
    feature_path: The path of the feature.
    is_categorical: Whether the feature is categorical.
    num_top_values: The number of most frequent feature values to keep for
      string features.
    num_rank_histogram_buckets: The number of buckets in the rank histogram for
      string features.
    num_unique: The number of unique values in the feature.
    value_count_list: A list of FeatureValueCount tuples.
    weighted_value_count_list: An optional list of FeatureValueCount tuples for
      weighted features.
    frequency_threshold: The minimum number of examples the most frequent values
      must be present in.
    weighted_frequency_threshold: The minimum weighted number of examples the
      most frequent weighted values must be present in. Optional.

  Returns:
    A FeatureNameStatistics proto containing the top-k and uniques stats.
  """

    result = statistics_pb2.FeatureNameStatistics()
    result.path.CopyFrom(feature_path.to_proto())
    # If we have a categorical feature, we preserve the type to be the original
    # INT type.
    result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                   else statistics_pb2.FeatureNameStatistics.STRING)

    # Create a FeatureNameStatistics proto that includes the unweighted top-k
    # stats.
    topk_stats = _make_feature_stats_proto_topk(feature_path, value_count_list,
                                                is_categorical, False,
                                                num_top_values,
                                                frequency_threshold,
                                                num_rank_histogram_buckets)

    # Topk rank histogram.
    topk_custom_stats = result.custom_stats.add(
        name=_TOPK_SKETCH_CUSTOM_STATS_NAME)
    topk_custom_stats.rank_histogram.CopyFrom(
        topk_stats.string_stats.rank_histogram)

    # If weights were provided, create another FeatureNameStatistics proto that
    # includes the weighted top-k stats, and then copy those weighted top-k stats
    # into the result proto.
    if weighted_value_count_list:
        assert weighted_frequency_threshold is not None
        weighted_topk_stats = _make_feature_stats_proto_topk(
            feature_path, weighted_value_count_list, is_categorical, True,
            num_top_values, weighted_frequency_threshold,
            num_rank_histogram_buckets)

        # Weighted Topk rank histogram.
        weighted_topk_custom_stats = result.custom_stats.add(
            name=_WEIGHTED_TOPK_SKETCH_CUSTOM_STATS_NAME)
        weighted_topk_custom_stats.rank_histogram.CopyFrom(
            weighted_topk_stats.string_stats.weighted_string_stats.
            rank_histogram)

    # Add the number of uniques to the FeatureNameStatistics proto.
    result.custom_stats.add(name=_UNIQUES_SKETCH_CUSTOM_STATS_NAME,
                            num=num_unique)
    return result