def _make_feature_stats_proto(numeric_stats, feature_name, quantiles_combiner,
                              num_histogram_buckets,
                              num_quantiles_histogram_buckets):
    """Convert the partial numeric statistics into FeatureNameStatistics proto."""
    numeric_stats_proto = statistics_pb2.NumericStatistics()

    # Set the stats in the proto only if we have at least one value for the
    # feature.
    if numeric_stats.total_num_values > 0:
        mean = numeric_stats.sum / numeric_stats.total_num_values
        variance = max(
            0,
            (numeric_stats.sum_of_squares / numeric_stats.total_num_values) -
            mean * mean)
        numeric_stats_proto.mean = float(mean)
        numeric_stats_proto.std_dev = math.sqrt(variance)
        numeric_stats_proto.num_zeros = numeric_stats.num_zeros
        numeric_stats_proto.min = float(numeric_stats.min)
        numeric_stats_proto.max = float(numeric_stats.max)

        # Extract the quantiles from the summary.
        quantiles = quantiles_combiner.extract_output(
            numeric_stats.quantiles_summary)

        # Find the median from the quantiles and update the numeric stats proto.
        numeric_stats_proto.median = float(
            quantiles_util.find_median(quantiles))

        # Construct the equi-width histogram from the quantiles and add it to the
        # numeric stats proto.
        std_histogram = quantiles_util.generate_equi_width_histogram(
            quantiles, numeric_stats.min, numeric_stats.max,
            numeric_stats.total_num_values, num_histogram_buckets)
        std_histogram.num_nan = numeric_stats.num_nan
        new_std_histogram = numeric_stats_proto.histograms.add()
        new_std_histogram.CopyFrom(std_histogram)

        # Construct the quantiles histogram from the quantiles and add it to the
        # numeric stats proto.
        q_histogram = quantiles_util.generate_quantiles_histogram(
            quantiles, numeric_stats.min, numeric_stats.max,
            numeric_stats.total_num_values, num_quantiles_histogram_buckets)
        q_histogram.num_nan = numeric_stats.num_nan
        new_q_histogram = numeric_stats_proto.histograms.add()
        new_q_histogram.CopyFrom(q_histogram)

    # Create a new FeatureNameStatistics proto.
    result = statistics_pb2.FeatureNameStatistics()
    result.name = feature_name
    result.type = numeric_stats.type
    result.num_stats.CopyFrom(numeric_stats_proto)

    return result
Exemplo n.º 2
0
def get_dataset_feature_statistics(builder, split):
  """Calculate statistics for the specified split."""
  statistics = statistics_pb2.DatasetFeatureStatistics()

  # Make this to the best of our abilities.
  schema = schema_pb2.Schema()

  dataset = builder.as_dataset(split=split)

  # Just computing the number of examples for now.
  statistics.num_examples = 0

  # Feature dictionaries.
  feature_to_shape = {}
  feature_to_dtype = {}
  feature_to_num_examples = collections.defaultdict(int)
  feature_to_min = {}
  feature_to_max = {}

  for example in dataset:
    statistics.num_examples += 1

    assert isinstance(example, dict)

    feature_names = example.keys()
    for feature_name in feature_names:

      # Update the number of examples this feature appears in.
      feature_to_num_examples[feature_name] += 1

      feature_shape = example[feature_name].shape
      feature_dtype = example[feature_name].dtype
      feature_np = example[feature_name].numpy()

      feature_min, feature_max = None, None
      is_numeric = (
          feature_dtype.is_floating or feature_dtype.is_integer or
          feature_dtype.is_bool)
      if is_numeric:
        feature_min = np.min(feature_np)
        feature_max = np.max(feature_np)

      # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add
      # logic for that.

      # Set the shape, or assert shapes match.
      if feature_name not in feature_to_shape:
        feature_to_shape[feature_name] = feature_shape
      else:
        assert feature_to_shape[feature_name] == feature_shape

      # Set the shape, or assert shapes match.
      if feature_name not in feature_to_dtype:
        feature_to_dtype[feature_name] = feature_dtype
      else:
        assert feature_to_dtype[feature_name] == feature_dtype

      # Set or update the min, max.
      if is_numeric:
        if ((feature_name not in feature_to_min) or
            (feature_to_min[feature_name] > feature_min)):
          feature_to_min[feature_name] = feature_min

        if ((feature_name not in feature_to_max) or
            (feature_to_max[feature_name] < feature_max)):
          feature_to_max[feature_name] = feature_max

  # Start here, we've processed all examples.

  # Assert that the keys match up.
  assert feature_to_shape.keys() == feature_to_dtype.keys()
  assert feature_to_shape.keys() == feature_to_num_examples.keys()

  for feature_name in feature_to_shape:
    # Try to fill in the schema.
    feature = schema.feature.add()
    feature.name = feature_name

    # TODO(afrozm): What do we do for non fixed size shapes?
    # What to do for scalars?
    for dim in feature_to_shape[feature_name].as_list():
      feature.shape.dim.add().size = dim
    feature_type = feature_to_dtype[feature_name]
    feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES)

    common_statistics = statistics_pb2.CommonStatistics()
    common_statistics.num_non_missing = feature_to_num_examples[feature_name]
    common_statistics.num_missing = (
        statistics.num_examples - common_statistics.num_non_missing)

    feature_name_statistics = statistics.features.add()
    feature_name_statistics.name = feature_name

    # TODO(afrozm): This can be skipped, since type information was added to
    # the Schema.
    feature_name_statistics.type = _SCHEMA_TYPE_MAP.get(
        feature.type, statistics_pb2.FeatureNameStatistics.BYTES)

    if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT:
      numeric_statistics = statistics_pb2.NumericStatistics()
      numeric_statistics.min = feature_to_min[feature_name]
      numeric_statistics.max = feature_to_max[feature_name]
      numeric_statistics.common_stats.CopyFrom(common_statistics)
      feature_name_statistics.num_stats.CopyFrom(numeric_statistics)
    else:
      # Let's shove it into BytesStatistics for now.
      bytes_statistics = statistics_pb2.BytesStatistics()
      bytes_statistics.common_stats.CopyFrom(common_statistics)
      feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics)

  return statistics, schema
def _make_numeric_stats_proto(
    numeric_stats,
    total_num_values,
    quantiles_combiner,
    num_histogram_buckets,
    num_quantiles_histogram_buckets,
    has_weights
    ):
  """Convert the partial numeric statistics into NumericStatistics proto."""
  result = statistics_pb2.NumericStatistics()

  if numeric_stats.num_nan > 0:
    total_num_values -= numeric_stats.num_nan

  # Set the stats in the proto only if we have at least one value for the
  # feature.
  if total_num_values == 0:
    return result

  mean = numeric_stats.sum / total_num_values
  variance = max(
      0, (numeric_stats.sum_of_squares / total_num_values) -
      mean * mean)
  result.mean = float(mean)
  result.std_dev = math.sqrt(variance)
  result.num_zeros = numeric_stats.num_zeros
  result.min = float(numeric_stats.min)
  result.max = float(numeric_stats.max)

  # Extract the quantiles from the summary.
  quantiles = quantiles_combiner.extract_output(
      numeric_stats.quantiles_summary)

  # Find the median from the quantiles and update the numeric stats proto.
  result.median = float(quantiles_util.find_median(quantiles))

  # Construct the equi-width histogram from the quantiles and add it to the
  # numeric stats proto.
  std_histogram = quantiles_util.generate_equi_width_histogram(
      quantiles, numeric_stats.min, numeric_stats.max,
      total_num_values, num_histogram_buckets)
  std_histogram.num_nan = numeric_stats.num_nan
  new_std_histogram = result.histograms.add()
  new_std_histogram.CopyFrom(std_histogram)

  # Construct the quantiles histogram from the quantiles and add it to the
  # numeric stats proto.
  q_histogram = quantiles_util.generate_quantiles_histogram(
      quantiles, numeric_stats.min, numeric_stats.max,
      total_num_values, num_quantiles_histogram_buckets)
  q_histogram.num_nan = numeric_stats.num_nan
  new_q_histogram = result.histograms.add()
  new_q_histogram.CopyFrom(q_histogram)

  # Add weighted numeric stats to the proto.
  if has_weights:
    weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics()

    if numeric_stats.weighted_total_num_values == 0:
      weighted_mean = 0
      weighted_variance = 0
    else:
      weighted_mean = (numeric_stats.weighted_sum /
                       numeric_stats.weighted_total_num_values)
      weighted_variance = max(0, (numeric_stats.weighted_sum_of_squares /
                                  numeric_stats.weighted_total_num_values)
                              - weighted_mean**2)
    weighted_numeric_stats_proto.mean = weighted_mean
    weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance)

    # Extract the weighted quantiles from the summary.
    weighted_quantiles = quantiles_combiner.extract_output(
        numeric_stats.weighted_quantiles_summary)

    # Find the weighted median from the quantiles and update the proto.
    weighted_numeric_stats_proto.median = float(
        quantiles_util.find_median(weighted_quantiles))

    # Construct the weighted equi-width histogram from the quantiles and
    # add it to the numeric stats proto.
    weighted_std_histogram = quantiles_util.generate_equi_width_histogram(
        weighted_quantiles, numeric_stats.min, numeric_stats.max,
        numeric_stats.weighted_total_num_values, num_histogram_buckets)
    weighted_std_histogram.num_nan = numeric_stats.num_nan
    weighted_numeric_stats_proto.histograms.extend([weighted_std_histogram])

    # Construct the weighted quantiles histogram from the quantiles and
    # add it to the numeric stats proto.
    weighted_q_histogram = quantiles_util.generate_quantiles_histogram(
        weighted_quantiles, numeric_stats.min, numeric_stats.max,
        numeric_stats.weighted_total_num_values,
        num_quantiles_histogram_buckets)
    weighted_q_histogram.num_nan = numeric_stats.num_nan
    weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram])

    result.weighted_numeric_stats.CopyFrom(
        weighted_numeric_stats_proto)
  return result
Exemplo n.º 4
0
def get_dataset_feature_statistics(builder, split):
    """Calculate statistics for the specified split."""
    statistics = statistics_pb2.DatasetFeatureStatistics()

    # Make this to the best of our abilities.
    schema = schema_pb2.Schema()

    dataset = builder.as_dataset(split=split)

    # Just computing the number of examples for now.
    statistics.num_examples = 0

    # Feature dictionaries.
    feature_to_num_examples = collections.defaultdict(int)
    feature_to_min = {}
    feature_to_max = {}

    np_dataset = dataset_utils.dataset_as_numpy(dataset)
    for example in tqdm.tqdm(np_dataset, unit=" examples"):
        statistics.num_examples += 1

        assert isinstance(example, dict)

        feature_names = sorted(example.keys())
        for feature_name in feature_names:

            # Update the number of examples this feature appears in.
            feature_to_num_examples[feature_name] += 1

            feature_np = example[feature_name]

            # For compatibility in graph and eager mode, we can get PODs here and
            # everything may not be neatly wrapped up in numpy's ndarray.

            feature_dtype = type(feature_np)

            if isinstance(feature_np, np.ndarray):
                feature_dtype = feature_np.dtype.type

            feature_min, feature_max = None, None
            is_numeric = (np.issubdtype(feature_dtype, np.number)
                          or feature_dtype == np.bool_)
            if is_numeric:
                feature_min = np.min(feature_np)
                feature_max = np.max(feature_np)

            # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add
            # logic for that.

            # Set or update the min, max.
            if is_numeric:
                if ((feature_name not in feature_to_min)
                        or (feature_to_min[feature_name] > feature_min)):
                    feature_to_min[feature_name] = feature_min

                if ((feature_name not in feature_to_max)
                        or (feature_to_max[feature_name] < feature_max)):
                    feature_to_max[feature_name] = feature_max

    # Start here, we've processed all examples.

    output_shapes_dict = dataset.output_shapes
    output_types_dict = dataset.output_types

    for feature_name in sorted(feature_to_num_examples.keys()):
        # Try to fill in the schema.
        feature = schema.feature.add()
        feature.name = feature_name

        # TODO(afrozm): Make this work with nested structures, currently the Schema
        # proto has no support for it.
        maybe_feature_shape = output_shapes_dict[feature_name]
        if not isinstance(maybe_feature_shape, tf.TensorShape):
            logging.error(
                "Statistics generation doesn't work for nested structures yet")
            continue

        for dim in maybe_feature_shape.as_list():
            # We denote `None`s as -1 in the shape proto.
            feature.shape.dim.add().size = dim if dim else -1
        feature_type = output_types_dict[feature_name]
        feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES)

        common_statistics = statistics_pb2.CommonStatistics()
        common_statistics.num_non_missing = feature_to_num_examples[
            feature_name]
        common_statistics.num_missing = (statistics.num_examples -
                                         common_statistics.num_non_missing)

        feature_name_statistics = statistics.features.add()
        feature_name_statistics.name = feature_name

        # TODO(afrozm): This can be skipped, since type information was added to
        # the Schema.
        feature_name_statistics.type = _SCHEMA_TYPE_MAP.get(
            feature.type, statistics_pb2.FeatureNameStatistics.BYTES)

        if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT:
            numeric_statistics = statistics_pb2.NumericStatistics()
            numeric_statistics.min = feature_to_min[feature_name]
            numeric_statistics.max = feature_to_max[feature_name]
            numeric_statistics.common_stats.CopyFrom(common_statistics)
            feature_name_statistics.num_stats.CopyFrom(numeric_statistics)
        else:
            # Let's shove it into BytesStatistics for now.
            bytes_statistics = statistics_pb2.BytesStatistics()
            bytes_statistics.common_stats.CopyFrom(common_statistics)
            feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics)

    return statistics, schema
Exemplo n.º 5
0
def _make_feature_stats_proto(common_stats, feature_name, q_combiner,
                              num_values_histogram_buckets, is_categorical,
                              has_weights):
    """Convert the partial common stats into a FeatureNameStatistics proto.

  Args:
    common_stats: The partial common stats associated with a feature.
    feature_name: The name of the feature.
    q_combiner: The quantiles combiner used to construct the quantiles
        histogram for the number of values in the feature.
    num_values_histogram_buckets: Number of buckets in the quantiles
        histogram for the number of values per feature.
    is_categorical: A boolean indicating whether the feature is categorical.
    has_weights: A boolean indicating whether a weight feature is specified.

  Returns:
    A statistics_pb2.FeatureNameStatistics proto.
  """
    common_stats_proto = statistics_pb2.CommonStatistics()
    common_stats_proto.num_non_missing = common_stats.num_non_missing
    common_stats_proto.num_missing = common_stats.num_missing
    common_stats_proto.tot_num_values = common_stats.total_num_values

    if common_stats.num_non_missing > 0:
        common_stats_proto.min_num_values = common_stats.min_num_values
        common_stats_proto.max_num_values = common_stats.max_num_values
        common_stats_proto.avg_num_values = (common_stats.total_num_values /
                                             common_stats.num_non_missing)

        # Add num_values_histogram to the common stats proto.
        num_values_quantiles = q_combiner.extract_output(
            common_stats.num_values_summary)
        histogram = quantiles_util.generate_quantiles_histogram(
            num_values_quantiles, common_stats.min_num_values,
            common_stats.max_num_values, common_stats.num_non_missing,
            num_values_histogram_buckets)
        common_stats_proto.num_values_histogram.CopyFrom(histogram)

    # Add weighted common stats to the proto.
    if has_weights:
        weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics(
            num_non_missing=common_stats.weighted_num_non_missing,
            num_missing=common_stats.weighted_num_missing,
            tot_num_values=common_stats.weighted_total_num_values)

        if common_stats.weighted_num_non_missing > 0:
            weighted_common_stats_proto.avg_num_values = (
                common_stats.weighted_total_num_values /
                common_stats.weighted_num_non_missing)

        common_stats_proto.weighted_common_stats.CopyFrom(
            weighted_common_stats_proto)

    # Create a new FeatureNameStatistics proto.
    result = statistics_pb2.FeatureNameStatistics()
    result.name = feature_name
    # Set the feature type.
    # If we have a categorical feature, we preserve the type to be the original
    # INT type. Currently we don't set the type if we cannot infer it, which
    # happens when all the values are missing. We need to add an UNKNOWN type
    # to the stats proto to handle this case.
    if is_categorical:
        result.type = statistics_pb2.FeatureNameStatistics.INT
    elif common_stats.type is None:
        # If a feature is completely missing, we assume the type to be STRING.
        result.type = statistics_pb2.FeatureNameStatistics.STRING
    else:
        result.type = common_stats.type

    # Copy the common stats into appropriate numeric/string stats.
    # If the type is not set, we currently wrap the common stats
    # within numeric stats.
    if (result.type == statistics_pb2.FeatureNameStatistics.STRING
            or is_categorical):
        # Add the common stats into string stats.
        string_stats_proto = statistics_pb2.StringStatistics()
        string_stats_proto.common_stats.CopyFrom(common_stats_proto)
        result.string_stats.CopyFrom(string_stats_proto)
    else:
        # Add the common stats into numeric stats.
        numeric_stats_proto = statistics_pb2.NumericStatistics()
        numeric_stats_proto.common_stats.CopyFrom(common_stats_proto)
        result.num_stats.CopyFrom(numeric_stats_proto)

    return result
def _make_numeric_stats_proto(
    numeric_stats: _PartialNumericStats,
    total_num_values: int,
    num_histogram_buckets: int,
    num_quantiles_histogram_buckets: int,
    has_weights: bool
    ) -> statistics_pb2.NumericStatistics:
  """Convert the partial numeric statistics into NumericStatistics proto."""
  result = statistics_pb2.NumericStatistics()

  if numeric_stats.num_nan > 0:
    total_num_values -= numeric_stats.num_nan

  if total_num_values == 0:
    # If we only have nan values, we only set num_nan.
    if numeric_stats.num_nan > 0:
      result.histograms.add(type=statistics_pb2.Histogram.STANDARD).num_nan = (
          numeric_stats.num_nan)
      result.histograms.add(type=statistics_pb2.Histogram.QUANTILES).num_nan = (
          numeric_stats.num_nan)
    return result

  mean = numeric_stats.sum / total_num_values
  variance = max(
      0, (numeric_stats.sum_of_squares / total_num_values) -
      mean * mean)
  result.mean = float(mean)
  result.std_dev = math.sqrt(variance)
  result.num_zeros = numeric_stats.num_zeros
  result.min = float(numeric_stats.min)
  result.max = float(numeric_stats.max)

  # Extract the quantiles from the summary.
  assert numeric_stats.quantiles_summary is not None
  quantiles = (
      numeric_stats.quantiles_summary.GetQuantiles(
          max(num_quantiles_histogram_buckets,
              _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM *
              num_histogram_buckets)).flatten().to_pylist())

  # Find the median from the quantiles and update the numeric stats proto.
  result.median = float(quantiles_util.find_median(quantiles))

  # Construct the equi-width histogram from the quantiles and add it to the
  # numeric stats proto.
  std_histogram = quantiles_util.generate_equi_width_histogram(
      quantiles, numeric_stats.finite_min, numeric_stats.finite_max,
      total_num_values, num_histogram_buckets)
  std_histogram.num_nan = numeric_stats.num_nan
  new_std_histogram = result.histograms.add()
  new_std_histogram.CopyFrom(std_histogram)

  # Construct the quantiles histogram from the quantiles and add it to the
  # numeric stats proto.
  q_histogram = quantiles_util.generate_quantiles_histogram(
      quantiles, total_num_values, num_quantiles_histogram_buckets)
  q_histogram.num_nan = numeric_stats.num_nan
  new_q_histogram = result.histograms.add()
  new_q_histogram.CopyFrom(q_histogram)

  # Add weighted numeric stats to the proto.
  if has_weights:
    weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics()

    if numeric_stats.weighted_total_num_values == 0:
      weighted_mean = 0
      weighted_variance = 0
    else:
      weighted_mean = (numeric_stats.weighted_sum /
                       numeric_stats.weighted_total_num_values)
      weighted_variance = max(0, (numeric_stats.weighted_sum_of_squares /
                                  numeric_stats.weighted_total_num_values)
                              - weighted_mean**2)
    weighted_numeric_stats_proto.mean = weighted_mean
    weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance)

    # Extract the weighted quantiles from the summary.
    assert numeric_stats.weighted_quantiles_summary is not None
    weighted_quantiles = (
        numeric_stats.weighted_quantiles_summary.GetQuantiles(
            max(num_quantiles_histogram_buckets,
                _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM *
                num_histogram_buckets)).flatten().to_pylist())

    # Find the weighted median from the quantiles and update the proto.
    weighted_numeric_stats_proto.median = float(
        quantiles_util.find_median(weighted_quantiles))

    # Construct the weighted equi-width histogram from the quantiles and
    # add it to the numeric stats proto.
    weighted_std_histogram = quantiles_util.generate_equi_width_histogram(
        weighted_quantiles, numeric_stats.finite_min, numeric_stats.finite_max,
        numeric_stats.weighted_total_num_values, num_histogram_buckets)
    weighted_std_histogram.num_nan = numeric_stats.num_nan
    weighted_numeric_stats_proto.histograms.extend([weighted_std_histogram])

    # Construct the weighted quantiles histogram from the quantiles and
    # add it to the numeric stats proto.
    weighted_q_histogram = quantiles_util.generate_quantiles_histogram(
        weighted_quantiles, numeric_stats.weighted_total_num_values,
        num_quantiles_histogram_buckets)
    weighted_q_histogram.num_nan = numeric_stats.num_nan
    weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram])

    result.weighted_numeric_stats.CopyFrom(
        weighted_numeric_stats_proto)
  return result