Пример #1
0
def _make_dataset_feature_stats_proto_with_uniques_for_single_feature(
        feature_path_to_value_count, categorical_features):
    """Makes a DatasetFeatureStatistics proto with uniques stats for a feature."""
    (slice_key, feature_path_tuple), count = feature_path_to_value_count
    feature_path = types.FeaturePath(feature_path_tuple)
    result = statistics_pb2.DatasetFeatureStatistics()
    result.features.add().CopyFrom(
        _make_feature_stats_proto_with_uniques_stats(
            feature_path, count, feature_path in categorical_features))
    return slice_key, result.SerializeToString()
Пример #2
0
def make_dataset_feature_stats_proto_unique_single(
    feature_path_tuple: types.FeaturePathTuple,
    num_uniques: int,
) -> statistics_pb2.DatasetFeatureStatistics:
  """Makes a DatasetFeatureStatistics proto with uniques stats for a feature."""
  feature_path = types.FeaturePath(feature_path_tuple)
  result = statistics_pb2.DatasetFeatureStatistics()
  result.features.add().CopyFrom(
      _make_feature_stats_proto_uniques(feature_path, num_uniques))
  return result
Пример #3
0
def _add_slice_key(
    stats_proto_per_slice,
    is_slicing_enabled
):
  """Add slice key to stats proto."""
  result = statistics_pb2.DatasetFeatureStatistics()
  result.CopyFrom(stats_proto_per_slice[1])
  if is_slicing_enabled:
    result.name = stats_proto_per_slice[0]
  return result
Пример #4
0
def _add_slice_key(
        stats_proto_per_slice: Tuple[types.SliceKey,
                                     statistics_pb2.DatasetFeatureStatistics],
        is_slicing_enabled: bool) -> statistics_pb2.DatasetFeatureStatistics:
    """Add slice key to stats proto."""
    result = statistics_pb2.DatasetFeatureStatistics()
    result.CopyFrom(stats_proto_per_slice[1])
    if is_slicing_enabled:
        result.name = stats_proto_per_slice[0]
    return result
Пример #5
0
  def test_topk_with_single_string_feature(self):
    # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
    examples = [{'fa': np.array(['a', 'b', 'c', 'e'])},
                {'fa': np.array(['a', 'c', 'd', 'a'])},
                {'fa': np.array(['a', 'b', 'c', 'd'])}]

    # Note that if two feature values have the same frequency, the one with the
    # lexicographically larger feature value will be higher in the order.
    expected_result = text_format.Parse(
        """
      features {
        name: 'fa'
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 4
          }
          top_values {
            value: 'c'
            frequency: 3
          }
          top_values {
            value: 'd'
            frequency: 2
          }
          top_values {
            value: 'b'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "d"
              sample_count: 2.0
            }
          }
        }
      }""", statistics_pb2.DatasetFeatureStatistics())
    generator = top_k_stats_generator.TopKStatsGenerator(
        num_top_values=4, num_rank_histogram_buckets=3)
    self.assertTransformOutputEqual(examples, generator, [expected_result])
    def test_make_dataset_feature_stats_proto_topk_single(self):
        expected_result = text_format.Parse(
            """
        features {
          string_stats {
            top_values {
              value: "e"
              frequency: 20.0
            }
            top_values {
              value: "d"
              frequency: 20.0
            }
            top_values {
              value: "a"
              frequency: 15.0
            }
            rank_histogram {
              buckets {
                label: "e"
                sample_count: 20.0
              }
              buckets {
                low_rank: 1
                high_rank: 1
                label: "d"
                sample_count: 20.0
              }
            }
          }
          path {
            step: "fa"
          }
    }""", statistics_pb2.DatasetFeatureStatistics())

        value_counts = [('e', 20), ('d', 20), ('a', 15), ('c', 10), ('b', 5)]
        value_count_list = [
            top_k_uniques_stats_util.FeatureValueCount(value_count[0],
                                                       value_count[1])
            for value_count in value_counts
        ]
        result = (top_k_uniques_stats_util.
                  make_dataset_feature_stats_proto_topk_single(
                      types.FeaturePath(['fa']).steps(),
                      value_count_list=value_count_list,
                      categorical_features=frozenset([
                          types.FeaturePath(['fa']),
                          types.FeaturePath(['fb'])
                      ]),
                      is_weighted_stats=False,
                      num_top_values=3,
                      frequency_threshold=1,
                      num_rank_histogram_buckets=2))
        test_util.assert_dataset_feature_stats_proto_equal(
            self, result, expected_result)
Пример #7
0
def _make_dataset_feature_stats_proto(
    lifts: Tuple[_SlicedFeatureKey, _LiftSeries], y_path: types.FeaturePath,
    y_boundaries: Optional[np.ndarray]
) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]:
    """Generates DatasetFeatureStatistics proto for a given x_path, y_path pair.

  Args:
    lifts: The result of two successive group bys of lift values. The innermost
      grouping collects all the lift values for a given (slice, x_path and
      y_value) tuple (corresponding to a single LiftSeries message). The
      outermost grouping collects all the lift values for the same (slice,
      x_path) tuple (corresponding to the set of the LiftSeries which share the
      same value of y_path). The full structure of lifts is described by:
        (slice, x_path), [(y, y_count, [(x, lift, xy_count, x_count)])]
    y_path: The path used as Y in the lift expression: lift = P(Y=y|X=x) /
      P(Y=y).
    y_boundaries: Optionally, a set of bin boundaries used for binning y_path
      values.

  Returns:
    The populated DatasetFeatureStatistics proto.
  """
    key, lift_series_list = lifts
    stats = statistics_pb2.DatasetFeatureStatistics()
    cross_stats = stats.cross_features.add(path_x=key.x_path.to_proto(),
                                           path_y=y_path.to_proto())
    for lift_series in sorted(lift_series_list):
        lift_series_proto = cross_stats.categorical_cross_stats.lift_series.add(
            y_count=lift_series.y_count)
        y = lift_series.y
        if y_boundaries is not None:
            low_value, high_value = bin_util.get_boundaries(y, y_boundaries)
            lift_series_proto.y_bucket.low_value = low_value
            lift_series_proto.y_bucket.high_value = high_value
        elif isinstance(y, six.string_types):
            lift_series_proto.y_string = y
        else:
            lift_series_proto.y_int = y

        # dedupe possibly overlapping top_k and bottom_k x values.
        lift_values_deduped = {v.x: v for v in lift_series.lift_values}
        # sort by lift DESC, x ASC
        lift_values_sorted = sorted(lift_values_deduped.values(),
                                    key=lambda v: (-v.lift, v.x))
        for lift_value in lift_values_sorted:
            lift_value_proto = lift_series_proto.lift_values.add(
                lift=lift_value.lift,
                x_count=lift_value.x_count,
                x_and_y_count=lift_value.xy_count)
            x = lift_value.x
            if isinstance(x, six.string_types):
                lift_value_proto.x_string = x
            else:
                lift_value_proto.x_int = x
    return key.slice_key, stats
Пример #8
0
 def assert_on_two_protos_with_same_features_in_different_order(self):
   expected = text_format.Parse(
       """
   features {
     path {
       step: 'fb'
     }
     type: STRING
     string_stats {
       unique: 5
     }
   }
   features {
     path {
       step: 'fa'
     }
     type: STRING
     string_stats {
       unique: 4
     }
   }""", statistics_pb2.DatasetFeatureStatistics())
   actual = text_format.Parse(
       """
   features {
     path {
       step: 'fa'
     }
     type: STRING
     string_stats {
       unique: 4
     }
   }
   features {
     path {
       step: 'fb'
     }
     type: STRING
     string_stats {
       unique: 5
     }
   }""", statistics_pb2.DatasetFeatureStatistics())
   test_util.assert_dataset_feature_stats_proto_equal(self, actual, expected)
Пример #9
0
def _merge_dataset_feature_stats_protos(
    stats_protos: Iterable[statistics_pb2.DatasetFeatureStatistics]
) -> statistics_pb2.DatasetFeatureStatistics:
    """Merges together a list of DatasetFeatureStatistics protos.

  Args:
    stats_protos: A list of DatasetFeatureStatistics protos to merge.

  Returns:
    The merged DatasetFeatureStatistics proto.
  """
    stats_per_feature = {}
    # Iterate over each DatasetFeatureStatistics proto and merge the
    # FeatureNameStatistics protos per feature.
    for stats_proto in stats_protos:
        for feature_stats_proto in stats_proto.features:
            feature_path = types.FeaturePath.from_proto(
                feature_stats_proto.path)
            if feature_path not in stats_per_feature:
                stats_per_feature[feature_path] = feature_stats_proto
            else:
                stats_for_feature = stats_per_feature[feature_path]
                # MergeFrom would concatenate repeated fields which is not what we want
                # for path.step.
                del stats_for_feature.path.step[:]
                stats_per_feature[feature_path].MergeFrom(feature_stats_proto)

    # Create a new DatasetFeatureStatistics proto.
    result = statistics_pb2.DatasetFeatureStatistics()
    num_examples = None
    for feature_stats_proto in six.itervalues(stats_per_feature):
        # Add the merged FeatureNameStatistics proto for the feature
        # into the DatasetFeatureStatistics proto.
        new_feature_stats_proto = result.features.add()
        new_feature_stats_proto.CopyFrom(feature_stats_proto)

        # Get the number of examples from one of the features that
        # has common stats.
        if num_examples is None:
            stats_type = feature_stats_proto.WhichOneof('stats')
            stats_proto = None
            if stats_type == 'num_stats':
                stats_proto = feature_stats_proto.num_stats
            else:
                stats_proto = feature_stats_proto.string_stats

            if stats_proto.HasField('common_stats'):
                num_examples = (stats_proto.common_stats.num_non_missing +
                                stats_proto.common_stats.num_missing)

    # Set the num_examples field.
    if num_examples is not None:
        result.num_examples = num_examples
    return result
Пример #10
0
 def extract_output(self, accumulator: List[float]
                   ) -> statistics_pb2.DatasetFeatureStatistics:
   result = statistics_pb2.DatasetFeatureStatistics()
   dummy_feature = result.features.add()
   dummy_feature.path.CopyFrom(_DUMMY_FEATURE_PATH.to_proto())
   dummy_feature.custom_stats.add(name=_NUM_EXAMPLES_KEY, num=accumulator[0])
   dummy_feature.custom_stats.add(name=_WEIGHTED_NUM_EXAMPLES_KEY,
                                  num=accumulator[1])
   beam.metrics.Metrics.counter(constants.METRICS_NAMESPACE, 'num_instances'
                               ).inc(accumulator[0])
   return result
 def test_topk_with_single_unicode_feature(self):
   # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
   batches = [{'fa': np.array([np.array(['a', 'b', 'c', 'e']),
                               np.array(['a', 'c', 'd', 'a'])],
                              dtype=np.unicode_)},
              {'fa': np.array([np.array(['a', 'b', 'c', 'd'])],
                              dtype=np.unicode_)}]
   expected_result = text_format.Parse(
       """
     features {
       name: 'fa'
       type: STRING
       string_stats {
         top_values {
           value: 'a'
           frequency: 4
         }
         top_values {
           value: 'c'
           frequency: 3
         }
         top_values {
           value: 'd'
           frequency: 2
         }
         top_values {
           value: 'b'
           frequency: 2
         }
         rank_histogram {
           buckets {
             low_rank: 0
             high_rank: 0
             label: "a"
             sample_count: 4.0
           }
           buckets {
             low_rank: 1
             high_rank: 1
             label: "c"
             sample_count: 3.0
           }
           buckets {
             low_rank: 2
             high_rank: 2
             label: "d"
             sample_count: 2.0
           }
         }
       }
     }""", statistics_pb2.DatasetFeatureStatistics())
   generator = top_k_stats_generator.TopKStatsGenerator(
       num_top_values=4, num_rank_histogram_buckets=3)
   self.assertTransformOutputEqual(batches, generator, [expected_result])
Пример #12
0
 def extract_output(self, accumulator):
     result = statistics_pb2.DatasetFeatureStatistics()
     dummy_feature = result.features.add()
     dummy_feature.name = _DUMMY_FEATURE_NAME
     dummy_feature.custom_stats.add(name=_NUM_EXAMPLES_KEY,
                                    num=accumulator[0])
     dummy_feature.custom_stats.add(name=_WEIGHTED_NUM_EXAMPLES_KEY,
                                    num=accumulator[1])
     beam.metrics.Metrics.counter(constants.METRICS_NAMESPACE,
                                  'num_instances').inc(accumulator[0])
     return result
 def test_topk_with_categorical_feature(self):
   batches = [{'fa': np.array([np.array([12, 23, 34, 12]),
                               np.array([45, 23])])},
              {'fa': np.array([np.array([12, 12, 34, 45])])}]
   expected_result_fa = text_format.Parse(
       """
     features {
       name: 'fa'
       type: INT
       string_stats {
         top_values {
           value: '12'
           frequency: 4
         }
         top_values {
           value: '45'
           frequency: 2
         }
         rank_histogram {
           buckets {
             low_rank: 0
             high_rank: 0
             label: "12"
             sample_count: 4.0
           }
           buckets {
             low_rank: 1
             high_rank: 1
             label: "45"
             sample_count: 2.0
           }
           buckets {
             low_rank: 2
             high_rank: 2
             label: "34"
             sample_count: 2.0
           }
         }
       }
     }""", statistics_pb2.DatasetFeatureStatistics())
   schema = text_format.Parse(
       """
       feature {
         name: "fa"
         type: INT
         int_domain {
           is_categorical: true
         }
       }
       """, schema_pb2.Schema())
   generator = top_k_stats_generator.TopKStatsGenerator(
       schema=schema,
       num_top_values=2, num_rank_histogram_buckets=3)
   self.assertTransformOutputEqual(batches, generator, [expected_result_fa])
Пример #14
0
 def extract_output(
         self, accumulator: List[float]
 ) -> statistics_pb2.DatasetFeatureStatistics:
     result = statistics_pb2.DatasetFeatureStatistics()
     dummy_feature = result.features.add()
     dummy_feature.path.CopyFrom(_DUMMY_FEATURE_PATH.to_proto())
     dummy_feature.custom_stats.add(name=_NUM_EXAMPLES_KEY,
                                    num=accumulator[0])
     dummy_feature.custom_stats.add(name=_WEIGHTED_NUM_EXAMPLES_KEY,
                                    num=accumulator[1])
     return result
Пример #15
0
    def test_mi_with_imputed_categorical_feature(self):
        label_array = pa.array([[0], [2], [0], [1], [2], [1], [1]])
        # A categorical feature with missing values.
        feat_array = pa.array([["Red"], ["Blue"], None, None, ["Blue"],
                               ["Green"], ["Green"]])
        batch = pa.RecordBatch.from_arrays([label_array, feat_array],
                                           ["label_key", "fa"])

        schema = text_format.Parse(
            """
        feature {
          name: "label_key"
          type: INT
          shape {
            dim {
              size: 1
            }
          }
          int_domain {
            is_categorical: true
          }
        }
        feature {
          name: "fa"
          type: BYTES
          shape {
            dim {
              size: 1
            }
          }
        }
        """, schema_pb2.Schema())

        expected = text_format.Parse(
            """
        features {
          path {
            step: "fa"
          }
          custom_stats {
            name: 'sklearn_adjusted_mutual_information'
            num: 0.3960841
          }
          custom_stats {
            name: 'sklearn_mutual_information'
            num: 0.8809502
          }
          custom_stats {
            name: "sklearn_normalized_adjusted_mutual_information"
            num: 0.4568877
          }
        }""", statistics_pb2.DatasetFeatureStatistics())
        self._assert_mi_output_equal(batch, expected, schema,
                                     types.FeaturePath(["label_key"]))
def _make_dataset_feature_stats_proto_with_single_feature(
        feature_name_to_value_count_list, categorical_features, num_top_values,
        num_rank_histogram_buckets):
    """Makes a DatasetFeatureStatistics containing one single feature."""
    result = statistics_pb2.DatasetFeatureStatistics()
    result.features.add().CopyFrom(
        _make_feature_stats_proto(
            feature_name_to_value_count_list[0],
            feature_name_to_value_count_list[1],
            feature_name_to_value_count_list[0] in categorical_features,
            num_top_values, num_rank_histogram_buckets))
    return result
Пример #17
0
    def test_mi_with_imputed_numerical_label(self):
        label_array = pa.array([[0.1], [0.2], [0.8], [0.7], [0.2], [np.NaN],
                                None, [0.1], [0.2], [0.8], [0.7], [0.2], [0.2],
                                [0.3]])
        feat_array = pa.array([[0.1], [0.2], [0.8], [0.7], [0.2], [0.2], [0.3],
                               [0.1], [0.2], [0.8], [0.7], [0.2], [0.2],
                               [0.3]])
        batch = pa.RecordBatch.from_arrays([label_array, feat_array],
                                           ["label_key", "fa"])

        schema = text_format.Parse(
            """
        feature {
          name: "fa"
          type: FLOAT
          shape {
            dim {
              size: 1
            }
          }
        }
        feature {
          name: "label_key"
          type: FLOAT
          shape {
            dim {
              size: 1
            }
          }
        }
        """, schema_pb2.Schema())

        expected = text_format.Parse(
            """
        features {
          path {
            step: "fa"
          }
          custom_stats {
            name: "sklearn_adjusted_mutual_information"
            num: 0.2640041
          }
          custom_stats {
            name: "sklearn_mutual_information"
            num: 0.3825569
          }
          custom_stats {
            name: "sklearn_normalized_adjusted_mutual_information"
            num: 0.244306
          }
        }""", statistics_pb2.DatasetFeatureStatistics())
        self._assert_mi_output_equal(batch, expected, schema,
                                     types.FeaturePath(["label_key"]))
def _make_dataset_feature_stats_proto_with_single_feature(
    feature_name_to_value_count,
    categorical_features
):
  """Generates a DatasetFeatureStatistics proto containing a single feature."""
  result = statistics_pb2.DatasetFeatureStatistics()
  result.features.add().CopyFrom(
      _make_feature_stats_proto(
          feature_name_to_value_count[0],
          feature_name_to_value_count[1],
          feature_name_to_value_count[0] in categorical_features))
  return result
 def test_all_string_features(self):
     # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
     # fb: 1 'a', 2 'b', 3 'c'
     examples = [{
         'fa': np.array(['a', 'b', 'c', 'e']),
         'fb': np.array(['a', 'c', 'c'])
     }, {
         'fa': None,
         'fb': np.array(['a', 'c', 'c'])
     }, {
         'fa': np.array(['a', 'c', 'd']),
         'fb': None
     }, {
         'fa': np.array(['a', 'a', 'b', 'c', 'd']),
         'fb': None
     }, {
         'fa': None,
         'fb': np.array(['b', 'c'])
     }]
     expected_result_fa = text_format.Parse(
         """
   features {
     name: 'fa'
     type: STRING
     string_stats {
       unique: 5
     }
   }""", statistics_pb2.DatasetFeatureStatistics())
     expected_result_fb = text_format.Parse(
         """
   features {
     name: 'fb'
     type: STRING
     string_stats {
       unique: 3
     }
   }""", statistics_pb2.DatasetFeatureStatistics())
     generator = uniques_stats_generator.UniquesStatsGenerator()
     self.assertTransformOutputEqual(
         examples, generator, [expected_result_fa, expected_result_fb])
Пример #20
0
  def test_mi_classif_with_int_label_and_categorical_feature(self):
    label_array = pa.array([
        [0], [2], [0], [1], [2], [1], [1], [0], [2], [1], [0]])
    # A categorical feature that maps directly on to the label.
    perfect_feat_array = pa.array([
        ["Red"], ["Blue"], ["Red"], ["Green"], ["Blue"], ["Green"], ["Green"],
        ["Red"], ["Blue"], ["Green"], ["Red"]])
    batch = pa.RecordBatch.from_arrays([label_array, perfect_feat_array],
                                       ["label_key", "perfect_feature"])

    schema = text_format.Parse(
        """
        feature {
          name: "label_key"
          type: INT
          int_domain {
            is_categorical: true
          }
          shape {
            dim {
              size: 1
            }
          }
        }
        feature {
          name: "perfect_feature"
          type: BYTES
          shape {
            dim {
              size: 1
            }
          }
        }
        """, schema_pb2.Schema())

    expected = text_format.Parse(
        """
        features {
          path {
            step: "perfect_feature"
          }
          custom_stats {
            name: 'sklearn_adjusted_mutual_information'
            num: 0.9297553
          }
          custom_stats {
            name: 'sklearn_mutual_information'
            num: 1.0900597
          }
        }""", statistics_pb2.DatasetFeatureStatistics())
    self._assert_mi_output_equal(batch, expected, schema,
                                 types.FeaturePath(["label_key"]))
def _make_dataset_feature_stats_proto_with_single_feature(
    feature_name_to_value_count_list,
    categorical_features, is_weighted_stats,
    num_top_values, num_rank_histogram_buckets
):
  """Makes a DatasetFeatureStatistics containing one single feature."""
  (slice_key, feature_name), value_count_list = feature_name_to_value_count_list
  result = statistics_pb2.DatasetFeatureStatistics()
  result.features.add().CopyFrom(
      make_feature_stats_proto_with_topk_stats(
          feature_name, value_count_list, feature_name in categorical_features,
          is_weighted_stats, num_top_values, num_rank_histogram_buckets))
  return slice_key, result.SerializeToString()
def _make_dataset_feature_stats_proto_with_uniques_for_single_feature(
    feature_path_to_value_count: Tuple[Tuple[types.SliceKey,
                                             FeaturePathTuple], int],
    categorical_features: Set[types.FeaturePath]
) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]:
  """Makes a DatasetFeatureStatistics proto with uniques stats for a feature."""
  (slice_key, feature_path_tuple), count = feature_path_to_value_count
  feature_path = types.FeaturePath(feature_path_tuple)
  result = statistics_pb2.DatasetFeatureStatistics()
  result.features.add().CopyFrom(
      _make_feature_stats_proto_with_uniques_stats(
          feature_path, count, feature_path in categorical_features))
  return slice_key, result
Пример #23
0
 def test_mi_classif_categorical_label_small_sample(self):
     label_array = pa.array([[0]])
     feat_array = pa.array([["Red"]])
     batch = pa.RecordBatch.from_arrays([label_array, feat_array],
                                        ["label_key", "feature"])
     schema = text_format.Parse(
         """
     feature {
       name: "label_key"
       type: INT
       int_domain {
         is_categorical: true
       }
       shape {
         dim {
           size: 1
         }
       }
     }
     feature {
       name: "feature"
       type: BYTES
       shape {
         dim {
           size: 1
         }
       }
     }
     """, schema_pb2.Schema())
     expected = text_format.Parse(
         """
     features {
       path {
         step: "feature"
       }
       custom_stats {
         name: 'sklearn_adjusted_mutual_information'
         num: 0
       }
       custom_stats {
         name: 'sklearn_mutual_information'
         num: 0
       }
       custom_stats {
         name: "sklearn_normalized_adjusted_mutual_information"
         num: 0
       }
     }""", statistics_pb2.DatasetFeatureStatistics())
     self._assert_mi_output_equal(batch, expected, schema,
                                  types.FeaturePath(["label_key"]))
Пример #24
0
  def test_topk_with_numeric_feature(self):
    # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
    examples = [{'fa': np.array(['a', 'b', 'c', 'e']),
                 'fb': np.array([1.0, 2.0, 3.0])},
                {'fa': None,
                 'fb': np.array([4.0, 5.0])},
                {'fa': np.array(['a', 'c', 'd']),
                 'fb': None},
                {'fa': np.array(['a', 'a', 'b', 'c', 'd']),
                 'fb': None}]

    expected_result_fa = text_format.Parse(
        """
      features {
        name: 'fa'
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 4
          }
          top_values {
            value: 'c'
            frequency: 3
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "d"
              sample_count: 2.0
            }
          }
        }
      }""", statistics_pb2.DatasetFeatureStatistics())
    generator = top_k_stats_generator.TopKStatsGenerator(
        num_top_values=2, num_rank_histogram_buckets=3)
    self.assertTransformOutputEqual(examples, generator, [expected_result_fa])
Пример #25
0
  def test_merge_dataset_feature_stats_protos_single_proto(self):
    proto1 = text_format.Parse(
        """
        num_examples: 7
        features: {
          name: 'feature1'
          type: STRING
          string_stats: {
            common_stats: {
              num_missing: 3
              num_non_missing: 4
              min_num_values: 1
              max_num_values: 1
            }
          }
        }
        """, statistics_pb2.DatasetFeatureStatistics())

    expected = text_format.Parse(
        """
        num_examples: 7
        features: {
          name: 'feature1'
          type: STRING
          string_stats: {
            common_stats: {
              num_missing: 3
              num_non_missing: 4
              min_num_values: 1
              max_num_values: 1
            }
          }
        }
        """, statistics_pb2.DatasetFeatureStatistics())

    actual = stats_impl._merge_dataset_feature_stats_protos([proto1])
    self.assertEqual(actual, expected)
Пример #26
0
def _merge_dataset_feature_stats_protos(
    stats_protos
):
  """Merges together a list of DatasetFeatureStatistics protos.

  Args:
    stats_protos: A list of DatasetFeatureStatistics protos to merge.

  Returns:
    The merged DatasetFeatureStatistics proto.
  """
  stats_per_feature = {}
  # Iterate over each DatasetFeatureStatistics proto and merge the
  # FeatureNameStatistics protos per feature.
  for stats_proto in stats_protos:
    for feature_stats_proto in stats_proto.features:
      if feature_stats_proto.name not in stats_per_feature:
        stats_per_feature[feature_stats_proto.name] = feature_stats_proto
      else:
        stats_per_feature[feature_stats_proto.name].MergeFrom(
            feature_stats_proto)

  # Create a new DatasetFeatureStatistics proto.
  result = statistics_pb2.DatasetFeatureStatistics()
  num_examples = None
  for feature_stats_proto in stats_per_feature.values():
    # Add the merged FeatureNameStatistics proto for the feature
    # into the DatasetFeatureStatistics proto.
    new_feature_stats_proto = result.features.add()
    new_feature_stats_proto.CopyFrom(feature_stats_proto)

    # Get the number of examples from one of the features that
    # has common stats.
    if num_examples is None:
      stats_type = feature_stats_proto.WhichOneof('stats')
      stats_proto = None
      if stats_type == 'num_stats':
        stats_proto = feature_stats_proto.num_stats
      else:
        stats_proto = feature_stats_proto.string_stats

      if stats_proto.HasField('common_stats'):
        num_examples = (stats_proto.common_stats.num_non_missing +
                        stats_proto.common_stats.num_missing)

  # Set the num_examples field.
  if num_examples is not None:
    result.num_examples = num_examples
  return result
Пример #27
0
def make_dataset_feature_stats_proto_topk_single(
    feature_path_tuple: types.FeaturePathTuple,
    value_count_list: List[FeatureValueCount],
    is_weighted_stats: bool, num_top_values: int,
    frequency_threshold: Union[int, float],
    num_rank_histogram_buckets: int) -> statistics_pb2.DatasetFeatureStatistics:
  """Makes a DatasetFeatureStatistics proto with top-k stats for a feature."""
  feature_path = types.FeaturePath(feature_path_tuple)
  result = statistics_pb2.DatasetFeatureStatistics()
  result.features.add().CopyFrom(
      _make_feature_stats_proto_topk(feature_path, value_count_list,
                                     is_weighted_stats, num_top_values,
                                     frequency_threshold,
                                     num_rank_histogram_buckets))
  return result
Пример #28
0
def _make_dataset_feature_stats_proto_with_topk_for_single_feature(
        feature_path_to_value_count_list, categorical_features,
        is_weighted_stats, num_top_values, frequency_threshold,
        num_rank_histogram_buckets):
    """Makes a DatasetFeatureStatistics proto with top-k stats for a feature."""
    (slice_key,
     feature_path_tuple), value_count_list = (feature_path_to_value_count_list)
    feature_path = types.FeaturePath(feature_path_tuple)
    result = statistics_pb2.DatasetFeatureStatistics()
    result.features.add().CopyFrom(
        make_feature_stats_proto_with_topk_stats(
            feature_path, value_count_list, feature_path
            in categorical_features, is_weighted_stats, num_top_values,
            frequency_threshold, num_rank_histogram_buckets))
    return slice_key, result.SerializeToString()
Пример #29
0
   def assert_on_two_protos_with_different_num_examples(self):
       expected = text_format.Parse(
           """
 num_examples: 1
 features {
   name: 'fa'
   type: STRING
   string_stats {
     unique: 4
   }
 }
 """, statistics_pb2.DatasetFeatureStatistics())
       actual = text_format.Parse(
           """
 num_examples: 2
 features {
   name: 'fa'
   type: STRING
   string_stats {
     unique: 4
   }
 }""", statistics_pb2.DatasetFeatureStatistics())
       test_util.assert_dataset_feature_stats_proto_equal(
           self, actual, expected)
    def extract_output(self, accumulator):
        # Create a new DatasetFeatureStatistics proto.
        result = statistics_pb2.DatasetFeatureStatistics()

        for feature_name, common_stats in accumulator.items():
            # Construct the FeatureNameStatistics proto from the partial
            # common stats.
            feature_stats_proto = _make_feature_stats_proto(
                common_stats, feature_name, self._quantiles_combiner,
                feature_name in self._categorical_features)
            # Copy the constructed FeatureNameStatistics proto into the
            # DatasetFeatureStatistics proto.
            new_feature_stats_proto = result.features.add()
            new_feature_stats_proto.CopyFrom(feature_stats_proto)
        return result