예제 #1
0
    def test_accuracy_after_compact(self, max_num_elements, eps,
                                    num_quantiles):
        s1 = sketches.QuantilesSketch(eps, max_num_elements, 1)
        s2 = sketches.QuantilesSketch(eps, max_num_elements, 1)
        s3 = sketches.QuantilesSketch(eps, max_num_elements, 1)
        values = pa.array(reversed(range(max_num_elements)))
        weights = pa.array(range(max_num_elements))
        total_weight = (max_num_elements - 1) * max_num_elements / 2

        def cdf(x):
            left_weight = (2 * (max_num_elements - 1) - x) * (x + 1) / 2
            return left_weight / total_weight

        _add_values(s1, values[:max_num_elements // 10],
                    weights[:max_num_elements // 10])
        _add_values(s2, values[max_num_elements // 10:max_num_elements // 3],
                    weights[max_num_elements // 10:max_num_elements // 3])
        _add_values(s3, values[max_num_elements // 3:],
                    weights[max_num_elements // 3:])
        s2.Compact()
        s3.Compact()
        s2.Merge(s3)
        s2.Compact()
        s1.Compact()
        s1.Merge(s2)
        s1.Compact()
        quantiles = s1.GetQuantiles(num_quantiles - 1).to_pylist()[0]
        self.assert_quantiles_accuracy(quantiles, cdf, eps)
예제 #2
0
 def __init__(self,
              invalidate=False,
              num_in_vocab_tokens: int = 0,
              total_num_tokens: int = 0,
              sum_in_vocab_token_lengths: int = 0,
              num_examples: int = 0) -> None:
     # True only if this feature should never be considered, e.g: some
     # value_lists have inconsistent types or feature doesn't have an
     # NL domain.
     self.invalidate = invalidate
     self.num_in_vocab_tokens = num_in_vocab_tokens
     self.total_num_tokens = total_num_tokens
     self.sum_in_vocab_token_lengths = sum_in_vocab_token_lengths
     self.num_examples = num_examples
     self.vocab_token_length_quantiles = sketches.QuantilesSketch(
         _QUANTILES_SKETCH_ERROR, _QUANTILES_SKETCH_NUM_ELEMENTS,
         _QUANTILES_SKETCH_NUM_STREAMS)
     self.min_sequence_length = None
     self.max_sequence_length = None
     self.sequence_length_quantiles = sketches.QuantilesSketch(
         _QUANTILES_SKETCH_ERROR, _QUANTILES_SKETCH_NUM_ELEMENTS,
         _QUANTILES_SKETCH_NUM_STREAMS)
     self.token_occurrence_counts = sketches.MisraGriesSketch(
         _NUM_MISRAGRIES_SKETCH_BUCKETS)
     self.token_statistics = collections.defaultdict(_TokenStats)
     self.reported_sequences_coverage = []
     self.reported_sequences_avg_token_length = []
예제 #3
0
  def test_quantiles_sketch_init(self):
    with self.assertRaisesRegex(RuntimeError, "eps must be positive"):
      _ = sketches.QuantilesSketch(0, 1 << 32, 1)

    with self.assertRaisesRegex(RuntimeError, "max_num_elements must be >= 1."):
      _ = sketches.QuantilesSketch(0.0001, 0, 1)

    with self.assertRaisesRegex(RuntimeError, "num_streams must be >= 1."):
      _ = sketches.QuantilesSketch(0.0001, 1 << 32, 0)

    _ = sketches.QuantilesSketch(0.0001, 1 << 32, 1)
예제 #4
0
    def test_merge(self, values, expected, num_streams, weights=None):
        if weights is None:
            weights = [None] * len(values)
        s1 = sketches.QuantilesSketch(0.00001, 1 << 32, num_streams)
        for value, weight in zip(values[:len(values) // 2],
                                 weights[:len(weights) // 2]):
            _add_values(s1, value, weight)
        s2 = sketches.QuantilesSketch(0.00001, 1 << 32, num_streams)
        for value, weight in zip(values[len(values) // 2:],
                                 weights[len(weights) // 2:]):
            _add_values(s2, value, weight)

        s1 = _pickle_roundtrip(s1)
        s2 = _pickle_roundtrip(s2)
        s1.Merge(s2)

        result = s1.GetQuantiles(len(expected[0]) - 1).to_pylist()
        np.testing.assert_almost_equal(expected, result)
예제 #5
0
    def test_quantiles(self, values, expected, num_streams, weights=None):
        s = sketches.QuantilesSketch(0.00001, 1 << 32, num_streams)
        if weights is None:
            weights = [None] * len(values)
        for value, weight in zip(values, weights):
            _add_values(s, value, weight)

        result = s.GetQuantiles(len(expected[0]) - 1).to_pylist()
        np.testing.assert_almost_equal(expected, result)
예제 #6
0
 def test_pickle(self, values, expected, num_streams, weights=None):
     s = sketches.QuantilesSketch(0.00001, 1 << 32, num_streams)
     if weights is None:
         weights = [None] * len(values)
     for value, weight in zip(values, weights):
         _add_values(s, value, weight)
     pickled = pickle.dumps(s)
     self.assertIsInstance(pickled, bytes)
     unpickled = pickle.loads(pickled)
     self.assertIsInstance(unpickled, sketches.QuantilesSketch)
     result = unpickled.GetQuantiles(len(expected[0]) - 1).to_pylist()
     np.testing.assert_almost_equal(expected, result)
예제 #7
0
    def test_accuracy(self, max_num_elements, eps, num_quantiles):
        s = sketches.QuantilesSketch(eps, max_num_elements, 1)
        values = pa.array(reversed(range(max_num_elements)))
        weights = pa.array(range(max_num_elements))
        total_weight = (max_num_elements - 1) * max_num_elements / 2

        def cdf(x):
            left_weight = (2 * (max_num_elements - 1) - x) * (x + 1) / 2
            return left_weight / total_weight

        _add_values(s, values, weights)
        quantiles = s.GetQuantiles(num_quantiles - 1).to_pylist()[0]
        self.assert_quantiles_accuracy(quantiles, cdf, eps)
예제 #8
0
    def test_compact(self, values, expected, num_streams, weights=None):
        s = sketches.QuantilesSketch(0.00001, 1 << 32, num_streams)
        num_values = len(values)
        if weights is None:
            weights = [None] * num_values
        for value, weight in zip(values[:num_values // 2],
                                 weights[:num_values // 2]):
            _add_values(s, value, weight)
        s.Compact()
        for value, weight in zip(values[num_values // 2:],
                                 weights[num_values // 2:]):
            _add_values(s, value, weight)
        s.Compact()

        result = s.GetQuantiles(len(expected[0]) - 1).to_pylist()
        np.testing.assert_almost_equal(expected, result)
  def __init__(
      self,  # pylint: disable=useless-super-delegation
      name: Text = 'BasicStatsGenerator',
      schema: Optional[schema_pb2.Schema] = None,
      example_weight_map: ExampleWeightMap = ExampleWeightMap(),
      num_values_histogram_buckets: Optional[int] = 10,
      num_histogram_buckets: Optional[int] = 10,
      num_quantiles_histogram_buckets: Optional[int] = 10,
      epsilon: Optional[float] = 0.01) -> None:
    """Initializes basic statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      example_weight_map: an ExampleWeightMap that maps a FeaturePath to its
          corresponding weight column.
      num_values_histogram_buckets: An optional number of buckets in a quantiles
          histogram for the number of values per Feature, which is stored in
          CommonStatistics.num_values_histogram.
      num_histogram_buckets: An optional number of buckets in a standard
          NumericStatistics.histogram with equal-width buckets.
      num_quantiles_histogram_buckets: An optional number of buckets in a
          quantiles NumericStatistics.histogram.
      epsilon: An optional error tolerance for the computation of quantiles,
          typically a small fraction close to zero (e.g. 0.01). Higher values
          of epsilon increase the quantile approximation, and hence result in
          more unequal buckets, but could improve performance, and resource
          consumption.
    """
    super(BasicStatsGenerator, self).__init__(name, schema)

    self._bytes_features = set(
        schema_util.get_bytes_features(schema) if schema else [])
    self._categorical_features = set(
        schema_util.get_categorical_numeric_features(schema) if schema else [])
    self._example_weight_map = example_weight_map
    self._num_values_histogram_buckets = num_values_histogram_buckets
    self._num_histogram_buckets = num_histogram_buckets
    self._num_quantiles_histogram_buckets = num_quantiles_histogram_buckets

    self._make_quantiles_sketch_fn = lambda: sketches.QuantilesSketch(  # pylint: disable=g-long-lambda
        eps=epsilon,
        max_num_elements=1 << 32,
        num_streams=1)