Exemplo n.º 1
0
 def test_generate_equi_width_histogram(self):
     result = quantiles_util.generate_equi_width_histogram(
         quantiles=np.array([1, 5, 10, 15, 20], dtype=np.float32),
         min_val=0,
         max_val=24.0,
         total_count=18,
         num_buckets=3)
     expected_result = text_format.Parse(
         """
     buckets {
       low_value: 0
       high_value: 8.0
       sample_count: 7.8
     }
     buckets {
       low_value: 8.0
       high_value: 16.0
       sample_count: 4.8
     }
     buckets {
       low_value: 16.0
       high_value: 24.0
       sample_count: 5.4
     }
     type: STANDARD
     """, statistics_pb2.Histogram())
     self.assertEqual(result, expected_result)
def generate_quantiles_histogram(quantiles: np.ndarray, total_count: float,
                                 num_buckets: int) -> statistics_pb2.Histogram:
    """Generate quantiles histrogram from the quantile boundaries.

  Args:
    quantiles: A numpy array containing the quantile boundaries.
    total_count: The total number of values over which the quantiles
        are computed.
    num_buckets: The required number of buckets in the quantiles histogram.

  Returns:
    A statistics_pb2.Histogram proto.
  """
    result = statistics_pb2.Histogram()
    result.type = statistics_pb2.Histogram.QUANTILES

    quantiles = list(quantiles)
    # We assume that the number of quantiles is at least the required number of
    # buckets in the quantiles histogram.
    assert len(quantiles) - 1 >= num_buckets

    # Sample count per bucket based on the computed quantiles.
    current_sample_count = float(total_count / (len(quantiles) - 1))
    # Sample count per bucket required for the quantiles histogram.
    required_sample_count = float(total_count / num_buckets)

    # Start of the current bucket.
    bucket_start = quantiles[0]
    # Sample count of the current bucket.
    running_sample_count = 0
    # Iterate to create the first num_buckets - 1 buckets.
    for i in six.moves.range(len(quantiles) - 1):
        if running_sample_count + current_sample_count >= required_sample_count:
            # Sample count needed for the current bucket.
            needed_sample_count = required_sample_count - running_sample_count
            # Compute width of the current bucket based on the needed sample count.
            # We assume the samples are uniformly distributed in an interval.
            width = ((quantiles[i + 1] - quantiles[i]) * needed_sample_count /
                     current_sample_count)

            result.buckets.add(low_value=bucket_start,
                               high_value=quantiles[i] + width,
                               sample_count=required_sample_count)

            # Add any carried over sample count for the next bucket.
            running_sample_count = current_sample_count - needed_sample_count
            # Fix the start of the next bucket.
            bucket_start = quantiles[i] + width

            if len(result.buckets) == num_buckets - 1:
                break
        else:
            running_sample_count += current_sample_count

    # Add the last bucket.
    result.buckets.add(low_value=bucket_start,
                       high_value=quantiles[-1],
                       sample_count=required_sample_count)

    return result
Exemplo n.º 3
0
 def test_generate_quantiles_histogram_diff_num_buckets_non_multiple(self):
     result = quantiles_util.generate_quantiles_histogram(
         quantiles=np.array([61.0, 121.0, 181.0, 241.0], dtype=np.float32),
         min_val=1.0,
         max_val=300.0,
         total_count=300.0,
         num_buckets=4)
     expected_result = text_format.Parse(
         """
     buckets {
       low_value: 1.0
       high_value: 76.0
       sample_count: 75.0
     }
     buckets {
       low_value: 76.0
       high_value: 151.0
       sample_count: 75.0
     }
     buckets {
       low_value: 151.0
       high_value: 226.0
       sample_count: 75.0
     }
     buckets {
       low_value: 226.0
       high_value: 300.0
       sample_count: 75.0
     }
     type: QUANTILES
     """, statistics_pb2.Histogram())
     self.assertEqual(result, expected_result)
def generate_equi_width_histogram(
        quantiles: np.ndarray, total_count: float,
        num_buckets: int) -> statistics_pb2.Histogram:
    """Generate equi-width histrogram from the quantile boundaries.

  Currently we construct the equi-width histogram by using the quantiles.
  Specifically, we compute a large number of quantiles and then compute
  the density for each equi-width histogram bucket by aggregating the
  densities of the smaller quantile intervals that fall within the bucket.
  This approach assumes that the number of quantiles is much higher than
  the required number of buckets in the equi-width histogram.

  Args:
    quantiles: A numpy array containing the quantile boundaries.
    total_count: The total number of values over which the quantiles
        are computed.
    num_buckets: The required number of buckets in the equi-width histogram.

  Returns:
    A statistics_pb2.Histogram proto.
  """
    result = statistics_pb2.Histogram()
    result.type = statistics_pb2.Histogram.STANDARD
    buckets = generate_equi_width_buckets(list(quantiles), total_count,
                                          num_buckets)
    for bucket_info in buckets:
        result.buckets.add(low_value=bucket_info.low_value,
                           high_value=bucket_info.high_value,
                           sample_count=bucket_info.sample_count)

    return result
Exemplo n.º 5
0
 def test_generate_quantiles_histogram(self):
   result = quantiles_util.generate_quantiles_histogram(
       quantiles=np.array(
           [1.0, 61.0, 121.0, 181.0, 241.0, 300.0], dtype=np.float32),
       total_count=300.0, num_buckets=5)
   expected_result = text_format.Parse(
       """
       buckets {
         low_value: 1.0
         high_value: 61.0
         sample_count: 60.0
       }
       buckets {
         low_value: 61.0
         high_value: 121.0
         sample_count: 60.0
       }
       buckets {
         low_value: 121.0
         high_value: 181.0
         sample_count: 60.0
       }
       buckets {
         low_value: 181.0
         high_value: 241.0
         sample_count: 60.0
       }
       buckets {
         low_value: 241.0
         high_value: 300.0
         sample_count: 60.0
       }
       type: QUANTILES
       """, statistics_pb2.Histogram())
   self.assertEqual(result, expected_result)
Exemplo n.º 6
0
def generate_quantiles_histogram(quantiles: np.ndarray, total_count: float,
                                 num_buckets: int) -> statistics_pb2.Histogram:
    """Generate quantiles histrogram from the quantile boundaries.

  Args:
    quantiles: A numpy array containing the quantile boundaries.
    total_count: The total number of values over which the quantiles
        are computed.
    num_buckets: The required number of buckets in the quantiles histogram.

  Returns:
    A statistics_pb2.Histogram proto.
  """
    result = statistics_pb2.Histogram()
    result.type = statistics_pb2.Histogram.QUANTILES

    quantiles = list(quantiles)
    # We assume that the number of quantiles is a multiple of the required
    # number of buckets in the quantiles histogram.
    assert (len(quantiles) - 1) % num_buckets == 0

    # Sample count per bucket based on the computed quantiles.
    sample_count = float(total_count / (len(quantiles) - 1))
    width = int((len(quantiles) - 1) / num_buckets)
    # Sample count per merged bucket.
    merged_bucket_sample_count = sample_count * width
    i = 0
    while i + width < len(quantiles):
        result.buckets.add(low_value=quantiles[i],
                           high_value=quantiles[i + width],
                           sample_count=merged_bucket_sample_count)
        i += width

    return result
Exemplo n.º 7
0
 def test_nl_generator_token_stats(self):
     """Tests generator calculation of token statistics."""
     with tempfile.NamedTemporaryFile() as vocab_file:
         vocab_file.write(b'Foo\nBar\n')
         vocab_file.flush()
         input_batches = [pa.array([[0, 1, 0], [1, 0, 0]])]
         generator = nlsg.NLStatsGenerator(
             schema=self._schema,
             vocab_paths={'my_vocab': vocab_file.name},
             num_quantiles_histogram_buckets=0,
             num_rank_histogram_buckets=0,
             num_histogram_buckets=3)
         expected_reported_sequences = [['Foo', 'Bar', 'Foo'],
                                        ['Bar', 'Foo', 'Foo']] * 2
         position_histogram_1 = statistics_pb2.Histogram()
         position_histogram_1.buckets.add(low_value=0,
                                          high_value=float(1) / 3,
                                          sample_count=1)
         position_histogram_1.buckets.add(low_value=float(1) / 3,
                                          high_value=float(2) / 3,
                                          sample_count=1)
         position_histogram_foo = statistics_pb2.Histogram()
         position_histogram_foo.buckets.add(low_value=0,
                                            high_value=float(1) / 3,
                                            sample_count=1)
         position_histogram_foo.buckets.add(low_value=float(1) / 3,
                                            high_value=float(2) / 3,
                                            sample_count=1)
         position_histogram_foo.buckets.add(low_value=float(2) / 3,
                                            high_value=1,
                                            sample_count=2)
         expected_token_stats = {
             1: (2, 1.0, 1, 1, 1, position_histogram_1),
             'Foo': (4, 1.0, 2, 2, 2, position_histogram_foo)
         }
         self.assertCombinerOutputEqual(
             input_batches, generator,
             self._create_expected_feature_name_statistics(
                 feature_coverage=1.0,
                 avg_token_length=3,
                 min_sequence_length=3,
                 max_sequence_length=3,
                 reported_sequences=expected_reported_sequences,
                 token_statistics=expected_token_stats),
             self._int_nlp_feature_with_vocab_and_token_constraints_path)
Exemplo n.º 8
0
def generate_quantiles_histogram(quantiles, min_val, max_val, total_count):
    """Generate quantiles histrogram from the quantile boundaries.

  Args:
    quantiles: A numpy array containing the quantile boundaries.
    min_val: The minimum value among all values over which the quantiles
        are computed.
    max_val: The maximum value among all values over which the quantiles
        are computed.
    total_count: The total number of values over which the quantiles
        are computed.

  Returns:
    A statistics_pb2.Histogram proto.
  """
    result = statistics_pb2.Histogram()
    result.type = statistics_pb2.Histogram.QUANTILES
    quantile_count = total_count / (quantiles.size + 1)

    # We explicitly add a bucket in the beginning and the end as the
    # quantiles combiner returns only the internal boundaries.
    # Add the bucket (min_val, first quantile in quantiles).
    result.buckets.add(low_value=min_val,
                       high_value=quantiles[0],
                       sample_count=quantile_count)

    for i in range(1, quantiles.size):
        result.buckets.add(low_value=quantiles[i - 1],
                           high_value=quantiles[i],
                           sample_count=quantile_count)

    # Add the bucket (last quantile in quantiles, max_val).
    result.buckets.add(low_value=quantiles[quantiles.size - 1],
                       high_value=max_val,
                       sample_count=quantile_count)

    return result
Exemplo n.º 9
0
def generate_quantiles_histogram(quantiles, min_val, max_val, total_count,
                                 num_buckets):
    """Generate quantiles histrogram from the quantile boundaries.

  Args:
    quantiles: A numpy array containing the quantile boundaries.
    min_val: The minimum value among all values over which the quantiles
        are computed.
    max_val: The maximum value among all values over which the quantiles
        are computed.
    total_count: The total number of values over which the quantiles
        are computed.
    num_buckets: The required number of buckets in the quantiles histogram.

  Returns:
    A statistics_pb2.Histogram proto.
  """
    result = statistics_pb2.Histogram()
    result.type = statistics_pb2.Histogram.QUANTILES

    quantiles = list(quantiles)
    # We explicitly add the min and max to the quantiles list as the
    # quantiles combiner returns only the internal boundaries.
    quantiles.insert(0, min_val)  # Insert min_val in the beginning.
    quantiles.append(max_val)  # Append max_val to the end.

    # We assume that the number of quantiles is at least the required number of
    # buckets in the quantiles histogram.
    assert len(quantiles) - 1 >= num_buckets

    # Sample count per bucket based on the computed quantiles.
    current_sample_count = float(total_count / (len(quantiles) - 1))
    # Sample count per bucket required for the quantiles histogram.
    required_sample_count = float(total_count / num_buckets)

    # Start of the current bucket.
    bucket_start = min_val
    # Sample count of the current bucket.
    running_sample_count = 0
    # Iterate to create the first num_buckets - 1 buckets.
    for i in range(len(quantiles) - 1):
        if running_sample_count + current_sample_count >= required_sample_count:
            # Sample count needed for the current bucket.
            needed_sample_count = required_sample_count - running_sample_count
            # Compute width of the current bucket based on the needed sample count.
            # We assume the samples are uniformly distributed in an interval.
            width = ((quantiles[i + 1] - quantiles[i]) * needed_sample_count /
                     current_sample_count)

            result.buckets.add(low_value=bucket_start,
                               high_value=quantiles[i] + width,
                               sample_count=required_sample_count)

            # Add any carried over sample count for the next bucket.
            running_sample_count = current_sample_count - needed_sample_count
            # Fix the start of the next bucket.
            bucket_start = quantiles[i] + width

            if len(result.buckets) == num_buckets - 1:
                break
        else:
            running_sample_count += current_sample_count

    # Add the last bucket.
    result.buckets.add(low_value=bucket_start,
                       high_value=max_val,
                       sample_count=required_sample_count)

    return result