예제 #1
0
def generate_partial_statistics_in_memory(
    examples,
    options,
    stats_generators
):
  """Generates statistics for an in-memory list of examples.

  Args:
    examples: A list of input examples.
    options: Options for generating data statistics.
    stats_generators: A list of statistics generators.

  Returns:
    A list of accumulators containing partial statistics.
  """
  batch = batch_util.merge_single_batch(examples)
  # If whitelist features are provided, keep only those features.
  if options.feature_whitelist:
    batch = {
        feature_name: batch[feature_name]
        for feature_name in options.feature_whitelist
    }
  return [
      generator.add_input(generator.create_accumulator(), batch)
      for generator in stats_generators  # pytype: disable=attribute-error
  ]
예제 #2
0
def generate_statistics_in_memory(examples,
                                  options=stats_options.StatsOptions()):
    """Generates statistics for an in-memory list of examples.

  Args:
    examples: A list of input examples.
    options: Options for generating data statistics.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    stats_generators = _get_generators(options, in_memory=True)

    batch = batch_util.merge_single_batch(examples)

    # If whitelist features are provided, keep only those features.
    if options.feature_whitelist:
        batch = {
            feature_name: batch[feature_name]
            for feature_name in options.feature_whitelist
        }

    outputs = [
        generator.extract_output(
            generator.add_input(generator.create_accumulator(), batch))
        # The type checker raises a false positive here because the type hint for
        # the return value of _get_generators (which created the list of
        # stats_generators) is StatsGenerator, but add_input, create_accumulator,
        # and extract_output can be called only on CombinerStatsGenerators.
        for generator in stats_generators  # pytype: disable=attribute-error
    ]

    return _make_dataset_feature_statistics_list_proto(
        [_merge_dataset_feature_stats_protos(outputs)])
예제 #3
0
def generate_partial_statistics_in_memory(
    examples,
    options,
    stats_generators
):
  """Generates statistics for an in-memory list of examples.

  Args:
    examples: A list of input examples.
    options: Options for generating data statistics.
    stats_generators: A list of statistics generators.

  Returns:
    A list of accumulators containing partial statistics.
  """
  result = []
  batch = None
  for generator in stats_generators:
    if isinstance(generator, stats_generator.CombinerStatsGenerator):
      if batch is None:
        batch = batch_util.merge_single_batch(examples)
        # If whitelist features are provided, keep only those features.
        if options.feature_whitelist:
          batch = {
              feature_name: batch[feature_name]
              for feature_name in options.feature_whitelist
          }
      result.append(generator.add_input(generator.create_accumulator(), batch))
    else:
      raise TypeError('Only stats_generator.CombinerStatsGenerator is '
                      'expected for now')
  return result
예제 #4
0
  def _maybe_do_batch(
      self,
      accumulator,
      force = False):
    """Maybe updates accumulator in place.

    Checks if accumulator has enough examples for a batch, and if so, does the
    stats computation for the batch and updates accumulator in place.

    Args:
      accumulator: Accumulator. Will be updated in place.
      force: Force computation of stats even if accumulator has less examples
        than the batch size.
    """
    batch_size = len(accumulator.input_examples)
    if (force and batch_size > 0) or batch_size >= self._desired_batch_size:
      self._combine_add_input_batch_size.update(batch_size)
      merged_batch = None
      if self._has_example_batch_combiner_generator:
        merged_batch = batch_util.merge_single_batch(accumulator.input_examples)

      def _generator_add_input(gen, gen_accumulator):
        if isinstance(gen, stats_generator.CombinerStatsGenerator):
          return gen.add_input(gen_accumulator, merged_batch)
        else:
          raise TypeError('Only stats_generator.CombinerStatsGenerator is '
                          'expected for now')

      accumulator.partial_accumulators = self._for_each_generator(
          _generator_add_input, accumulator.partial_accumulators)
      del accumulator.input_examples[:]
예제 #5
0
def generate_statistics_in_memory(
    examples,
    options = stats_options.StatsOptions()
):
  """Generates statistics for an in-memory list of examples.

  Args:
    examples: A list of input examples.
    options: Options for generating data statistics.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
  stats_generators = _get_default_generators(options, in_memory=True)

  if options.generators is not None:
    for generator in options.generators:
      if isinstance(generator, stats_generator.CombinerStatsGenerator):
        stats_generators.append(generator)
      else:
        raise TypeError('Statistics generator used in '
                        'generate_statistics_in_memory must '
                        'extend CombinerStatsGenerator, found object of type '
                        '%s.' %
                        generator.__class__.__name__)

  batch = batch_util.merge_single_batch(examples)

  # If whitelist features are provided, keep only those features.
  if options.feature_whitelist:
    batch = {
        feature_name: batch[feature_name]
        for feature_name in options.feature_whitelist
    }

  outputs = [
      generator.extract_output(
          generator.add_input(generator.create_accumulator(), batch))
      # The type checker raises a false positive here because the type hint for
      # the return value of _get_default_generators (which created the list of
      # stats_generators) is StatsGenerator, but add_input, create_accumulator,
      # and extract_output can be called only on CombinerStatsGenerators.
      for generator in stats_generators  # pytype: disable=attribute-error
  ]

  return _make_dataset_feature_statistics_list_proto(
      _merge_dataset_feature_stats_protos(outputs))
예제 #6
0
 def test_merge_single_batch(self):
   examples = [
       {
           'a': np.array([1.0, 2.0]),
           'b': np.array(['a', 'b', 'c', 'e'])
       },
       {
           'a': np.array([3.0, 4.0, np.NaN, 5.0]),
       },
       {
           'b': np.array(['d', 'e', 'f']),
           'd': np.array([10, 20, 30]),
       },
       {
           'b': np.array(['a', 'b', 'c'])
       },
       {
           'c': np.array(['d', 'e', 'f'])
       }
   ]
   expected_batch = {
       'a': [np.array([1.0, 2.0]), np.array([3.0, 4.0, np.NaN, 5.0]),
             None, None, None],
       'b': [np.array(['a', 'b', 'c', 'e']), None, np.array(['d', 'e', 'f']),
             np.array(['a', 'b', 'c']), None],
       'c': [None, None, None, None, np.array(['d', 'e', 'f'])],
       'd': [None, None, np.array([10, 20, 30]), None, None]
   }
   actual_batch = batch_util.merge_single_batch(examples)
   # check number of features.
   self.assertLen(actual_batch, len(expected_batch))
   for feature_name in expected_batch:
     # check batch size.
     self.assertLen(actual_batch[feature_name],
                    len(expected_batch[feature_name]))
     for i in range(len(expected_batch[feature_name])):
       expected_value = expected_batch[feature_name][i]
       actual_value = actual_batch[feature_name][i]
       if expected_value is None:
         self.assertEqual(actual_value, expected_value)
       else:
         # check dtype.
         self.assertEqual(actual_value.dtype, expected_value.dtype)
         # check numpy array.
         np.testing.assert_array_equal(actual_value, expected_value)
예제 #7
0
    def _maybe_do_batch(self, accumulator, force=False):
        """Maybe updates accumulator in place.

    Checks if accumulator has enough examples for a batch, and if so, does the
    stats computation for the batch and updates accumulator in place.

    Args:
      accumulator: Accumulator. Will be updated in place.
      force: Force computation of stats even if accumulator has less examples
        than the batch size.
    """
        batch_size = len(accumulator.input_examples)
        if (force
                and batch_size > 0) or batch_size >= self._desired_batch_size:
            self._combine_add_input_batch_size.update(batch_size)
            accumulator.partial_accumulator = self._generator.add_input(
                accumulator.partial_accumulator,
                batch_util.merge_single_batch(accumulator.input_examples))
            del accumulator.input_examples[:]  # Clear processed examples.
예제 #8
0
def _process_partition(partition, stats_fn):
    """Process examples in a single partition."""
    (slice_key, _), examples = partition
    return slice_key, stats_fn.compute(batch_util.merge_single_batch(examples))
예제 #9
0
def generate_statistics_in_memory(examples,
                                  options=stats_options.StatsOptions()):
    """Generates statistics for an in-memory list of examples.

  Args:
    examples: A list of input examples.
    options: Options for generating data statistics.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """

    stats_generators = [
        common_stats_generator.CommonStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_values_histogram_buckets=\
              options.num_values_histogram_buckets,
            epsilon=options.epsilon),

        numeric_stats_generator.NumericStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_histogram_buckets=options.num_histogram_buckets,
            num_quantiles_histogram_buckets=\
              options.num_quantiles_histogram_buckets,
            epsilon=options.epsilon),

        string_stats_generator.StringStatsGenerator(schema=options.schema),

        top_k_uniques_combiner_stats_generator.TopKUniquesCombinerStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets),
    ]

    if options.generators is not None:
        for generator in options.generators:
            if isinstance(generator, stats_generator.CombinerStatsGenerator):
                stats_generators.append(generator)
            else:
                raise TypeError(
                    'Statistics generator used in '
                    'generate_statistics_in_memory must '
                    'extend CombinerStatsGenerator, found object of type '
                    '%s.' % generator.__class__.__name__)

    batch = batch_util.merge_single_batch(examples)

    # If whitelist features are provided, keep only those features.
    if options.feature_whitelist:
        batch = {
            feature_name: batch[feature_name]
            for feature_name in options.feature_whitelist
        }

    outputs = [
        generator.extract_output(
            generator.add_input(generator.create_accumulator(), batch))
        for generator in stats_generators
    ]

    return _make_dataset_feature_statistics_list_proto(
        _merge_dataset_feature_stats_protos(outputs))