예제 #1
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile and then batch input examples.
        batched_dataset = (dataset
                           | 'Profile' >> profile_util.Profile()
                           | 'BatchInputs' >> batch_util.BatchExamples())

        # If a set of whitelist features are provided, keep only those features.
        filtered_dataset = batched_dataset
        if self._options.feature_whitelist:
            filtered_dataset = (
                batched_dataset | 'RemoveNonWhitelistedFeatures' >> beam.Map(
                    _filter_features,
                    feature_whitelist=self._options.feature_whitelist))

        return (filtered_dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
예제 #2
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = _get_default_generators(self._options)

        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        result_protos = []
        # Iterate over the stats generators. For each generator,
        #   a) if it is a CombinerStatsGenerator, wrap it as a beam.CombineFn
        #      and run it.
        #   b) if it is a TransformStatsGenerator, wrap it as a beam.PTransform
        #      and run it.
        for generator in stats_generators:
            if isinstance(generator, stats_generator.CombinerStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> beam.CombineGlobally(
                                         _CombineFnWrapper(generator)))
            elif isinstance(generator,
                            stats_generator.TransformStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> generator.ptransform)
            else:
                raise TypeError(
                    'Statistics generator must extend one of '
                    'CombinerStatsGenerator or TransformStatsGenerator, '
                    'found object of type %s' % generator.__class__.__name__)

        # Each stats generator will output a PCollection of DatasetFeatureStatistics
        # protos. We now flatten the list of PCollections into a single PCollection,
        # then merge the DatasetFeatureStatistics protos in the PCollection into a
        # single DatasetFeatureStatisticsList proto.
        return (result_protos | 'FlattenFeatureStatistics' >> beam.Flatten()
                | 'MergeDatasetFeatureStatisticsProtos' >>
                beam.CombineGlobally(_merge_dataset_feature_stats_protos)
                | 'MakeDatasetFeatureStatisticsListProto' >>
                beam.Map(_make_dataset_feature_statistics_list_proto))
예제 #3
0
  def test_batch_examples(self):
    examples = [{
        'a': np.array([1.0, 2.0], dtype=np.floating),
        'b': np.array(['a', 'b', 'c', 'e'], dtype=np.object)
    }, {
        'a': np.array([3.0, 4.0, np.NaN, 5.0], dtype=np.floating),
    }, {
        'b': np.array(['d', 'e', 'f'], dtype=np.object),
        'd': np.array([10, 20, 30], dtype=np.integer),
    }, {
        'b': np.array(['a', 'b', 'c'], dtype=np.object)
    }, {
        'c': np.array(['d', 'e', 'f'], dtype=np.object)
    }]

    expected_batched_examples = [{
        'a': np.array([np.array([1.0, 2.0]), np.array([3.0, 4.0, np.NaN, 5.0]),
                       None], dtype=np.object),
        'b': np.array([np.array(['a', 'b', 'c', 'e']), None,
                       np.array(['d', 'e', 'f'])], dtype=np.object),
        'd': np.array([np.NaN, np.NaN, np.array([10, 20, 30])], dtype=np.object)
    }, {
        'b': np.array([np.array(['a', 'b', 'c']), None], dtype=np.object),
        'c': np.array([None, np.array(['d', 'e', 'f'])], dtype=np.object)
    }]

    def _batched_example_equal_fn(expected_batched_examples):
      """Makes a matcher function for comparing batched examples."""
      # TODO(pachristopher): Find out the right way to compare the outcome with
      # the expected output.
      def _matcher(actual_batched_examples):
        self.assertEqual(
            len(actual_batched_examples), len(expected_batched_examples))
        for idx, batched_example in enumerate(actual_batched_examples):
          self.assertCountEqual(batched_example, expected_batched_examples[idx])

      return _matcher

    with beam.Pipeline() as p:
      result = (p
                | beam.Create(examples)
                | batch_util.BatchExamples(desired_batch_size=3))
      util.assert_that(
          result, _batched_example_equal_fn(expected_batched_examples))
예제 #4
0
  def test_batch_examples(self):
    examples = [{
        'a': np.array([1.0, 2.0], dtype=np.floating),
        'b': np.array(['a', 'b', 'c', 'e'], dtype=np.object)
    }, {
        'a': np.array([3.0, 4.0, np.NaN, 5.0], dtype=np.floating),
    }, {
        'b': np.array(['d', 'e', 'f'], dtype=np.object),
        'd': np.array([10, 20, 30], dtype=np.integer),
    }, {
        'b': np.array(['a', 'b', 'c'], dtype=np.object)
    }, {
        'c': np.array(['d', 'e', 'f'], dtype=np.object)
    }]

    expected_batched_examples = [{
        'a': np.array([np.array([1.0, 2.0]), np.array([3.0, 4.0, np.NaN, 5.0]),
                       None], dtype=np.object),
        'b': np.array([np.array(['a', 'b', 'c', 'e']), None,
                       np.array(['d', 'e', 'f'])], dtype=np.object),
        'd': np.array([np.NaN, np.NaN, np.array([10, 20, 30])], dtype=np.object)
    }, {
        'b': np.array([np.array(['a', 'b', 'c']), None], dtype=np.object),
        'c': np.array([None, np.array(['d', 'e', 'f'])], dtype=np.object)
    }]

    def _batched_example_equal_fn(expected_batched_examples):
      """Makes a matcher function for comparing batched examples."""
      def _matcher(actual_batched_examples):
        sorted_result = sorted(actual_batched_examples)
        sorted_expected_result = sorted(expected_batched_examples)
        self.assertEqual(len(sorted_result), len(sorted_expected_result))
        for idx, batched_example in enumerate(sorted_result):
          self.assertEqual(sorted(batched_example),
                           sorted(sorted_expected_result[idx]))
      return _matcher

    with beam.Pipeline() as p:
      result = (p
                | beam.Create(examples)
                | batch_util.BatchExamples(desired_batch_size=3))
      util.assert_that(
          result, _batched_example_equal_fn(expected_batched_examples))
예제 #5
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile the input examples.
        dataset |= 'ProfileExamples' >> profile_util.Profile()

        # Sample input data if sample_count option is provided.
        if self._options.sample_count is not None:
            # beam.combiners.Sample.FixedSizeGlobally returns a
            # PCollection[List[types.Example]], which we then flatten to get a
            # PCollection[types.Example].
            dataset |= ('SampleExamples(%s)' % self._options.sample_count >>
                        beam.combiners.Sample.FixedSizeGlobally(
                            self._options.sample_count)
                        | 'FlattenExamples' >> beam.FlatMap(lambda lst: lst))
        elif self._options.sample_rate is not None:
            dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate
                        >> beam.FlatMap(_sample_at_rate,
                                        sample_rate=self._options.sample_rate))

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        return (dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
예제 #6
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        result_protos = []
        # Iterate over the stats generators. For each generator,
        #   a) if it is a CombinerStatsGenerator, wrap it as a beam.CombineFn
        #      and run it.
        #   b) if it is a TransformStatsGenerator, wrap it as a beam.PTransform
        #      and run it.
        for generator in stats_generators:
            if isinstance(generator, stats_generator.CombinerStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> beam.CombineGlobally(
                                         _CombineFnWrapper(generator)))
            elif isinstance(generator,
                            stats_generator.TransformStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> generator.ptransform)
            else:
                raise TypeError(
                    'Statistics generator must extend one of '
                    'CombinerStatsGenerator or TransformStatsGenerator, '
                    'found object of type %s' % generator.__class__.__name__)

        # Each stats generator will output a PCollection of DatasetFeatureStatistics
        # protos. We now flatten the list of PCollections into a single PCollection,
        # then merge the DatasetFeatureStatistics protos in the PCollection into a
        # single DatasetFeatureStatisticsList proto.
        return (result_protos | 'FlattenFeatureStatistics' >> beam.Flatten()
                | 'MergeDatasetFeatureStatisticsProtos' >>
                beam.CombineGlobally(_merge_dataset_feature_stats_protos)
                | 'MakeDatasetFeatureStatisticsListProto' >>
                beam.Map(_make_dataset_feature_statistics_list_proto))