示例#1
0
    def test_cache_merge(self):
        base_test_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        dataset_key_0 = 'dataset_key_0'
        dataset_key_1 = 'dataset_key_1'
        dataset_keys = (dataset_key_0, dataset_key_1)
        cache_keys = list('abcd')

        def read_manifests():
            return [
                analyzer_cache._ManifestFile(
                    analyzer_cache._get_dataset_cache_path(base_test_dir,
                                                           key)).read()
                for key in dataset_keys
            ]

        with beam.Pipeline() as p:
            cache_pcoll_dict = {
                dataset_key_0: {
                    'a': p | 'CreateA' >> beam.Create([b'a']),
                    'b': p | 'CreateB' >> beam.Create([b'b']),
                },
                dataset_key_1: {
                    'c': p | 'CreateC' >> beam.Create([b'c']),
                    'd': p | 'CreateD' >> beam.Create([b'd']),
                },
            }
            _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS(
                p, base_test_dir, dataset_keys)

        first_manifests = read_manifests()

        with beam.Pipeline() as p:
            cache_pcoll_dict = {
                dataset_key_0: {
                    'c': p | 'CreateC' >> beam.Create([b'c']),
                    'd': p | 'CreateD' >> beam.Create([b'd']),
                },
                dataset_key_1: {
                    'a': p | 'CreateA' >> beam.Create([b'a']),
                    'b': p | 'CreateB' >> beam.Create([b'b']),
                },
            }
            _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS(
                p, base_test_dir, dataset_keys)

        second_manifests = read_manifests()
        self.assertEqual(len(first_manifests), len(second_manifests))
        for manifest_a, manifest_b in zip(first_manifests, second_manifests):
            for key_value_pair in manifest_a.items():
                self.assertIn(key_value_pair, manifest_b.items())

            self.assertEqual(2, len(manifest_a))
            self.assertCountEqual(range(len(manifest_a)), manifest_a.values())

            self.assertEqual(4, len(manifest_b))
            self.assertCountEqual(range(len(manifest_b)), manifest_b.values())
            self.assertCountEqual(cache_keys, manifest_b.keys())
  def test_cache_write_empty(self):
    base_test_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    with beam.Pipeline() as p:
      _ = {} | analyzer_cache.WriteAnalysisCacheToFS(
          p, base_test_dir, (analyzer_cache.DatasetKey('dataset_key_0'),))
    self.assertFalse(os.path.isdir(base_test_dir))
示例#3
0
  def test_cache_helpers_with_alternative_io(self):

    class LocalSink(beam.PTransform):

      def __init__(self, path):
        self._path = path

      def expand(self, pcoll):

        def write_to_file(value):
          tf.io.gfile.makedirs(self._path)
          with open(os.path.join(self._path, 'cache'), 'wb') as f:
            f.write(value)

        return pcoll | beam.Map(write_to_file)

    test_cache_dict = {
        analyzer_cache.DatasetKey('a'): {
            'b': [bytes([17, 19, 27, 31])]
        }
    }

    class LocalSource(beam.PTransform):

      def __init__(self, path):
        del path

      def expand(self, pbegin):
        return pbegin | beam.Create([test_cache_dict['a']['b']])

    dataset_keys = list(test_cache_dict.keys())
    cache_dir = self.get_temp_dir()
    with beam.Pipeline() as p:
      _ = test_cache_dict | analyzer_cache.WriteAnalysisCacheToFS(
          p, cache_dir, dataset_keys, sink=LocalSink)

      read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
          cache_dir, dataset_keys, source=LocalSource)

      self.assertItemsEqual(read_cache.keys(), ['a'])
      self.assertItemsEqual(read_cache['a'].keys(), ['b'])

      beam_test_util.assert_that(
          read_cache['a']['b'],
          beam_test_util.equal_to([test_cache_dict['a']['b']]))
示例#4
0
  def test_cache_helpers_round_trip(self):
    base_test_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    with beam.Pipeline() as p:
      cache_pcoll_dict = {
          'dataset_key_0': {
              'a': p | 'CreateA' >> beam.Create([b'[1, 2, 3]']),
              'b': p | 'CreateB' >> beam.Create([b'[5]']),
          },
          'dataset_key_1': {
              'c': p | 'CreateC' >> beam.Create([b'[9, 5, 2, 1]']),
          },
      }
      _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS(
          base_test_dir)

    with beam.Pipeline() as p:
      read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
          base_test_dir, list(cache_pcoll_dict.keys()))

      def assert_equal_matcher(expected_encoded):

        def _assert_equal(encoded_cache_list):
          (encode_cache,) = encoded_cache_list
          self.assertEqual(expected_encoded, encode_cache)

        return _assert_equal

      beam_test_util.assert_that(
          read_cache['dataset_key_0']['a'],
          beam_test_util.equal_to([b'[1, 2, 3]']),
          label='AssertA')
      beam_test_util.assert_that(
          read_cache['dataset_key_0']['b'],
          assert_equal_matcher(b'[5]'),
          label='AssertB')
      beam_test_util.assert_that(
          read_cache['dataset_key_1']['c'],
          assert_equal_matcher(b'[9, 5, 2, 1]'),
          label='AssertC')
示例#5
0
  def test_cache_helpers_round_trip(self):
    base_test_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    dataset_key_0 = analyzer_cache.DatasetKey('dataset_key_0')
    dataset_key_1 = analyzer_cache.DatasetKey('dataset_key_1')
    dataset_keys = (dataset_key_0, dataset_key_1)

    with beam.Pipeline() as p:
      cache_pcoll_dict = {
          dataset_key_0: {
              b'\x8a': p | 'CreateA' >> beam.Create([b'[1, 2, 3]']),
              b'\x8b': p | 'CreateB' >> beam.Create([b'[5]']),
              b'\x8b1': p | 'CreateB1' >> beam.Create([b'[6]']),
          },
          dataset_key_1: {
              b'\x8c': p | 'CreateC' >> beam.Create([b'[9, 5, 2, 1]']),
          },
      }

      _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS(
          p, base_test_dir, dataset_keys)

    with beam.Pipeline() as p:
      read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
          base_test_dir, list(cache_pcoll_dict.keys()),
          [b'\x8a', b'\x8b', b'\x8c'])

      beam_test_util.assert_that(
          read_cache[dataset_key_0][b'\x8a'],
          beam_test_util.equal_to([b'[1, 2, 3]']),
          label='AssertA')
      beam_test_util.assert_that(
          read_cache[dataset_key_0][b'\x8b'],
          beam_test_util.equal_to([b'[5]']),
          label='AssertB')
      beam_test_util.assert_that(
          read_cache[dataset_key_1][b'\x8c'],
          beam_test_util.equal_to([b'[9, 5, 2, 1]']),
          label='AssertC')
示例#6
0
    def test_caching_vocab_for_integer_categorical(self):

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):
            return {
                'x_vocab':
                tft.compute_and_apply_vocabulary(inputs['x'],
                                                 frequency_threshold=2)
            }

        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.FixedLenFeature([], tf.int64),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
            }, {
                'x': -4,
            }, {
                'x': -1,
            }, {
                'x': 4,
            }],
            span_1_key: [{
                'x': -2,
            }, {
                'x': -1,
            }, {
                'x': 6,
            }, {
                'x': 7,
            }],
        }
        expected_transformed_data = [{
            'x_vocab': 0,
        }, {
            'x_vocab': 1,
        }, {
            'x_vocab': -1,
        }, {
            'x_vocab': -1,
        }]
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # TODO(b/37788560): Get these names programmatically.
                cache_dict = {
                    span_0_key: {
                        '__v0__VocabularyAccumulate--compute_and_apply_vocabulary-vocabulary--':
                        p | 'CreateB' >> beam.Create(
                            [b'[-2, 2]', b'[-4, 1]', b'[-1, 1]', b'[4, 1]']),
                    },
                    span_1_key: {},
                }

                transform_fn, cache_output = (
                    (flat_data, input_data_dict, cache_dict, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
                    self._cache_dir)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn)
                    | 'Transform' >> beam_impl.TransformDataset())

                transformed_data, _ = transformed_dataset

                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed_data),
                    label='first')
示例#7
0
    def test_single_phase_run_twice(self):

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):

            _ = tft.vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.io.FixedLenFeature([], tf.float32),
                'y':
                tf.io.FixedLenFeature([], tf.float32),
                's':
                tf.io.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'a',
            }, {
                'x': 4,
                'y': -4,
                's': 'a',
            }],
            span_1_key:
            input_data,
        }
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # wrap each value in input_data_dict as a pcoll.
                input_data_pcoll_dict = {}
                for a, b in six.iteritems(input_data_dict):
                    input_data_pcoll_dict[a] = p | a >> beam.Create(b)

                transform_fn_1, cache_output = (
                    (flat_data, input_data_pcoll_dict, {}, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = (cache_output | 'WriteCache' >>
                     analyzer_cache.WriteAnalysisCacheToFS(self._cache_dir))

                transformed_dataset = (
                    ((input_data_pcoll_dict[span_1_key], input_metadata),
                     transform_fn_1)
                    | 'Transform' >> beam_impl.TransformDataset())

                del input_data_pcoll_dict
                transformed_data, unused_transformed_metadata = transformed_dataset

                expected_transformed_data = [
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                    },
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                    },
                ]
                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed_data),
                    label='first')

                transform_fn_dir = os.path.join(self.base_test_dir,
                                                'transform_fn_1')
                _ = transform_fn_1 | tft_beam.WriteTransformFn(
                    transform_fn_dir)

                for key in input_data_dict:
                    self.assertIn(key, cache_output)
                    self.assertEqual(6, len(cache_output[key]))

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # wrap each value in input_data_dict as a pcoll.
                input_data_pcoll_dict = {}
                for a, b in six.iteritems(input_data_dict):
                    input_data_pcoll_dict[a] = p | a >> beam.Create(b)

                input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
                    self._cache_dir, list(input_data_dict.keys()))

                transform_fn_2, second_output_cache = (
                    (flat_data, input_data_pcoll_dict, input_cache,
                     input_metadata)
                    | 'AnalyzeAgain' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

                dot_string = nodes.get_dot_graph(
                    [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
                self.WriteRenderedDotFile(dot_string)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn_2)
                    | 'TransformAgain' >> beam_impl.TransformDataset())
        transformed_data, unused_transformed_metadata = transformed_dataset
        beam_test_util.assert_that(
            transformed_data,
            beam_test_util.equal_to(expected_transformed_data),
            label='second')

        self.assertFalse(second_output_cache)
示例#8
0
    def test_single_phase_mixed_analyzer_run_once(self):
        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):

            integerized_s = tft.compute_and_apply_vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'integerized_s':
                integerized_s,
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.io.FixedLenFeature([], tf.float32),
                'y':
                tf.io.FixedLenFeature([], tf.float32),
                's':
                tf.io.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'b',
            }, {
                'x': 4,
                'y': -4,
                's': 'b',
            }],
            span_1_key:
            input_data,
        }

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # TODO(b/37788560): Get these names programmatically.
                cache_dict = {
                    span_0_key: {
                        '__v0__CacheableCombineAccumulate--x_1-mean_and_var--':
                        p
                        | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0, 0.0]']),
                        '__v0__CacheableCombineAccumulate--x-x--':
                        p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']),
                        '__v0__CacheableCombineAccumulate--y_1-mean_and_var--':
                        p |
                        'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25, 0.0]']),
                        '__v0__CacheableCombineAccumulate--y-y--':
                        p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']),
                    },
                    span_1_key: {},
                }

                transform_fn, cache_output = (
                    (flat_data, input_data_dict, cache_dict, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
                    self._cache_dir)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn)
                    | 'Transform' >> beam_impl.TransformDataset())

                dot_string = nodes.get_dot_graph(
                    [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
                self.WriteRenderedDotFile(dot_string)

                transformed_data, unused_transformed_metadata = transformed_dataset

                expected_transformed = [
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                        'integerized_s': 0,
                    },
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                        'integerized_s': 0,
                    },
                ]
                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed))

                transform_fn_dir = os.path.join(self.base_test_dir,
                                                'transform_fn')
                _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
示例#9
0
  def _RunBeamImpl(self, inputs: Mapping[Text, Any],
                   outputs: Mapping[Text, Any], preprocessing_fn: Any,
                   input_dataset_metadata: dataset_metadata.DatasetMetadata,
                   raw_examples_data_format: Text, transform_output_path: Text,
                   compute_statistics: bool,
                   materialize_output_paths: Sequence[Text]) -> _Status:
    """Perform data preprocessing with FlumeC++ runner.

    Args:
      inputs: A dictionary of labelled input values.
      outputs: A dictionary of labelled output values.
      preprocessing_fn: The tf.Transform preprocessing_fn.
      input_dataset_metadata: A DatasetMetadata object for the input data.
      raw_examples_data_format: A string describing the raw data format.
      transform_output_path: An absolute path to write the output to.
      compute_statistics: A bool indicating whether or not compute statistics.
      materialize_output_paths: Paths to materialized outputs.

    Raises:
      RuntimeError: If reset() is not being invoked between two run().
      ValueError: If the schema is empty.

    Returns:
      Status of the execution.
    """
    raw_examples_file_format = common.GetSoleValue(
        inputs, labels.EXAMPLES_FILE_FORMAT_LABEL, strict=False)
    analyze_and_transform_data_paths = common.GetValues(
        inputs, labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL)
    transform_only_data_paths = common.GetValues(
        inputs, labels.TRANSFORM_ONLY_DATA_PATHS_LABEL)
    stats_use_tfdv = common.GetSoleValue(inputs,
                                         labels.TFT_STATISTICS_USE_TFDV_LABEL)
    per_set_stats_output_paths = common.GetValues(
        outputs, labels.PER_SET_STATS_OUTPUT_PATHS_LABEL)
    temp_path = common.GetSoleValue(outputs, labels.TEMP_OUTPUT_LABEL)

    input_cache_dir = common.GetSoleValue(
        inputs, labels.CACHE_INPUT_PATH_LABEL, strict=False)
    output_cache_dir = common.GetSoleValue(
        outputs, labels.CACHE_OUTPUT_PATH_LABEL, strict=False)

    tf.logging.info('Analyze and transform data patterns: %s',
                    list(enumerate(analyze_and_transform_data_paths)))
    tf.logging.info('Transform data patterns: %s',
                    list(enumerate(transform_only_data_paths)))
    tf.logging.info('Transform materialization output paths: %s',
                    list(enumerate(materialize_output_paths)))
    tf.logging.info('Transform output path: %s', transform_output_path)

    feature_spec = schema_utils.schema_as_feature_spec(
        _GetSchemaProto(input_dataset_metadata)).feature_spec
    try:
      analyze_input_columns = tft.get_analyze_input_columns(
          preprocessing_fn, feature_spec)
      transform_input_columns = (
          tft.get_transform_input_columns(preprocessing_fn, feature_spec))
    except AttributeError:
      # If using TFT 1.12, fall back to assuming all features are used.
      analyze_input_columns = feature_spec.keys()
      transform_input_columns = feature_spec.keys()
    # Use the same dataset (same columns) for AnalyzeDataset and computing
    # pre-transform stats so that the data will only be read once for these
    # two operations.
    if compute_statistics:
      analyze_input_columns = list(
          set(list(analyze_input_columns) + list(transform_input_columns)))
    if input_dataset_metadata.schema is _RAW_EXAMPLE_SCHEMA:
      analyze_input_dataset_metadata = input_dataset_metadata
      transform_input_dataset_metadata = input_dataset_metadata
    else:
      analyze_input_dataset_metadata = dataset_metadata.DatasetMetadata(
          dataset_schema.from_feature_spec(
              {feature: feature_spec[feature]
               for feature in analyze_input_columns}))
      transform_input_dataset_metadata = dataset_metadata.DatasetMetadata(
          dataset_schema.from_feature_spec(
              {feature: feature_spec[feature]
               for feature in transform_input_columns}))

    can_process_jointly = not bool(per_set_stats_output_paths or
                                   materialize_output_paths or output_cache_dir)
    analyze_data_list = self._MakeDatasetList(
        analyze_and_transform_data_paths, raw_examples_file_format,
        raw_examples_data_format, analyze_input_dataset_metadata,
        can_process_jointly)
    transform_data_list = self._MakeDatasetList(
        list(analyze_and_transform_data_paths) +
        list(transform_only_data_paths), raw_examples_file_format,
        raw_examples_data_format, transform_input_dataset_metadata,
        can_process_jointly)

    desired_batch_size = self._GetDesiredBatchSize(raw_examples_data_format)

    with self._CreatePipeline(outputs) as p:
      with tft_beam.Context(
          temp_dir=temp_path,
          desired_batch_size=desired_batch_size,
          passthrough_keys={_TRANSFORM_INTERNAL_FEATURE_FOR_KEY},
          use_deep_copy_optimization=True):
        # pylint: disable=expression-not-assigned
        # pylint: disable=no-value-for-parameter

        _ = (
            p | self._IncrementColumnUsageCounter(
                len(feature_spec.keys()), len(analyze_input_columns),
                len(transform_input_columns)))

        (new_analyze_data_dict, input_cache, flat_data_required) = (
            p | self._OptimizeRun(input_cache_dir, output_cache_dir,
                                  analyze_data_list, feature_spec,
                                  preprocessing_fn, self._GetCacheSource()))
        # Removing unneeded datasets if they won't be needed for
        # materialization. This means that these datasets won't be included in
        # the statistics computation or profiling either.
        if not materialize_output_paths:
          analyze_data_list = [
              d for d in new_analyze_data_dict.values() if d is not None
          ]

        analyze_decode_fn = (
            self._GetDecodeFunction(raw_examples_data_format,
                                    analyze_input_dataset_metadata.schema))

        for (idx, dataset) in enumerate(analyze_data_list):
          dataset.encoded = (
              p | 'ReadAnalysisDataset[{}]'.format(idx) >>
              self._ReadExamples(dataset))
          dataset.decoded = (
              dataset.encoded
              | 'DecodeAnalysisDataset[{}]'.format(idx) >>
              self._DecodeInputs(analyze_decode_fn))

        input_analysis_data = {}
        for key, dataset in six.iteritems(new_analyze_data_dict):
          if dataset is None:
            input_analysis_data[key] = None
          else:
            input_analysis_data[key] = dataset.decoded

        if flat_data_required:
          flat_input_analysis_data = (
              [dataset.decoded for dataset in analyze_data_list]
              | 'FlattenAnalysisDatasets' >> beam.Flatten(pipeline=p))
        else:
          flat_input_analysis_data = None
        if input_cache:
          tf.logging.info('Analyzing data with cache.')
        transform_fn, cache_output = (
            (flat_input_analysis_data, input_analysis_data, input_cache,
             input_dataset_metadata)
            | 'AnalyzeDataset' >> tft_beam.AnalyzeDatasetWithCache(
                preprocessing_fn, pipeline=p))

        # Write the raw/input metadata.
        (input_dataset_metadata
         | 'WriteMetadata' >> tft_beam.WriteMetadata(
             os.path.join(transform_output_path,
                          tft.TFTransformOutput.RAW_METADATA_DIR), p))

        # WriteTransformFn writes transform_fn and metadata to subdirectories
        # tensorflow_transform.SAVED_MODEL_DIR and
        # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
        (transform_fn |
         'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_output_path))

        if output_cache_dir is not None and cache_output is not None:
          # TODO(b/37788560): Possibly make this part of the beam graph.
          tf.io.gfile.makedirs(output_cache_dir)
          tf.logging.info('Using existing cache in: %s', input_cache_dir)
          if input_cache_dir is not None:
            # Only copy cache that is relevant to this iteration. This is
            # assuming that this pipeline operates on rolling ranges, so those
            # cache entries may also be relevant for future iterations.
            for span_cache_dir in input_analysis_data:
              full_span_cache_dir = os.path.join(input_cache_dir,
                                                 span_cache_dir)
              if tf.io.gfile.isdir(full_span_cache_dir):
                self._CopyCache(full_span_cache_dir,
                                os.path.join(output_cache_dir, span_cache_dir))

          (cache_output
           | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
               p, output_cache_dir, sink=self._GetCacheSink()))

        if compute_statistics or materialize_output_paths:
          # Do not compute pre-transform stats if the input format is raw proto,
          # as StatsGen would treat any input as tf.Example.
          if (compute_statistics and
              not self._IsDataFormatProto(raw_examples_data_format)):
            # Aggregated feature stats before transformation.
            pre_transform_feature_stats_path = os.path.join(
                transform_output_path,
                tft.TFTransformOutput.PRE_TRANSFORM_FEATURE_STATS_PATH)

            schema_proto = _GetSchemaProto(analyze_input_dataset_metadata)
            ([
                dataset.decoded if stats_use_tfdv else dataset.encoded
                for dataset in analyze_data_list
            ]
             | 'FlattenPreTransformAnalysisDatasets' >> beam.Flatten(pipeline=p)
             | 'GenerateAggregatePreTransformAnalysisStats' >>
             self._GenerateStats(
                 pre_transform_feature_stats_path,
                 schema_proto,
                 use_deep_copy_optimization=True,
                 use_tfdv=stats_use_tfdv))

          transform_decode_fn = (
              self._GetDecodeFunction(raw_examples_data_format,
                                      transform_input_dataset_metadata.schema))
          # transform_data_list is a superset of analyze_data_list, we pay the
          # cost to read the same dataset (analyze_data_list) again here to
          # prevent certain beam runner from doing large temp materialization.
          for (idx, dataset) in enumerate(transform_data_list):
            dataset.encoded = (
                p
                | 'ReadTransformDataset[{}]'.format(idx) >>
                self._ReadExamples(dataset))
            dataset.decoded = (
                dataset.encoded
                | 'DecodeTransformDataset[{}]'.format(idx) >>
                self._DecodeInputs(transform_decode_fn))
            (dataset.transformed,
             metadata) = (((dataset.decoded, transform_input_dataset_metadata),
                           transform_fn)
                          | 'TransformDataset[{}]'.format(idx) >>
                          tft_beam.TransformDataset())

            if materialize_output_paths or not stats_use_tfdv:
              dataset.transformed_and_encoded = (
                  dataset.transformed
                  | 'EncodeTransformedDataset[{}]'.format(idx) >> beam.ParDo(
                      self._EncodeAsExamples(), metadata))

          if compute_statistics:
            # Aggregated feature stats after transformation.
            _, metadata = transform_fn
            post_transform_feature_stats_path = os.path.join(
                transform_output_path,
                tft.TFTransformOutput.POST_TRANSFORM_FEATURE_STATS_PATH)

            # TODO(b/70392441): Retain tf.Metadata (e.g., IntDomain) in
            # schema. Currently input dataset schema only contains dtypes,
            # and other metadata is dropped due to roundtrip to tensors.
            transformed_schema_proto = _GetSchemaProto(metadata)

            ([(dataset.transformed
               if stats_use_tfdv else dataset.transformed_and_encoded)
              for dataset in transform_data_list]
             | 'FlattenPostTransformAnalysisDatasets' >> beam.Flatten()
             | 'GenerateAggregatePostTransformAnalysisStats' >>
             self._GenerateStats(
                 post_transform_feature_stats_path,
                 transformed_schema_proto,
                 use_tfdv=stats_use_tfdv))

            if per_set_stats_output_paths:
              assert len(transform_data_list) == len(per_set_stats_output_paths)
              # TODO(b/67632871): Remove duplicate stats gen compute that is
              # done both on a flattened view of the data, and on each span
              # below.
              bundles = zip(transform_data_list, per_set_stats_output_paths)
              for (idx, (dataset, output_path)) in enumerate(bundles):
                if stats_use_tfdv:
                  data = dataset.transformed
                else:
                  data = dataset.transformed_and_encoded
                (data
                 | 'GeneratePostTransformStats[{}]'.format(idx) >>
                 self._GenerateStats(
                     output_path,
                     transformed_schema_proto,
                     use_tfdv=stats_use_tfdv))

          if materialize_output_paths:
            assert len(transform_data_list) == len(materialize_output_paths)
            bundles = zip(transform_data_list, materialize_output_paths)
            for (idx, (dataset, output_path)) in enumerate(bundles):
              (dataset.transformed_and_encoded
               | 'Materialize[{}]'.format(idx) >> self._WriteExamples(
                   raw_examples_file_format, output_path))

    return _Status.OK()
示例#10
0
  def test_caching_vocab_for_integer_categorical(self):

    span_0_key = 'span-0'
    span_1_key = 'span-1'

    def preprocessing_fn(inputs):
      return {
          'x_vocab':
              tft.compute_and_apply_vocabulary(
                  inputs['x'], frequency_threshold=2)
      }

    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'x': tf.FixedLenFeature([], tf.int64),
        }))
    input_data_dict = {
        span_0_key: [{
            'x': -2,
        }, {
            'x': -4,
        }, {
            'x': -1,
        }, {
            'x': 4,
        }],
        span_1_key: [{
            'x': -2,
        }, {
            'x': -1,
        }, {
            'x': 6,
        }, {
            'x': 7,
        }],
    }
    expected_transformed_data = [{
        'x_vocab': 0,
    }, {
        'x_vocab': 1,
    }, {
        'x_vocab': -1,
    }, {
        'x_vocab': -1,
    }]
    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))

      cache_dict = {
          span_0_key: {
              b'__v0__VocabularyAccumulate[compute_and_apply_vocabulary/vocabulary]-\x05e\xfe4\x03H.P\xb5\xcb\xd22\xe3\x16\x15\xf8\xf5\xe38\xd9':
                  p | 'CreateB' >> beam.Create(
                      [b'[-2, 2]', b'[-4, 1]', b'[-1, 1]', b'[4, 1]']),
          },
          span_1_key: {},
      }

      transform_fn, cache_output = (
          (flat_data, input_data_dict, cache_dict, input_metadata)
          | 'Analyze' >>
          (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

      dot_string = nodes.get_dot_graph(
          [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
      self.WriteRenderedDotFile(dot_string)

      self.assertNotIn(span_0_key, cache_output)

      _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
          self._cache_dir)

      transformed_dataset = ((
          (input_data_dict[span_1_key], input_metadata), transform_fn)
                             | 'Transform' >> beam_impl.TransformDataset())

      transformed_data, _ = transformed_dataset

      beam_test_util.assert_that(
          transformed_data,
          beam_test_util.equal_to(expected_transformed_data),
          label='first')

    # 4 from analysis since 1 span was completely cached, and 4 from transform.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 1)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 1)
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)
示例#11
0
  def test_single_phase_run_twice(self):

    span_0_key = 'span-0'
    span_1_key = 'span-1'

    def preprocessing_fn(inputs):

      _ = tft.vocabulary(inputs['s'], vocab_filename='vocab1')

      _ = tft.bucketize(inputs['x'], 2, name='bucketize')

      return {
          'x_min':
              tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'x_mean':
              tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'y_min':
              tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
          'y_mean':
              tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
          's_integerized':
              tft.compute_and_apply_vocabulary(
                  inputs['s'],
                  labels=inputs['label'],
                  use_adjusted_mutual_info=True),
      }

    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'x': tf.io.FixedLenFeature([], tf.float32),
            'y': tf.io.FixedLenFeature([], tf.float32),
            's': tf.io.FixedLenFeature([], tf.string),
            'label': tf.io.FixedLenFeature([], tf.int64),
        }))
    input_data_dict = {
        span_0_key: [{
            'x': -2,
            'y': 1,
            's': 'a',
            'label': 0,
        }, {
            'x': 4,
            'y': -4,
            's': 'a',
            'label': 1,
        }, {
            'x': 5,
            'y': 11,
            's': 'a',
            'label': 1,
        }, {
            'x': 1,
            'y': -4,
            's': u'ȟᎥ𝒋ǩľḿꞑȯ𝘱𝑞𝗋𝘴'.encode('utf-8'),
            'label': 1,
        }],
        span_1_key: [{
            'x': 12,
            'y': 1,
            's': u'ȟᎥ𝒋ǩľḿꞑȯ𝘱𝑞𝗋𝘴'.encode('utf-8'),
            'label': 0
        }, {
            'x': 10,
            'y': 1,
            's': 'c',
            'label': 1
        }],
    }
    expected_vocabulary_contents = np.array(
        [b'a', u'ȟᎥ𝒋ǩľḿꞑȯ𝘱𝑞𝗋𝘴'.encode('utf-8'), b'c'],
        dtype=object)
    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))

      # wrap each value in input_data_dict as a pcoll.
      input_data_pcoll_dict = {}
      for a, b in six.iteritems(input_data_dict):
        input_data_pcoll_dict[a] = p | a >> beam.Create(b)

      transform_fn_1, cache_output = (
          (flat_data, input_data_pcoll_dict, {}, input_metadata)
          | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
      _ = (
          cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
              self._cache_dir))

      transformed_dataset = ((
          (input_data_pcoll_dict[span_1_key], input_metadata), transform_fn_1)
                             | 'Transform' >> beam_impl.TransformDataset())

      del input_data_pcoll_dict
      transformed_data, unused_transformed_metadata = transformed_dataset

      expected_transformed_data = [
          {
              'x_mean': 5.0,
              'x_min': -2.0,
              'y_mean': 1.0,
              'y_min': -4.0,
              's_integerized': 0,
          },
          {
              'x_mean': 5.0,
              'x_min': -2.0,
              'y_mean': 1.0,
              'y_min': -4.0,
              's_integerized': 2,
          },
      ]
      beam_test_util.assert_that(
          transformed_data,
          beam_test_util.equal_to(expected_transformed_data),
          label='first')

      transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_1')
      _ = transform_fn_1 | tft_beam.WriteTransformFn(transform_fn_dir)

      for key in input_data_dict:
        self.assertIn(key, cache_output)
        self.assertEqual(7, len(cache_output[key]))

    tf_transform_output = tft.TFTransformOutput(transform_fn_dir)
    vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1')
    self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents)

    # 4 from analyzing 2 spans, and 2 from transform.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 14)
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)

    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))

      # wrap each value in input_data_dict as a pcoll.
      input_data_pcoll_dict = {}
      for a, b in six.iteritems(input_data_dict):
        input_data_pcoll_dict[a] = p | a >> beam.Create(b)

      input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
          self._cache_dir, list(input_data_dict.keys()))

      transform_fn_2, second_output_cache = (
          (flat_data, input_data_pcoll_dict, input_cache, input_metadata)
          | 'AnalyzeAgain' >>
          (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

      dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH
                                       ]).to_string()
      self.WriteRenderedDotFile(dot_string)

      transformed_dataset = ((
          (input_data_dict[span_1_key], input_metadata), transform_fn_2)
                             | 'TransformAgain' >> beam_impl.TransformDataset())
      transformed_data, unused_transformed_metadata = transformed_dataset
      beam_test_util.assert_that(
          transformed_data,
          beam_test_util.equal_to(expected_transformed_data),
          label='second')

      transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_2')
      _ = transform_fn_2 | tft_beam.WriteTransformFn(transform_fn_dir)

    tf_transform_output = tft.TFTransformOutput(transform_fn_dir)
    vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1')
    self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents)

    self.assertFalse(second_output_cache)

    # Only 2 from transform.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 2)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 14)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 0)

    # The root CreateSavedModel is optimized away because the data doesn't get
    # processed at all (only cache).
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 1)
示例#12
0
  def test_single_phase_mixed_analyzer_run_once(self):
    span_0_key = 'span-0'
    span_1_key = 'span-1'

    def preprocessing_fn(inputs):

      integerized_s = tft.compute_and_apply_vocabulary(inputs['s'])

      _ = tft.bucketize(inputs['x'], 2, name='bucketize')

      return {
          'integerized_s':
              integerized_s,
          'x_min':
              tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'x_mean':
              tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'y_min':
              tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
          'y_mean':
              tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
      }

    # Run AnalyzeAndTransform on some input data and compare with expected
    # output.
    input_data = [{'x': 12, 'y': 1, 's': 'd'}, {'x': 10, 'y': 1, 's': 'c'}]
    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'x': tf.io.FixedLenFeature([], tf.float32),
            'y': tf.io.FixedLenFeature([], tf.float32),
            's': tf.io.FixedLenFeature([], tf.string),
        }))
    input_data_dict = {
        span_0_key: [{
            'x': -2,
            'y': 1,
            's': 'b',
        }, {
            'x': 4,
            'y': -4,
            's': 'b',
        }],
        span_1_key: input_data,
    }

    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))
      cache_dict = {
          span_0_key: {
              b'__v0__CacheableCombineAccumulate[x_1/mean_and_var]-.\xc4t>ZBv\xea\xa5SU\xf4\x065\xc6\x1c\x81W\xf9\x1b':
                  p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0, 0.0]']),
              b'__v0__CacheableCombineAccumulate[x/x]-\x95\xc5w\x88\x85\x8b5V\xc9\x00\xe0\x0f\x03\x1a\xdaL\x9d\xd5\xb3\xe3':
                  p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']),
              b'__v0__CacheableCombineAccumulate[y_1/mean_and_var]-E^\xb7VZ\xeew4rm\xab\xa3\xa4k|J\x80ck\x16':
                  p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25, 0.0]']),
              b'__v0__CacheableCombineAccumulate[y/y]-\xdf\x1ey\x03\x1c\x96\xd5'
              b' e\x9bJ\xa1\xd2\xfc\x9c\x03\x0fM \xdb':
                  p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']),
          },
          span_1_key: {},
      }

      transform_fn, cache_output = (
          (flat_data, input_data_dict, cache_dict, input_metadata)
          | 'Analyze' >>
          (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
      _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
          self._cache_dir)

      transformed_dataset = ((
          (input_data_dict[span_1_key], input_metadata), transform_fn)
                             | 'Transform' >> beam_impl.TransformDataset())

      dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH
                                       ]).to_string()
      self.WriteRenderedDotFile(dot_string)

      # The output cache should not have entries for the cache that is present
      # in the input cache.
      self.assertEqual(
          len(cache_output[span_0_key]),
          len(cache_output[span_1_key]) - 4)

      transformed_data, unused_transformed_metadata = transformed_dataset

      expected_transformed = [
          {
              'x_mean': 6.0,
              'x_min': -2.0,
              'y_mean': -0.25,
              'y_min': -4.0,
              'integerized_s': 1,
          },
          {
              'x_mean': 6.0,
              'x_min': -2.0,
              'y_mean': -0.25,
              'y_min': -4.0,
              'integerized_s': 2,
          },
      ]
      beam_test_util.assert_that(transformed_data,
                                 beam_test_util.equal_to(expected_transformed))

      transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn')
      _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)

    # 4 from analyzing 2 spans, and 2 from transform.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 6)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 4)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 8)
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)