Exemplo n.º 1
0
  def test_perform_combiner_packing_optimization(
      self, feature_spec, preprocessing_fn, num_phases,
      expected_dot_graph_str_before_packing,
      expected_dot_graph_str_after_packing):

    graph, structured_inputs, structured_outputs = (
        impl_helper.trace_preprocessing_function(
            preprocessing_fn, feature_spec, use_tf_compat_v1=True))

    def _side_effect_fn(saved_model_future, cache_value_nodes,
                        unused_num_phases):
      return (saved_model_future, cache_value_nodes)

    with mock.patch.object(
        combiner_packing_util,
        'perform_combiner_packing_optimization',
        side_effect=_side_effect_fn):
      transform_fn_future_before, unused_cache = analysis_graph_builder.build(
          graph, structured_inputs, structured_outputs)
    transform_fn_future_after, unused_cache = (
        combiner_packing_util.perform_combiner_packing_optimization(
            transform_fn_future_before, unused_cache, num_phases))
    dot_string_before = nodes.get_dot_graph(
        [transform_fn_future_before]).to_string()
    self.assertMultiLineEqual(
        msg='Result dot graph is:\n{}'.format(dot_string_before),
        first=dot_string_before,
        second=expected_dot_graph_str_before_packing)
    dot_string_after = nodes.get_dot_graph(
        [transform_fn_future_after]).to_string()
    self.WriteRenderedDotFile(dot_string_after)
    self.assertMultiLineEqual(
        msg='Result dot graph is:\n{}'.format(dot_string_after),
        first=dot_string_after,
        second=expected_dot_graph_str_after_packing)
    def test_perform_combiner_packing_optimization(
            self, feature_spec, preprocessing_fn, num_phases,
            expected_dot_graph_str_before_packing,
            expected_dot_graph_str_after_packing):
        with tf.compat.v1.Graph().as_default() as graph:
            with tf.compat.v1.name_scope('inputs'):
                input_signature = impl_helper.feature_spec_as_batched_placeholders(
                    feature_spec)
            output_signature = preprocessing_fn(input_signature)

            def _side_effect_fn(saved_model_future, cache_value_nodes,
                                unused_num_phases):
                return (saved_model_future, cache_value_nodes)

            with mock.patch.object(combiner_packing_util,
                                   'perform_combiner_packing_optimization',
                                   side_effect=_side_effect_fn):
                transform_fn_future_before, unused_cache = analysis_graph_builder.build(
                    graph, input_signature, output_signature)
            transform_fn_future_after, unused_cache = (
                combiner_packing_util.perform_combiner_packing_optimization(
                    transform_fn_future_before, unused_cache, num_phases))
        dot_string_before = nodes.get_dot_graph([transform_fn_future_before
                                                 ]).to_string()
        self.assertMultiLineEqual(
            msg='Result dot graph is:\n{}'.format(dot_string_before),
            first=dot_string_before,
            second=expected_dot_graph_str_before_packing)
        dot_string_after = nodes.get_dot_graph([transform_fn_future_after
                                                ]).to_string()
        self.WriteRenderedDotFile(dot_string_after)
        self.assertMultiLineEqual(
            msg='Result dot graph is:\n{}'.format(dot_string_after),
            first=dot_string_after,
            second=expected_dot_graph_str_after_packing)
    def test_get_analysis_cache_entry_keys(self, use_tf_compat_v1):
        if not use_tf_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        full_dataset_keys = ['a', 'b']

        def preprocessing_fn(inputs):
            return {'x': tft.scale_to_0_1(inputs['x'])}

        mocked_cache_entry_key = 'A'

        def mocked_make_cache_entry_key(_):
            return mocked_cache_entry_key

        feature_spec = {'x': tf.io.FixedLenFeature([], tf.float32)}
        specs = (feature_spec if use_tf_compat_v1 else
                 impl_helper.get_type_specs_from_feature_specs(feature_spec))
        with mock.patch(
                'tensorflow_transform.beam.analysis_graph_builder.'
                'analyzer_cache.make_cache_entry_key',
                side_effect=mocked_make_cache_entry_key):
            cache_entry_keys = (
                analysis_graph_builder.get_analysis_cache_entry_keys(
                    preprocessing_fn,
                    specs,
                    full_dataset_keys,
                    force_tf_compat_v1=use_tf_compat_v1))

        dot_string = nodes.get_dot_graph(
            [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
        self.WriteRenderedDotFile(dot_string)
        self.assertCountEqual(cache_entry_keys, [mocked_cache_entry_key])
Exemplo n.º 4
0
    def testGetDotGraph(self):
        a = nodes.apply_operation(_Constant, value='a', label='Constant[a]')
        b = nodes.apply_operation(_Constant, value='b', label='Constant[b]')
        b_copy, a_copy = nodes.apply_multi_output_operation(_Swap,
                                                            a,
                                                            b,
                                                            label='Swap[0]')
        b_copy2, unused_a_copy2 = nodes.apply_multi_output_operation(
            _Swap, a_copy, b_copy, label='Swap[1]')
        dot_string = nodes.get_dot_graph([b_copy2]).to_string()
        self.WriteRenderedDotFile(dot_string)

        self.assertMultiLineEqual(
            dot_string,
            """\
digraph G {
directed=True;
node [shape=Mrecord];
"Constant[a]" [label="{_Constant|value: a|label: Constant[a]}"];
"Constant[b]" [label="{_Constant|value: b|label: Constant[b]}"];
"Swap[0]" [label="{_Swap|label: Swap[0]|{<0>0|<1>1}}"];
"Constant[a]" -> "Swap[0]";
"Constant[b]" -> "Swap[0]";
"Swap[1]" [label="{_Swap|label: Swap[1]|{<0>0|<1>1}}"];
"Swap[0]":1 -> "Swap[1]";
"Swap[0]":0 -> "Swap[1]";
}
""",
            msg='Result dot graph is:\n{}'.format(dot_string))
Exemplo n.º 5
0
    def test_optimize_traversal(self, feature_spec, preprocessing_fn,
                                dataset_input_cache_dict,
                                expected_dot_graph_str):
        span_0_key, span_1_key = 'span-0', 'span-1'
        if dataset_input_cache_dict is not None:
            cache = {span_0_key: dataset_input_cache_dict}
        else:
            cache = {}

        with tf.compat.v1.name_scope('inputs'):
            input_signature = impl_helper.feature_spec_as_batched_placeholders(
                feature_spec)
        output_signature = preprocessing_fn(input_signature)
        transform_fn_future, cache_output_dict = analysis_graph_builder.build(
            tf.compat.v1.get_default_graph(), input_signature,
            output_signature, {span_0_key, span_1_key}, cache)

        leaf_nodes = [transform_fn_future] + sorted(cache_output_dict.values(),
                                                    key=str)
        dot_string = nodes.get_dot_graph(leaf_nodes).to_string()
        self.WriteRenderedDotFile(dot_string)

        self.assertSameElements(
            dot_string.split('\n'),
            expected_dot_graph_str.split('\n'),
            msg='Result dot graph is:\n{}'.format(dot_string))
Exemplo n.º 6
0
    def test_get_analysis_dataset_keys(self, preprocessing_fn,
                                       full_dataset_keys, cached_dataset_keys,
                                       expected_dataset_keys,
                                       expected_flat_data_required):
        # We force all dataset keys with entries in the cache dict will have a cache
        # hit.
        mocked_cache_entry_key = b'M'
        input_cache = {
            key: {
                mocked_cache_entry_key: 'C'
            }
            for key in cached_dataset_keys
        }
        feature_spec = {'x': tf.io.FixedLenFeature([], tf.float32)}
        with mock.patch(
                'tensorflow_transform.beam.analysis_graph_builder.'
                'analyzer_cache.make_cache_entry_key',
                return_value=mocked_cache_entry_key):
            dataset_keys, flat_data_required = (
                analysis_graph_builder.get_analysis_dataset_keys(
                    preprocessing_fn, feature_spec, full_dataset_keys,
                    input_cache))

        dot_string = nodes.get_dot_graph(
            [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
        self.WriteRenderedDotFile(dot_string)

        self.assertCountEqual(expected_dataset_keys, dataset_keys)
        self.assertEqual(expected_flat_data_required, flat_data_required)
Exemplo n.º 7
0
  def test_build(self, feature_spec, preprocessing_fn, expected_dot_graph_str):
    graph, structured_inputs, structured_outputs = (
        impl_helper.trace_preprocessing_function(
            preprocessing_fn, feature_spec, use_tf_compat_v1=True))
    transform_fn_future, unused_cache = analysis_graph_builder.build(
        graph, structured_inputs, structured_outputs)

    dot_string = nodes.get_dot_graph([transform_fn_future]).to_string()
    self.WriteRenderedDotFile(dot_string)
    self.assertMultiLineEqual(
        msg='Result dot graph is:\n{}'.format(dot_string),
        first=dot_string,
        second=expected_dot_graph_str)
Exemplo n.º 8
0
  def test_build(self, feature_spec, preprocessing_fn, expected_dot_graph_str):
    with tf.name_scope('inputs'):
      input_signature = impl_helper.feature_spec_as_batched_placeholders(
          feature_spec)
    output_signature = preprocessing_fn(input_signature)
    transform_fn_future = analysis_graph_builder.build(
        tf.get_default_graph(), input_signature, output_signature)

    dot_string = nodes.get_dot_graph([transform_fn_future]).to_string()
    self.WriteRenderedDotFile(dot_string)

    self.assertMultiLineEqual(
        msg='Result dot graph is:\n{}'.format(dot_string),
        first=dot_string,
        second=expected_dot_graph_str)
Exemplo n.º 9
0
  def test_get_analysis_cache_entry_keys(self):
    full_dataset_keys = ['a', 'b']
    def preprocessing_fn(inputs):
      return {'x': tft.scale_to_0_1(inputs['x'])}
    mocked_cache_entry_key = 'A'
    def mocked_make_cache_entry_key(_):
      return mocked_cache_entry_key
    feature_spec = {'x': tf.io.FixedLenFeature([], tf.float32)}
    with mock.patch(
        'tensorflow_transform.beam.analysis_graph_builder.'
        'analyzer_cache.make_cache_entry_key',
        side_effect=mocked_make_cache_entry_key):
      cache_entry_keys = (
          analysis_graph_builder.get_analysis_cache_entry_keys(
              preprocessing_fn, feature_spec, full_dataset_keys))

    dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH
                                     ]).to_string()
    self.WriteRenderedDotFile(dot_string)
    self.assertCountEqual(cache_entry_keys, [mocked_cache_entry_key])
Exemplo n.º 10
0
  def test_optimize_traversal(self, feature_spec, preprocessing_fn,
                              write_cache_fn, expected_dot_graph_str):
    cache_location = self._make_cache_location()
    span_0_key, span_1_key = 'span-0', 'span-1'
    if write_cache_fn is not None:
      write_cache_fn(cache_location.input_cache_dir, [span_0_key, span_1_key])

    with tf.name_scope('inputs'):
      input_signature = impl_helper.feature_spec_as_batched_placeholders(
          feature_spec)
    output_signature = preprocessing_fn(input_signature)
    transform_fn_future = analysis_graph_builder.build(
        tf.get_default_graph(), input_signature, output_signature,
        {span_0_key, span_1_key}, cache_location)

    dot_string = nodes.get_dot_graph([transform_fn_future]).to_string()
    self.WriteRenderedDotFile(dot_string)

    self.assertSameElements(
        dot_string.split('\n'),
        expected_dot_graph_str.split('\n'),
        msg='Result dot graph is:\n{}'.format(dot_string))
    def test_get_analysis_dataset_keys(self, preprocessing_fn,
                                       full_dataset_keys, cached_dataset_keys,
                                       expected_dataset_keys,
                                       use_tf_compat_v1):
        if not use_tf_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        full_dataset_keys = [
            analysis_graph_builder.analyzer_cache.DatasetKey(k)
            for k in full_dataset_keys
        ]
        # We force all dataset keys with entries in the cache dict will have a cache
        # hit.
        mocked_cache_entry_key = b'M'
        input_cache = {
            key: {
                mocked_cache_entry_key: 'C'
            }
            for key in cached_dataset_keys
        }
        feature_spec = {'x': tf.io.FixedLenFeature([], tf.float32)}
        specs = (feature_spec if use_tf_compat_v1 else
                 impl_helper.get_type_specs_from_feature_specs(feature_spec))
        with mock.patch(
                'tensorflow_transform.beam.analysis_graph_builder.'
                'analyzer_cache.make_cache_entry_key',
                return_value=mocked_cache_entry_key):
            dataset_keys = (analysis_graph_builder.get_analysis_dataset_keys(
                preprocessing_fn,
                specs,
                full_dataset_keys,
                input_cache,
                force_tf_compat_v1=use_tf_compat_v1))

        dot_string = nodes.get_dot_graph(
            [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
        self.WriteRenderedDotFile(dot_string)
        self.assertCountEqual(expected_dataset_keys, dataset_keys)
    def test_build(self, feature_spec, preprocessing_fn,
                   expected_dot_graph_str, expected_dot_graph_str_tf2,
                   use_tf_compat_v1):
        if not use_tf_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        specs = (feature_spec if use_tf_compat_v1 else
                 impl_helper.get_type_specs_from_feature_specs(feature_spec))
        graph, structured_inputs, structured_outputs = (
            impl_helper.trace_preprocessing_function(
                preprocessing_fn,
                specs,
                use_tf_compat_v1=use_tf_compat_v1,
                base_temp_dir=os.path.join(self.get_temp_dir(),
                                           self._testMethodName)))
        transform_fn_future, unused_cache = analysis_graph_builder.build(
            graph, structured_inputs, structured_outputs)

        dot_string = nodes.get_dot_graph([transform_fn_future]).to_string()
        self.WriteRenderedDotFile(dot_string)
        self.assertMultiLineEqual(
            msg='Result dot graph is:\n{}'.format(dot_string),
            first=dot_string,
            second=(expected_dot_graph_str
                    if use_tf_compat_v1 else expected_dot_graph_str_tf2))
Exemplo n.º 13
0
  def test_single_phase_run_twice(self):

    span_0_key = 'span-0'
    span_1_key = 'span-1'

    def preprocessing_fn(inputs):

      _ = tft.vocabulary(inputs['s'], vocab_filename='vocab1')

      _ = tft.bucketize(inputs['x'], 2, name='bucketize')

      return {
          'x_min':
              tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'x_mean':
              tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'y_min':
              tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
          'y_mean':
              tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
          's_integerized':
              tft.compute_and_apply_vocabulary(
                  inputs['s'],
                  labels=inputs['label'],
                  use_adjusted_mutual_info=True),
      }

    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'x': tf.io.FixedLenFeature([], tf.float32),
            'y': tf.io.FixedLenFeature([], tf.float32),
            's': tf.io.FixedLenFeature([], tf.string),
            'label': tf.io.FixedLenFeature([], tf.int64),
        }))
    input_data_dict = {
        span_0_key: [{
            'x': -2,
            'y': 1,
            's': 'a',
            'label': 0,
        }, {
            'x': 4,
            'y': -4,
            's': 'a',
            'label': 1,
        }, {
            'x': 5,
            'y': 11,
            's': 'a',
            'label': 1,
        }, {
            'x': 1,
            'y': -4,
            's': u'ȟᎥ𝒋ǩľḿꞑȯ𝘱𝑞𝗋𝘴'.encode('utf-8'),
            'label': 1,
        }],
        span_1_key: [{
            'x': 12,
            'y': 1,
            's': u'ȟᎥ𝒋ǩľḿꞑȯ𝘱𝑞𝗋𝘴'.encode('utf-8'),
            'label': 0
        }, {
            'x': 10,
            'y': 1,
            's': 'c',
            'label': 1
        }],
    }
    expected_vocabulary_contents = np.array(
        [b'a', u'ȟᎥ𝒋ǩľḿꞑȯ𝘱𝑞𝗋𝘴'.encode('utf-8'), b'c'],
        dtype=object)
    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))

      # wrap each value in input_data_dict as a pcoll.
      input_data_pcoll_dict = {}
      for a, b in six.iteritems(input_data_dict):
        input_data_pcoll_dict[a] = p | a >> beam.Create(b)

      transform_fn_1, cache_output = (
          (flat_data, input_data_pcoll_dict, {}, input_metadata)
          | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
      _ = (
          cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
              self._cache_dir))

      transformed_dataset = ((
          (input_data_pcoll_dict[span_1_key], input_metadata), transform_fn_1)
                             | 'Transform' >> beam_impl.TransformDataset())

      del input_data_pcoll_dict
      transformed_data, unused_transformed_metadata = transformed_dataset

      expected_transformed_data = [
          {
              'x_mean': 5.0,
              'x_min': -2.0,
              'y_mean': 1.0,
              'y_min': -4.0,
              's_integerized': 0,
          },
          {
              'x_mean': 5.0,
              'x_min': -2.0,
              'y_mean': 1.0,
              'y_min': -4.0,
              's_integerized': 2,
          },
      ]
      beam_test_util.assert_that(
          transformed_data,
          beam_test_util.equal_to(expected_transformed_data),
          label='first')

      transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_1')
      _ = transform_fn_1 | tft_beam.WriteTransformFn(transform_fn_dir)

      for key in input_data_dict:
        self.assertIn(key, cache_output)
        self.assertEqual(7, len(cache_output[key]))

    tf_transform_output = tft.TFTransformOutput(transform_fn_dir)
    vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1')
    self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents)

    # 4 from analyzing 2 spans, and 2 from transform.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 14)
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)

    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))

      # wrap each value in input_data_dict as a pcoll.
      input_data_pcoll_dict = {}
      for a, b in six.iteritems(input_data_dict):
        input_data_pcoll_dict[a] = p | a >> beam.Create(b)

      input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
          self._cache_dir, list(input_data_dict.keys()))

      transform_fn_2, second_output_cache = (
          (flat_data, input_data_pcoll_dict, input_cache, input_metadata)
          | 'AnalyzeAgain' >>
          (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

      dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH
                                       ]).to_string()
      self.WriteRenderedDotFile(dot_string)

      transformed_dataset = ((
          (input_data_dict[span_1_key], input_metadata), transform_fn_2)
                             | 'TransformAgain' >> beam_impl.TransformDataset())
      transformed_data, unused_transformed_metadata = transformed_dataset
      beam_test_util.assert_that(
          transformed_data,
          beam_test_util.equal_to(expected_transformed_data),
          label='second')

      transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_2')
      _ = transform_fn_2 | tft_beam.WriteTransformFn(transform_fn_dir)

    tf_transform_output = tft.TFTransformOutput(transform_fn_dir)
    vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1')
    self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents)

    self.assertFalse(second_output_cache)

    # Only 2 from transform.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 2)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 14)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 0)

    # The root CreateSavedModel is optimized away because the data doesn't get
    # processed at all (only cache).
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 1)
Exemplo n.º 14
0
    def test_single_phase_run_twice(self):

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):

            _ = tft.vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.io.FixedLenFeature([], tf.float32),
                'y':
                tf.io.FixedLenFeature([], tf.float32),
                's':
                tf.io.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'a',
            }, {
                'x': 4,
                'y': -4,
                's': 'a',
            }],
            span_1_key:
            input_data,
        }
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # wrap each value in input_data_dict as a pcoll.
                input_data_pcoll_dict = {}
                for a, b in six.iteritems(input_data_dict):
                    input_data_pcoll_dict[a] = p | a >> beam.Create(b)

                transform_fn_1, cache_output = (
                    (flat_data, input_data_pcoll_dict, {}, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = (cache_output | 'WriteCache' >>
                     analyzer_cache.WriteAnalysisCacheToFS(self._cache_dir))

                transformed_dataset = (
                    ((input_data_pcoll_dict[span_1_key], input_metadata),
                     transform_fn_1)
                    | 'Transform' >> beam_impl.TransformDataset())

                del input_data_pcoll_dict
                transformed_data, unused_transformed_metadata = transformed_dataset

                expected_transformed_data = [
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                    },
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                    },
                ]
                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed_data),
                    label='first')

                transform_fn_dir = os.path.join(self.base_test_dir,
                                                'transform_fn_1')
                _ = transform_fn_1 | tft_beam.WriteTransformFn(
                    transform_fn_dir)

                for key in input_data_dict:
                    self.assertIn(key, cache_output)
                    self.assertEqual(6, len(cache_output[key]))

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # wrap each value in input_data_dict as a pcoll.
                input_data_pcoll_dict = {}
                for a, b in six.iteritems(input_data_dict):
                    input_data_pcoll_dict[a] = p | a >> beam.Create(b)

                input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
                    self._cache_dir, list(input_data_dict.keys()))

                transform_fn_2, second_output_cache = (
                    (flat_data, input_data_pcoll_dict, input_cache,
                     input_metadata)
                    | 'AnalyzeAgain' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

                dot_string = nodes.get_dot_graph(
                    [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
                self.WriteRenderedDotFile(dot_string)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn_2)
                    | 'TransformAgain' >> beam_impl.TransformDataset())
        transformed_data, unused_transformed_metadata = transformed_dataset
        beam_test_util.assert_that(
            transformed_data,
            beam_test_util.equal_to(expected_transformed_data),
            label='second')

        self.assertFalse(second_output_cache)
Exemplo n.º 15
0
    def test_single_phase_mixed_analyzer_run_once(self):
        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):

            integerized_s = tft.compute_and_apply_vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'integerized_s':
                integerized_s,
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.io.FixedLenFeature([], tf.float32),
                'y':
                tf.io.FixedLenFeature([], tf.float32),
                's':
                tf.io.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'b',
            }, {
                'x': 4,
                'y': -4,
                's': 'b',
            }],
            span_1_key:
            input_data,
        }

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # TODO(b/37788560): Get these names programmatically.
                cache_dict = {
                    span_0_key: {
                        '__v0__CacheableCombineAccumulate--x_1-mean_and_var--':
                        p
                        | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0, 0.0]']),
                        '__v0__CacheableCombineAccumulate--x-x--':
                        p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']),
                        '__v0__CacheableCombineAccumulate--y_1-mean_and_var--':
                        p |
                        'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25, 0.0]']),
                        '__v0__CacheableCombineAccumulate--y-y--':
                        p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']),
                    },
                    span_1_key: {},
                }

                transform_fn, cache_output = (
                    (flat_data, input_data_dict, cache_dict, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
                    self._cache_dir)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn)
                    | 'Transform' >> beam_impl.TransformDataset())

                dot_string = nodes.get_dot_graph(
                    [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
                self.WriteRenderedDotFile(dot_string)

                transformed_data, unused_transformed_metadata = transformed_dataset

                expected_transformed = [
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                        'integerized_s': 0,
                    },
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                        'integerized_s': 0,
                    },
                ]
                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed))

                transform_fn_dir = os.path.join(self.base_test_dir,
                                                'transform_fn')
                _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
Exemplo n.º 16
0
  def test_caching_vocab_for_integer_categorical(self):

    span_0_key = 'span-0'
    span_1_key = 'span-1'

    def preprocessing_fn(inputs):
      return {
          'x_vocab':
              tft.compute_and_apply_vocabulary(
                  inputs['x'], frequency_threshold=2)
      }

    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'x': tf.FixedLenFeature([], tf.int64),
        }))
    input_data_dict = {
        span_0_key: [{
            'x': -2,
        }, {
            'x': -4,
        }, {
            'x': -1,
        }, {
            'x': 4,
        }],
        span_1_key: [{
            'x': -2,
        }, {
            'x': -1,
        }, {
            'x': 6,
        }, {
            'x': 7,
        }],
    }
    expected_transformed_data = [{
        'x_vocab': 0,
    }, {
        'x_vocab': 1,
    }, {
        'x_vocab': -1,
    }, {
        'x_vocab': -1,
    }]
    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))

      cache_dict = {
          span_0_key: {
              b'__v0__VocabularyAccumulate[compute_and_apply_vocabulary/vocabulary]-\x05e\xfe4\x03H.P\xb5\xcb\xd22\xe3\x16\x15\xf8\xf5\xe38\xd9':
                  p | 'CreateB' >> beam.Create(
                      [b'[-2, 2]', b'[-4, 1]', b'[-1, 1]', b'[4, 1]']),
          },
          span_1_key: {},
      }

      transform_fn, cache_output = (
          (flat_data, input_data_dict, cache_dict, input_metadata)
          | 'Analyze' >>
          (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

      dot_string = nodes.get_dot_graph(
          [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
      self.WriteRenderedDotFile(dot_string)

      self.assertNotIn(span_0_key, cache_output)

      _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
          self._cache_dir)

      transformed_dataset = ((
          (input_data_dict[span_1_key], input_metadata), transform_fn)
                             | 'Transform' >> beam_impl.TransformDataset())

      transformed_data, _ = transformed_dataset

      beam_test_util.assert_that(
          transformed_data,
          beam_test_util.equal_to(expected_transformed_data),
          label='first')

    # 4 from analysis since 1 span was completely cached, and 4 from transform.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 1)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 1)
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)
Exemplo n.º 17
0
  def test_single_phase_mixed_analyzer_run_once(self):
    span_0_key = 'span-0'
    span_1_key = 'span-1'

    def preprocessing_fn(inputs):

      integerized_s = tft.compute_and_apply_vocabulary(inputs['s'])

      _ = tft.bucketize(inputs['x'], 2, name='bucketize')

      return {
          'integerized_s':
              integerized_s,
          'x_min':
              tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'x_mean':
              tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'y_min':
              tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
          'y_mean':
              tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
      }

    # Run AnalyzeAndTransform on some input data and compare with expected
    # output.
    input_data = [{'x': 12, 'y': 1, 's': 'd'}, {'x': 10, 'y': 1, 's': 'c'}]
    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'x': tf.io.FixedLenFeature([], tf.float32),
            'y': tf.io.FixedLenFeature([], tf.float32),
            's': tf.io.FixedLenFeature([], tf.string),
        }))
    input_data_dict = {
        span_0_key: [{
            'x': -2,
            'y': 1,
            's': 'b',
        }, {
            'x': 4,
            'y': -4,
            's': 'b',
        }],
        span_1_key: input_data,
    }

    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))
      cache_dict = {
          span_0_key: {
              b'__v0__CacheableCombineAccumulate[x_1/mean_and_var]-.\xc4t>ZBv\xea\xa5SU\xf4\x065\xc6\x1c\x81W\xf9\x1b':
                  p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0, 0.0]']),
              b'__v0__CacheableCombineAccumulate[x/x]-\x95\xc5w\x88\x85\x8b5V\xc9\x00\xe0\x0f\x03\x1a\xdaL\x9d\xd5\xb3\xe3':
                  p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']),
              b'__v0__CacheableCombineAccumulate[y_1/mean_and_var]-E^\xb7VZ\xeew4rm\xab\xa3\xa4k|J\x80ck\x16':
                  p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25, 0.0]']),
              b'__v0__CacheableCombineAccumulate[y/y]-\xdf\x1ey\x03\x1c\x96\xd5'
              b' e\x9bJ\xa1\xd2\xfc\x9c\x03\x0fM \xdb':
                  p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']),
          },
          span_1_key: {},
      }

      transform_fn, cache_output = (
          (flat_data, input_data_dict, cache_dict, input_metadata)
          | 'Analyze' >>
          (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
      _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
          self._cache_dir)

      transformed_dataset = ((
          (input_data_dict[span_1_key], input_metadata), transform_fn)
                             | 'Transform' >> beam_impl.TransformDataset())

      dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH
                                       ]).to_string()
      self.WriteRenderedDotFile(dot_string)

      # The output cache should not have entries for the cache that is present
      # in the input cache.
      self.assertEqual(
          len(cache_output[span_0_key]),
          len(cache_output[span_1_key]) - 4)

      transformed_data, unused_transformed_metadata = transformed_dataset

      expected_transformed = [
          {
              'x_mean': 6.0,
              'x_min': -2.0,
              'y_mean': -0.25,
              'y_min': -4.0,
              'integerized_s': 1,
          },
          {
              'x_mean': 6.0,
              'x_min': -2.0,
              'y_mean': -0.25,
              'y_min': -4.0,
              'integerized_s': 2,
          },
      ]
      beam_test_util.assert_that(transformed_data,
                                 beam_test_util.equal_to(expected_transformed))

      transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn')
      _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)

    # 4 from analyzing 2 spans, and 2 from transform.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 6)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 4)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 8)
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)