예제 #1
0
    def testCreatePhasesWithTable(self):
        # Create a graph with table that can only be run after the first analyzer
        # has run.  Note converting an integerized string into a float doesn't make
        # much sense, but is a legal tensorflow computation.
        string_placeholder = tf.placeholder(tf.string, shape=(None, ))
        integerized = mappers.string_to_int(string_placeholder)
        integerized = tf.to_float(integerized)
        integerized / analyzers.max(integerized)  # pylint: disable=expression-not-assigned

        phases = impl_helper.create_phases()
        self.assertEqual(len(phases), 2)
        self.assertEqual(len(phases[0].analyzers), 1)
        self.assertEqual(len(phases[1].analyzers), 1)
        self.assertEqual(len(phases[0].table_initializers), 0)
        self.assertEqual(len(phases[1].table_initializers), 1)
예제 #2
0
  def test_create_phases_with_tf_cond(self):
    int_placeholder = tf.placeholder(tf.int64, shape=(None,))
    abs_int_placeholder = tf.cond(
        tf.reduce_sum(int_placeholder) > 0,
        lambda: int_placeholder,
        lambda: -int_placeholder)

    # We need to call an analyzer after the tf.cond because only the transitive
    # parents of analyzers are inspected by create_phases.
    mappers.scale_to_0_1(abs_int_placeholder)

    phases = impl_helper.create_phases({'x': int_placeholder})
    self.assertEqual(len(phases), 1)

    # tft.scale_to_0_1 uses a single analyzer: analyzers._min_and_max.
    self.assertEqual(len(phases[0].analyzer_infos), 1)
예제 #3
0
    def testCreatePhasesWithUnwrappedTable(self):
        # Test a preprocessing function with a table that is not wrapped in
        # `apply_function`.
        def preprocessing_fn(inputs):
            table = lookup.string_to_index_table_from_tensor(['a', 'b'])
            integerized = table.lookup(inputs['x'])
            return {'integerized': integerized}

        input_schema = sch.Schema({
            'x':
            sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
        })
        graph, _, _ = impl_helper.run_preprocessing_fn(preprocessing_fn,
                                                       input_schema)
        with self.assertRaisesRegexp(ValueError, 'Found table initializers'):
            _ = impl_helper.create_phases(graph)
예제 #4
0
  def testCreatePhasesWithMultipleLevelsOfAnalyzers(self):
    # Test a preprocessing function similar to scale_to_0_1 except that it
    # involves multiple interleavings of analyzers and transforms.
    def preprocessing_fn(inputs):
      scaled_to_0 = inputs['x'] - analyzers.min(inputs['x'])
      scaled_to_0_1 = scaled_to_0 / analyzers.max(scaled_to_0)
      return {'x_scaled': scaled_to_0_1}

    input_schema = sch.Schema({
        'x': sch.ColumnSchema(tf.float32, [], sch.FixedColumnRepresentation())
    })
    graph, _, _ = impl_helper.run_preprocessing_fn(
        preprocessing_fn, input_schema)
    phases = impl_helper.create_phases(graph)
    self.assertEqual(len(phases), 2)
    self.assertEqual(len(phases[0].analyzers), 1)
    self.assertEqual(len(phases[1].analyzers), 1)
예제 #5
0
  def testCreatePhasesWithTable(self):
    # Test a preprocessing function with table that can only be run after the
    # first analyzer has run.  Note converting an integerized string into a
    # float doesn't make much sense, but is a legal tensorflow computation.
    def preprocessing_fn(inputs):
      integerized = mappers.string_to_int(inputs['x'])
      integerized = tf.to_float(integerized)
      scaled_to_0_1 = integerized / analyzers.max(integerized)
      return {'x_scaled': scaled_to_0_1}

    input_schema = sch.Schema({
        'x': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
    })
    graph, _, _ = impl_helper.run_preprocessing_fn(
        preprocessing_fn, input_schema)
    phases = impl_helper.create_phases(graph)
    self.assertEqual(len(phases), 2)
    self.assertEqual(len(phases[0].analyzers), 1)
    self.assertEqual(len(phases[1].analyzers), 1)
    self.assertEqual(len(phases[0].table_initializers), 0)
    self.assertEqual(len(phases[1].table_initializers), 1)
예제 #6
0
파일: impl.py 프로젝트: qipa/transform
  def expand(self, dataset):
    """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.

    Raises:
      ValueError: If preprocessing_fn has no outputs.
    """
    input_values, input_metadata = dataset
    input_schema = input_metadata.schema

    base_temp_dir = Context.create_base_temp_dir()

    graph = tf.Graph()
    with graph.as_default():

      with tf.name_scope('inputs'):
        inputs = input_schema.as_batched_placeholders()
      # In order to avoid a bug where import_graph_def fails when the input_map
      # and return_elements of an imported graph are the same (b/34288791), we
      # avoid using the placeholder of an input column as an output of a graph.
      # We do this by applying tf.identity to all inputs of the
      # preprocessing_fn.  Note this applies at the level of raw tensors.
      outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs))

      # At this point we check that the preprocessing_fn has at least one
      # output. This is because if we allowed the output of preprocessing_fn to
      # be empty, we wouldn't be able to determine how many instances to
      # "unbatch" the output into.
      if not outputs:
        raise ValueError('The preprocessing function returned an empty dict')

      if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
        raise ValueError(
            'The preprocessing function contained trainable variables '
            '{}'.format(
                graph.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES)))

      # NOTE: it's important that create_phases is called directly after
      # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS
      # collection which would break the logic in create_phases.
      phases = impl_helper.create_phases()

      # Iterate through levels.  tensor_pcoll_mapping is a mapping from tensor
      # names to singleton PCollections containing a _TensorValue.  We compute
      # tensor_pcoll_mapping in phases, where at each phase we compute the
      # analyzers that are ready to run and update tensor_pcoll_mapping.
      tensor_pcoll_mapping = {}
      table_initializers = graph.get_collection_ref(
          tf.GraphKeys.TABLE_INITIALIZERS)
      original_table_initializers = list(table_initializers)
      del table_initializers[:]

      serialized_tf_config = (
          common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
              input_values.pipeline.runner))
      for level, phase in enumerate(phases):
        # Create a SavedModel that describes the mapping from the input data
        # to the inputs of the analyzers at this level.  The colum names of the
        # outputs are the tensor names of the analyzer inputs in the graph.
        # This graph has the anaylzer outputs computed so far replaced with
        # constants.
        analyzer_inputs = {}
        for analyzer in phase.analyzers:
          for input_tensor in analyzer.inputs:
            analyzer_inputs[input_tensor.name] = input_tensor
        table_initializers.extend(phase.table_initializers)
        unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir)
        _write_saved_transform(graph, inputs, analyzer_inputs,
                               unbound_saved_model_dir)
        saved_model_dir = (
            tensor_pcoll_mapping
            | 'CreateSavedModelForAnalyzerInputs[%d]' % level >>
            _ReplaceTensorsWithConstants(unbound_saved_model_dir, base_temp_dir,
                                         input_values.pipeline))

        # Run this saved model on the input dataset to obtain the inputs to the
        # analyzers.
        analyzer_input_values = (
            input_values
            | 'BatchAnalyzerInputs[%d]' % level >> _BatchElements()
            | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo(
                _RunMetaGraphDoFn(
                    input_schema,
                    serialized_tf_config,
                    shared_graph_state_handle=shared.Shared(),
                    passthrough_keys=Context.get_passthrough_keys()),
                saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir)))

        # Compute the analyzers from their inputs.  `analyzer_outputs_dict` is a
        # map from tensor names to singleton PCollections of `_TensorValue`s.
        analyzer_outputs_dict = (
            analyzer_input_values
            | 'ComputeAnalyzerOutputs[%d]' % level >> _ComputeAnalyzerOutputs(
                phase.analyzers, base_temp_dir))

        # Update the mapping for all analyzers.
        tensor_pcoll_mapping.update(analyzer_outputs_dict)

      del table_initializers[:]
      table_initializers.extend(original_table_initializers)
      saved_model_dir = _make_unique_temp_dir(base_temp_dir)
      _write_saved_transform(graph, inputs, outputs, saved_model_dir)
      transform_fn = (
          tensor_pcoll_mapping
          | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants(
              saved_model_dir, base_temp_dir, input_values.pipeline))

      # Infer metadata.  The metadata may contain Futures that refer to the
      # values of tensors in the graph.  In that case, the tensors must be
      # "constant" in that they don't depend on input data.  The tensors can
      # depend on analyzer outputs though.  This allows us to set metadata that
      # depends on analyzer outputs.
      #
      # We first extract the names of the tensors that are referenced by the
      # Futures, and then compute them by calling _ComputeScalarConstants with
      # the tensor-PCollection mapping representing the analyzer outputs.
      metadata = dataset_metadata.DatasetMetadata(
          schema=impl_helper.infer_feature_schema(outputs))

      deferred_metadata_tensor_names = {
          future.name
          for column_schema in metadata.schema.column_schemas.values()
          for future in column_schema.substitute_futures({})
      }
      name_pcoll_dict = (
          tensor_pcoll_mapping
          | 'ComputeTensorValues' >>
          _ComputeTensorValues(deferred_metadata_tensor_names, saved_model_dir,
                               input_values.pipeline))
      full_metadata = beam_metadata_io.BeamDatasetMetadata(
          metadata, name_pcoll_dict)

      _clear_shared_state_after_barrier(input_values.pipeline, transform_fn)

      return transform_fn, full_metadata
예제 #7
0
파일: impl.py 프로젝트: robertwb/transform
  def expand(self, dataset):
    """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.
    """

    input_values, input_metadata = dataset
    input_schema = input_metadata.schema

    base_temp_dir = Context.create_base_temp_dir()

    # NOTE: it's important that create_phases is called directly after
    # run_preprocessing_fn, because we later mutate the graph's
    # TABLE_INITIALIZERS collection which would break the logic in
    # create_phases.
    graph, inputs, outputs = impl_helper.run_preprocessing_fn(
        self._preprocessing_fn, input_schema)
    phases = impl_helper.create_phases(graph)

    # Iterate through levels.  tensor_pcoll_mapping is a mapping from tensor
    # names to singleton PCollections containing a _TensorValue.  We compute
    # tensor_pcoll_mapping in phases, where at each phase we compute the
    # analyzers that are ready to run and update tensor_pcoll_mapping.
    tensor_pcoll_mapping = {}
    table_initializers = graph.get_collection_ref(
        tf.GraphKeys.TABLE_INITIALIZERS)
    original_table_initializers = list(table_initializers)
    del table_initializers[:]

    serialized_tf_config = (
        analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
            input_values.pipeline.runner))
    for level, phase in enumerate(phases):
      # Create a SavedModel that describes the mapping from the input data
      # to the inputs of the analyzers at this level.  The colum names of the
      # outputs are the tensor names of the analyzer inputs in the graph.  This
      # graph has the anaylzer outputs computed so far replaced with constants.
      analyzer_inputs = {}
      for analyzer in phase.analyzers:
        for input_tensor in analyzer.inputs:
          analyzer_inputs[input_tensor.name] = input_tensor
      table_initializers.extend(phase.table_initializers)
      unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir)
      _write_saved_transform(
          graph, inputs, analyzer_inputs, unbound_saved_model_dir)
      saved_model_dir = (
          tensor_pcoll_mapping
          | 'CreateSavedModelForAnaylzerInputs[%d]' % level
          >> _ReplaceTensorsWithConstants(
              unbound_saved_model_dir, base_temp_dir, input_values.pipeline))

      # Run this saved model on the input dataset to obtain the inputs to the
      # analyzers.
      analyzer_input_values = (
          input_values
          | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo(
              _RunMetaGraphDoFn(
                  input_schema,
                  serialized_tf_config,
                  shared_graph_state_handle=shared.Shared()),
              saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir)))

      # Compute the analyzers from their inputs.  `analyzer_outputs_dict` is a
      # map from tensor names to singleton PCollections of `_TensorValue`s.
      analyzer_outputs_dict = (
          analyzer_input_values
          | 'ComputeAnalyzerOutputs[%d]' % level
          >> _ComputeAnalyzerOutputs(phase.analyzers, base_temp_dir))

      # Update the mapping for all analyzers.
      tensor_pcoll_mapping.update(analyzer_outputs_dict)

    del table_initializers[:]
    table_initializers.extend(original_table_initializers)
    saved_model_dir = _make_unique_temp_dir(base_temp_dir)
    _write_saved_transform(graph, inputs, outputs, saved_model_dir)
    transform_fn = (
        tensor_pcoll_mapping
        | 'ReplaceTensorsWithConstants'
        >> _ReplaceTensorsWithConstants(
            saved_model_dir, base_temp_dir, input_values.pipeline))

    # Infer metadata.  The metadata may contain Futures that refer to the values
    # of tensors in the graph.  In that case, the tensors must be "constant" in
    # that they don't depend on input data.  The tensors can depend on analyzer
    # outputs though.  This allows us to set metadata that depends on analyzer
    # outputs.
    #
    # We first extract the names of the tensors that are referenced by the
    # Futures, and then compute them by calling _ComputeScalarConstants with the
    # tensor-PCollection mapping representing the analyzer outputs.
    metadata = dataset_metadata.DatasetMetadata(
        schema=impl_helper.infer_feature_schema(graph, outputs))

    deferred_metadata_tensor_names = [
        future.name
        for column_schema in tft_api.get_column_schemas(graph).values()
        for future in column_schema.substitute_futures({})]
    name_pcoll_dict = (
        tensor_pcoll_mapping
        | 'ComputeTensorValues' >>
        _ComputeTensorValues(
            deferred_metadata_tensor_names, saved_model_dir,
            input_values.pipeline))
    full_metadata = beam_metadata_io.BeamDatasetMetadata(
        metadata, name_pcoll_dict)

    _clear_shared_state_after_barrier(input_values.pipeline, transform_fn)

    return transform_fn, full_metadata
예제 #8
0
    def expand(self, dataset):
        """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.
    """

        input_values, input_metadata = dataset
        input_schema = input_metadata.schema

        base_temp_dir = Context.create_base_temp_dir()

        class _ReplaceTensorsWithConstants(beam.PTransform):
            """Bind statistics in a deferred manner.

      This transform fills in analyzer outputs with their actual computed
      values.

      Args:
        saved_model_dir: The directory containing the SavedModel.
      """
            def __init__(self, saved_model_dir):
                # Generally the pipeline is inferred from its inputs, however we need
                # to know the pipeline for beam.Create.
                self.pipeline = input_values.pipeline
                self._saved_model_dir = saved_model_dir

            def expand(self, tensor_pcoll_mapping):
                """Converts a dict of statistics to a transform function.

        Args:
          tensor_pcoll_mapping: A dictionary mapping `Tensor`s to singleton
              `PCollection`s.

        Returns:
          A single-element PCollection containing the directory name with the
              SavedModel.
        """
                transform_fn = (self.pipeline | 'CreateTransformFn' >>
                                beam.Create([self._saved_model_dir]))

                if not tensor_pcoll_mapping:
                    return transform_fn

                # Convert tensor_value_mapping into a DictPCollectionView so it can be
                # passed as a side input to the beam Map below.
                tensor_value_pairs = []
                for name, pcoll in six.iteritems(tensor_pcoll_mapping):
                    tensor_value_pairs.append(
                        pcoll
                        | 'AddName[%s]' % name >> beam.Map(lambda x, name=name:
                                                           (name, x)))
                tensor_value_mapping = beam.pvalue.AsDict(
                    tensor_value_pairs
                    | 'MergeTensorValuePairs' >> beam.Flatten())

                # Run a mapper that inserts statistic values into the graph.  We wrap
                # replace_tensors_with_constant_values in a wrapper that also creates
                # a temp dir.  This makes the wrapper idempotent since any retry will
                # use a different temp dir.
                def replace_tensors_with_constant_values(
                        saved_model_dir, tensor_value_mapping,
                        serialized_tf_config):

                    tf_config = _maybe_deserialize_tf_config(
                        serialized_tf_config)
                    with tf.Session(config=tf_config) as session:
                        temp_dir = _make_unique_temp_dir(base_temp_dir)
                        input_tensors, output_tensors = (
                            saved_transform_io.partially_apply_saved_transform(
                                saved_model_dir, {}, tensor_value_mapping))
                        saved_transform_io.write_saved_transform_from_session(
                            session, input_tensors, output_tensors, temp_dir)
                    return temp_dir

                serialized_tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(
                    self.pipeline.runner)
                return (transform_fn | 'ReplaceTensorsWithConstantValues' >>
                        beam.Map(replace_tensors_with_constant_values,
                                 tensor_value_mapping=tensor_value_mapping,
                                 serialized_tf_config=serialized_tf_config))

        class _ComputeTensorPcollMappingUpdate(beam.PTransform):
            """Create a mapping from `Tensor`s to PCollections.

      Creates a mapping from `Tensor`s to PCollections for the outputs of the
      new analyzers.  An existing mapping will be provided as the argument
      to the extend() method.

      Args:
        phase: The Phase to run
      """
            def __init__(self, saved_model_dir, analyzer_inputs_schema,
                         analyzers):
                self._saved_model_dir = saved_model_dir
                self._analyzer_inputs_schema = analyzer_inputs_schema
                self._analyzers = analyzers

            def expand(self, input_values_and_tensor_pcoll_mapping):
                input_values, tensor_pcoll_mapping = (
                    input_values_and_tensor_pcoll_mapping)

                # Create a transform_fn to produce inputs to new analyzers.
                transform_fn = (
                    tensor_pcoll_mapping
                    | 'ReplaceTensorsWithConstants' >>
                    _ReplaceTensorsWithConstants(self._saved_model_dir))

                # Run the transform_fn.
                serialized_tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(
                    self.pipeline.runner)
                analyzer_input_values = (
                    input_values | 'ComputeAnalyzerInputs' >> beam.ParDo(
                        _RunMetaGraphDoFn(input_schema,
                                          self._analyzer_inputs_schema,
                                          serialized_tf_config),
                        saved_model_dir=beam.pvalue.AsSingleton(transform_fn)))

                # For each analyzer output, look up its input values (by tensor name)
                # and run the analyzer on these values.
                #
                tensor_pcoll_mapping_update = {}
                for idx, analyzer in enumerate(self._analyzers):
                    analyzer_impl = analyzer_impls._impl_for_analyzer(
                        analyzer.spec)
                    # pylint: enable=protected-access

                    assert len(analyzer.inputs) == 1
                    output_pcolls = (analyzer_input_values
                                     | 'Extract_%d' % idx >> beam.Map(
                                         lambda batch, key: batch[key],
                                         key=analyzer.inputs[0].name)
                                     | 'Analyze_%d' % idx >> analyzer_impl)
                    assert len(analyzer.outputs) == len(output_pcolls)
                    for tensor, pcoll in zip(analyzer.outputs, output_pcolls):
                        tensor_pcoll_mapping_update[tensor.name] = pcoll
                return tensor_pcoll_mapping_update

        # NOTE: it's important that create_phases is called directly after
        # run_preprocessing_fn, because we later mutate the graph's
        # TABLE_INITIALIZERS collection which would break the logic in
        # create_phases.
        graph, inputs, outputs = impl_helper.run_preprocessing_fn(
            self._preprocessing_fn, input_schema)
        phases = impl_helper.create_phases(graph)

        # Iterate through levels, generating PCollections for columns that are the
        # outputs of `Operations` that are not `MapOperation`s.
        tensor_pcoll_mapping = {}
        table_initializers = graph.get_collection_ref(
            tf.GraphKeys.TABLE_INITIALIZERS)
        original_table_initializers = list(table_initializers)
        del table_initializers[:]

        for level, phase in enumerate(phases):
            analyzer_inputs = {}
            for analyzer in phase.analyzers:
                for input_tensor in analyzer.inputs:
                    analyzer_inputs[input_tensor.name] = input_tensor
            analyzer_inputs_schema = impl_helper.infer_feature_schema(
                analyzer_inputs)
            table_initializers.extend(phase.table_initializers)
            saved_model_dir = _make_unique_temp_dir(base_temp_dir)
            _write_saved_transform(graph, inputs, analyzer_inputs,
                                   saved_model_dir)

            tensor_pcoll_mapping_update = (
                (input_values, tensor_pcoll_mapping)
                | 'ComputeTensorPcollMappingUpdate_%d' % level >>
                _ComputeTensorPcollMappingUpdate(
                    saved_model_dir, analyzer_inputs_schema, phase.analyzers))
            tensor_pcoll_mapping.update(tensor_pcoll_mapping_update)

        output_metadata = dataset_metadata.DatasetMetadata(
            schema=impl_helper.infer_feature_schema(outputs))
        del table_initializers[:]
        table_initializers.extend(original_table_initializers)
        saved_model_dir = _make_unique_temp_dir(base_temp_dir)
        _write_saved_transform(graph, inputs, outputs, saved_model_dir)
        transform_fn = (tensor_pcoll_mapping
                        | 'ReplaceTensorsWithConstants' >>
                        _ReplaceTensorsWithConstants(saved_model_dir))

        return transform_fn, output_metadata
예제 #9
0
    def expand(self, dataset):
        """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.

    Raises:
      ValueError: If preprocessing_fn has no outputs.
    """
        input_values, input_metadata = dataset
        input_schema = input_metadata.schema

        base_temp_dir = Context.create_base_temp_dir()

        graph = tf.Graph()
        with graph.as_default():

            with tf.name_scope('inputs'):
                inputs = input_schema.as_batched_placeholders()
            # In order to avoid a bug where import_graph_def fails when the input_map
            # and return_elements of an imported graph are the same (b/34288791), we
            # avoid using the placeholder of an input column as an output of a graph.
            # We do this by applying tf.identity to all inputs of the
            # preprocessing_fn.  Note this applies at the level of raw tensors.
            outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs))

            # At this point we check that the preprocessing_fn has at least one
            # output. This is because if we allowed the output of preprocessing_fn to
            # be empty, we wouldn't be able to determine how many instances to
            # "unbatch" the output into.
            if not outputs:
                raise ValueError(
                    'The preprocessing function returned an empty dict')

            if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
                raise ValueError(
                    'The preprocessing function contained trainable variables '
                    '{}'.format(
                        graph.get_collection_ref(
                            tf.GraphKeys.TRAINABLE_VARIABLES)))

            # NOTE: it's important that create_phases is called directly after
            # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS
            # collection which would break the logic in create_phases.
            phases = impl_helper.create_phases()

            # Iterate through levels.  tensor_pcoll_mapping is a mapping from tensor
            # names to singleton PCollections containing a _TensorValue.  We compute
            # tensor_pcoll_mapping in phases, where at each phase we compute the
            # analyzers that are ready to run and update tensor_pcoll_mapping.
            tensor_pcoll_mapping = {}
            table_initializers = graph.get_collection_ref(
                tf.GraphKeys.TABLE_INITIALIZERS)
            original_table_initializers = list(table_initializers)
            del table_initializers[:]

            serialized_tf_config = (
                common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
                    input_values.pipeline.runner))
            for level, phase in enumerate(phases):
                # Create a SavedModel that describes the mapping from the input data
                # to the inputs of the analyzers at this level.  The colum names of the
                # outputs are the tensor names of the analyzer inputs in the graph.
                # This graph has the anaylzer outputs computed so far replaced with
                # constants.
                analyzer_inputs = {}
                for analyzer in phase.analyzer_infos:
                    for input_tensor_name in analyzer.input_tensor_names:
                        analyzer_inputs[
                            input_tensor_name] = graph.get_tensor_by_name(
                                input_tensor_name)
                table_initializers.extend(phase.table_initializers)
                unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir)
                _write_saved_transform(graph, inputs, analyzer_inputs,
                                       unbound_saved_model_dir)

                tensor_pcoll_mapping_update = (
                    (input_values, tensor_pcoll_mapping)
                    | 'RunPhase[{}]'.format(level) >> _RunPhase(
                        phase.analyzer_infos, unbound_saved_model_dir,
                        base_temp_dir, input_schema, serialized_tf_config))

                # Update the mapping for all analyzers.
                tensor_pcoll_mapping.update(tensor_pcoll_mapping_update)

            del table_initializers[:]
            table_initializers.extend(original_table_initializers)
            saved_model_dir = _make_unique_temp_dir(base_temp_dir)
            _write_saved_transform(graph, inputs, outputs, saved_model_dir)
            transform_fn = (
                tensor_pcoll_mapping
                |
                'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants(
                    saved_model_dir, base_temp_dir, input_values.pipeline))

            # Infer metadata.  We take the inferred metadata and apply overrides that
            # refer to values of tensors in the graph.  The override tensors must
            # be "constant" in that they don't depend on input data.  The tensors can
            # depend on analyzer outputs though.  This allows us to set metadata that
            # depends on analyzer outputs. _augment_metadata will use the analyzer
            # outputs stored in `transform_fn` to compute the metadata in a
            # deferred manner, once the analyzer outputs are known.
            metadata = dataset_metadata.DatasetMetadata(
                schema=impl_helper.infer_feature_schema(outputs))

            deferred_metadata = (transform_fn
                                 | 'ComputeDeferredMetadata' >> beam.Map(
                                     _augment_metadata, metadata))

            full_metadata = beam_metadata_io.BeamDatasetMetadata(
                metadata, deferred_metadata)

            _clear_shared_state_after_barrier(input_values.pipeline,
                                              transform_fn)

            return transform_fn, full_metadata