示例#1
0
    def testWriteTransformFn(self):
        path = os.path.join(self.get_temp_dir(), 'output')

        with beam.Pipeline() as pipeline:
            # Create an empty directory for the source saved model dir.
            saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
            file_io.recursive_create_dir(saved_model_dir)
            saved_model_dir_pcoll = (
                pipeline
                | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
            metadata = beam_metadata_io.BeamDatasetMetadata(
                _TEST_METADATA_WITH_FUTURES, {
                    'a': pipeline | 'CreateA' >> beam.Create([3]),
                })

            _ = ((saved_model_dir_pcoll, metadata)
                 | transform_fn_io.WriteTransformFn(path))

        transformed_metadata_dir = os.path.join(
            path, transform_fn_io.TRANSFORMED_METADATA_DIR)
        metadata = metadata_io.read_metadata(transformed_metadata_dir)
        self.assertEqual(metadata, _TEST_METADATA)

        transform_fn_dir = os.path.join(path, transform_fn_io.TRANSFORM_FN_DIR)
        self.assertTrue(file_io.file_exists(transform_fn_dir))
        self.assertTrue(file_io.is_directory(transform_fn_dir))
示例#2
0
  def testWriteTransformFn(self):
    transform_output_dir = os.path.join(self.get_temp_dir(), 'output')

    with beam.Pipeline() as pipeline:
      # Create an empty directory for the source saved model dir.
      saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
      file_io.recursive_create_dir(saved_model_dir)
      saved_model_dir_pcoll = (
          pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
      # Combine test metadata with a dict of PCollections resolving futures.
      deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
          [test_metadata.COMPLETE_METADATA])
      metadata = beam_metadata_io.BeamDatasetMetadata(
          test_metadata.INCOMPLETE_METADATA, deferred_metadata)

      _ = ((saved_model_dir_pcoll, metadata)
           | transform_fn_io.WriteTransformFn(transform_output_dir))

    # Test reading with TFTransformOutput
    tf_transform_output = tft.TFTransformOutput(transform_output_dir)
    metadata = tf_transform_output.transformed_metadata
    self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)

    transform_fn_dir = tf_transform_output.transform_savedmodel_dir
    self.assertTrue(file_io.file_exists(transform_fn_dir))
    self.assertTrue(file_io.is_directory(transform_fn_dir))
示例#3
0
    def testWriteTransformFn(self):
        transform_output_dir = os.path.join(self.get_temp_dir(), 'output')

        with beam.Pipeline() as pipeline:
            # Create an empty directory for the source saved model dir.
            saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
            file_io.recursive_create_dir(saved_model_dir)
            saved_model_dir_pcoll = (
                pipeline
                | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
            metadata = beam_metadata_io.BeamDatasetMetadata(
                _TEST_METADATA_WITH_FUTURES, {
                    'a': pipeline | 'CreateA' >> beam.Create([3]),
                })

            _ = ((saved_model_dir_pcoll, metadata)
                 | transform_fn_io.WriteTransformFn(transform_output_dir))

        # Test reading with TFTransformOutput
        tf_transform_output = tft.TFTransformOutput(transform_output_dir)
        metadata = tf_transform_output.transformed_metadata
        self.assertEqual(metadata, _TEST_METADATA)

        transform_fn_dir = tf_transform_output.transform_savedmodel_dir
        self.assertTrue(file_io.file_exists(transform_fn_dir))
        self.assertTrue(file_io.is_directory(transform_fn_dir))
示例#4
0
  def expand(self, dataset_and_transform_fn):
    """Transforms the dataset using the transform_fn.

    Args:
      dataset_and_transform_fn: A tuple of dataset and preprocessing
      function.

    Returns:
      A dataset transformed according to the transform_fn.
    """
    (input_values, input_metadata), (transform_fn, output_metadata) = (
        dataset_and_transform_fn)

    if self._use_tfxio:
      input_schema = None
      input_tensor_adapter_config = input_metadata
    else:
      input_schema = input_metadata.schema
      input_tensor_adapter_config = None

    # If exclude_outputs is set, update the output metadata.
    if self._exclude_outputs is not None:
      if isinstance(output_metadata, beam_metadata_io.BeamDatasetMetadata):
        new_metadata = _remove_columns_from_metadata(
            output_metadata.dataset_metadata, self._exclude_outputs)
        new_deferred_metadata = (
            output_metadata.deferred_metadata
            | 'RemoveColumms' >> beam.Map(_remove_columns_from_metadata,
                                          self._exclude_outputs))
        output_metadata = beam_metadata_io.BeamDatasetMetadata(
            new_metadata, new_deferred_metadata)
      else:
        output_metadata = _remove_columns_from_metadata(
            output_metadata, self._exclude_outputs)

    tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_BEAM_RUNNER_TYPE.get(
        type(self.pipeline.runner))
    if not self._use_tfxio:
      input_values |= 'Batch' >> _BatchElements()
    output_instances = (
        input_values
        | 'Transform' >> beam.ParDo(
            _RunMetaGraphDoFn(
                tf_config,
                input_schema=input_schema,
                input_tensor_adapter_config=input_tensor_adapter_config,
                use_tfxio=self._use_tfxio,
                shared_graph_state_handle=shared.Shared(),
                passthrough_keys=Context.get_passthrough_keys(),
                exclude_outputs=self._exclude_outputs),
            saved_model_dir=beam.pvalue.AsSingleton(transform_fn))
        | 'ConvertAndUnbatch' >> beam.FlatMap(
            _convert_and_unbatch_to_instance_dicts,
            schema=output_metadata.schema,
            passthrough_keys=Context.get_passthrough_keys()))

    _clear_shared_state_after_barrier(self.pipeline, output_instances)

    return (output_instances, output_metadata)
    def testWriteMetadataDeferred(self):
        # Write metadata to disk using WriteMetadata PTransform, combining
        # incomplete metadata with (deferred) complete metadata.
        with beam.Pipeline() as pipeline:
            path = self.get_temp_dir()
            deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
                [test_metadata.COMPLETE_METADATA])
            metadata = beam_metadata_io.BeamDatasetMetadata(
                test_metadata.INCOMPLETE_METADATA, deferred_metadata)
            _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline)

        # Load from disk and check that it is as expected.
        metadata = metadata_io.read_metadata(path)
        self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
示例#6
0
    def testWriteTransformFnIsRetryable(self):
        tft.test_case.skip_if_external_environment(
            'Retries are currently not available on this environment.')
        original_copy_tree_to_unique_temp_dir = (
            transform_fn_io._copy_tree_to_unique_temp_dir)

        def mock_copy_tree_to_unique_temp_dir(source, base_temp_dir_path):
            """Mocks transform_fn_io._copy_tree to fail the first time it is called by this test, thus forcing a retry which should succeed."""
            global _COPY_TREE_TO_UNIQUE_TEMP_DIR_CALLED
            if not _COPY_TREE_TO_UNIQUE_TEMP_DIR_CALLED:
                _COPY_TREE_TO_UNIQUE_TEMP_DIR_CALLED = True
                original_copy_tree_to_unique_temp_dir(source,
                                                      base_temp_dir_path)
                raise ArithmeticError('Some error')
            return original_copy_tree_to_unique_temp_dir(
                source, base_temp_dir_path)

        with self._makeTestPipeline() as pipeline:
            transform_output_dir = os.path.join(self.get_temp_dir(), 'output')
            # Create an empty directory for the source saved model dir.
            saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
            file_io.recursive_create_dir(saved_model_dir)
            saved_model_path = os.path.join(saved_model_dir, 'saved_model')
            with file_io.FileIO(saved_model_path, mode='w') as f:
                f.write('some content')
            saved_model_dir_pcoll = (
                pipeline
                | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
            # Combine test metadata with a dict of PCollections resolving futures.
            deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
                [test_metadata.COMPLETE_METADATA])
            metadata = beam_metadata_io.BeamDatasetMetadata(
                test_metadata.INCOMPLETE_METADATA, deferred_metadata)
            with mock.patch.object(transform_fn_io,
                                   '_copy_tree_to_unique_temp_dir',
                                   mock_copy_tree_to_unique_temp_dir):
                _ = ((saved_model_dir_pcoll, metadata)
                     | transform_fn_io.WriteTransformFn(transform_output_dir))

        # Test reading with TFTransformOutput
        tf_transform_output = tft.TFTransformOutput(transform_output_dir)
        metadata = tf_transform_output.transformed_metadata
        self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)

        transform_fn_dir = tf_transform_output.transform_savedmodel_dir
        self.assertTrue(file_io.file_exists(transform_fn_dir))
        self.assertTrue(file_io.is_directory(transform_fn_dir))
        # Check temp directory created by failed run was cleaned up.
        self.assertEqual(2, len(file_io.list_directory(transform_output_dir)))
示例#7
0
    def testWriteMetadataDeferredProperties(self):
        # Write deferred properties as metadata to disk.
        with beam.Pipeline() as pipeline:
            path = self.get_temp_dir()

            # Combine _TEST_METADATA with the complete (deferred) metadata.
            deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
                [_TEST_METADATA_COMPLETE])
            metadata = beam_metadata_io.BeamDatasetMetadata(
                _TEST_METADATA, deferred_metadata)

            _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline)
        # Load from disk and check that it is as expected.
        metadata = metadata_io.read_metadata(path)
        self.assertMetadataEqual(metadata, _TEST_METADATA_COMPLETE)
    def testWriteMetadataDeferredProperties(self):
        # Write deferred properties as metadata to disk.
        with beam.Pipeline() as pipeline:
            path = self.get_temp_dir()

            # Combine test metadata with a dict of PCollections resolving futures.
            metadata = beam_metadata_io.BeamDatasetMetadata(
                _TEST_METADATA_WITH_FUTURES, {
                    'a': pipeline | 'CreateA' >> beam.Create([3]),
                    'b': pipeline | 'CreateB' >> beam.Create([5])
                })

            _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline)
        # Load from disk and check that it is as expected.
        metadata = metadata_io.read_metadata(path)
        self.assertMetadataEqual(metadata, _TEST_METADATA)
    def testWriteMetadataDeferred(self):
        # Write metadata to disk using WriteMetadata PTransform, combining
        # incomplete metadata with (deferred) complete metadata.
        expected_asset_map = {'key': 'value'}
        with beam.Pipeline() as pipeline:
            path = self.get_temp_dir()
            deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
                [test_metadata.COMPLETE_METADATA])
            metadata = beam_metadata_io.BeamDatasetMetadata(
                test_metadata.INCOMPLETE_METADATA, deferred_metadata,
                expected_asset_map)
            _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline)

        # Load from disk and check that it is as expected.
        metadata = metadata_io.read_metadata(path)
        self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)

        with tf.io.gfile.GFile(
                os.path.join(path,
                             output_wrapper.TFTransformOutput.ASSET_MAP)) as f:
            asset_map = json.loads(f.read())
            self.assertDictEqual(asset_map, expected_asset_map)
示例#10
0
  def expand(self, dataset):
    """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.

    Raises:
      ValueError: If preprocessing_fn has no outputs.
    """
    flattened_pcoll, input_values_pcoll_dict, input_metadata = dataset
    input_schema = input_metadata.schema

    input_values_pcoll_dict = input_values_pcoll_dict or dict()

    analyzer_cache.validate_dataset_keys(input_values_pcoll_dict.keys())

    with tf.Graph().as_default() as graph:

      with tf.name_scope('inputs'):
        feature_spec = input_schema.as_feature_spec()
        input_signature = impl_helper.feature_spec_as_batched_placeholders(
            feature_spec)
        # In order to avoid a bug where import_graph_def fails when the
        # input_map and return_elements of an imported graph are the same
        # (b/34288791), we avoid using the placeholder of an input column as an
        # output of a graph. We do this by applying tf.identity to all inputs of
        # the preprocessing_fn.  Note this applies at the level of raw tensors.
        # TODO(b/34288791): Remove this workaround and use a shallow copy of
        # inputs instead.  A shallow copy is needed in case
        # self._preprocessing_fn mutates its input.
        copied_inputs = impl_helper.copy_tensors(input_signature)

      output_signature = self._preprocessing_fn(copied_inputs)

    # At this point we check that the preprocessing_fn has at least one
    # output. This is because if we allowed the output of preprocessing_fn to
    # be empty, we wouldn't be able to determine how many instances to
    # "unbatch" the output into.
    if not output_signature:
      raise ValueError('The preprocessing function returned an empty dict')

    if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
      raise ValueError(
          'The preprocessing function contained trainable variables '
          '{}'.format(
              graph.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES)))

    pipeline = flattened_pcoll.pipeline
    serialized_tf_config = common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
        pipeline.runner)
    extra_args = common.ConstructBeamPipelineVisitor.ExtraArgs(
        base_temp_dir=Context.create_base_temp_dir(),
        serialized_tf_config=serialized_tf_config,
        pipeline=pipeline,
        flat_pcollection=flattened_pcoll,
        pcollection_dict=input_values_pcoll_dict,
        graph=graph,
        input_signature=input_signature,
        input_schema=input_schema,
        cache_location=self._cache_location)

    transform_fn_future = analysis_graph_builder.build(
        graph, input_signature, output_signature,
        input_values_pcoll_dict.keys(), self._cache_location)

    transform_fn_pcoll = nodes.Traverser(
        common.ConstructBeamPipelineVisitor(extra_args)).visit_value_node(
            transform_fn_future)

    # Infer metadata.  We take the inferred metadata and apply overrides that
    # refer to values of tensors in the graph.  The override tensors must
    # be "constant" in that they don't depend on input data.  The tensors can
    # depend on analyzer outputs though.  This allows us to set metadata that
    # depends on analyzer outputs. _augment_metadata will use the analyzer
    # outputs stored in `transform_fn` to compute the metadata in a
    # deferred manner, once the analyzer outputs are known.
    metadata = dataset_metadata.DatasetMetadata(
        schema=schema_inference.infer_feature_schema(output_signature, graph))

    deferred_metadata = (
        transform_fn_pcoll
        |
        'ComputeDeferredMetadata' >> beam.Map(_infer_metadata_from_saved_model))

    full_metadata = beam_metadata_io.BeamDatasetMetadata(
        metadata, deferred_metadata)

    _clear_shared_state_after_barrier(pipeline, transform_fn_pcoll)

    return transform_fn_pcoll, full_metadata
示例#11
0
文件: impl.py 项目: qipa/transform
  def expand(self, dataset_and_transform_fn):
    """Transforms the dataset using the transform_fn.

    Args:
      dataset_and_transform_fn: A tuple of dataset and preprocessing
      function.

    Returns:
      A dataset transformed according to the transform_fn.
    """
    (input_values, input_metadata), (transform_fn, output_metadata) = (
        dataset_and_transform_fn)

    # If exclude_outputs is set, update the output metadata.
    if self._exclude_outputs is not None:
      if isinstance(output_metadata, beam_metadata_io.BeamDatasetMetadata):
        # Unwrap BeamDatasetMetadata into DatasetMetadata and pcollections dict.
        output_metadata, pcollections = output_metadata
        schema = output_metadata.schema
        # Update DatasetMetadata to remove excluded outputs
        output_metadata = dataset_metadata.DatasetMetadata(
            schema=dataset_schema.Schema({
                key: column_schema
                for key, column_schema in six.iteritems(schema.column_schemas)
                if key not in self._exclude_outputs
            }))
        # Update pcollections to keep only pcollections that resolve futures in
        # the updated metadata.
        unresolved_future_names = set(
            future.name for future in output_metadata.substitute_futures({}))
        pcollections = {
            name: pcollection
            for name, pcollection in six.iteritems(pcollections)
            if name in unresolved_future_names
        }
        # Wrap DatasetMetadata and pcollections as BeamDatasetMetadata
        output_metadata = beam_metadata_io.BeamDatasetMetadata(
            output_metadata, pcollections)
      else:
        schema = output_metadata.schema
        output_metadata = dataset_metadata.DatasetMetadata(
            schema=dataset_schema.Schema({
                key: column_schema
                for key, column_schema in six.iteritems(schema.column_schemas)
                if key not in self._exclude_outputs
            }))

    serialized_tf_config = (
        common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
            self.pipeline.runner))

    output_instances = (
        input_values
        | 'Batch' >> _BatchElements()
        | 'Transform' >> beam.ParDo(
            _RunMetaGraphDoFn(
                input_metadata.schema,
                serialized_tf_config,
                shared_graph_state_handle=shared.Shared(),
                passthrough_keys=Context.get_passthrough_keys(),
                exclude_outputs=self._exclude_outputs),
            saved_model_dir=beam.pvalue.AsSingleton(transform_fn))
        | 'ConvertAndUnbatch' >> beam.FlatMap(
            _convert_and_unbatch_to_instance_dicts,
            schema=output_metadata.schema,
            passthrough_keys=Context.get_passthrough_keys()))

    _clear_shared_state_after_barrier(self.pipeline, output_instances)

    return (output_instances, output_metadata)
示例#12
0
文件: impl.py 项目: qipa/transform
  def expand(self, dataset):
    """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.

    Raises:
      ValueError: If preprocessing_fn has no outputs.
    """
    input_values, input_metadata = dataset
    input_schema = input_metadata.schema

    base_temp_dir = Context.create_base_temp_dir()

    graph = tf.Graph()
    with graph.as_default():

      with tf.name_scope('inputs'):
        inputs = input_schema.as_batched_placeholders()
      # In order to avoid a bug where import_graph_def fails when the input_map
      # and return_elements of an imported graph are the same (b/34288791), we
      # avoid using the placeholder of an input column as an output of a graph.
      # We do this by applying tf.identity to all inputs of the
      # preprocessing_fn.  Note this applies at the level of raw tensors.
      outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs))

      # At this point we check that the preprocessing_fn has at least one
      # output. This is because if we allowed the output of preprocessing_fn to
      # be empty, we wouldn't be able to determine how many instances to
      # "unbatch" the output into.
      if not outputs:
        raise ValueError('The preprocessing function returned an empty dict')

      if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
        raise ValueError(
            'The preprocessing function contained trainable variables '
            '{}'.format(
                graph.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES)))

      # NOTE: it's important that create_phases is called directly after
      # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS
      # collection which would break the logic in create_phases.
      phases = impl_helper.create_phases()

      # Iterate through levels.  tensor_pcoll_mapping is a mapping from tensor
      # names to singleton PCollections containing a _TensorValue.  We compute
      # tensor_pcoll_mapping in phases, where at each phase we compute the
      # analyzers that are ready to run and update tensor_pcoll_mapping.
      tensor_pcoll_mapping = {}
      table_initializers = graph.get_collection_ref(
          tf.GraphKeys.TABLE_INITIALIZERS)
      original_table_initializers = list(table_initializers)
      del table_initializers[:]

      serialized_tf_config = (
          common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
              input_values.pipeline.runner))
      for level, phase in enumerate(phases):
        # Create a SavedModel that describes the mapping from the input data
        # to the inputs of the analyzers at this level.  The colum names of the
        # outputs are the tensor names of the analyzer inputs in the graph.
        # This graph has the anaylzer outputs computed so far replaced with
        # constants.
        analyzer_inputs = {}
        for analyzer in phase.analyzers:
          for input_tensor in analyzer.inputs:
            analyzer_inputs[input_tensor.name] = input_tensor
        table_initializers.extend(phase.table_initializers)
        unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir)
        _write_saved_transform(graph, inputs, analyzer_inputs,
                               unbound_saved_model_dir)
        saved_model_dir = (
            tensor_pcoll_mapping
            | 'CreateSavedModelForAnalyzerInputs[%d]' % level >>
            _ReplaceTensorsWithConstants(unbound_saved_model_dir, base_temp_dir,
                                         input_values.pipeline))

        # Run this saved model on the input dataset to obtain the inputs to the
        # analyzers.
        analyzer_input_values = (
            input_values
            | 'BatchAnalyzerInputs[%d]' % level >> _BatchElements()
            | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo(
                _RunMetaGraphDoFn(
                    input_schema,
                    serialized_tf_config,
                    shared_graph_state_handle=shared.Shared(),
                    passthrough_keys=Context.get_passthrough_keys()),
                saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir)))

        # Compute the analyzers from their inputs.  `analyzer_outputs_dict` is a
        # map from tensor names to singleton PCollections of `_TensorValue`s.
        analyzer_outputs_dict = (
            analyzer_input_values
            | 'ComputeAnalyzerOutputs[%d]' % level >> _ComputeAnalyzerOutputs(
                phase.analyzers, base_temp_dir))

        # Update the mapping for all analyzers.
        tensor_pcoll_mapping.update(analyzer_outputs_dict)

      del table_initializers[:]
      table_initializers.extend(original_table_initializers)
      saved_model_dir = _make_unique_temp_dir(base_temp_dir)
      _write_saved_transform(graph, inputs, outputs, saved_model_dir)
      transform_fn = (
          tensor_pcoll_mapping
          | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants(
              saved_model_dir, base_temp_dir, input_values.pipeline))

      # Infer metadata.  The metadata may contain Futures that refer to the
      # values of tensors in the graph.  In that case, the tensors must be
      # "constant" in that they don't depend on input data.  The tensors can
      # depend on analyzer outputs though.  This allows us to set metadata that
      # depends on analyzer outputs.
      #
      # We first extract the names of the tensors that are referenced by the
      # Futures, and then compute them by calling _ComputeScalarConstants with
      # the tensor-PCollection mapping representing the analyzer outputs.
      metadata = dataset_metadata.DatasetMetadata(
          schema=impl_helper.infer_feature_schema(outputs))

      deferred_metadata_tensor_names = {
          future.name
          for column_schema in metadata.schema.column_schemas.values()
          for future in column_schema.substitute_futures({})
      }
      name_pcoll_dict = (
          tensor_pcoll_mapping
          | 'ComputeTensorValues' >>
          _ComputeTensorValues(deferred_metadata_tensor_names, saved_model_dir,
                               input_values.pipeline))
      full_metadata = beam_metadata_io.BeamDatasetMetadata(
          metadata, name_pcoll_dict)

      _clear_shared_state_after_barrier(input_values.pipeline, transform_fn)

      return transform_fn, full_metadata
示例#13
0
  def expand(self, dataset):
    """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.
    """

    input_values, input_metadata = dataset
    input_schema = input_metadata.schema

    base_temp_dir = Context.create_base_temp_dir()

    # NOTE: it's important that create_phases is called directly after
    # run_preprocessing_fn, because we later mutate the graph's
    # TABLE_INITIALIZERS collection which would break the logic in
    # create_phases.
    graph, inputs, outputs = impl_helper.run_preprocessing_fn(
        self._preprocessing_fn, input_schema)
    phases = impl_helper.create_phases(graph)

    # Iterate through levels.  tensor_pcoll_mapping is a mapping from tensor
    # names to singleton PCollections containing a _TensorValue.  We compute
    # tensor_pcoll_mapping in phases, where at each phase we compute the
    # analyzers that are ready to run and update tensor_pcoll_mapping.
    tensor_pcoll_mapping = {}
    table_initializers = graph.get_collection_ref(
        tf.GraphKeys.TABLE_INITIALIZERS)
    original_table_initializers = list(table_initializers)
    del table_initializers[:]

    serialized_tf_config = (
        analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
            input_values.pipeline.runner))
    for level, phase in enumerate(phases):
      # Create a SavedModel that describes the mapping from the input data
      # to the inputs of the analyzers at this level.  The colum names of the
      # outputs are the tensor names of the analyzer inputs in the graph.  This
      # graph has the anaylzer outputs computed so far replaced with constants.
      analyzer_inputs = {}
      for analyzer in phase.analyzers:
        for input_tensor in analyzer.inputs:
          analyzer_inputs[input_tensor.name] = input_tensor
      table_initializers.extend(phase.table_initializers)
      unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir)
      _write_saved_transform(
          graph, inputs, analyzer_inputs, unbound_saved_model_dir)
      saved_model_dir = (
          tensor_pcoll_mapping
          | 'CreateSavedModelForAnaylzerInputs[%d]' % level
          >> _ReplaceTensorsWithConstants(
              unbound_saved_model_dir, base_temp_dir, input_values.pipeline))

      # Run this saved model on the input dataset to obtain the inputs to the
      # analyzers.
      analyzer_input_values = (
          input_values
          | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo(
              _RunMetaGraphDoFn(
                  input_schema,
                  serialized_tf_config,
                  shared_graph_state_handle=shared.Shared()),
              saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir)))

      # Compute the analyzers from their inputs.  `analyzer_outputs_dict` is a
      # map from tensor names to singleton PCollections of `_TensorValue`s.
      analyzer_outputs_dict = (
          analyzer_input_values
          | 'ComputeAnalyzerOutputs[%d]' % level
          >> _ComputeAnalyzerOutputs(phase.analyzers, base_temp_dir))

      # Update the mapping for all analyzers.
      tensor_pcoll_mapping.update(analyzer_outputs_dict)

    del table_initializers[:]
    table_initializers.extend(original_table_initializers)
    saved_model_dir = _make_unique_temp_dir(base_temp_dir)
    _write_saved_transform(graph, inputs, outputs, saved_model_dir)
    transform_fn = (
        tensor_pcoll_mapping
        | 'ReplaceTensorsWithConstants'
        >> _ReplaceTensorsWithConstants(
            saved_model_dir, base_temp_dir, input_values.pipeline))

    # Infer metadata.  The metadata may contain Futures that refer to the values
    # of tensors in the graph.  In that case, the tensors must be "constant" in
    # that they don't depend on input data.  The tensors can depend on analyzer
    # outputs though.  This allows us to set metadata that depends on analyzer
    # outputs.
    #
    # We first extract the names of the tensors that are referenced by the
    # Futures, and then compute them by calling _ComputeScalarConstants with the
    # tensor-PCollection mapping representing the analyzer outputs.
    metadata = dataset_metadata.DatasetMetadata(
        schema=impl_helper.infer_feature_schema(graph, outputs))

    deferred_metadata_tensor_names = [
        future.name
        for column_schema in tft_api.get_column_schemas(graph).values()
        for future in column_schema.substitute_futures({})]
    name_pcoll_dict = (
        tensor_pcoll_mapping
        | 'ComputeTensorValues' >>
        _ComputeTensorValues(
            deferred_metadata_tensor_names, saved_model_dir,
            input_values.pipeline))
    full_metadata = beam_metadata_io.BeamDatasetMetadata(
        metadata, name_pcoll_dict)

    _clear_shared_state_after_barrier(input_values.pipeline, transform_fn)

    return transform_fn, full_metadata
示例#14
0
  def expand(self, dataset):
    """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.

    Raises:
      ValueError: If preprocessing_fn has no outputs.
    """
    (flattened_pcoll, input_values_pcoll_dict, dataset_cache_dict,
     input_metadata) = dataset
    if self._use_tfxio:
      input_schema = None
      input_tensor_adapter_config = input_metadata
    else:
      input_schema = input_metadata.schema
      input_tensor_adapter_config = None

    input_values_pcoll_dict = input_values_pcoll_dict or dict()

    with tf.compat.v1.Graph().as_default() as graph:

      with tf.compat.v1.name_scope('inputs'):
        if self._use_tfxio:
          specs = TensorAdapter(input_tensor_adapter_config).OriginalTypeSpecs()
        else:
          specs = schema_utils.schema_as_feature_spec(input_schema).feature_spec
        input_signature = impl_helper.batched_placeholders_from_specs(specs)
        # In order to avoid a bug where import_graph_def fails when the
        # input_map and return_elements of an imported graph are the same
        # (b/34288791), we avoid using the placeholder of an input column as an
        # output of a graph. We do this by applying tf.identity to all inputs of
        # the preprocessing_fn.  Note this applies at the level of raw tensors.
        # TODO(b/34288791): Remove this workaround and use a shallow copy of
        # inputs instead.  A shallow copy is needed in case
        # self._preprocessing_fn mutates its input.
        copied_inputs = impl_helper.copy_tensors(input_signature)

      output_signature = self._preprocessing_fn(copied_inputs)

    # At this point we check that the preprocessing_fn has at least one
    # output. This is because if we allowed the output of preprocessing_fn to
    # be empty, we wouldn't be able to determine how many instances to
    # "unbatch" the output into.
    if not output_signature:
      raise ValueError('The preprocessing function returned an empty dict')

    if graph.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES):
      raise ValueError(
          'The preprocessing function contained trainable variables '
          '{}'.format(
              graph.get_collection_ref(
                  tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES)))

    pipeline = self.pipeline or (flattened_pcoll or next(
        v for v in input_values_pcoll_dict.values() if v is not None)).pipeline

    # Add a stage that inspects graph collections for API use counts and logs
    # them as a beam metric.
    _ = (pipeline | 'InstrumentAPI' >> _InstrumentAPI(graph))

    tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_BEAM_RUNNER_TYPE.get(
        type(pipeline.runner))
    extra_args = beam_common.ConstructBeamPipelineVisitor.ExtraArgs(
        base_temp_dir=Context.create_base_temp_dir(),
        tf_config=tf_config,
        pipeline=pipeline,
        flat_pcollection=flattened_pcoll,
        pcollection_dict=input_values_pcoll_dict,
        graph=graph,
        input_signature=input_signature,
        input_schema=input_schema,
        input_tensor_adapter_config=input_tensor_adapter_config,
        use_tfxio=self._use_tfxio,
        cache_pcoll_dict=dataset_cache_dict)

    transform_fn_future, cache_value_nodes = analysis_graph_builder.build(
        graph,
        input_signature,
        output_signature,
        input_values_pcoll_dict.keys(),
        cache_dict=dataset_cache_dict)
    traverser = nodes.Traverser(
        beam_common.ConstructBeamPipelineVisitor(extra_args))
    transform_fn_pcoll = traverser.visit_value_node(transform_fn_future)

    if cache_value_nodes is not None:
      output_cache_pcoll_dict = {}
      for (dataset_key,
           cache_key), value_node in six.iteritems(cache_value_nodes):
        if dataset_key not in output_cache_pcoll_dict:
          output_cache_pcoll_dict[dataset_key] = {}
        output_cache_pcoll_dict[dataset_key][cache_key] = (
            traverser.visit_value_node(value_node))
    else:
      output_cache_pcoll_dict = None

    # Infer metadata.  We take the inferred metadata and apply overrides that
    # refer to values of tensors in the graph.  The override tensors must
    # be "constant" in that they don't depend on input data.  The tensors can
    # depend on analyzer outputs though.  This allows us to set metadata that
    # depends on analyzer outputs. _infer_metadata_from_saved_model will use the
    # analyzer outputs stored in `transform_fn` to compute the metadata in a
    # deferred manner, once the analyzer outputs are known.
    metadata = dataset_metadata.DatasetMetadata(
        schema=schema_inference.infer_feature_schema(output_signature, graph))

    deferred_metadata = (
        transform_fn_pcoll
        |
        'ComputeDeferredMetadata' >> beam.Map(_infer_metadata_from_saved_model))

    full_metadata = beam_metadata_io.BeamDatasetMetadata(
        metadata, deferred_metadata)

    _clear_shared_state_after_barrier(pipeline, transform_fn_pcoll)

    return (transform_fn_pcoll, full_metadata), output_cache_pcoll_dict
示例#15
0
    def expand(self, dataset):
        """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.

    Raises:
      ValueError: If preprocessing_fn has no outputs.
    """
        input_values, input_metadata = dataset
        input_schema = input_metadata.schema

        base_temp_dir = Context.create_base_temp_dir()

        graph = tf.Graph()
        with graph.as_default():

            with tf.name_scope('inputs'):
                inputs = input_schema.as_batched_placeholders()
            # In order to avoid a bug where import_graph_def fails when the input_map
            # and return_elements of an imported graph are the same (b/34288791), we
            # avoid using the placeholder of an input column as an output of a graph.
            # We do this by applying tf.identity to all inputs of the
            # preprocessing_fn.  Note this applies at the level of raw tensors.
            outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs))

            # At this point we check that the preprocessing_fn has at least one
            # output. This is because if we allowed the output of preprocessing_fn to
            # be empty, we wouldn't be able to determine how many instances to
            # "unbatch" the output into.
            if not outputs:
                raise ValueError(
                    'The preprocessing function returned an empty dict')

            if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
                raise ValueError(
                    'The preprocessing function contained trainable variables '
                    '{}'.format(
                        graph.get_collection_ref(
                            tf.GraphKeys.TRAINABLE_VARIABLES)))

            # NOTE: it's important that create_phases is called directly after
            # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS
            # collection which would break the logic in create_phases.
            phases = impl_helper.create_phases()

            # Iterate through levels.  tensor_pcoll_mapping is a mapping from tensor
            # names to singleton PCollections containing a _TensorValue.  We compute
            # tensor_pcoll_mapping in phases, where at each phase we compute the
            # analyzers that are ready to run and update tensor_pcoll_mapping.
            tensor_pcoll_mapping = {}
            table_initializers = graph.get_collection_ref(
                tf.GraphKeys.TABLE_INITIALIZERS)
            original_table_initializers = list(table_initializers)
            del table_initializers[:]

            serialized_tf_config = (
                common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
                    input_values.pipeline.runner))
            for level, phase in enumerate(phases):
                # Create a SavedModel that describes the mapping from the input data
                # to the inputs of the analyzers at this level.  The colum names of the
                # outputs are the tensor names of the analyzer inputs in the graph.
                # This graph has the anaylzer outputs computed so far replaced with
                # constants.
                analyzer_inputs = {}
                for analyzer in phase.analyzer_infos:
                    for input_tensor_name in analyzer.input_tensor_names:
                        analyzer_inputs[
                            input_tensor_name] = graph.get_tensor_by_name(
                                input_tensor_name)
                table_initializers.extend(phase.table_initializers)
                unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir)
                _write_saved_transform(graph, inputs, analyzer_inputs,
                                       unbound_saved_model_dir)

                tensor_pcoll_mapping_update = (
                    (input_values, tensor_pcoll_mapping)
                    | 'RunPhase[{}]'.format(level) >> _RunPhase(
                        phase.analyzer_infos, unbound_saved_model_dir,
                        base_temp_dir, input_schema, serialized_tf_config))

                # Update the mapping for all analyzers.
                tensor_pcoll_mapping.update(tensor_pcoll_mapping_update)

            del table_initializers[:]
            table_initializers.extend(original_table_initializers)
            saved_model_dir = _make_unique_temp_dir(base_temp_dir)
            _write_saved_transform(graph, inputs, outputs, saved_model_dir)
            transform_fn = (
                tensor_pcoll_mapping
                |
                'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants(
                    saved_model_dir, base_temp_dir, input_values.pipeline))

            # Infer metadata.  We take the inferred metadata and apply overrides that
            # refer to values of tensors in the graph.  The override tensors must
            # be "constant" in that they don't depend on input data.  The tensors can
            # depend on analyzer outputs though.  This allows us to set metadata that
            # depends on analyzer outputs. _augment_metadata will use the analyzer
            # outputs stored in `transform_fn` to compute the metadata in a
            # deferred manner, once the analyzer outputs are known.
            metadata = dataset_metadata.DatasetMetadata(
                schema=impl_helper.infer_feature_schema(outputs))

            deferred_metadata = (transform_fn
                                 | 'ComputeDeferredMetadata' >> beam.Map(
                                     _augment_metadata, metadata))

            full_metadata = beam_metadata_io.BeamDatasetMetadata(
                metadata, deferred_metadata)

            _clear_shared_state_after_barrier(input_values.pipeline,
                                              transform_fn)

            return transform_fn, full_metadata