예제 #1
0
  def default_transforming_serving_input_receiver_fn():
    """Serving Input Receiver that applies transforms to raw data in Tensors."""

    feature_spec = raw_metadata.schema.as_feature_spec()
    batched_placeholders = impl_helper.feature_spec_as_batched_placeholders(
        feature_spec)
    raw_serving_features = {
        k: v
        for k, v in six.iteritems(batched_placeholders)
        if k in include_raw_keys}

    sparse_serving_features = [t for t in raw_serving_features
                               if isinstance(t, tf.SparseTensor)]
    if sparse_serving_features:
      raise ValueError("Feeding sparse tensors directly at serving time is not "
                       "supported.")

    _, transformed_features = (
        saved_transform_io.partially_apply_saved_transform_internal(
            transform_savedmodel_dir, raw_serving_features))

    if convert_scalars_to_vectors:
      transformed_features = _convert_scalars_to_vectors(transformed_features)

    return tf.estimator.export.ServingInputReceiver(
        transformed_features, raw_serving_features)
예제 #2
0
    def test_optimize_traversal(self, feature_spec, preprocessing_fn,
                                dataset_input_cache_dict,
                                expected_dot_graph_str):
        span_0_key, span_1_key = 'span-0', 'span-1'
        if dataset_input_cache_dict is not None:
            cache = {span_0_key: dataset_input_cache_dict}
        else:
            cache = {}

        with tf.compat.v1.name_scope('inputs'):
            input_signature = impl_helper.feature_spec_as_batched_placeholders(
                feature_spec)
        output_signature = preprocessing_fn(input_signature)
        transform_fn_future, cache_output_dict = analysis_graph_builder.build(
            tf.compat.v1.get_default_graph(), input_signature,
            output_signature, {span_0_key, span_1_key}, cache)

        leaf_nodes = [transform_fn_future] + sorted(cache_output_dict.values(),
                                                    key=str)
        dot_string = nodes.get_dot_graph(leaf_nodes).to_string()
        self.WriteRenderedDotFile(dot_string)

        self.assertSameElements(
            dot_string.split('\n'),
            expected_dot_graph_str.split('\n'),
            msg='Result dot graph is:\n{}'.format(dot_string))
    def test_perform_combiner_packing_optimization(
            self, feature_spec, preprocessing_fn, num_phases,
            expected_dot_graph_str_before_packing,
            expected_dot_graph_str_after_packing):
        with tf.compat.v1.Graph().as_default() as graph:
            with tf.compat.v1.name_scope('inputs'):
                input_signature = impl_helper.feature_spec_as_batched_placeholders(
                    feature_spec)
            output_signature = preprocessing_fn(input_signature)

            def _side_effect_fn(saved_model_future, cache_value_nodes,
                                unused_num_phases):
                return (saved_model_future, cache_value_nodes)

            with mock.patch.object(combiner_packing_util,
                                   'perform_combiner_packing_optimization',
                                   side_effect=_side_effect_fn):
                transform_fn_future_before, unused_cache = analysis_graph_builder.build(
                    graph, input_signature, output_signature)
            transform_fn_future_after, unused_cache = (
                combiner_packing_util.perform_combiner_packing_optimization(
                    transform_fn_future_before, unused_cache, num_phases))
        dot_string_before = nodes.get_dot_graph([transform_fn_future_before
                                                 ]).to_string()
        self.assertMultiLineEqual(
            msg='Result dot graph is:\n{}'.format(dot_string_before),
            first=dot_string_before,
            second=expected_dot_graph_str_before_packing)
        dot_string_after = nodes.get_dot_graph([transform_fn_future_after
                                                ]).to_string()
        self.WriteRenderedDotFile(dot_string_after)
        self.assertMultiLineEqual(
            msg='Result dot graph is:\n{}'.format(dot_string_after),
            first=dot_string_after,
            second=expected_dot_graph_str_after_packing)
예제 #4
0
 def test_feature_spec_as_batched_placeholders(self):
   feature_spec = {
       'fixed_len_float': tf.io.FixedLenFeature([2, 3], tf.float32),
       'fixed_len_string': tf.io.FixedLenFeature([], tf.string),
       '_var_len_underscored': tf.io.VarLenFeature(tf.string),
       'var_len_int': tf.io.VarLenFeature(tf.int64)
   }
   with tf.compat.v1.Graph().as_default():
     features = impl_helper.feature_spec_as_batched_placeholders(feature_spec)
   self.assertCountEqual(features.keys(), [
       'fixed_len_float', 'fixed_len_string', 'var_len_int',
       '_var_len_underscored'
   ])
   self.assertEqual(type(features['fixed_len_float']), tf.Tensor)
   self.assertEqual(features['fixed_len_float'].get_shape().as_list(),
                    [None, 2, 3])
   self.assertEqual(type(features['fixed_len_string']), tf.Tensor)
   self.assertEqual(features['fixed_len_string'].get_shape().as_list(),
                    [None])
   self.assertEqual(type(features['var_len_int']), tf.SparseTensor)
   self.assertEqual(features['var_len_int'].get_shape().as_list(),
                    [None, None])
   self.assertEqual(type(features['_var_len_underscored']), tf.SparseTensor)
   self.assertEqual(features['_var_len_underscored'].get_shape().as_list(),
                    [None, None])
예제 #5
0
파일: executor.py 프로젝트: rummens/tfx
    def _RunInPlaceImpl(self, preprocessing_fn: Any,
                        metadata: dataset_metadata.DatasetMetadata,
                        transform_output_path: Text) -> _Status:
        """Runs a transformation iteration in-place without looking at the data.

    Args:
      preprocessing_fn: The tf.Transform preprocessing_fn.
      metadata: A DatasetMetadata object for the input data.
      transform_output_path: An absolute path to write the output to.

    Returns:
      Status of the execution.
    """

        tf.logging.info('Processing an in-place transform')

        raw_metadata_dir = os.path.join(transform_output_path,
                                        tft.TFTransformOutput.RAW_METADATA_DIR)
        metadata_io.write_metadata(metadata, raw_metadata_dir)

        with tf.Graph().as_default() as graph:
            with tf.Session(graph=graph) as sess:

                input_signature = impl_helper.feature_spec_as_batched_placeholders(
                    schema_utils.schema_as_feature_spec(
                        _GetSchemaProto(metadata)).feature_spec)

                # In order to avoid a bug where import_graph_def fails when the
                # input_map and return_elements of an imported graph are the same
                # (b/34288791), we avoid using the placeholder of an input column as an
                # output of a graph. We do this by applying tf.identity to all inputs of
                # the preprocessing_fn.  Note this applies at the level of raw tensors.
                # TODO(b/34288791): Remove this workaround and use a shallow copy of
                # inputs instead.  A shallow copy is needed in case
                # self._preprocessing_fn mutates its input.
                copied_inputs = impl_helper.copy_tensors(input_signature)

                output_signature = preprocessing_fn(copied_inputs)
                sess.run(tf.global_variables_initializer())
                sess.run(tf.tables_initializer())
                transform_fn_path = os.path.join(
                    transform_output_path,
                    tft.TFTransformOutput.TRANSFORM_FN_DIR)
                saved_transform_io.write_saved_transform_from_session(
                    sess, input_signature, output_signature, transform_fn_path)

                transformed_metadata = dataset_metadata.DatasetMetadata(
                    schema=tft.schema_inference.infer_feature_schema(
                        output_signature, graph, sess))

        transformed_metadata_dir = os.path.join(
            transform_output_path,
            tft.TFTransformOutput.TRANSFORMED_METADATA_DIR)
        metadata_io.write_metadata(transformed_metadata,
                                   transformed_metadata_dir)

        return _Status.OK()
예제 #6
0
파일: executor.py 프로젝트: luvneries/tfx
  def _RunInPlaceImpl(self, preprocessing_fn,
                      metadata,
                      transform_output_path):
    """Runs a transformation iteration in-place without looking at the data.

    Args:
      preprocessing_fn: The tf.Transform preprocessing_fn.
      metadata: A DatasetMetadata object for the input data.
      transform_output_path: An absolute path to write the output to.

    Returns:
      Status of the execution.
    """

    tf.logging.info('Processing an in-place transform')

    raw_metadata_dir = os.path.join(transform_output_path,
                                    tft.TFTransformOutput.RAW_METADATA_DIR)
    metadata_io.write_metadata(metadata, raw_metadata_dir)

    with tf.Graph().as_default() as graph:
      with tf.Session(graph=graph) as sess:

        input_signature = impl_helper.feature_spec_as_batched_placeholders(
            metadata.schema.as_feature_spec())

        # In order to avoid a bug where import_graph_def fails when the
        # input_map and return_elements of an imported graph are the same
        # (b/34288791), we avoid using the placeholder of an input column as an
        # output of a graph. We do this by applying tf.identity to all inputs of
        # the preprocessing_fn.  Note this applies at the level of raw tensors.
        # TODO(b/34288791): Remove this workaround and use a shallow copy of
        # inputs instead.  A shallow copy is needed in case
        # self._preprocessing_fn mutates its input.
        copied_inputs = impl_helper.copy_tensors(input_signature)

        output_signature = preprocessing_fn(copied_inputs)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        transform_fn_path = os.path.join(transform_output_path,
                                         tft.TFTransformOutput.TRANSFORM_FN_DIR)
        saved_transform_io.write_saved_transform_from_session(
            sess, input_signature, output_signature, transform_fn_path)

        transformed_metadata = dataset_metadata.DatasetMetadata(
            schema=tft.schema_inference.infer_feature_schema(
                output_signature, graph, sess))

    transformed_metadata_dir = os.path.join(
        transform_output_path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR)
    metadata_io.write_metadata(transformed_metadata, transformed_metadata_dir)

    return _Status.OK()
예제 #7
0
  def test_build(self, feature_spec, preprocessing_fn, expected_dot_graph_str):
    with tf.name_scope('inputs'):
      input_signature = impl_helper.feature_spec_as_batched_placeholders(
          feature_spec)
    output_signature = preprocessing_fn(input_signature)
    transform_fn_future = analysis_graph_builder.build(
        tf.get_default_graph(), input_signature, output_signature)

    dot_string = nodes.get_dot_graph([transform_fn_future]).to_string()
    self.WriteRenderedDotFile(dot_string)

    self.assertMultiLineEqual(
        msg='Result dot graph is:\n{}'.format(dot_string),
        first=dot_string,
        second=expected_dot_graph_str)
예제 #8
0
def get_transform_input_columns(preprocessing_fn, feature_spec):
    """Return columns that are required inputs of `TransformDataset`.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    feature_spec: A dict of feature name to feature specification.

  Returns:
    A list of columns that are required inputs of the transform `tf.Graph`
    defined by `preprocessing_fn`.
  """
    with tf.compat.v1.Graph().as_default() as graph:
        input_signature = impl_helper.feature_spec_as_batched_placeholders(
            feature_spec)
        output_signature = preprocessing_fn(input_signature.copy())
        transform_input_tensors = graph_tools.get_dependent_inputs(
            graph, input_signature, output_signature)
        return transform_input_tensors.keys()
예제 #9
0
def _build_analysis_graph_for_inspection(
    preprocessing_fn, feature_spec, dataset_keys, input_cache):
  """Builds the analysis graph for inspection."""
  with tf.compat.v1.Graph().as_default() as graph:
    with tf.compat.v1.name_scope('inputs'):
      input_signature = impl_helper.feature_spec_as_batched_placeholders(
          feature_spec)
      # TODO(b/34288791): This needs to be exactly the same as in impl.py
      copied_inputs = impl_helper.copy_tensors(input_signature)

    output_signature = preprocessing_fn(copied_inputs)
  transform_fn_future, cache_dict = build(
      graph,
      input_signature,
      output_signature,
      dataset_keys=dataset_keys,
      cache_dict=input_cache)
  return transform_fn_future, cache_dict
예제 #10
0
def get_analysis_dataset_keys(preprocessing_fn, feature_spec, dataset_keys,
                              input_cache):
  """Computes the dataset keys that are required in order to perform analysis.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    feature_spec: A dict of feature name to feature specification.
    dataset_keys: A set of strings which are dataset keys, they uniquely
      identify these datasets across analysis runs.
    input_cache: A cache dictionary.

  Returns:
    A pair of:
      - A set of dataset keys that are required for analysis.
      - A boolean indicating whether or not a flattened version of the entire
        dataset is required. See the `flat_data` input to
        `AnalyzeDatasetWithCache`.
  """
  with tf.Graph().as_default() as graph:
    with tf.compat.v1.name_scope('inputs'):
      input_signature = impl_helper.feature_spec_as_batched_placeholders(
          feature_spec)
      # TODO(b/34288791): This needs to be exactly the same as in impl.py
      copied_inputs = impl_helper.copy_tensors(input_signature)

    output_signature = preprocessing_fn(copied_inputs)
  transform_fn_future, _ = build(
      graph,
      input_signature,
      output_signature,
      dataset_keys=dataset_keys,
      cache_dict=input_cache)

  required_dataset_keys_result = set()
  inspect_visitor = _InspectVisitor(required_dataset_keys_result)
  inspect_traverser = nodes.Traverser(inspect_visitor)
  _ = inspect_traverser.visit_value_node(transform_fn_future)

  # If None is present this means that a flattened version of the entire dataset
  # is required, therefore this will be returning all of the given dataset_keys.
  flat_data_required = None in required_dataset_keys_result
  if flat_data_required:
    required_dataset_keys_result = dataset_keys
  return required_dataset_keys_result, flat_data_required
예제 #11
0
  def test_optimize_traversal(self, feature_spec, preprocessing_fn,
                              write_cache_fn, expected_dot_graph_str):
    cache_location = self._make_cache_location()
    span_0_key, span_1_key = 'span-0', 'span-1'
    if write_cache_fn is not None:
      write_cache_fn(cache_location.input_cache_dir, [span_0_key, span_1_key])

    with tf.name_scope('inputs'):
      input_signature = impl_helper.feature_spec_as_batched_placeholders(
          feature_spec)
    output_signature = preprocessing_fn(input_signature)
    transform_fn_future = analysis_graph_builder.build(
        tf.get_default_graph(), input_signature, output_signature,
        {span_0_key, span_1_key}, cache_location)

    dot_string = nodes.get_dot_graph([transform_fn_future]).to_string()
    self.WriteRenderedDotFile(dot_string)

    self.assertSameElements(
        dot_string.split('\n'),
        expected_dot_graph_str.split('\n'),
        msg='Result dot graph is:\n{}'.format(dot_string))
예제 #12
0
def get_analyze_input_columns(preprocessing_fn, feature_spec):
    """Return columns that are required inputs of `AnalyzeDataset`.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    feature_spec: A dict of feature name to feature specification.

  Returns:
    A list of columns that are required inputs of analyzers.
  """
    with tf.compat.v1.Graph().as_default() as graph:
        input_signature = impl_helper.feature_spec_as_batched_placeholders(
            feature_spec)
        _ = preprocessing_fn(input_signature.copy())

        tensor_sinks = graph.get_collection(analyzer_nodes.TENSOR_REPLACEMENTS)
        visitor = _SourcedTensorsVisitor()
        for tensor_sink in tensor_sinks:
            nodes.Traverser(visitor).visit_value_node(tensor_sink.future)

        analyze_input_tensors = graph_tools.get_dependent_inputs(
            graph, input_signature, visitor.sourced_tensors)
        return analyze_input_tensors.keys()
예제 #13
0
파일: impl.py 프로젝트: sswapnil2/transform
  def expand(self, dataset):
    """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.

    Raises:
      ValueError: If preprocessing_fn has no outputs.
    """
    flattened_pcoll, input_values_pcoll_dict, input_metadata = dataset
    input_schema = input_metadata.schema

    input_values_pcoll_dict = input_values_pcoll_dict or dict()

    analyzer_cache.validate_dataset_keys(input_values_pcoll_dict.keys())

    with tf.Graph().as_default() as graph:

      with tf.name_scope('inputs'):
        feature_spec = input_schema.as_feature_spec()
        input_signature = impl_helper.feature_spec_as_batched_placeholders(
            feature_spec)
        # In order to avoid a bug where import_graph_def fails when the
        # input_map and return_elements of an imported graph are the same
        # (b/34288791), we avoid using the placeholder of an input column as an
        # output of a graph. We do this by applying tf.identity to all inputs of
        # the preprocessing_fn.  Note this applies at the level of raw tensors.
        # TODO(b/34288791): Remove this workaround and use a shallow copy of
        # inputs instead.  A shallow copy is needed in case
        # self._preprocessing_fn mutates its input.
        copied_inputs = impl_helper.copy_tensors(input_signature)

      output_signature = self._preprocessing_fn(copied_inputs)

    # At this point we check that the preprocessing_fn has at least one
    # output. This is because if we allowed the output of preprocessing_fn to
    # be empty, we wouldn't be able to determine how many instances to
    # "unbatch" the output into.
    if not output_signature:
      raise ValueError('The preprocessing function returned an empty dict')

    if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
      raise ValueError(
          'The preprocessing function contained trainable variables '
          '{}'.format(
              graph.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES)))

    pipeline = flattened_pcoll.pipeline
    serialized_tf_config = common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
        pipeline.runner)
    extra_args = common.ConstructBeamPipelineVisitor.ExtraArgs(
        base_temp_dir=Context.create_base_temp_dir(),
        serialized_tf_config=serialized_tf_config,
        pipeline=pipeline,
        flat_pcollection=flattened_pcoll,
        pcollection_dict=input_values_pcoll_dict,
        graph=graph,
        input_signature=input_signature,
        input_schema=input_schema,
        cache_location=self._cache_location)

    transform_fn_future = analysis_graph_builder.build(
        graph, input_signature, output_signature,
        input_values_pcoll_dict.keys(), self._cache_location)

    transform_fn_pcoll = nodes.Traverser(
        common.ConstructBeamPipelineVisitor(extra_args)).visit_value_node(
            transform_fn_future)

    # Infer metadata.  We take the inferred metadata and apply overrides that
    # refer to values of tensors in the graph.  The override tensors must
    # be "constant" in that they don't depend on input data.  The tensors can
    # depend on analyzer outputs though.  This allows us to set metadata that
    # depends on analyzer outputs. _augment_metadata will use the analyzer
    # outputs stored in `transform_fn` to compute the metadata in a
    # deferred manner, once the analyzer outputs are known.
    metadata = dataset_metadata.DatasetMetadata(
        schema=schema_inference.infer_feature_schema(output_signature, graph))

    deferred_metadata = (
        transform_fn_pcoll
        |
        'ComputeDeferredMetadata' >> beam.Map(_infer_metadata_from_saved_model))

    full_metadata = beam_metadata_io.BeamDatasetMetadata(
        metadata, deferred_metadata)

    _clear_shared_state_after_barrier(pipeline, transform_fn_pcoll)

    return transform_fn_pcoll, full_metadata
예제 #14
0
    def expand(self, dataset):
        """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.

    Raises:
      ValueError: If preprocessing_fn has no outputs.
    """
        input_values, input_metadata = dataset
        input_schema = input_metadata.schema

        base_temp_dir = Context.create_base_temp_dir()

        with tf.Graph().as_default() as graph:

            with tf.name_scope('inputs'):
                feature_spec = input_schema.as_feature_spec()
                inputs = impl_helper.feature_spec_as_batched_placeholders(
                    feature_spec)
            # In order to avoid a bug where import_graph_def fails when the input_map
            # and return_elements of an imported graph are the same (b/34288791), we
            # avoid using the placeholder of an input column as an output of a graph.
            # We do this by applying tf.identity to all inputs of the
            # preprocessing_fn.  Note this applies at the level of raw tensors.
            outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs))

            # At this point we check that the preprocessing_fn has at least one
            # output. This is because if we allowed the output of preprocessing_fn to
            # be empty, we wouldn't be able to determine how many instances to
            # "unbatch" the output into.
            if not outputs:
                raise ValueError(
                    'The preprocessing function returned an empty dict')

            if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
                raise ValueError(
                    'The preprocessing function contained trainable variables '
                    '{}'.format(
                        graph.get_collection_ref(
                            tf.GraphKeys.TRAINABLE_VARIABLES)))

            # NOTE: it's important that create_phases is called directly after
            # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS
            # collection which would break the logic in create_phases.
            phases = impl_helper.create_phases(inputs)

            # Iterate through levels.  tensor_pcoll_mapping is a mapping from tensor
            # names to singleton PCollections containing a _TensorValue.  We compute
            # tensor_pcoll_mapping in phases, where at each phase we compute the
            # analyzers that are ready to run and update tensor_pcoll_mapping.
            tensor_pcoll_mapping = {}
            table_initializers = graph.get_collection_ref(
                tf.GraphKeys.TABLE_INITIALIZERS)
            original_table_initializers = list(table_initializers)
            del table_initializers[:]

            serialized_tf_config = (
                common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
                    input_values.pipeline.runner))
            for level, phase in enumerate(phases):
                # Create a SavedModel that describes the mapping from the input data
                # to the inputs of the analyzers at this level.  The colum names of the
                # outputs are the tensor names of the analyzer inputs in the graph.
                # This graph has the anaylzer outputs computed so far replaced with
                # constants.
                analyzer_inputs = {}
                for analyzer in phase.analyzer_infos:
                    for input_tensor_name in analyzer.input_tensor_names:
                        analyzer_inputs[
                            input_tensor_name] = graph.get_tensor_by_name(
                                input_tensor_name)
                table_initializers.extend(phase.table_initializers)
                unbound_saved_model_dir = common.make_unique_temp_dir(
                    base_temp_dir)
                _write_saved_transform(graph, inputs, analyzer_inputs,
                                       unbound_saved_model_dir)

                tensor_pcoll_mapping_update = (
                    (input_values, tensor_pcoll_mapping)
                    | 'RunPhase[{}]'.format(level) >> _RunPhase(
                        phase.analyzer_infos, unbound_saved_model_dir,
                        base_temp_dir, input_schema, serialized_tf_config,
                        level))

                # Update the mapping for all analyzers.
                tensor_pcoll_mapping.update(tensor_pcoll_mapping_update)

            del table_initializers[:]
            table_initializers.extend(original_table_initializers)
            saved_model_dir = common.make_unique_temp_dir(base_temp_dir)
            _write_saved_transform(graph, inputs, outputs, saved_model_dir)
            transform_fn = (
                tensor_pcoll_mapping
                |
                'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants(
                    saved_model_dir, base_temp_dir, input_values.pipeline))

            # Infer metadata.  We take the inferred metadata and apply overrides that
            # refer to values of tensors in the graph.  The override tensors must
            # be "constant" in that they don't depend on input data.  The tensors can
            # depend on analyzer outputs though.  This allows us to set metadata that
            # depends on analyzer outputs. _augment_metadata will use the analyzer
            # outputs stored in `transform_fn` to compute the metadata in a
            # deferred manner, once the analyzer outputs are known.
            metadata = dataset_metadata.DatasetMetadata(
                schema=schema_inference.infer_feature_schema(outputs, graph))

            deferred_metadata = (transform_fn
                                 | 'ComputeDeferredMetadata' >>
                                 beam.Map(_infer_metadata_from_saved_model))

            full_metadata = beam_metadata_io.BeamDatasetMetadata(
                metadata, deferred_metadata)

            _clear_shared_state_after_barrier(input_values.pipeline,
                                              transform_fn)

            return transform_fn, full_metadata