def default_transforming_serving_input_receiver_fn(): """Serving Input Receiver that applies transforms to raw data in Tensors.""" feature_spec = raw_metadata.schema.as_feature_spec() batched_placeholders = impl_helper.feature_spec_as_batched_placeholders( feature_spec) raw_serving_features = { k: v for k, v in six.iteritems(batched_placeholders) if k in include_raw_keys} sparse_serving_features = [t for t in raw_serving_features if isinstance(t, tf.SparseTensor)] if sparse_serving_features: raise ValueError("Feeding sparse tensors directly at serving time is not " "supported.") _, transformed_features = ( saved_transform_io.partially_apply_saved_transform_internal( transform_savedmodel_dir, raw_serving_features)) if convert_scalars_to_vectors: transformed_features = _convert_scalars_to_vectors(transformed_features) return tf.estimator.export.ServingInputReceiver( transformed_features, raw_serving_features)
def test_optimize_traversal(self, feature_spec, preprocessing_fn, dataset_input_cache_dict, expected_dot_graph_str): span_0_key, span_1_key = 'span-0', 'span-1' if dataset_input_cache_dict is not None: cache = {span_0_key: dataset_input_cache_dict} else: cache = {} with tf.compat.v1.name_scope('inputs'): input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) output_signature = preprocessing_fn(input_signature) transform_fn_future, cache_output_dict = analysis_graph_builder.build( tf.compat.v1.get_default_graph(), input_signature, output_signature, {span_0_key, span_1_key}, cache) leaf_nodes = [transform_fn_future] + sorted(cache_output_dict.values(), key=str) dot_string = nodes.get_dot_graph(leaf_nodes).to_string() self.WriteRenderedDotFile(dot_string) self.assertSameElements( dot_string.split('\n'), expected_dot_graph_str.split('\n'), msg='Result dot graph is:\n{}'.format(dot_string))
def test_perform_combiner_packing_optimization( self, feature_spec, preprocessing_fn, num_phases, expected_dot_graph_str_before_packing, expected_dot_graph_str_after_packing): with tf.compat.v1.Graph().as_default() as graph: with tf.compat.v1.name_scope('inputs'): input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) output_signature = preprocessing_fn(input_signature) def _side_effect_fn(saved_model_future, cache_value_nodes, unused_num_phases): return (saved_model_future, cache_value_nodes) with mock.patch.object(combiner_packing_util, 'perform_combiner_packing_optimization', side_effect=_side_effect_fn): transform_fn_future_before, unused_cache = analysis_graph_builder.build( graph, input_signature, output_signature) transform_fn_future_after, unused_cache = ( combiner_packing_util.perform_combiner_packing_optimization( transform_fn_future_before, unused_cache, num_phases)) dot_string_before = nodes.get_dot_graph([transform_fn_future_before ]).to_string() self.assertMultiLineEqual( msg='Result dot graph is:\n{}'.format(dot_string_before), first=dot_string_before, second=expected_dot_graph_str_before_packing) dot_string_after = nodes.get_dot_graph([transform_fn_future_after ]).to_string() self.WriteRenderedDotFile(dot_string_after) self.assertMultiLineEqual( msg='Result dot graph is:\n{}'.format(dot_string_after), first=dot_string_after, second=expected_dot_graph_str_after_packing)
def test_feature_spec_as_batched_placeholders(self): feature_spec = { 'fixed_len_float': tf.io.FixedLenFeature([2, 3], tf.float32), 'fixed_len_string': tf.io.FixedLenFeature([], tf.string), '_var_len_underscored': tf.io.VarLenFeature(tf.string), 'var_len_int': tf.io.VarLenFeature(tf.int64) } with tf.compat.v1.Graph().as_default(): features = impl_helper.feature_spec_as_batched_placeholders(feature_spec) self.assertCountEqual(features.keys(), [ 'fixed_len_float', 'fixed_len_string', 'var_len_int', '_var_len_underscored' ]) self.assertEqual(type(features['fixed_len_float']), tf.Tensor) self.assertEqual(features['fixed_len_float'].get_shape().as_list(), [None, 2, 3]) self.assertEqual(type(features['fixed_len_string']), tf.Tensor) self.assertEqual(features['fixed_len_string'].get_shape().as_list(), [None]) self.assertEqual(type(features['var_len_int']), tf.SparseTensor) self.assertEqual(features['var_len_int'].get_shape().as_list(), [None, None]) self.assertEqual(type(features['_var_len_underscored']), tf.SparseTensor) self.assertEqual(features['_var_len_underscored'].get_shape().as_list(), [None, None])
def _RunInPlaceImpl(self, preprocessing_fn: Any, metadata: dataset_metadata.DatasetMetadata, transform_output_path: Text) -> _Status: """Runs a transformation iteration in-place without looking at the data. Args: preprocessing_fn: The tf.Transform preprocessing_fn. metadata: A DatasetMetadata object for the input data. transform_output_path: An absolute path to write the output to. Returns: Status of the execution. """ tf.logging.info('Processing an in-place transform') raw_metadata_dir = os.path.join(transform_output_path, tft.TFTransformOutput.RAW_METADATA_DIR) metadata_io.write_metadata(metadata, raw_metadata_dir) with tf.Graph().as_default() as graph: with tf.Session(graph=graph) as sess: input_signature = impl_helper.feature_spec_as_batched_placeholders( schema_utils.schema_as_feature_spec( _GetSchemaProto(metadata)).feature_spec) # In order to avoid a bug where import_graph_def fails when the # input_map and return_elements of an imported graph are the same # (b/34288791), we avoid using the placeholder of an input column as an # output of a graph. We do this by applying tf.identity to all inputs of # the preprocessing_fn. Note this applies at the level of raw tensors. # TODO(b/34288791): Remove this workaround and use a shallow copy of # inputs instead. A shallow copy is needed in case # self._preprocessing_fn mutates its input. copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = preprocessing_fn(copied_inputs) sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) transform_fn_path = os.path.join( transform_output_path, tft.TFTransformOutput.TRANSFORM_FN_DIR) saved_transform_io.write_saved_transform_from_session( sess, input_signature, output_signature, transform_fn_path) transformed_metadata = dataset_metadata.DatasetMetadata( schema=tft.schema_inference.infer_feature_schema( output_signature, graph, sess)) transformed_metadata_dir = os.path.join( transform_output_path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR) metadata_io.write_metadata(transformed_metadata, transformed_metadata_dir) return _Status.OK()
def _RunInPlaceImpl(self, preprocessing_fn, metadata, transform_output_path): """Runs a transformation iteration in-place without looking at the data. Args: preprocessing_fn: The tf.Transform preprocessing_fn. metadata: A DatasetMetadata object for the input data. transform_output_path: An absolute path to write the output to. Returns: Status of the execution. """ tf.logging.info('Processing an in-place transform') raw_metadata_dir = os.path.join(transform_output_path, tft.TFTransformOutput.RAW_METADATA_DIR) metadata_io.write_metadata(metadata, raw_metadata_dir) with tf.Graph().as_default() as graph: with tf.Session(graph=graph) as sess: input_signature = impl_helper.feature_spec_as_batched_placeholders( metadata.schema.as_feature_spec()) # In order to avoid a bug where import_graph_def fails when the # input_map and return_elements of an imported graph are the same # (b/34288791), we avoid using the placeholder of an input column as an # output of a graph. We do this by applying tf.identity to all inputs of # the preprocessing_fn. Note this applies at the level of raw tensors. # TODO(b/34288791): Remove this workaround and use a shallow copy of # inputs instead. A shallow copy is needed in case # self._preprocessing_fn mutates its input. copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = preprocessing_fn(copied_inputs) sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) transform_fn_path = os.path.join(transform_output_path, tft.TFTransformOutput.TRANSFORM_FN_DIR) saved_transform_io.write_saved_transform_from_session( sess, input_signature, output_signature, transform_fn_path) transformed_metadata = dataset_metadata.DatasetMetadata( schema=tft.schema_inference.infer_feature_schema( output_signature, graph, sess)) transformed_metadata_dir = os.path.join( transform_output_path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR) metadata_io.write_metadata(transformed_metadata, transformed_metadata_dir) return _Status.OK()
def test_build(self, feature_spec, preprocessing_fn, expected_dot_graph_str): with tf.name_scope('inputs'): input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) output_signature = preprocessing_fn(input_signature) transform_fn_future = analysis_graph_builder.build( tf.get_default_graph(), input_signature, output_signature) dot_string = nodes.get_dot_graph([transform_fn_future]).to_string() self.WriteRenderedDotFile(dot_string) self.assertMultiLineEqual( msg='Result dot graph is:\n{}'.format(dot_string), first=dot_string, second=expected_dot_graph_str)
def get_transform_input_columns(preprocessing_fn, feature_spec): """Return columns that are required inputs of `TransformDataset`. Args: preprocessing_fn: A tf.transform preprocessing_fn. feature_spec: A dict of feature name to feature specification. Returns: A list of columns that are required inputs of the transform `tf.Graph` defined by `preprocessing_fn`. """ with tf.compat.v1.Graph().as_default() as graph: input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) output_signature = preprocessing_fn(input_signature.copy()) transform_input_tensors = graph_tools.get_dependent_inputs( graph, input_signature, output_signature) return transform_input_tensors.keys()
def _build_analysis_graph_for_inspection( preprocessing_fn, feature_spec, dataset_keys, input_cache): """Builds the analysis graph for inspection.""" with tf.compat.v1.Graph().as_default() as graph: with tf.compat.v1.name_scope('inputs'): input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) # TODO(b/34288791): This needs to be exactly the same as in impl.py copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = preprocessing_fn(copied_inputs) transform_fn_future, cache_dict = build( graph, input_signature, output_signature, dataset_keys=dataset_keys, cache_dict=input_cache) return transform_fn_future, cache_dict
def get_analysis_dataset_keys(preprocessing_fn, feature_spec, dataset_keys, input_cache): """Computes the dataset keys that are required in order to perform analysis. Args: preprocessing_fn: A tf.transform preprocessing_fn. feature_spec: A dict of feature name to feature specification. dataset_keys: A set of strings which are dataset keys, they uniquely identify these datasets across analysis runs. input_cache: A cache dictionary. Returns: A pair of: - A set of dataset keys that are required for analysis. - A boolean indicating whether or not a flattened version of the entire dataset is required. See the `flat_data` input to `AnalyzeDatasetWithCache`. """ with tf.Graph().as_default() as graph: with tf.compat.v1.name_scope('inputs'): input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) # TODO(b/34288791): This needs to be exactly the same as in impl.py copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = preprocessing_fn(copied_inputs) transform_fn_future, _ = build( graph, input_signature, output_signature, dataset_keys=dataset_keys, cache_dict=input_cache) required_dataset_keys_result = set() inspect_visitor = _InspectVisitor(required_dataset_keys_result) inspect_traverser = nodes.Traverser(inspect_visitor) _ = inspect_traverser.visit_value_node(transform_fn_future) # If None is present this means that a flattened version of the entire dataset # is required, therefore this will be returning all of the given dataset_keys. flat_data_required = None in required_dataset_keys_result if flat_data_required: required_dataset_keys_result = dataset_keys return required_dataset_keys_result, flat_data_required
def test_optimize_traversal(self, feature_spec, preprocessing_fn, write_cache_fn, expected_dot_graph_str): cache_location = self._make_cache_location() span_0_key, span_1_key = 'span-0', 'span-1' if write_cache_fn is not None: write_cache_fn(cache_location.input_cache_dir, [span_0_key, span_1_key]) with tf.name_scope('inputs'): input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) output_signature = preprocessing_fn(input_signature) transform_fn_future = analysis_graph_builder.build( tf.get_default_graph(), input_signature, output_signature, {span_0_key, span_1_key}, cache_location) dot_string = nodes.get_dot_graph([transform_fn_future]).to_string() self.WriteRenderedDotFile(dot_string) self.assertSameElements( dot_string.split('\n'), expected_dot_graph_str.split('\n'), msg='Result dot graph is:\n{}'.format(dot_string))
def get_analyze_input_columns(preprocessing_fn, feature_spec): """Return columns that are required inputs of `AnalyzeDataset`. Args: preprocessing_fn: A tf.transform preprocessing_fn. feature_spec: A dict of feature name to feature specification. Returns: A list of columns that are required inputs of analyzers. """ with tf.compat.v1.Graph().as_default() as graph: input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) _ = preprocessing_fn(input_signature.copy()) tensor_sinks = graph.get_collection(analyzer_nodes.TENSOR_REPLACEMENTS) visitor = _SourcedTensorsVisitor() for tensor_sink in tensor_sinks: nodes.Traverser(visitor).visit_value_node(tensor_sink.future) analyze_input_tensors = graph_tools.get_dependent_inputs( graph, input_signature, visitor.sourced_tensors) return analyze_input_tensors.keys()
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ flattened_pcoll, input_values_pcoll_dict, input_metadata = dataset input_schema = input_metadata.schema input_values_pcoll_dict = input_values_pcoll_dict or dict() analyzer_cache.validate_dataset_keys(input_values_pcoll_dict.keys()) with tf.Graph().as_default() as graph: with tf.name_scope('inputs'): feature_spec = input_schema.as_feature_spec() input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) # In order to avoid a bug where import_graph_def fails when the # input_map and return_elements of an imported graph are the same # (b/34288791), we avoid using the placeholder of an input column as an # output of a graph. We do this by applying tf.identity to all inputs of # the preprocessing_fn. Note this applies at the level of raw tensors. # TODO(b/34288791): Remove this workaround and use a shallow copy of # inputs instead. A shallow copy is needed in case # self._preprocessing_fn mutates its input. copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = self._preprocessing_fn(copied_inputs) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not output_signature: raise ValueError('The preprocessing function returned an empty dict') if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES))) pipeline = flattened_pcoll.pipeline serialized_tf_config = common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access pipeline.runner) extra_args = common.ConstructBeamPipelineVisitor.ExtraArgs( base_temp_dir=Context.create_base_temp_dir(), serialized_tf_config=serialized_tf_config, pipeline=pipeline, flat_pcollection=flattened_pcoll, pcollection_dict=input_values_pcoll_dict, graph=graph, input_signature=input_signature, input_schema=input_schema, cache_location=self._cache_location) transform_fn_future = analysis_graph_builder.build( graph, input_signature, output_signature, input_values_pcoll_dict.keys(), self._cache_location) transform_fn_pcoll = nodes.Traverser( common.ConstructBeamPipelineVisitor(extra_args)).visit_value_node( transform_fn_future) # Infer metadata. We take the inferred metadata and apply overrides that # refer to values of tensors in the graph. The override tensors must # be "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. _augment_metadata will use the analyzer # outputs stored in `transform_fn` to compute the metadata in a # deferred manner, once the analyzer outputs are known. metadata = dataset_metadata.DatasetMetadata( schema=schema_inference.infer_feature_schema(output_signature, graph)) deferred_metadata = ( transform_fn_pcoll | 'ComputeDeferredMetadata' >> beam.Map(_infer_metadata_from_saved_model)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, deferred_metadata) _clear_shared_state_after_barrier(pipeline, transform_fn_pcoll) return transform_fn_pcoll, full_metadata
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ input_values, input_metadata = dataset input_schema = input_metadata.schema base_temp_dir = Context.create_base_temp_dir() with tf.Graph().as_default() as graph: with tf.name_scope('inputs'): feature_spec = input_schema.as_feature_spec() inputs = impl_helper.feature_spec_as_batched_placeholders( feature_spec) # In order to avoid a bug where import_graph_def fails when the input_map # and return_elements of an imported graph are the same (b/34288791), we # avoid using the placeholder of an input column as an output of a graph. # We do this by applying tf.identity to all inputs of the # preprocessing_fn. Note this applies at the level of raw tensors. outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs)) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not outputs: raise ValueError( 'The preprocessing function returned an empty dict') if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref( tf.GraphKeys.TRAINABLE_VARIABLES))) # NOTE: it's important that create_phases is called directly after # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS # collection which would break the logic in create_phases. phases = impl_helper.create_phases(inputs) # Iterate through levels. tensor_pcoll_mapping is a mapping from tensor # names to singleton PCollections containing a _TensorValue. We compute # tensor_pcoll_mapping in phases, where at each phase we compute the # analyzers that are ready to run and update tensor_pcoll_mapping. tensor_pcoll_mapping = {} table_initializers = graph.get_collection_ref( tf.GraphKeys.TABLE_INITIALIZERS) original_table_initializers = list(table_initializers) del table_initializers[:] serialized_tf_config = ( common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access input_values.pipeline.runner)) for level, phase in enumerate(phases): # Create a SavedModel that describes the mapping from the input data # to the inputs of the analyzers at this level. The colum names of the # outputs are the tensor names of the analyzer inputs in the graph. # This graph has the anaylzer outputs computed so far replaced with # constants. analyzer_inputs = {} for analyzer in phase.analyzer_infos: for input_tensor_name in analyzer.input_tensor_names: analyzer_inputs[ input_tensor_name] = graph.get_tensor_by_name( input_tensor_name) table_initializers.extend(phase.table_initializers) unbound_saved_model_dir = common.make_unique_temp_dir( base_temp_dir) _write_saved_transform(graph, inputs, analyzer_inputs, unbound_saved_model_dir) tensor_pcoll_mapping_update = ( (input_values, tensor_pcoll_mapping) | 'RunPhase[{}]'.format(level) >> _RunPhase( phase.analyzer_infos, unbound_saved_model_dir, base_temp_dir, input_schema, serialized_tf_config, level)) # Update the mapping for all analyzers. tensor_pcoll_mapping.update(tensor_pcoll_mapping_update) del table_initializers[:] table_initializers.extend(original_table_initializers) saved_model_dir = common.make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, outputs, saved_model_dir) transform_fn = ( tensor_pcoll_mapping | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants( saved_model_dir, base_temp_dir, input_values.pipeline)) # Infer metadata. We take the inferred metadata and apply overrides that # refer to values of tensors in the graph. The override tensors must # be "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. _augment_metadata will use the analyzer # outputs stored in `transform_fn` to compute the metadata in a # deferred manner, once the analyzer outputs are known. metadata = dataset_metadata.DatasetMetadata( schema=schema_inference.infer_feature_schema(outputs, graph)) deferred_metadata = (transform_fn | 'ComputeDeferredMetadata' >> beam.Map(_infer_metadata_from_saved_model)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, deferred_metadata) _clear_shared_state_after_barrier(input_values.pipeline, transform_fn) return transform_fn, full_metadata