def test_copy_tensors_produces_equivalent_tensors(self): tensors = { 'dense': tf.compat.v1.placeholder(tf.int64, (None, ), name='my_dense_input'), 'sparse': tf.compat.v1.sparse_placeholder(tf.int64, name='my_sparse_input') } copied_tensors = impl_helper.copy_tensors(tensors) with tf.compat.v1.Session() as session: dense_value = [1, 2] sparse_value = tf.compat.v1.SparseTensorValue(indices=[[0, 0], [0, 2], [1, 1]], values=[3, 4, 5], dense_shape=[2, 3]) sample_tensors = session.run(copied_tensors, feed_dict={ tensors['dense']: dense_value, tensors['sparse']: sparse_value }) self.assertAllEqual(sample_tensors['dense'], dense_value) self.assertAllEqual(sample_tensors['sparse'].indices, sparse_value.indices) self.assertAllEqual(sample_tensors['sparse'].values, sparse_value.values) self.assertAllEqual(sample_tensors['sparse'].dense_shape, sparse_value.dense_shape)
def test_copy_tensors_produces_different_tensors(self): with tf.compat.v1.Graph().as_default(): tensors = { 'dense': tf.compat.v1.placeholder(tf.int64, (None, ), name='my_dense_input'), 'sparse': tf.compat.v1.sparse_placeholder(tf.int64, name='my_sparse_input'), 'ragged': tf.compat.v1.ragged.placeholder(tf.int64, ragged_rank=2, name='my_ragged_input') } copied_tensors = impl_helper.copy_tensors(tensors) self.assertNotEqual(tensors['dense'], copied_tensors['dense']) self.assertNotEqual(tensors['sparse'].indices, copied_tensors['sparse'].indices) self.assertNotEqual(tensors['sparse'].values, copied_tensors['sparse'].values) self.assertNotEqual(tensors['sparse'].dense_shape, copied_tensors['sparse'].dense_shape) self.assertNotEqual(tensors['ragged'].values, copied_tensors['ragged'].values) self.assertNotEqual(tensors['ragged'].row_splits, copied_tensors['ragged'].row_splits)
def _RunInPlaceImpl(self, preprocessing_fn: Any, metadata: dataset_metadata.DatasetMetadata, transform_output_path: Text) -> _Status: """Runs a transformation iteration in-place without looking at the data. Args: preprocessing_fn: The tf.Transform preprocessing_fn. metadata: A DatasetMetadata object for the input data. transform_output_path: An absolute path to write the output to. Returns: Status of the execution. """ tf.logging.info('Processing an in-place transform') raw_metadata_dir = os.path.join(transform_output_path, tft.TFTransformOutput.RAW_METADATA_DIR) metadata_io.write_metadata(metadata, raw_metadata_dir) with tf.Graph().as_default() as graph: with tf.Session(graph=graph) as sess: input_signature = impl_helper.feature_spec_as_batched_placeholders( schema_utils.schema_as_feature_spec( _GetSchemaProto(metadata)).feature_spec) # In order to avoid a bug where import_graph_def fails when the # input_map and return_elements of an imported graph are the same # (b/34288791), we avoid using the placeholder of an input column as an # output of a graph. We do this by applying tf.identity to all inputs of # the preprocessing_fn. Note this applies at the level of raw tensors. # TODO(b/34288791): Remove this workaround and use a shallow copy of # inputs instead. A shallow copy is needed in case # self._preprocessing_fn mutates its input. copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = preprocessing_fn(copied_inputs) sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) transform_fn_path = os.path.join( transform_output_path, tft.TFTransformOutput.TRANSFORM_FN_DIR) saved_transform_io.write_saved_transform_from_session( sess, input_signature, output_signature, transform_fn_path) transformed_metadata = dataset_metadata.DatasetMetadata( schema=tft.schema_inference.infer_feature_schema( output_signature, graph, sess)) transformed_metadata_dir = os.path.join( transform_output_path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR) metadata_io.write_metadata(transformed_metadata, transformed_metadata_dir) return _Status.OK()
def _RunInPlaceImpl(self, preprocessing_fn, metadata, transform_output_path): """Runs a transformation iteration in-place without looking at the data. Args: preprocessing_fn: The tf.Transform preprocessing_fn. metadata: A DatasetMetadata object for the input data. transform_output_path: An absolute path to write the output to. Returns: Status of the execution. """ tf.logging.info('Processing an in-place transform') raw_metadata_dir = os.path.join(transform_output_path, tft.TFTransformOutput.RAW_METADATA_DIR) metadata_io.write_metadata(metadata, raw_metadata_dir) with tf.Graph().as_default() as graph: with tf.Session(graph=graph) as sess: input_signature = impl_helper.feature_spec_as_batched_placeholders( metadata.schema.as_feature_spec()) # In order to avoid a bug where import_graph_def fails when the # input_map and return_elements of an imported graph are the same # (b/34288791), we avoid using the placeholder of an input column as an # output of a graph. We do this by applying tf.identity to all inputs of # the preprocessing_fn. Note this applies at the level of raw tensors. # TODO(b/34288791): Remove this workaround and use a shallow copy of # inputs instead. A shallow copy is needed in case # self._preprocessing_fn mutates its input. copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = preprocessing_fn(copied_inputs) sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) transform_fn_path = os.path.join(transform_output_path, tft.TFTransformOutput.TRANSFORM_FN_DIR) saved_transform_io.write_saved_transform_from_session( sess, input_signature, output_signature, transform_fn_path) transformed_metadata = dataset_metadata.DatasetMetadata( schema=tft.schema_inference.infer_feature_schema( output_signature, graph, sess)) transformed_metadata_dir = os.path.join( transform_output_path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR) metadata_io.write_metadata(transformed_metadata, transformed_metadata_dir) return _Status.OK()
def testCopyTensorsCopiesProducesDifferentTensors(self): tensors = { 'dense': tf.placeholder(tf.int64, (None, ), name='my_dense_input'), 'sparse': tf.sparse_placeholder(tf.int64, name='my_sparse_input') } copied_tensors = impl_helper.copy_tensors(tensors) self.assertNotEqual(tensors['dense'], copied_tensors['dense']) self.assertNotEqual(tensors['sparse'].indices, copied_tensors['sparse'].indices) self.assertNotEqual(tensors['sparse'].values, copied_tensors['sparse'].values) self.assertNotEqual(tensors['sparse'].dense_shape, copied_tensors['sparse'].dense_shape)
def _build_analysis_graph_for_inspection(preprocessing_fn, specs, dataset_keys, input_cache): """Builds the analysis graph for inspection.""" with tf.compat.v1.Graph().as_default() as graph: with tf.compat.v1.name_scope('inputs'): input_signature = impl_helper.batched_placeholders_from_specs( specs) # TODO(b/34288791): This needs to be exactly the same as in impl.py copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = preprocessing_fn(copied_inputs) transform_fn_future, cache_dict = build(graph, input_signature, output_signature, dataset_keys=dataset_keys, cache_dict=input_cache) return transform_fn_future, cache_dict
def test_copy_tensors_produces_equivalent_tensors(self): with tf.compat.v1.Graph().as_default(): tensors = { 'dense': tf.compat.v1.placeholder(tf.int64, (None, ), name='my_dense_input'), 'sparse': tf.compat.v1.sparse_placeholder(tf.int64, name='my_sparse_input'), 'ragged': tf.compat.v1.ragged.placeholder(tf.int64, ragged_rank=1, name='my_ragged_input') } copied_tensors = impl_helper.copy_tensors(tensors) with tf.compat.v1.Session() as session: dense_value = [1, 2] sparse_value = tf.compat.v1.SparseTensorValue( indices=[[0, 0], [0, 2], [1, 1]], values=[3, 4, 5], dense_shape=[2, 3]) ragged_value = tf.compat.v1.ragged.RaggedTensorValue( values=np.array([3, 4, 5], dtype=np.int64), row_splits=np.array([0, 2, 3], dtype=np.int64)) sample_tensors = session.run(copied_tensors, feed_dict={ tensors['dense']: dense_value, tensors['sparse']: sparse_value, tensors['ragged']: ragged_value }) self.assertAllEqual(sample_tensors['dense'], dense_value) self.assertAllEqual(sample_tensors['sparse'].indices, sparse_value.indices) self.assertAllEqual(sample_tensors['sparse'].values, sparse_value.values) self.assertAllEqual(sample_tensors['sparse'].dense_shape, sparse_value.dense_shape) self.assertAllEqual(sample_tensors['ragged'].values, ragged_value.values) self.assertAllEqual(sample_tensors['ragged'].row_splits, ragged_value.row_splits)
def get_analysis_dataset_keys(preprocessing_fn, feature_spec, dataset_keys, input_cache): """Computes the dataset keys that are required in order to perform analysis. Args: preprocessing_fn: A tf.transform preprocessing_fn. feature_spec: A dict of feature name to feature specification. dataset_keys: A set of strings which are dataset keys, they uniquely identify these datasets across analysis runs. input_cache: A cache dictionary. Returns: A pair of: - A set of dataset keys that are required for analysis. - A boolean indicating whether or not a flattened version of the entire dataset is required. See the `flat_data` input to `AnalyzeDatasetWithCache`. """ with tf.Graph().as_default() as graph: with tf.compat.v1.name_scope('inputs'): input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) # TODO(b/34288791): This needs to be exactly the same as in impl.py copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = preprocessing_fn(copied_inputs) transform_fn_future, _ = build( graph, input_signature, output_signature, dataset_keys=dataset_keys, cache_dict=input_cache) required_dataset_keys_result = set() inspect_visitor = _InspectVisitor(required_dataset_keys_result) inspect_traverser = nodes.Traverser(inspect_visitor) _ = inspect_traverser.visit_value_node(transform_fn_future) # If None is present this means that a flattened version of the entire dataset # is required, therefore this will be returning all of the given dataset_keys. flat_data_required = None in required_dataset_keys_result if flat_data_required: required_dataset_keys_result = dataset_keys return required_dataset_keys_result, flat_data_required
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ flattened_pcoll, input_values_pcoll_dict, input_metadata = dataset input_schema = input_metadata.schema input_values_pcoll_dict = input_values_pcoll_dict or dict() analyzer_cache.validate_dataset_keys(input_values_pcoll_dict.keys()) with tf.Graph().as_default() as graph: with tf.name_scope('inputs'): feature_spec = input_schema.as_feature_spec() input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) # In order to avoid a bug where import_graph_def fails when the # input_map and return_elements of an imported graph are the same # (b/34288791), we avoid using the placeholder of an input column as an # output of a graph. We do this by applying tf.identity to all inputs of # the preprocessing_fn. Note this applies at the level of raw tensors. # TODO(b/34288791): Remove this workaround and use a shallow copy of # inputs instead. A shallow copy is needed in case # self._preprocessing_fn mutates its input. copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = self._preprocessing_fn(copied_inputs) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not output_signature: raise ValueError('The preprocessing function returned an empty dict') if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES))) pipeline = flattened_pcoll.pipeline serialized_tf_config = common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access pipeline.runner) extra_args = common.ConstructBeamPipelineVisitor.ExtraArgs( base_temp_dir=Context.create_base_temp_dir(), serialized_tf_config=serialized_tf_config, pipeline=pipeline, flat_pcollection=flattened_pcoll, pcollection_dict=input_values_pcoll_dict, graph=graph, input_signature=input_signature, input_schema=input_schema, cache_location=self._cache_location) transform_fn_future = analysis_graph_builder.build( graph, input_signature, output_signature, input_values_pcoll_dict.keys(), self._cache_location) transform_fn_pcoll = nodes.Traverser( common.ConstructBeamPipelineVisitor(extra_args)).visit_value_node( transform_fn_future) # Infer metadata. We take the inferred metadata and apply overrides that # refer to values of tensors in the graph. The override tensors must # be "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. _augment_metadata will use the analyzer # outputs stored in `transform_fn` to compute the metadata in a # deferred manner, once the analyzer outputs are known. metadata = dataset_metadata.DatasetMetadata( schema=schema_inference.infer_feature_schema(output_signature, graph)) deferred_metadata = ( transform_fn_pcoll | 'ComputeDeferredMetadata' >> beam.Map(_infer_metadata_from_saved_model)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, deferred_metadata) _clear_shared_state_after_barrier(pipeline, transform_fn_pcoll) return transform_fn_pcoll, full_metadata
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ input_values, input_metadata = dataset input_schema = input_metadata.schema base_temp_dir = Context.create_base_temp_dir() graph = tf.Graph() with graph.as_default(): with tf.name_scope('inputs'): inputs = input_schema.as_batched_placeholders() # In order to avoid a bug where import_graph_def fails when the input_map # and return_elements of an imported graph are the same (b/34288791), we # avoid using the placeholder of an input column as an output of a graph. # We do this by applying tf.identity to all inputs of the # preprocessing_fn. Note this applies at the level of raw tensors. outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs)) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not outputs: raise ValueError('The preprocessing function returned an empty dict') if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES))) # NOTE: it's important that create_phases is called directly after # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS # collection which would break the logic in create_phases. phases = impl_helper.create_phases() # Iterate through levels. tensor_pcoll_mapping is a mapping from tensor # names to singleton PCollections containing a _TensorValue. We compute # tensor_pcoll_mapping in phases, where at each phase we compute the # analyzers that are ready to run and update tensor_pcoll_mapping. tensor_pcoll_mapping = {} table_initializers = graph.get_collection_ref( tf.GraphKeys.TABLE_INITIALIZERS) original_table_initializers = list(table_initializers) del table_initializers[:] serialized_tf_config = ( common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access input_values.pipeline.runner)) for level, phase in enumerate(phases): # Create a SavedModel that describes the mapping from the input data # to the inputs of the analyzers at this level. The colum names of the # outputs are the tensor names of the analyzer inputs in the graph. # This graph has the anaylzer outputs computed so far replaced with # constants. analyzer_inputs = {} for analyzer in phase.analyzers: for input_tensor in analyzer.inputs: analyzer_inputs[input_tensor.name] = input_tensor table_initializers.extend(phase.table_initializers) unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, analyzer_inputs, unbound_saved_model_dir) saved_model_dir = ( tensor_pcoll_mapping | 'CreateSavedModelForAnalyzerInputs[%d]' % level >> _ReplaceTensorsWithConstants(unbound_saved_model_dir, base_temp_dir, input_values.pipeline)) # Run this saved model on the input dataset to obtain the inputs to the # analyzers. analyzer_input_values = ( input_values | 'BatchAnalyzerInputs[%d]' % level >> _BatchElements() | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo( _RunMetaGraphDoFn( input_schema, serialized_tf_config, shared_graph_state_handle=shared.Shared(), passthrough_keys=Context.get_passthrough_keys()), saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir))) # Compute the analyzers from their inputs. `analyzer_outputs_dict` is a # map from tensor names to singleton PCollections of `_TensorValue`s. analyzer_outputs_dict = ( analyzer_input_values | 'ComputeAnalyzerOutputs[%d]' % level >> _ComputeAnalyzerOutputs( phase.analyzers, base_temp_dir)) # Update the mapping for all analyzers. tensor_pcoll_mapping.update(analyzer_outputs_dict) del table_initializers[:] table_initializers.extend(original_table_initializers) saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, outputs, saved_model_dir) transform_fn = ( tensor_pcoll_mapping | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants( saved_model_dir, base_temp_dir, input_values.pipeline)) # Infer metadata. The metadata may contain Futures that refer to the # values of tensors in the graph. In that case, the tensors must be # "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. # # We first extract the names of the tensors that are referenced by the # Futures, and then compute them by calling _ComputeScalarConstants with # the tensor-PCollection mapping representing the analyzer outputs. metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(outputs)) deferred_metadata_tensor_names = { future.name for column_schema in metadata.schema.column_schemas.values() for future in column_schema.substitute_futures({}) } name_pcoll_dict = ( tensor_pcoll_mapping | 'ComputeTensorValues' >> _ComputeTensorValues(deferred_metadata_tensor_names, saved_model_dir, input_values.pipeline)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, name_pcoll_dict) _clear_shared_state_after_barrier(input_values.pipeline, transform_fn) return transform_fn, full_metadata
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ (flattened_pcoll, input_values_pcoll_dict, dataset_cache_dict, input_metadata) = dataset if self._use_tfxio: input_schema = None input_tensor_adapter_config = input_metadata else: input_schema = input_metadata.schema input_tensor_adapter_config = None input_values_pcoll_dict = input_values_pcoll_dict or dict() with tf.compat.v1.Graph().as_default() as graph: with tf.compat.v1.name_scope('inputs'): if self._use_tfxio: specs = TensorAdapter(input_tensor_adapter_config).OriginalTypeSpecs() else: specs = schema_utils.schema_as_feature_spec(input_schema).feature_spec input_signature = impl_helper.batched_placeholders_from_specs(specs) # In order to avoid a bug where import_graph_def fails when the # input_map and return_elements of an imported graph are the same # (b/34288791), we avoid using the placeholder of an input column as an # output of a graph. We do this by applying tf.identity to all inputs of # the preprocessing_fn. Note this applies at the level of raw tensors. # TODO(b/34288791): Remove this workaround and use a shallow copy of # inputs instead. A shallow copy is needed in case # self._preprocessing_fn mutates its input. copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = self._preprocessing_fn(copied_inputs) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not output_signature: raise ValueError('The preprocessing function returned an empty dict') if graph.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref( tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES))) pipeline = self.pipeline or (flattened_pcoll or next( v for v in input_values_pcoll_dict.values() if v is not None)).pipeline # Add a stage that inspects graph collections for API use counts and logs # them as a beam metric. _ = (pipeline | 'InstrumentAPI' >> _InstrumentAPI(graph)) tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_BEAM_RUNNER_TYPE.get( type(pipeline.runner)) extra_args = beam_common.ConstructBeamPipelineVisitor.ExtraArgs( base_temp_dir=Context.create_base_temp_dir(), tf_config=tf_config, pipeline=pipeline, flat_pcollection=flattened_pcoll, pcollection_dict=input_values_pcoll_dict, graph=graph, input_signature=input_signature, input_schema=input_schema, input_tensor_adapter_config=input_tensor_adapter_config, use_tfxio=self._use_tfxio, cache_pcoll_dict=dataset_cache_dict) transform_fn_future, cache_value_nodes = analysis_graph_builder.build( graph, input_signature, output_signature, input_values_pcoll_dict.keys(), cache_dict=dataset_cache_dict) traverser = nodes.Traverser( beam_common.ConstructBeamPipelineVisitor(extra_args)) transform_fn_pcoll = traverser.visit_value_node(transform_fn_future) if cache_value_nodes is not None: output_cache_pcoll_dict = {} for (dataset_key, cache_key), value_node in six.iteritems(cache_value_nodes): if dataset_key not in output_cache_pcoll_dict: output_cache_pcoll_dict[dataset_key] = {} output_cache_pcoll_dict[dataset_key][cache_key] = ( traverser.visit_value_node(value_node)) else: output_cache_pcoll_dict = None # Infer metadata. We take the inferred metadata and apply overrides that # refer to values of tensors in the graph. The override tensors must # be "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. _infer_metadata_from_saved_model will use the # analyzer outputs stored in `transform_fn` to compute the metadata in a # deferred manner, once the analyzer outputs are known. metadata = dataset_metadata.DatasetMetadata( schema=schema_inference.infer_feature_schema(output_signature, graph)) deferred_metadata = ( transform_fn_pcoll | 'ComputeDeferredMetadata' >> beam.Map(_infer_metadata_from_saved_model)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, deferred_metadata) _clear_shared_state_after_barrier(pipeline, transform_fn_pcoll) return (transform_fn_pcoll, full_metadata), output_cache_pcoll_dict
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ input_values, input_metadata = dataset input_schema = input_metadata.schema base_temp_dir = Context.create_base_temp_dir() graph = tf.Graph() with graph.as_default(): with tf.name_scope('inputs'): inputs = input_schema.as_batched_placeholders() # In order to avoid a bug where import_graph_def fails when the input_map # and return_elements of an imported graph are the same (b/34288791), we # avoid using the placeholder of an input column as an output of a graph. # We do this by applying tf.identity to all inputs of the # preprocessing_fn. Note this applies at the level of raw tensors. outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs)) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not outputs: raise ValueError( 'The preprocessing function returned an empty dict') if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref( tf.GraphKeys.TRAINABLE_VARIABLES))) # NOTE: it's important that create_phases is called directly after # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS # collection which would break the logic in create_phases. phases = impl_helper.create_phases() # Iterate through levels. tensor_pcoll_mapping is a mapping from tensor # names to singleton PCollections containing a _TensorValue. We compute # tensor_pcoll_mapping in phases, where at each phase we compute the # analyzers that are ready to run and update tensor_pcoll_mapping. tensor_pcoll_mapping = {} table_initializers = graph.get_collection_ref( tf.GraphKeys.TABLE_INITIALIZERS) original_table_initializers = list(table_initializers) del table_initializers[:] serialized_tf_config = ( common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access input_values.pipeline.runner)) for level, phase in enumerate(phases): # Create a SavedModel that describes the mapping from the input data # to the inputs of the analyzers at this level. The colum names of the # outputs are the tensor names of the analyzer inputs in the graph. # This graph has the anaylzer outputs computed so far replaced with # constants. analyzer_inputs = {} for analyzer in phase.analyzer_infos: for input_tensor_name in analyzer.input_tensor_names: analyzer_inputs[ input_tensor_name] = graph.get_tensor_by_name( input_tensor_name) table_initializers.extend(phase.table_initializers) unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, analyzer_inputs, unbound_saved_model_dir) tensor_pcoll_mapping_update = ( (input_values, tensor_pcoll_mapping) | 'RunPhase[{}]'.format(level) >> _RunPhase( phase.analyzer_infos, unbound_saved_model_dir, base_temp_dir, input_schema, serialized_tf_config)) # Update the mapping for all analyzers. tensor_pcoll_mapping.update(tensor_pcoll_mapping_update) del table_initializers[:] table_initializers.extend(original_table_initializers) saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, outputs, saved_model_dir) transform_fn = ( tensor_pcoll_mapping | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants( saved_model_dir, base_temp_dir, input_values.pipeline)) # Infer metadata. We take the inferred metadata and apply overrides that # refer to values of tensors in the graph. The override tensors must # be "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. _augment_metadata will use the analyzer # outputs stored in `transform_fn` to compute the metadata in a # deferred manner, once the analyzer outputs are known. metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(outputs)) deferred_metadata = (transform_fn | 'ComputeDeferredMetadata' >> beam.Map( _augment_metadata, metadata)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, deferred_metadata) _clear_shared_state_after_barrier(input_values.pipeline, transform_fn) return transform_fn, full_metadata