def test_batched_placeholders_from_specs_invalid_dtype(self): with self.assertRaisesRegexp(ValueError, 'had invalid dtype'): impl_helper.batched_placeholders_from_specs( {'f': tf.TensorSpec(dtype=tf.int32, shape=[None])}) with self.assertRaisesRegexp(ValueError, 'had invalid dtype'): impl_helper.batched_placeholders_from_specs( {'f': tf.io.FixedLenFeature(dtype=tf.int32, shape=[None])})
def test_batched_placeholders_from_specs_invalid_mixing(self): with self.assertRaisesRegexp(TypeError, 'Specs must be all'): impl_helper.batched_placeholders_from_specs({ 'f1': tf.TensorSpec(dtype=tf.int64, shape=[None]), 'f2': tf.io.FixedLenFeature(dtype=tf.int64, shape=[None]), })
def test_batched_placeholders_from_feature_spec(self): feature_spec = { 'fixed_len_float': tf.io.FixedLenFeature([2, 3], tf.float32), 'fixed_len_string': tf.io.FixedLenFeature([], tf.string), '_var_len_underscored': tf.io.VarLenFeature(tf.string), 'var_len_int': tf.io.VarLenFeature(tf.int64) } with tf.compat.v1.Graph().as_default(): features = impl_helper.batched_placeholders_from_specs( feature_spec) self.assertCountEqual(features.keys(), [ 'fixed_len_float', 'fixed_len_string', 'var_len_int', '_var_len_underscored' ]) self.assertEqual(type(features['fixed_len_float']), tf.Tensor) self.assertEqual(features['fixed_len_float'].get_shape().as_list(), [None, 2, 3]) self.assertEqual(type(features['fixed_len_string']), tf.Tensor) self.assertEqual(features['fixed_len_string'].get_shape().as_list(), [None]) self.assertEqual(type(features['var_len_int']), tf.SparseTensor) self.assertEqual(features['var_len_int'].get_shape().as_list(), [None, None]) self.assertEqual(type(features['_var_len_underscored']), tf.SparseTensor) self.assertEqual( features['_var_len_underscored'].get_shape().as_list(), [None, None])
def test_batched_placeholders_from_typespecs(self): typespecs = { 'dense_float': tf.TensorSpec(dtype=tf.float32, shape=[None, 2, 3]), 'dense_string': tf.TensorSpec(shape=[None], dtype=tf.string), '_sparse_underscored': tf.SparseTensorSpec(dtype=tf.string, shape=[None, None]), } with tf.compat.v1.Graph().as_default(): features = impl_helper.batched_placeholders_from_specs(typespecs) self.assertCountEqual(features.keys(), [ 'dense_float', 'dense_string', '_sparse_underscored', ]) self.assertEqual(type(features['dense_float']), tf.Tensor) self.assertEqual(features['dense_float'].get_shape().as_list(), [None, 2, 3]) self.assertEqual(features['dense_float'].dtype, tf.float32) self.assertEqual(type(features['dense_string']), tf.Tensor) self.assertEqual(features['dense_string'].get_shape().as_list(), [None]) self.assertEqual(features['dense_string'].dtype, tf.string) self.assertEqual(type(features['_sparse_underscored']), tf.SparseTensor) self.assertEqual(features['_sparse_underscored'].get_shape().as_list(), [None, None]) self.assertEqual(features['_sparse_underscored'].dtype, tf.string)
def test_batched_placeholders_from_typespecs(self): typespecs = { 'dense_float': tf.TensorSpec(dtype=tf.float32, shape=[None, 2, 3]), 'dense_string': tf.TensorSpec(shape=[None], dtype=tf.string), '_sparse_underscored': tf.SparseTensorSpec(dtype=tf.string, shape=[None, None, 17]), 'ragged_string': tf.RaggedTensorSpec(dtype=tf.string, ragged_rank=1, shape=[None, None]), 'ragged_multi_dimension': tf.RaggedTensorSpec(dtype=tf.int64, ragged_rank=3, shape=[None, None, None, None, 5]), } with tf.compat.v1.Graph().as_default(): features = impl_helper.batched_placeholders_from_specs(typespecs) self.assertCountEqual(features.keys(), [ 'dense_float', 'dense_string', '_sparse_underscored', 'ragged_string', 'ragged_multi_dimension', ]) self.assertEqual(type(features['dense_float']), tf.Tensor) self.assertEqual(features['dense_float'].get_shape().as_list(), [None, 2, 3]) self.assertEqual(features['dense_float'].dtype, tf.float32) self.assertEqual(type(features['dense_string']), tf.Tensor) self.assertEqual(features['dense_string'].get_shape().as_list(), [None]) self.assertEqual(features['dense_string'].dtype, tf.string) self.assertEqual(type(features['_sparse_underscored']), tf.SparseTensor) # TODO(zoyahav): Change last dimension size to 17 once SparseTensors propogate # static dense_shape from typespec correctly. self.assertEqual(features['_sparse_underscored'].get_shape().as_list(), [None, None, None]) self.assertEqual(features['_sparse_underscored'].dtype, tf.string) self.assertEqual(type(features['ragged_string']), tf.RaggedTensor) self.assertEqual(features['ragged_string'].shape.as_list(), [None, None]) self.assertEqual(features['ragged_string'].ragged_rank, 1) self.assertEqual(features['ragged_string'].dtype, tf.string) self.assertEqual(type(features['ragged_multi_dimension']), tf.RaggedTensor) self.assertEqual(features['ragged_multi_dimension'].shape.as_list(), [None, None, None, None, 5]) self.assertEqual(features['ragged_multi_dimension'].ragged_rank, 3) self.assertEqual(features['ragged_multi_dimension'].dtype, tf.int64)
def test_batched_placeholders_from_feature_spec(self): feature_spec = { 'fixed_len_float': tf.io.FixedLenFeature([2, 3], tf.float32), 'fixed_len_string': tf.io.FixedLenFeature([], tf.string), '_var_len_underscored': tf.io.VarLenFeature(tf.string), 'var_len_int': tf.io.VarLenFeature(tf.int64), 'sparse_1d': tf.io.SparseFeature('1d_idx', '1d_value', tf.int64, 7), 'sparse_2d': tf.io.SparseFeature(['2d_idx0', '2d_idx1'], '2d_value', tf.int64, [2, 17]), } with tf.compat.v1.Graph().as_default(): features = impl_helper.batched_placeholders_from_specs( feature_spec) self.assertCountEqual(features.keys(), [ 'fixed_len_float', 'fixed_len_string', 'var_len_int', '_var_len_underscored', 'sparse_1d', 'sparse_2d', ]) self.assertEqual(type(features['fixed_len_float']), tf.Tensor) self.assertEqual(features['fixed_len_float'].get_shape().as_list(), [None, 2, 3]) self.assertEqual(type(features['fixed_len_string']), tf.Tensor) self.assertEqual(features['fixed_len_string'].get_shape().as_list(), [None]) self.assertEqual(type(features['var_len_int']), tf.SparseTensor) self.assertEqual(features['var_len_int'].get_shape().as_list(), [None, None]) self.assertEqual(type(features['_var_len_underscored']), tf.SparseTensor) self.assertEqual( features['_var_len_underscored'].get_shape().as_list(), [None, None]) self.assertEqual(type(features['sparse_1d']), tf.SparseTensor) self.assertEqual(type(features['sparse_2d']), tf.SparseTensor) if version.parse(tf.__version__) >= version.parse('2'): self.assertEqual(features['sparse_1d'].get_shape().as_list(), [None, 7]) self.assertEqual(features['sparse_2d'].get_shape().as_list(), [None, 2, 17]) else: self.assertEqual(features['sparse_1d'].get_shape().as_list(), [None, None]) self.assertEqual(features['sparse_2d'].get_shape().as_list(), [None, None, None])
def test_build(self, feature_spec, preprocessing_fn, expected_dot_graph_str): with tf.compat.v1.Graph().as_default() as graph: with tf.compat.v1.name_scope('inputs'): input_signature = impl_helper.batched_placeholders_from_specs( feature_spec) output_signature = preprocessing_fn(input_signature) transform_fn_future, unused_cache = analysis_graph_builder.build( graph, input_signature, output_signature) dot_string = nodes.get_dot_graph([transform_fn_future]).to_string() self.WriteRenderedDotFile(dot_string) self.assertMultiLineEqual( msg='Result dot graph is:\n{}'.format(dot_string), first=dot_string, second=expected_dot_graph_str)
def _build_analysis_graph_for_inspection(preprocessing_fn, specs, dataset_keys, input_cache): """Builds the analysis graph for inspection.""" with tf.compat.v1.Graph().as_default() as graph: with tf.compat.v1.name_scope('inputs'): input_signature = impl_helper.batched_placeholders_from_specs( specs) # TODO(b/34288791): This needs to be exactly the same as in impl.py copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = preprocessing_fn(copied_inputs) transform_fn_future, cache_dict = build(graph, input_signature, output_signature, dataset_keys=dataset_keys, cache_dict=input_cache) return transform_fn_future, cache_dict
def get_transform_input_columns(preprocessing_fn, specs): """Return columns that are required inputs of `TransformDataset`. Args: preprocessing_fn: A tf.transform preprocessing_fn. specs: A dict of feature name to feature specification or tf.TypeSpecs. Returns: A list of columns that are required inputs of the transform `tf.Graph` defined by `preprocessing_fn`. """ with tf.compat.v1.Graph().as_default() as graph: input_signature = impl_helper.batched_placeholders_from_specs( specs) output_signature = preprocessing_fn(input_signature.copy()) transform_input_tensors = graph_tools.get_dependent_inputs( graph, input_signature, output_signature) return list(transform_input_tensors.keys())
def get_analyze_input_columns(preprocessing_fn, specs): """Return columns that are required inputs of `AnalyzeDataset`. Args: preprocessing_fn: A tf.transform preprocessing_fn. specs: A dict of feature name to feature specification or tf.TypeSpecs. Returns: A list of columns that are required inputs of analyzers. """ with tf.compat.v1.Graph().as_default() as graph: input_signature = impl_helper.batched_placeholders_from_specs( specs) _ = preprocessing_fn(input_signature.copy()) tensor_sinks = graph.get_collection(analyzer_nodes.TENSOR_REPLACEMENTS) visitor = _SourcedTensorsVisitor() for tensor_sink in tensor_sinks: nodes.Traverser(visitor).visit_value_node(tensor_sink.future) analyze_input_tensors = graph_tools.get_dependent_inputs( graph, input_signature, visitor.sourced_tensors) return list(analyze_input_tensors.keys())
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ (flattened_pcoll, input_values_pcoll_dict, dataset_cache_dict, input_metadata) = dataset if self._use_tfxio: input_schema = None input_tensor_adapter_config = input_metadata else: input_schema = input_metadata.schema input_tensor_adapter_config = None input_values_pcoll_dict = input_values_pcoll_dict or dict() with tf.compat.v1.Graph().as_default() as graph: with tf.compat.v1.name_scope('inputs'): if self._use_tfxio: specs = TensorAdapter(input_tensor_adapter_config).OriginalTypeSpecs() else: specs = schema_utils.schema_as_feature_spec(input_schema).feature_spec input_signature = impl_helper.batched_placeholders_from_specs(specs) # In order to avoid a bug where import_graph_def fails when the # input_map and return_elements of an imported graph are the same # (b/34288791), we avoid using the placeholder of an input column as an # output of a graph. We do this by applying tf.identity to all inputs of # the preprocessing_fn. Note this applies at the level of raw tensors. # TODO(b/34288791): Remove this workaround and use a shallow copy of # inputs instead. A shallow copy is needed in case # self._preprocessing_fn mutates its input. copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = self._preprocessing_fn(copied_inputs) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not output_signature: raise ValueError('The preprocessing function returned an empty dict') if graph.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref( tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES))) pipeline = self.pipeline or (flattened_pcoll or next( v for v in input_values_pcoll_dict.values() if v is not None)).pipeline # Add a stage that inspects graph collections for API use counts and logs # them as a beam metric. _ = (pipeline | 'InstrumentAPI' >> _InstrumentAPI(graph)) tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_BEAM_RUNNER_TYPE.get( type(pipeline.runner)) extra_args = beam_common.ConstructBeamPipelineVisitor.ExtraArgs( base_temp_dir=Context.create_base_temp_dir(), tf_config=tf_config, pipeline=pipeline, flat_pcollection=flattened_pcoll, pcollection_dict=input_values_pcoll_dict, graph=graph, input_signature=input_signature, input_schema=input_schema, input_tensor_adapter_config=input_tensor_adapter_config, use_tfxio=self._use_tfxio, cache_pcoll_dict=dataset_cache_dict) transform_fn_future, cache_value_nodes = analysis_graph_builder.build( graph, input_signature, output_signature, input_values_pcoll_dict.keys(), cache_dict=dataset_cache_dict) traverser = nodes.Traverser( beam_common.ConstructBeamPipelineVisitor(extra_args)) transform_fn_pcoll = traverser.visit_value_node(transform_fn_future) if cache_value_nodes is not None: output_cache_pcoll_dict = {} for (dataset_key, cache_key), value_node in six.iteritems(cache_value_nodes): if dataset_key not in output_cache_pcoll_dict: output_cache_pcoll_dict[dataset_key] = {} output_cache_pcoll_dict[dataset_key][cache_key] = ( traverser.visit_value_node(value_node)) else: output_cache_pcoll_dict = None # Infer metadata. We take the inferred metadata and apply overrides that # refer to values of tensors in the graph. The override tensors must # be "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. _infer_metadata_from_saved_model will use the # analyzer outputs stored in `transform_fn` to compute the metadata in a # deferred manner, once the analyzer outputs are known. metadata = dataset_metadata.DatasetMetadata( schema=schema_inference.infer_feature_schema(output_signature, graph)) deferred_metadata = ( transform_fn_pcoll | 'ComputeDeferredMetadata' >> beam.Map(_infer_metadata_from_saved_model)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, deferred_metadata) _clear_shared_state_after_barrier(pipeline, transform_fn_pcoll) return (transform_fn_pcoll, full_metadata), output_cache_pcoll_dict