def testCrossNamedParameters(self): test_cases_1 = [ { 'testcase_name': 'a_1_b_1', 'a': 1, 'b': 1 }, { 'testcase_name': 'a_3_b_3', 'a': 3, 'b': 3 }, ] test_cases_2 = [ { 'testcase_name': 'c_2', 'c': 2 }, { 'testcase_name': 'c_4', 'c': 4 }, ] expected_cross = [ { 'testcase_name': 'a_1_b_1_c_2', 'a': 1, 'b': 1, 'c': 2 }, { 'testcase_name': 'a_1_b_1_c_4', 'a': 1, 'b': 1, 'c': 4 }, { 'testcase_name': 'a_3_b_3_c_2', 'a': 3, 'b': 3, 'c': 2 }, { 'testcase_name': 'a_3_b_3_c_4', 'a': 3, 'b': 3, 'c': 4 }, ] self.assertEqual( test_case.cross_named_parameters(test_cases_1, test_cases_2), expected_cross)
class InspectPreprocessingFnTest(test_case.TransformTestCase): @test_case.named_parameters(*test_case.cross_named_parameters([ dict(testcase_name='identity', preprocessing_fn=_identity_preprocessing_fn, expected_analyze_input_columns=[], expected_transform_input_columns=['x', 'y', 's']), dict(testcase_name='side_affect', preprocessing_fn=_side_affect_preprocessing_fn, expected_analyze_input_columns=['s'], expected_transform_input_columns=[]), dict(testcase_name='non_identity_ops', preprocessing_fn=_non_identity_ops_preprocessing_fn, expected_analyze_input_columns=[], expected_transform_input_columns=['x', 'y', 's']), dict(testcase_name='feature_renaming', preprocessing_fn=_renaming_preprocessing_fn, expected_analyze_input_columns=[], expected_transform_input_columns=['x', 'y', 's']), dict(testcase_name='one_phase', preprocessing_fn=_one_phase_preprocessing_fn, expected_analyze_input_columns=['x', 's'], expected_transform_input_columns=['y']), dict(testcase_name='two_phases', preprocessing_fn=_two_phases_preprocessing_fn, expected_analyze_input_columns=['x', 'y', 's'], expected_transform_input_columns=['x', 's']) ], [ dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True), dict(testcase_name='tf2', force_tf_compat_v1=False) ])) def test_column_inference(self, preprocessing_fn, expected_analyze_input_columns, expected_transform_input_columns, force_tf_compat_v1): if not force_tf_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') specs = _TYPE_SPEC else: specs = _FEATURE_SPEC analyze_input_columns = ( inspect_preprocessing_fn.get_analyze_input_columns( preprocessing_fn, specs, force_tf_compat_v1)) transform_input_columns = ( inspect_preprocessing_fn.get_transform_input_columns( preprocessing_fn, specs, force_tf_compat_v1)) self.assertCountEqual(analyze_input_columns, expected_analyze_input_columns) self.assertCountEqual(transform_input_columns, expected_transform_input_columns)
class ImplHelperTest(test_case.TransformTestCase): def test_batched_placeholders_from_feature_spec(self): feature_spec = { 'fixed_len_float': tf.io.FixedLenFeature([2, 3], tf.float32), 'fixed_len_string': tf.io.FixedLenFeature([], tf.string), '_var_len_underscored': tf.io.VarLenFeature(tf.string), 'var_len_int': tf.io.VarLenFeature(tf.int64) } with tf.compat.v1.Graph().as_default(): features = impl_helper.batched_placeholders_from_specs( feature_spec) self.assertCountEqual(features.keys(), [ 'fixed_len_float', 'fixed_len_string', 'var_len_int', '_var_len_underscored' ]) self.assertEqual(type(features['fixed_len_float']), tf.Tensor) self.assertEqual(features['fixed_len_float'].get_shape().as_list(), [None, 2, 3]) self.assertEqual(type(features['fixed_len_string']), tf.Tensor) self.assertEqual(features['fixed_len_string'].get_shape().as_list(), [None]) self.assertEqual(type(features['var_len_int']), tf.SparseTensor) self.assertEqual(features['var_len_int'].get_shape().as_list(), [None, None]) self.assertEqual(type(features['_var_len_underscored']), tf.SparseTensor) self.assertEqual( features['_var_len_underscored'].get_shape().as_list(), [None, None]) def test_batched_placeholders_from_typespecs(self): typespecs = { 'dense_float': tf.TensorSpec(dtype=tf.float32, shape=[None, 2, 3]), 'dense_string': tf.TensorSpec(shape=[None], dtype=tf.string), '_sparse_underscored': tf.SparseTensorSpec(dtype=tf.string, shape=[None, None]), 'ragged_string': tf.RaggedTensorSpec(dtype=tf.string, ragged_rank=1, shape=[None, None]), 'ragged_multi_dimension': tf.RaggedTensorSpec(dtype=tf.int64, ragged_rank=3, shape=[None, None, None, None, 5]), } with tf.compat.v1.Graph().as_default(): features = impl_helper.batched_placeholders_from_specs(typespecs) self.assertCountEqual(features.keys(), [ 'dense_float', 'dense_string', '_sparse_underscored', 'ragged_string', 'ragged_multi_dimension', ]) self.assertEqual(type(features['dense_float']), tf.Tensor) self.assertEqual(features['dense_float'].get_shape().as_list(), [None, 2, 3]) self.assertEqual(features['dense_float'].dtype, tf.float32) self.assertEqual(type(features['dense_string']), tf.Tensor) self.assertEqual(features['dense_string'].get_shape().as_list(), [None]) self.assertEqual(features['dense_string'].dtype, tf.string) self.assertEqual(type(features['_sparse_underscored']), tf.SparseTensor) self.assertEqual(features['_sparse_underscored'].get_shape().as_list(), [None, None]) self.assertEqual(features['_sparse_underscored'].dtype, tf.string) self.assertEqual(type(features['ragged_string']), tf.RaggedTensor) self.assertEqual(features['ragged_string'].shape.as_list(), [None, None]) self.assertEqual(features['ragged_string'].ragged_rank, 1) self.assertEqual(features['ragged_string'].dtype, tf.string) self.assertEqual(type(features['ragged_multi_dimension']), tf.RaggedTensor) self.assertEqual(features['ragged_multi_dimension'].shape.as_list(), [None, None, None, None, 5]) self.assertEqual(features['ragged_multi_dimension'].ragged_rank, 3) self.assertEqual(features['ragged_multi_dimension'].dtype, tf.int64) def test_batched_placeholders_from_specs_invalid_dtype(self): with self.assertRaisesRegexp(ValueError, 'had invalid dtype'): impl_helper.batched_placeholders_from_specs( {'f': tf.TensorSpec(dtype=tf.int32, shape=[None])}) with self.assertRaisesRegexp(ValueError, 'had invalid dtype'): impl_helper.batched_placeholders_from_specs( {'f': tf.io.FixedLenFeature(dtype=tf.int32, shape=[None])}) def test_batched_placeholders_from_specs_invalid_mixing(self): with self.assertRaisesRegexp(TypeError, 'Specs must be all'): impl_helper.batched_placeholders_from_specs({ 'f1': tf.TensorSpec(dtype=tf.int64, shape=[None]), 'f2': tf.io.FixedLenFeature(dtype=tf.int64, shape=[None]), }) @test_case.named_parameters(*test_case.cross_named_parameters( _ROUNDTRIP_CASES, [ dict(testcase_name='eager_tensors', feed_eager_tensors=True), dict(testcase_name='session_run_values', feed_eager_tensors=False) ])) def test_to_instance_dicts(self, feature_spec, instances, feed_dict, feed_eager_tensors): if feed_eager_tensors: test_case.skip_if_not_tf2('Tensorflow 2.x required') schema = schema_utils.schema_from_feature_spec(feature_spec) feed_dict_local = copy.copy(feed_dict) if feed_eager_tensors: for key, value in six.iteritems(feed_dict_local): if isinstance(value, tf.compat.v1.SparseTensorValue): feed_dict_local[key] = tf.sparse.SparseTensor.from_value( value) else: feed_dict_local[key] = tf.constant(value) np.testing.assert_equal( instances, impl_helper.to_instance_dicts(schema, feed_dict_local)) @test_case.named_parameters(*_TO_INSTANCE_DICT_ERROR_CASES) def test_to_instance_dicts_error(self, feature_spec, feed_dict, error_msg, error_type=ValueError): schema = schema_utils.schema_from_feature_spec(feature_spec) with self.assertRaisesRegexp(error_type, error_msg): impl_helper.to_instance_dicts(schema, feed_dict) @test_case.named_parameters( dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True), dict(testcase_name='native_tf2', force_tf_compat_v1=False)) def test_analyze_in_place(self, force_tf_compat_v1): if not force_tf_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') def preprocessing_fn(inputs): return {'x_add_1': inputs['x'] + 1} feature_spec = {'x': tf.io.FixedLenFeature([], tf.int64)} type_spec = { 'x': tf.TensorSpec(dtype=tf.int64, shape=[ None, ]) } output_path = os.path.join(self.get_temp_dir(), self._testMethodName) impl_helper.analyze_in_place(preprocessing_fn, force_tf_compat_v1, feature_spec, type_spec, output_path) tft_output = TFTransformOutput(output_path) expected_value = np.array([2], dtype=np.int64) if force_tf_compat_v1: with tf.Graph().as_default() as graph: with tf.compat.v1.Session(graph=graph).as_default(): transformed_features = tft_output.transform_raw_features( {'x': tf.constant([1], dtype=tf.int64)}) transformed_value = transformed_features['x_add_1'].eval() else: transformed_features = tft_output.transform_raw_features( {'x': tf.constant([1], dtype=tf.int64)}) transformed_value = transformed_features['x_add_1'].numpy() self.assertEqual(transformed_value, expected_value) transformed_feature_spec = tft_output.transformed_feature_spec() expected_feature_spec = feature_spec = { 'x_add_1': tf.io.FixedLenFeature([], tf.int64) } self.assertEqual(transformed_feature_spec, expected_feature_spec) @test_case.named_parameters( dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True), dict(testcase_name='native_tf2', force_tf_compat_v1=False)) def test_analyze_in_place_with_analyzers_raises_error( self, force_tf_compat_v1): if not force_tf_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') def preprocessing_fn(inputs): return {'x_add_1': analyzers.mean(inputs['x'])} feature_spec = {'x': tf.io.FixedLenFeature([], tf.int64)} type_spec = { 'x': tf.TensorSpec(dtype=tf.int64, shape=[ None, ]) } output_path = os.path.join(self.get_temp_dir(), self._testMethodName) with self.assertRaisesRegexp(RuntimeError, 'analyzers found when tracing'): impl_helper.analyze_in_place(preprocessing_fn, force_tf_compat_v1, feature_spec, type_spec, output_path)
class ImplHelperTest(test_case.TransformTestCase): def test_batched_placeholders_from_feature_spec(self): feature_spec = { 'fixed_len_float': tf.io.FixedLenFeature([2, 3], tf.float32), 'fixed_len_string': tf.io.FixedLenFeature([], tf.string), '_var_len_underscored': tf.io.VarLenFeature(tf.string), 'var_len_int': tf.io.VarLenFeature(tf.int64), 'sparse_1d': tf.io.SparseFeature('1d_idx', '1d_value', tf.int64, 7), 'sparse_2d': tf.io.SparseFeature(['2d_idx0', '2d_idx1'], '2d_value', tf.int64, [2, 17]), } with tf.compat.v1.Graph().as_default(): features = impl_helper.batched_placeholders_from_specs( feature_spec) self.assertCountEqual(features.keys(), [ 'fixed_len_float', 'fixed_len_string', 'var_len_int', '_var_len_underscored', 'sparse_1d', 'sparse_2d', ]) self.assertEqual(type(features['fixed_len_float']), tf.Tensor) self.assertEqual(features['fixed_len_float'].get_shape().as_list(), [None, 2, 3]) self.assertEqual(type(features['fixed_len_string']), tf.Tensor) self.assertEqual(features['fixed_len_string'].get_shape().as_list(), [None]) self.assertEqual(type(features['var_len_int']), tf.SparseTensor) self.assertEqual(features['var_len_int'].get_shape().as_list(), [None, None]) self.assertEqual(type(features['_var_len_underscored']), tf.SparseTensor) self.assertEqual( features['_var_len_underscored'].get_shape().as_list(), [None, None]) self.assertEqual(type(features['sparse_1d']), tf.SparseTensor) self.assertEqual(type(features['sparse_2d']), tf.SparseTensor) if version.parse(tf.__version__) >= version.parse('2'): self.assertEqual(features['sparse_1d'].get_shape().as_list(), [None, 7]) self.assertEqual(features['sparse_2d'].get_shape().as_list(), [None, 2, 17]) else: self.assertEqual(features['sparse_1d'].get_shape().as_list(), [None, None]) self.assertEqual(features['sparse_2d'].get_shape().as_list(), [None, None, None]) def test_batched_placeholders_from_typespecs(self): typespecs = { 'dense_float': tf.TensorSpec(dtype=tf.float32, shape=[None, 2, 3]), 'dense_string': tf.TensorSpec(shape=[None], dtype=tf.string), '_sparse_underscored': tf.SparseTensorSpec(dtype=tf.string, shape=[None, None, 17]), 'ragged_string': tf.RaggedTensorSpec(dtype=tf.string, ragged_rank=1, shape=[None, None]), 'ragged_multi_dimension': tf.RaggedTensorSpec(dtype=tf.int64, ragged_rank=3, shape=[None, None, None, None, 5]), } with tf.compat.v1.Graph().as_default(): features = impl_helper.batched_placeholders_from_specs(typespecs) self.assertCountEqual(features.keys(), [ 'dense_float', 'dense_string', '_sparse_underscored', 'ragged_string', 'ragged_multi_dimension', ]) self.assertEqual(type(features['dense_float']), tf.Tensor) self.assertEqual(features['dense_float'].get_shape().as_list(), [None, 2, 3]) self.assertEqual(features['dense_float'].dtype, tf.float32) self.assertEqual(type(features['dense_string']), tf.Tensor) self.assertEqual(features['dense_string'].get_shape().as_list(), [None]) self.assertEqual(features['dense_string'].dtype, tf.string) self.assertEqual(type(features['_sparse_underscored']), tf.SparseTensor) # TODO(zoyahav): Change last dimension size to 17 once SparseTensors propogate # static dense_shape from typespec correctly. self.assertEqual(features['_sparse_underscored'].get_shape().as_list(), [None, None, None]) self.assertEqual(features['_sparse_underscored'].dtype, tf.string) self.assertEqual(type(features['ragged_string']), tf.RaggedTensor) self.assertEqual(features['ragged_string'].shape.as_list(), [None, None]) self.assertEqual(features['ragged_string'].ragged_rank, 1) self.assertEqual(features['ragged_string'].dtype, tf.string) self.assertEqual(type(features['ragged_multi_dimension']), tf.RaggedTensor) self.assertEqual(features['ragged_multi_dimension'].shape.as_list(), [None, None, None, None, 5]) self.assertEqual(features['ragged_multi_dimension'].ragged_rank, 3) self.assertEqual(features['ragged_multi_dimension'].dtype, tf.int64) def test_batched_placeholders_from_specs_invalid_dtype(self): with self.assertRaisesRegexp(ValueError, 'had invalid dtype'): impl_helper.batched_placeholders_from_specs( {'f': tf.TensorSpec(dtype=tf.int32, shape=[None])}) with self.assertRaisesRegexp(ValueError, 'had invalid dtype'): impl_helper.batched_placeholders_from_specs( {'f': tf.io.FixedLenFeature(dtype=tf.int32, shape=[None])}) def test_batched_placeholders_from_specs_invalid_mixing(self): with self.assertRaisesRegexp(TypeError, 'Specs must be all'): impl_helper.batched_placeholders_from_specs({ 'f1': tf.TensorSpec(dtype=tf.int64, shape=[None]), 'f2': tf.io.FixedLenFeature(dtype=tf.int64, shape=[None]), }) @test_case.named_parameters(*test_case.cross_named_parameters( _ROUNDTRIP_CASES, [ dict(testcase_name='eager_tensors', feed_eager_tensors=True), dict(testcase_name='session_run_values', feed_eager_tensors=False) ])) def test_to_instance_dicts(self, feature_spec, instances, record_batch, feed_dict, feed_eager_tensors): del record_batch if feed_eager_tensors: test_case.skip_if_not_tf2('Tensorflow 2.x required') schema = schema_utils.schema_from_feature_spec(feature_spec) feed_dict_local = (_eager_tensor_from_values(feed_dict) if feed_eager_tensors else copy.copy(feed_dict)) result = impl_helper.to_instance_dicts(schema, feed_dict_local) np.testing.assert_equal(instances, result) @test_case.named_parameters(*_TO_INSTANCE_DICT_ERROR_CASES) def test_to_instance_dicts_error(self, feature_spec, feed_dict, error_msg, error_type=ValueError): schema = schema_utils.schema_from_feature_spec(feature_spec) with self.assertRaisesRegexp(error_type, error_msg): impl_helper.to_instance_dicts(schema, feed_dict) @test_case.named_parameters(*test_case.cross_named_parameters( _ROUNDTRIP_CASES, [ dict(testcase_name='eager_tensors', feed_eager_tensors=True), dict(testcase_name='session_run_values', feed_eager_tensors=False) ])) def test_convert_to_arrow(self, feature_spec, instances, record_batch, feed_dict, feed_eager_tensors): del instances if feed_eager_tensors: test_case.skip_if_not_tf2('Tensorflow 2.x required') schema = schema_utils.schema_from_feature_spec(feature_spec) converter = impl_helper.make_tensor_to_arrow_converter(schema) feed_dict_local = (_eager_tensor_from_values(feed_dict) if feed_eager_tensors else copy.copy(feed_dict)) arrow_columns, arrow_schema = impl_helper.convert_to_arrow( schema, converter, feed_dict_local) actual = pa.RecordBatch.from_arrays(arrow_columns, schema=arrow_schema) expected = pa.RecordBatch.from_arrays(list(record_batch.values()), names=list(record_batch.keys())) np.testing.assert_equal(actual.to_pydict(), expected.to_pydict()) @test_case.named_parameters(*_CONVERT_TO_ARROW_ERROR_CASES) def test_convert_to_arrow_error(self, feature_spec, feed_dict, error_msg, error_type=ValueError): schema = schema_utils.schema_from_feature_spec(feature_spec) converter = impl_helper.make_tensor_to_arrow_converter(schema) with self.assertRaisesRegexp(error_type, error_msg): impl_helper.convert_to_arrow(schema, converter, feed_dict) @test_case.named_parameters( dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True), dict(testcase_name='native_tf2', force_tf_compat_v1=False)) def test_analyze_in_place(self, force_tf_compat_v1): if not force_tf_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') def preprocessing_fn(inputs): return {'x_add_1': inputs['x'] + 1} feature_spec = {'x': tf.io.FixedLenFeature([], tf.int64)} type_spec = { 'x': tf.TensorSpec(dtype=tf.int64, shape=[ None, ]) } output_path = os.path.join(self.get_temp_dir(), self._testMethodName) impl_helper.analyze_in_place(preprocessing_fn, force_tf_compat_v1, feature_spec, type_spec, output_path) tft_output = TFTransformOutput(output_path) expected_value = np.array([2], dtype=np.int64) if force_tf_compat_v1: with tf.Graph().as_default() as graph: with tf.compat.v1.Session(graph=graph).as_default(): transformed_features = tft_output.transform_raw_features( {'x': tf.constant([1], dtype=tf.int64)}) transformed_value = transformed_features['x_add_1'].eval() else: transformed_features = tft_output.transform_raw_features( {'x': tf.constant([1], dtype=tf.int64)}) transformed_value = transformed_features['x_add_1'].numpy() self.assertEqual(transformed_value, expected_value) transformed_feature_spec = tft_output.transformed_feature_spec() expected_feature_spec = feature_spec = { 'x_add_1': tf.io.FixedLenFeature([], tf.int64) } self.assertEqual(transformed_feature_spec, expected_feature_spec) @test_case.named_parameters( dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True), dict(testcase_name='native_tf2', force_tf_compat_v1=False)) def test_analyze_in_place_with_analyzers_raises_error( self, force_tf_compat_v1): if not force_tf_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') def preprocessing_fn(inputs): return {'x_add_1': analyzers.mean(inputs['x'])} feature_spec = {'x': tf.io.FixedLenFeature([], tf.int64)} type_spec = { 'x': tf.TensorSpec(dtype=tf.int64, shape=[ None, ]) } output_path = os.path.join(self.get_temp_dir(), self._testMethodName) with self.assertRaisesRegexp(RuntimeError, 'analyzers found when tracing'): impl_helper.analyze_in_place(preprocessing_fn, force_tf_compat_v1, feature_spec, type_spec, output_path) @test_case.named_parameters( dict(testcase_name='_3d', sparse_value=tf.compat.v1.SparseTensorValue( indices=np.array([[0, 0, 1], [0, 1, 2], [1, 1, 1]]), values=np.array([0, 1, 2]), dense_shape=np.array([2, 2, 3])), expected_indices=[[np.array([0, 1]), np.array([1, 2])], [np.array([1]), np.array([1])]], expected_values=[np.array([0, 1]), np.array([2])]), dict(testcase_name='_4d', sparse_value=tf.compat.v1.SparseTensorValue( indices=np.array([[0, 0, 0, 1], [0, 1, 0, 2], [1, 1, 1, 1]]), values=np.array([0, 1, 2]), dense_shape=np.array([2, 2, 2, 3])), expected_indices=[[ np.array([0, 1]), np.array([0, 0]), np.array([1, 2]) ], [np.array([1]), np.array([1]), np.array([1])]], expected_values=[np.array([0, 1]), np.array([2])]), ) def test_decompose_sparse_batch(self, sparse_value, expected_indices, expected_values): indices, values = impl_helper._decompose_sparse_batch(sparse_value) self.assertLen(indices, len(expected_indices)) self.assertLen(values, len(expected_values)) for idx, (a, b) in enumerate(zip(expected_indices, indices)): self.assertAllEqual( a, b, 'Indices are different at index {}'.format(idx)) for idx, (a, b) in enumerate(zip(expected_values, values)): self.assertAllEqual(a, b, 'Values are different at index {}'.format(idx)) def test_get_num_values_per_instance_in_sparse_batch(self): batch_indices = np.array([[idx % 4, 0, 1, 2] for idx in range(100)]) num_values = impl_helper._get_num_values_per_instance_in_sparse_batch( batch_indices, 27) expected_num_values = [25, 25, 25, 25] + [0] * 23 self.assertEqual(expected_num_values, num_values) @test_case.named_parameters( dict( testcase_name='_3d', ragged_tensor=tf.compat.v1.ragged.RaggedTensorValue( values=tf.compat.v1.ragged.RaggedTensorValue( values=tf.compat.v1.ragged.RaggedTensorValue( values=np.array([10., 20., 30.]), row_splits=np.array([0, 0, 1, 3])), # row_lengths2 row_splits=np.array([0, 1, 1, 3])), # row_lengths1 row_splits=np.array([0, 2, 3])), # batch dimension # pytype: disable=attribute-error spec=tf.io.RaggedFeature( # pylint: disable=g-long-ternary tf.float32, value_key='ragged_3d_val', partitions=[ tf.io.RaggedFeature.RowLengths('ragged_3d_row_lengths1'), tf.io.RaggedFeature.RowLengths('ragged_3d_row_lengths2'), ]) if common_types.is_ragged_feature_available() else None, # pytype: enable=attribute-error expected_components={ 'ragged_3d_val': [np.array([], dtype=np.float32), np.array([10., 20., 30.])], 'ragged_3d_row_lengths1': [np.array([1, 0]), np.array([2])], 'ragged_3d_row_lengths2': [np.array([0]), np.array([1, 2])], }, ), dict( testcase_name='_4d', ragged_tensor=tf.compat.v1.ragged.RaggedTensorValue( values=tf.compat.v1.ragged.RaggedTensorValue( values=tf.compat.v1.ragged.RaggedTensorValue( values=tf.compat.v1.ragged.RaggedTensorValue( values=np.array([b'a', b'b', b'c', b'd']), row_splits=np.array([0, 1, 1, 3, 4])), # row_lengths3 row_splits=np.array([0, 2, 2, 4])), # row_lengths2 row_splits=np.array([0, 1, 1, 3])), # row_lengths1 row_splits=np.array([0, 2, 2, 3])), # batch dimension # pytype: disable=attribute-error spec=tf.io.RaggedFeature( # pylint: disable=g-long-ternary tf.float32, value_key='ragged_4d_val', partitions=[ tf.io.RaggedFeature.RowLengths('ragged_4d_row_lengths1'), tf.io.RaggedFeature.RowLengths('ragged_4d_row_lengths2'), tf.io.RaggedFeature.RowLengths('ragged_4d_row_lengths3'), ]) if common_types.is_ragged_feature_available() else None, # pytype: enable=attribute-error expected_components={ 'ragged_4d_val': [ np.array([b'a']), np.array([], dtype=object), np.array([b'b', b'c', b'd']) ], 'ragged_4d_row_lengths1': [np.array([1, 0]), np.array([]), np.array([2])], 'ragged_4d_row_lengths2': [np.array([2]), np.array([]), np.array([0, 2])], 'ragged_4d_row_lengths3': [np.array([1, 0]), np.array([]), np.array([2, 1])], }, )) def test_handle_ragged_batch(self, ragged_tensor, spec, expected_components): test_case.skip_if_not_tf2('RaggedFeature is not available in TF 1.x') result = impl_helper._handle_ragged_batch(ragged_tensor, spec, name='ragged') np.testing.assert_equal(result, expected_components)
class ImplHelperTest(test_case.TransformTestCase): def test_batched_placeholders_from_feature_spec(self): feature_spec = { 'fixed_len_float': tf.io.FixedLenFeature([2, 3], tf.float32), 'fixed_len_string': tf.io.FixedLenFeature([], tf.string), '_var_len_underscored': tf.io.VarLenFeature(tf.string), 'var_len_int': tf.io.VarLenFeature(tf.int64), 'sparse_1d': tf.io.SparseFeature('1d_idx', '1d_value', tf.int64, 7), 'sparse_2d': tf.io.SparseFeature(['2d_idx0', '2d_idx1'], '2d_value', tf.int64, [2, 17]), } with tf.compat.v1.Graph().as_default(): features = impl_helper.batched_placeholders_from_specs( feature_spec) self.assertCountEqual(features.keys(), [ 'fixed_len_float', 'fixed_len_string', 'var_len_int', '_var_len_underscored', 'sparse_1d', 'sparse_2d', ]) self.assertEqual(type(features['fixed_len_float']), tf.Tensor) self.assertEqual(features['fixed_len_float'].get_shape().as_list(), [None, 2, 3]) self.assertEqual(type(features['fixed_len_string']), tf.Tensor) self.assertEqual(features['fixed_len_string'].get_shape().as_list(), [None]) self.assertEqual(type(features['var_len_int']), tf.SparseTensor) self.assertEqual(features['var_len_int'].get_shape().as_list(), [None, None]) self.assertEqual(type(features['_var_len_underscored']), tf.SparseTensor) self.assertEqual( features['_var_len_underscored'].get_shape().as_list(), [None, None]) self.assertEqual(type(features['sparse_1d']), tf.SparseTensor) self.assertEqual(type(features['sparse_2d']), tf.SparseTensor) if tf.__version__ >= '2': self.assertEqual(features['sparse_1d'].get_shape().as_list(), [None, 7]) self.assertEqual(features['sparse_2d'].get_shape().as_list(), [None, 2, 17]) else: self.assertEqual(features['sparse_1d'].get_shape().as_list(), [None, None]) self.assertEqual(features['sparse_2d'].get_shape().as_list(), [None, None, None]) def test_batched_placeholders_from_typespecs(self): typespecs = { 'dense_float': tf.TensorSpec(dtype=tf.float32, shape=[None, 2, 3]), 'dense_string': tf.TensorSpec(shape=[None], dtype=tf.string), '_sparse_underscored': tf.SparseTensorSpec(dtype=tf.string, shape=[None, None, 17]), 'ragged_string': tf.RaggedTensorSpec(dtype=tf.string, ragged_rank=1, shape=[None, None]), 'ragged_multi_dimension': tf.RaggedTensorSpec(dtype=tf.int64, ragged_rank=3, shape=[None, None, None, None, 5]), } with tf.compat.v1.Graph().as_default(): features = impl_helper.batched_placeholders_from_specs(typespecs) self.assertCountEqual(features.keys(), [ 'dense_float', 'dense_string', '_sparse_underscored', 'ragged_string', 'ragged_multi_dimension', ]) self.assertEqual(type(features['dense_float']), tf.Tensor) self.assertEqual(features['dense_float'].get_shape().as_list(), [None, 2, 3]) self.assertEqual(features['dense_float'].dtype, tf.float32) self.assertEqual(type(features['dense_string']), tf.Tensor) self.assertEqual(features['dense_string'].get_shape().as_list(), [None]) self.assertEqual(features['dense_string'].dtype, tf.string) self.assertEqual(type(features['_sparse_underscored']), tf.SparseTensor) # TODO(zoyahav): Change last dimension size to 17 once SparseTensors propogate # static dense_shape from typespec correctly. self.assertEqual(features['_sparse_underscored'].get_shape().as_list(), [None, None, None]) self.assertEqual(features['_sparse_underscored'].dtype, tf.string) self.assertEqual(type(features['ragged_string']), tf.RaggedTensor) self.assertEqual(features['ragged_string'].shape.as_list(), [None, None]) self.assertEqual(features['ragged_string'].ragged_rank, 1) self.assertEqual(features['ragged_string'].dtype, tf.string) self.assertEqual(type(features['ragged_multi_dimension']), tf.RaggedTensor) self.assertEqual(features['ragged_multi_dimension'].shape.as_list(), [None, None, None, None, 5]) self.assertEqual(features['ragged_multi_dimension'].ragged_rank, 3) self.assertEqual(features['ragged_multi_dimension'].dtype, tf.int64) def test_batched_placeholders_from_specs_invalid_dtype(self): with self.assertRaisesRegexp(ValueError, 'had invalid dtype'): impl_helper.batched_placeholders_from_specs( {'f': tf.TensorSpec(dtype=tf.int32, shape=[None])}) with self.assertRaisesRegexp(ValueError, 'had invalid dtype'): impl_helper.batched_placeholders_from_specs( {'f': tf.io.FixedLenFeature(dtype=tf.int32, shape=[None])}) def test_batched_placeholders_from_specs_invalid_mixing(self): with self.assertRaisesRegexp(TypeError, 'Specs must be all'): impl_helper.batched_placeholders_from_specs({ 'f1': tf.TensorSpec(dtype=tf.int64, shape=[None]), 'f2': tf.io.FixedLenFeature(dtype=tf.int64, shape=[None]), }) @test_case.named_parameters(*test_case.cross_named_parameters( _ROUNDTRIP_CASES, [ dict(testcase_name='eager_tensors', feed_eager_tensors=True), dict(testcase_name='session_run_values', feed_eager_tensors=False) ])) def test_to_instance_dicts(self, feature_spec, instances, feed_dict, feed_eager_tensors): if feed_eager_tensors: test_case.skip_if_not_tf2('Tensorflow 2.x required') schema = schema_utils.schema_from_feature_spec(feature_spec) feed_dict_local = copy.copy(feed_dict) if feed_eager_tensors: for key, value in six.iteritems(feed_dict_local): if isinstance(value, tf.compat.v1.SparseTensorValue): feed_dict_local[key] = tf.sparse.SparseTensor.from_value( value) else: feed_dict_local[key] = tf.constant(value) result = impl_helper.to_instance_dicts(schema, feed_dict_local) np.testing.assert_equal(instances, result) @test_case.named_parameters(*_TO_INSTANCE_DICT_ERROR_CASES) def test_to_instance_dicts_error(self, feature_spec, feed_dict, error_msg, error_type=ValueError): schema = schema_utils.schema_from_feature_spec(feature_spec) with self.assertRaisesRegexp(error_type, error_msg): impl_helper.to_instance_dicts(schema, feed_dict) @test_case.named_parameters(*test_case.cross_named_parameters( _ROUNDTRIP_CASES, [ dict(testcase_name='eager_tensors', feed_eager_tensors=True), dict(testcase_name='session_run_values', feed_eager_tensors=False) ])) def test_convert_to_arrow(self, feature_spec, instances, feed_dict, feed_eager_tensors): if feed_eager_tensors: test_case.skip_if_not_tf2('Tensorflow 2.x required') schema = schema_utils.schema_from_feature_spec(feature_spec) converter = impl_helper.make_tensor_to_arrow_converter(schema) feed_dict_local = copy.copy(feed_dict) if feed_eager_tensors: for key, value in six.iteritems(feed_dict_local): if isinstance(value, tf.compat.v1.SparseTensorValue): feed_dict_local[key] = tf.sparse.SparseTensor.from_value( value) else: feed_dict_local[key] = tf.constant(value) arrow_columns, arrow_schema = impl_helper.convert_to_arrow( schema, converter, feed_dict_local) record_batch = pa.RecordBatch.from_arrays(arrow_columns, arrow_schema) # Merge and flatten expected instance dicts. expected = collections.defaultdict(list) for instance_dict in instances: for key, value in instance_dict.items(): expected[key].append(np.ravel(value)) actual = record_batch.to_pydict() self.assertEqual(len(actual), len(expected)) for key, expected_value in expected.items(): # Floating-point error breaks exact equality for some floating values. # However, the approximate equality testing fails on strings. if np.issubdtype(expected_value[0].dtype, np.number): self.assertAllClose(actual[key], expected_value) else: np.testing.assert_equal(actual[key], expected_value) @test_case.named_parameters(*_CONVERT_TO_ARROW_ERROR_CASES) def test_convert_to_arrow_error(self, feature_spec, feed_dict, error_msg, error_type=ValueError): schema = schema_utils.schema_from_feature_spec(feature_spec) converter = impl_helper.make_tensor_to_arrow_converter(schema) with self.assertRaisesRegexp(error_type, error_msg): impl_helper.convert_to_arrow(schema, converter, feed_dict) @test_case.named_parameters( dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True), dict(testcase_name='native_tf2', force_tf_compat_v1=False)) def test_analyze_in_place(self, force_tf_compat_v1): if not force_tf_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') def preprocessing_fn(inputs): return {'x_add_1': inputs['x'] + 1} feature_spec = {'x': tf.io.FixedLenFeature([], tf.int64)} type_spec = { 'x': tf.TensorSpec(dtype=tf.int64, shape=[ None, ]) } output_path = os.path.join(self.get_temp_dir(), self._testMethodName) impl_helper.analyze_in_place(preprocessing_fn, force_tf_compat_v1, feature_spec, type_spec, output_path) tft_output = TFTransformOutput(output_path) expected_value = np.array([2], dtype=np.int64) if force_tf_compat_v1: with tf.Graph().as_default() as graph: with tf.compat.v1.Session(graph=graph).as_default(): transformed_features = tft_output.transform_raw_features( {'x': tf.constant([1], dtype=tf.int64)}) transformed_value = transformed_features['x_add_1'].eval() else: transformed_features = tft_output.transform_raw_features( {'x': tf.constant([1], dtype=tf.int64)}) transformed_value = transformed_features['x_add_1'].numpy() self.assertEqual(transformed_value, expected_value) transformed_feature_spec = tft_output.transformed_feature_spec() expected_feature_spec = feature_spec = { 'x_add_1': tf.io.FixedLenFeature([], tf.int64) } self.assertEqual(transformed_feature_spec, expected_feature_spec) @test_case.named_parameters( dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True), dict(testcase_name='native_tf2', force_tf_compat_v1=False)) def test_analyze_in_place_with_analyzers_raises_error( self, force_tf_compat_v1): if not force_tf_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') def preprocessing_fn(inputs): return {'x_add_1': analyzers.mean(inputs['x'])} feature_spec = {'x': tf.io.FixedLenFeature([], tf.int64)} type_spec = { 'x': tf.TensorSpec(dtype=tf.int64, shape=[ None, ]) } output_path = os.path.join(self.get_temp_dir(), self._testMethodName) with self.assertRaisesRegexp(RuntimeError, 'analyzers found when tracing'): impl_helper.analyze_in_place(preprocessing_fn, force_tf_compat_v1, feature_spec, type_spec, output_path) @test_case.named_parameters( dict(testcase_name='_3d', sparse_value=tf.compat.v1.SparseTensorValue( indices=np.array([[0, 0, 1], [0, 1, 2], [1, 1, 1]]), values=np.array([0, 1, 2]), dense_shape=np.array([2, 2, 3])), expected_indices=[[np.array([0, 1]), np.array([1, 2])], [np.array([1]), np.array([1])]], expected_values=[np.array([0, 1]), np.array([2])]), dict(testcase_name='_4d', sparse_value=tf.compat.v1.SparseTensorValue( indices=np.array([[0, 0, 0, 1], [0, 1, 0, 2], [1, 1, 1, 1]]), values=np.array([0, 1, 2]), dense_shape=np.array([2, 2, 2, 3])), expected_indices=[[ np.array([0, 1]), np.array([0, 0]), np.array([1, 2]) ], [np.array([1]), np.array([1]), np.array([1])]], expected_values=[np.array([0, 1]), np.array([2])]), ) def test_decompose_sparse_batch(self, sparse_value, expected_indices, expected_values): indices, values = impl_helper._decompose_sparse_batch(sparse_value) self.assertLen(indices, len(expected_indices)) self.assertLen(values, len(expected_values)) for idx, (a, b) in enumerate(zip(expected_indices, indices)): self.assertAllEqual( a, b, 'Indices are different at index {}'.format(idx)) for idx, (a, b) in enumerate(zip(expected_values, values)): self.assertAllEqual(a, b, 'Values are different at index {}'.format(idx)) def test_get_num_values_per_instance_in_sparse_batch(self): batch_indices = np.array([[idx % 4, 0, 1, 2] for idx in range(100)]) num_values = impl_helper._get_num_values_per_instance_in_sparse_batch( batch_indices, 27) expected_num_values = [25, 25, 25, 25] + [0] * 23 self.assertEqual(expected_num_values, num_values)
class AnalysisGraphBuilderTest(test_case.TransformTestCase): @test_case.named_parameters(*test_case.cross_named_parameters( _ANALYZE_TEST_CASES, [ dict(testcase_name='tf_compat_v1', use_tf_compat_v1=True), dict(testcase_name='tf2', use_tf_compat_v1=False) ])) def test_build(self, feature_spec, preprocessing_fn, expected_dot_graph_str, expected_dot_graph_str_tf2, use_tf_compat_v1): if not use_tf_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') specs = (feature_spec if use_tf_compat_v1 else impl_helper.get_type_specs_from_feature_specs(feature_spec)) graph, structured_inputs, structured_outputs = ( impl_helper.trace_preprocessing_function( preprocessing_fn, specs, use_tf_compat_v1=use_tf_compat_v1, base_temp_dir=os.path.join(self.get_temp_dir(), self._testMethodName))) transform_fn_future, unused_cache = analysis_graph_builder.build( graph, structured_inputs, structured_outputs) dot_string = nodes.get_dot_graph([transform_fn_future]).to_string() self.WriteRenderedDotFile(dot_string) self.assertMultiLineEqual( msg='Result dot graph is:\n{}'.format(dot_string), first=dot_string, second=(expected_dot_graph_str if use_tf_compat_v1 else expected_dot_graph_str_tf2)) @test_case.named_parameters(*test_case.cross_named_parameters( [ dict( testcase_name='one_dataset_cached_single_phase', preprocessing_fn=_preprocessing_fn_with_one_analyzer, full_dataset_keys=['a', 'b'], cached_dataset_keys=['a'], expected_dataset_keys=['b'], ), dict( testcase_name='all_datasets_cached_single_phase', preprocessing_fn=_preprocessing_fn_with_one_analyzer, full_dataset_keys=['a', 'b'], cached_dataset_keys=['a', 'b'], expected_dataset_keys=[], ), dict( testcase_name='mixed_single_phase', preprocessing_fn=lambda d: dict( # pylint: disable=g-long-lambda list( _preprocessing_fn_with_chained_ptransforms(d).items()) + list(_preprocessing_fn_with_one_analyzer(d).items())), full_dataset_keys=['a', 'b'], cached_dataset_keys=['a', 'b'], expected_dataset_keys=['a', 'b'], ), dict( testcase_name='multi_phase', preprocessing_fn=_preprocessing_fn_with_two_phases, full_dataset_keys=['a', 'b'], cached_dataset_keys=['a', 'b'], expected_dataset_keys=['a', 'b'], ) ], [ dict(testcase_name='tf_compat_v1', use_tf_compat_v1=True), dict(testcase_name='tf2', use_tf_compat_v1=False) ])) def test_get_analysis_dataset_keys(self, preprocessing_fn, full_dataset_keys, cached_dataset_keys, expected_dataset_keys, use_tf_compat_v1): if not use_tf_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') full_dataset_keys = [ analysis_graph_builder.analyzer_cache.DatasetKey(k) for k in full_dataset_keys ] # We force all dataset keys with entries in the cache dict will have a cache # hit. mocked_cache_entry_key = b'M' input_cache = { key: { mocked_cache_entry_key: 'C' } for key in cached_dataset_keys } feature_spec = {'x': tf.io.FixedLenFeature([], tf.float32)} specs = (feature_spec if use_tf_compat_v1 else impl_helper.get_type_specs_from_feature_specs(feature_spec)) with mock.patch( 'tensorflow_transform.beam.analysis_graph_builder.' 'analyzer_cache.make_cache_entry_key', return_value=mocked_cache_entry_key): dataset_keys = (analysis_graph_builder.get_analysis_dataset_keys( preprocessing_fn, specs, full_dataset_keys, input_cache, force_tf_compat_v1=use_tf_compat_v1)) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) self.assertCountEqual(expected_dataset_keys, dataset_keys) @test_case.named_parameters( dict(testcase_name='tf_compat_v1', use_tf_compat_v1=True), dict(testcase_name='tf2', use_tf_compat_v1=False)) def test_get_analysis_cache_entry_keys(self, use_tf_compat_v1): if not use_tf_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') full_dataset_keys = ['a', 'b'] def preprocessing_fn(inputs): return {'x': tft.scale_to_0_1(inputs['x'])} mocked_cache_entry_key = 'A' def mocked_make_cache_entry_key(_): return mocked_cache_entry_key feature_spec = {'x': tf.io.FixedLenFeature([], tf.float32)} specs = (feature_spec if use_tf_compat_v1 else impl_helper.get_type_specs_from_feature_specs(feature_spec)) with mock.patch( 'tensorflow_transform.beam.analysis_graph_builder.' 'analyzer_cache.make_cache_entry_key', side_effect=mocked_make_cache_entry_key): cache_entry_keys = ( analysis_graph_builder.get_analysis_cache_entry_keys( preprocessing_fn, specs, full_dataset_keys, force_tf_compat_v1=use_tf_compat_v1)) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) self.assertCountEqual(cache_entry_keys, [mocked_cache_entry_key])
class SchemaInferenceTest(test_case.TransformTestCase): def _get_schema(self, preprocessing_fn, use_compat_v1, inputs=None, input_signature=None, create_session=False): if inputs is None: inputs = {} if input_signature is None: input_signature = {} if use_compat_v1: with tf.compat.v1.Graph().as_default() as graph: # Convert eager tensors to graph tensors. inputs_copy = { k: tf.constant(v, input_signature[k].dtype) for k, v in inputs.items() } tensors = preprocessing_fn(inputs_copy) if create_session: # Create a session to actually evaluate the annotations and extract # the output schema with annotations applied. with tf.compat.v1.Session(graph=graph) as session: schema = schema_inference.infer_feature_schema( tensors, graph, session) else: schema = schema_inference.infer_feature_schema( tensors, graph) else: tf_func = tf.function(preprocessing_fn, input_signature=[input_signature ]).get_concrete_function() tensors = tf.nest.pack_sequence_as( structure=tf_func.structured_outputs, flat_sequence=tf_func.outputs, expand_composites=True) metadata_fn = schema_inference.get_traced_metadata_fn( tensor_replacement_map={}, preprocessing_fn=preprocessing_fn, input_signature=input_signature, base_temp_dir=os.path.join(self.get_temp_dir(), self._testMethodName), evaluate_schema_overrides=create_session) schema = schema_inference.infer_feature_schema_v2( tensors, metadata_fn.get_concrete_function(), evaluate_schema_overrides=create_session) return schema # pylint: disable=g-long-lambda @test_case.named_parameters(*test_case.cross_named_parameters([ dict(testcase_name='fixed_len_int', make_tensors_fn=_make_tensors, feature_spec={'x': tf.io.FixedLenFeature([], tf.int64)}), dict(testcase_name='fixed_len_string', make_tensors_fn=_make_tensors, feature_spec={'x': tf.io.FixedLenFeature([], tf.string)}), dict(testcase_name='fixed_len_float', make_tensors_fn=_make_tensors, feature_spec={'x': tf.io.FixedLenFeature([], tf.float32)}), dict(testcase_name='override', make_tensors_fn=_make_tensors_with_override, feature_spec={'x': tf.io.FixedLenFeature([], tf.int64)}, domains={'x': schema_pb2.IntDomain(is_categorical=True)}), dict(testcase_name='override_with_session', make_tensors_fn=_make_tensors_with_override, feature_spec={'x': tf.io.FixedLenFeature([], tf.int64)}, domains={ 'x': schema_pb2.IntDomain(min=5, max=6, is_categorical=True) }, create_session=True) ], [ dict(testcase_name='compat_v1', use_compat_v1=True), dict(testcase_name='v2', use_compat_v1=False) ])) # pylint: enable=g-long-lambda def test_infer_feature_schema(self, make_tensors_fn, feature_spec, use_compat_v1, domains=None, create_session=False): if not use_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') x_val = '0' if feature_spec['x'].dtype == tf.string else 0 inputs = {'x': [x_val]} input_signature = { 'x': tf.TensorSpec([None], dtype=feature_spec['x'].dtype) } schema = self._get_schema(make_tensors_fn, use_compat_v1, inputs=inputs, input_signature=input_signature, create_session=create_session) expected_schema = schema_utils.schema_from_feature_spec( feature_spec, domains) self.assertEqual(schema, expected_schema) @test_case.named_parameters( dict(testcase_name='compat_v1', use_compat_v1=True), dict(testcase_name='v2', use_compat_v1=False)) def test_infer_feature_schema_bad_rank(self, use_compat_v1): if not use_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') inputs = {'x': 0} input_signature = {'x': tf.TensorSpec([], dtype=tf.float32)} with self.assertRaises(ValueError): self._get_schema(_make_tensors, use_compat_v1, inputs=inputs, input_signature=input_signature) @unittest.skipIf(not common.IS_ANNOTATIONS_PB_AVAILABLE, 'Schema annotations are not available') @test_case.named_parameters( dict(testcase_name='compat_v1', use_compat_v1=True), dict(testcase_name='v2', use_compat_v1=False)) def test_vocab_annotation(self, use_compat_v1): if not use_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') def preprocessing_fn(_): analyzers._maybe_annotate_vocab_metadata( 'file1', tf.constant(100, dtype=tf.int64)) analyzers._maybe_annotate_vocab_metadata( 'file2', tf.constant(200, dtype=tf.int64)) return { 'foo': tf.convert_to_tensor([0, 1, 2, 3], dtype=tf.int64), } schema = self._get_schema(preprocessing_fn, use_compat_v1, create_session=True) self.assertLen(schema.annotation.extra_metadata, 2) sizes = {} for annotation in schema.annotation.extra_metadata: message = annotations_pb2.VocabularyMetadata() annotation.Unpack(message) sizes[message.file_name] = message.unfiltered_vocabulary_size self.assertDictEqual(sizes, {'file1': 100, 'file2': 200}) @unittest.skipIf(not common.IS_ANNOTATIONS_PB_AVAILABLE, 'Schema annotations are not available') @test_case.named_parameters( dict(testcase_name='compat_v1', use_compat_v1=True), dict(testcase_name='v2', use_compat_v1=False)) def test_bucketization_annotation(self, use_compat_v1): if not use_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') def preprocessing_fn(_): inputs = { 'foo': tf.convert_to_tensor([0, 1, 2, 3]), 'bar': tf.convert_to_tensor([0, 2, 0, 2]), } boundaries_foo = tf.expand_dims(tf.convert_to_tensor([.5, 1.5]), axis=0) boundaries_bar = tf.expand_dims(tf.convert_to_tensor([.1, .2]), axis=0) outputs = {} # tft.apply_buckets will annotate the feature in the output schema to # indicate the bucket boundaries that were applied. outputs['Bucketized_foo'] = mappers.apply_buckets( inputs['foo'], boundaries_foo) outputs['Bucketized_bar'] = mappers.apply_buckets( inputs['bar'], boundaries_bar) return outputs schema = self._get_schema(preprocessing_fn, use_compat_v1, create_session=True) self.assertLen(schema.feature, 2) for feature in schema.feature: self.assertLen(feature.annotation.extra_metadata, 1) for annotation in feature.annotation.extra_metadata: # Extract the annotated message and validate its contents message = annotations_pb2.BucketBoundaries() annotation.Unpack(message) if feature.name == 'Bucketized_foo': self.assertAllClose(list(message.boundaries), [.5, 1.5]) elif feature.name == 'Bucketized_bar': self.assertAllClose(list(message.boundaries), [.1, .2]) else: raise RuntimeError('Unexpected features in schema') @unittest.skipIf(not common.IS_ANNOTATIONS_PB_AVAILABLE, 'Schema annotations are not available') @test_case.named_parameters( dict(testcase_name='compat_v1', use_compat_v1=True), dict(testcase_name='v2', use_compat_v1=False)) def test_global_annotation(self, use_compat_v1): # pylint: enable=g-import-not-at-top if not use_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') def preprocessing_fn(_): # Annotate an arbitrary proto at the schema level (not sure what global # schema boundaries would mean, but hey I'm just a test). boundaries = tf.constant([[1.0]]) message_type = annotations_pb2.BucketBoundaries.DESCRIPTOR.full_name sizes = tf.expand_dims([tf.size(boundaries)], axis=0) message_proto = tf.raw_ops.EncodeProto( sizes=sizes, values=[tf.cast(boundaries, tf.float32)], field_names=['boundaries'], message_type=message_type)[0] type_url = os.path.join('type.googleapis.com', message_type) schema_inference.annotate(type_url, message_proto) return { 'foo': tf.convert_to_tensor([0, 1, 2, 3], dtype=tf.int64), 'bar': tf.convert_to_tensor([0, 2, 0, 2], dtype=tf.int64), } schema = self._get_schema(preprocessing_fn, use_compat_v1, create_session=True) self.assertLen(schema.annotation.extra_metadata, 1) for annotation in schema.annotation.extra_metadata: # Extract the annotated message and validate its contents message = annotations_pb2.BucketBoundaries() annotation.Unpack(message) self.assertAllClose(list(message.boundaries), [1]) @test_case.named_parameters( dict(testcase_name='compat_v1', use_compat_v1=True), dict(testcase_name='v2', use_compat_v1=False)) def test_infer_feature_schema_with_ragged_tensor(self, use_compat_v1): if not use_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') def preprocessing_fn(_): return { 'foo': tf.RaggedTensor.from_row_splits(values=tf.constant( [3, 1, 4, 1, 5, 9, 2, 6], tf.int64), row_splits=[0, 4, 4, 7, 8, 8]), } schema = self._get_schema(preprocessing_fn, use_compat_v1, create_session=True) expected_schema_ascii = """feature { name: "foo" type: INT annotation { tag: "ragged_tensor" } } """ expected_schema = text_format.Parse(expected_schema_ascii, schema_pb2.Schema()) schema_utils_legacy.set_generate_legacy_feature_spec( expected_schema, False) self.assertProtoEquals(expected_schema, schema) with self.assertRaisesRegexp(ValueError, 'Feature "foo" had tag "ragged_tensor"'): schema_utils.schema_as_feature_spec(schema)
class ImplHelperTest(test_case.TransformTestCase): def test_batched_placeholders_from_feature_spec(self): feature_spec = { 'fixed_len_float': tf.io.FixedLenFeature([2, 3], tf.float32), 'fixed_len_string': tf.io.FixedLenFeature([], tf.string), '_var_len_underscored': tf.io.VarLenFeature(tf.string), 'var_len_int': tf.io.VarLenFeature(tf.int64) } with tf.compat.v1.Graph().as_default(): features = impl_helper.batched_placeholders_from_specs( feature_spec) self.assertCountEqual(features.keys(), [ 'fixed_len_float', 'fixed_len_string', 'var_len_int', '_var_len_underscored' ]) self.assertEqual(type(features['fixed_len_float']), tf.Tensor) self.assertEqual(features['fixed_len_float'].get_shape().as_list(), [None, 2, 3]) self.assertEqual(type(features['fixed_len_string']), tf.Tensor) self.assertEqual(features['fixed_len_string'].get_shape().as_list(), [None]) self.assertEqual(type(features['var_len_int']), tf.SparseTensor) self.assertEqual(features['var_len_int'].get_shape().as_list(), [None, None]) self.assertEqual(type(features['_var_len_underscored']), tf.SparseTensor) self.assertEqual( features['_var_len_underscored'].get_shape().as_list(), [None, None]) def test_batched_placeholders_from_typespecs(self): typespecs = { 'dense_float': tf.TensorSpec(dtype=tf.float32, shape=[None, 2, 3]), 'dense_string': tf.TensorSpec(shape=[None], dtype=tf.string), '_sparse_underscored': tf.SparseTensorSpec(dtype=tf.string, shape=[None, None]), 'ragged_string': tf.RaggedTensorSpec(dtype=tf.string, ragged_rank=1, shape=[None, None]), 'ragged_multi_dimension': tf.RaggedTensorSpec(dtype=tf.int64, ragged_rank=3, shape=[None, None, None, None, 5]), } with tf.compat.v1.Graph().as_default(): features = impl_helper.batched_placeholders_from_specs(typespecs) self.assertCountEqual(features.keys(), [ 'dense_float', 'dense_string', '_sparse_underscored', 'ragged_string', 'ragged_multi_dimension', ]) self.assertEqual(type(features['dense_float']), tf.Tensor) self.assertEqual(features['dense_float'].get_shape().as_list(), [None, 2, 3]) self.assertEqual(features['dense_float'].dtype, tf.float32) self.assertEqual(type(features['dense_string']), tf.Tensor) self.assertEqual(features['dense_string'].get_shape().as_list(), [None]) self.assertEqual(features['dense_string'].dtype, tf.string) self.assertEqual(type(features['_sparse_underscored']), tf.SparseTensor) self.assertEqual(features['_sparse_underscored'].get_shape().as_list(), [None, None]) self.assertEqual(features['_sparse_underscored'].dtype, tf.string) self.assertEqual(type(features['ragged_string']), tf.RaggedTensor) self.assertEqual(features['ragged_string'].shape.as_list(), [None, None]) self.assertEqual(features['ragged_string'].ragged_rank, 1) self.assertEqual(features['ragged_string'].dtype, tf.string) self.assertEqual(type(features['ragged_multi_dimension']), tf.RaggedTensor) self.assertEqual(features['ragged_multi_dimension'].shape.as_list(), [None, None, None, None, 5]) self.assertEqual(features['ragged_multi_dimension'].ragged_rank, 3) self.assertEqual(features['ragged_multi_dimension'].dtype, tf.int64) def test_batched_placeholders_from_specs_invalid_dtype(self): with self.assertRaisesRegexp(ValueError, 'had invalid dtype'): impl_helper.batched_placeholders_from_specs( {'f': tf.TensorSpec(dtype=tf.int32, shape=[None])}) with self.assertRaisesRegexp(ValueError, 'had invalid dtype'): impl_helper.batched_placeholders_from_specs( {'f': tf.io.FixedLenFeature(dtype=tf.int32, shape=[None])}) def test_batched_placeholders_from_specs_invalid_mixing(self): with self.assertRaisesRegexp(TypeError, 'Specs must be all'): impl_helper.batched_placeholders_from_specs({ 'f1': tf.TensorSpec(dtype=tf.int64, shape=[None]), 'f2': tf.io.FixedLenFeature(dtype=tf.int64, shape=[None]), }) @test_case.named_parameters(*test_case.cross_named_parameters( (_ROUNDTRIP_CASES + _MAKE_FEED_DICT_CASES), [ dict(testcase_name='eager_tensors', produce_eager_tensors=True), dict(testcase_name='feed_values', produce_eager_tensors=False) ])) def test_make_feed_list(self, feature_spec, instances, feed_dict, produce_eager_tensors): if produce_eager_tensors: test_case.skip_if_not_tf2('Tensorflow 2.x required') schema = schema_utils.schema_from_feature_spec(feature_spec) feature_names = list(feature_spec.keys()) expected_feed_list = [feed_dict[key] for key in feature_names] evaluated_feed_list = impl_helper.make_feed_list( feature_names, schema, instances, produce_eager_tensors=produce_eager_tensors) np.testing.assert_equal( evaluated_feed_list if not produce_eager_tensors else _get_value_from_eager_tensors(evaluated_feed_list), expected_feed_list) @test_case.named_parameters(*_MAKE_FEED_LIST_ERROR_CASES) def test_make_feed_list_error(self, feature_spec, instances, error_msg, error_type=ValueError): with tf.compat.v1.Graph().as_default(): tensors = tf.io.parse_example(serialized=tf.compat.v1.placeholder( tf.string, [None]), features=feature_spec) schema = schema_utils.schema_from_feature_spec(feature_spec) with self.assertRaisesRegexp(error_type, error_msg): impl_helper.make_feed_list(tensors, schema, instances) @test_case.named_parameters(*test_case.cross_named_parameters( _ROUNDTRIP_CASES, [ dict(testcase_name='eager_tensors', feed_eager_tensors=True), dict(testcase_name='session_run_values', feed_eager_tensors=False) ])) def test_to_instance_dicts(self, feature_spec, instances, feed_dict, feed_eager_tensors): if feed_eager_tensors: test_case.skip_if_not_tf2('Tensorflow 2.x required') schema = schema_utils.schema_from_feature_spec(feature_spec) feed_dict_local = copy.copy(feed_dict) if feed_eager_tensors: for key, value in six.iteritems(feed_dict_local): if isinstance(value, tf.compat.v1.SparseTensorValue): feed_dict_local[key] = tf.sparse.SparseTensor.from_value( value) else: feed_dict_local[key] = tf.constant(value) np.testing.assert_equal( instances, impl_helper.to_instance_dicts(schema, feed_dict_local)) @test_case.named_parameters(*_TO_INSTANCE_DICT_ERROR_CASES) def test_to_instance_dicts_error(self, feature_spec, feed_dict, error_msg, error_type=ValueError): schema = schema_utils.schema_from_feature_spec(feature_spec) with self.assertRaisesRegexp(error_type, error_msg): impl_helper.to_instance_dicts(schema, feed_dict) def test_copy_tensors_produces_different_tensors(self): with tf.compat.v1.Graph().as_default(): tensors = { 'dense': tf.compat.v1.placeholder(tf.int64, (None, ), name='my_dense_input'), 'sparse': tf.compat.v1.sparse_placeholder(tf.int64, name='my_sparse_input'), 'ragged': tf.compat.v1.ragged.placeholder(tf.int64, ragged_rank=2, name='my_ragged_input') } copied_tensors = impl_helper.copy_tensors(tensors) self.assertNotEqual(tensors['dense'], copied_tensors['dense']) self.assertNotEqual(tensors['sparse'].indices, copied_tensors['sparse'].indices) self.assertNotEqual(tensors['sparse'].values, copied_tensors['sparse'].values) self.assertNotEqual(tensors['sparse'].dense_shape, copied_tensors['sparse'].dense_shape) self.assertNotEqual(tensors['ragged'].values, copied_tensors['ragged'].values) self.assertNotEqual(tensors['ragged'].row_splits, copied_tensors['ragged'].row_splits) def test_copy_tensors_produces_equivalent_tensors(self): with tf.compat.v1.Graph().as_default(): tensors = { 'dense': tf.compat.v1.placeholder(tf.int64, (None, ), name='my_dense_input'), 'sparse': tf.compat.v1.sparse_placeholder(tf.int64, name='my_sparse_input'), 'ragged': tf.compat.v1.ragged.placeholder(tf.int64, ragged_rank=1, name='my_ragged_input') } copied_tensors = impl_helper.copy_tensors(tensors) with tf.compat.v1.Session() as session: dense_value = [1, 2] sparse_value = tf.compat.v1.SparseTensorValue( indices=[[0, 0], [0, 2], [1, 1]], values=[3, 4, 5], dense_shape=[2, 3]) ragged_value = tf.compat.v1.ragged.RaggedTensorValue( values=np.array([3, 4, 5], dtype=np.int64), row_splits=np.array([0, 2, 3], dtype=np.int64)) sample_tensors = session.run(copied_tensors, feed_dict={ tensors['dense']: dense_value, tensors['sparse']: sparse_value, tensors['ragged']: ragged_value }) self.assertAllEqual(sample_tensors['dense'], dense_value) self.assertAllEqual(sample_tensors['sparse'].indices, sparse_value.indices) self.assertAllEqual(sample_tensors['sparse'].values, sparse_value.values) self.assertAllEqual(sample_tensors['sparse'].dense_shape, sparse_value.dense_shape) self.assertAllEqual(sample_tensors['ragged'].values, ragged_value.values) self.assertAllEqual(sample_tensors['ragged'].row_splits, ragged_value.row_splits)