def testGaussianize(self, input_data, output_data, elementwise): def preprocessing_fn(inputs): x = inputs['x'] x_cast = tf.cast(x, tf.as_dtype(input_data.dtype)) x_gaussianized = tft.scale_to_gaussian(x_cast, elementwise=elementwise) self.assertEqual(x_gaussianized.dtype, tf.as_dtype(output_data.dtype)) return {'x_gaussianized': tf.cast(x_gaussianized, tf.float32)} input_data_dicts = [{'x': x} for x in input_data] expected_data_dicts = [{ 'x_gaussianized': x_gaussianized } for x_gaussianized in output_data] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature( input_data.shape[1:], tft_unit.canonical_numeric_dtype(tf.as_dtype( input_data.dtype))), }) expected_metadata = tft_unit.metadata_from_feature_spec({ 'x_gaussianized': tf.io.FixedLenFeature(output_data.shape[1:], tf.float32), }) self.assertAnalyzeAndTransformResults(input_data_dicts, input_metadata, preprocessing_fn, expected_data_dicts, expected_metadata, desired_batch_size=20, beam_pipeline=beam.Pipeline())
def testBasicType(self): config = { 'timesteps': 3, 'time_features': [], 'features': ['a'], 'enable_timestamp_features': False } input_data = [{'a': [1000.0, 2000.0, 3000.0]}] input_metadata = tft_unit.metadata_from_feature_spec( {'a': tf.io.VarLenFeature(tf.float32)}) output = [[1000], [2000], [3000]] output = stats.zscore(output) expected_data = [{'Float32': output, 'LABEL': output}] expected_metadata = tft_unit.metadata_from_feature_spec({ 'Float32': tf.io.FixedLenFeature([config['timesteps'], 1], tf.float32), 'LABEL': tf.io.FixedLenFeature([config['timesteps'], 1], tf.float32) }) preprocessing_fn = functools.partial( encoder_decoder_preprocessing.preprocessing_fn, custom_config=config) self.assertAnalyzeAndTransformResults(input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata)
def testWithTimeStamps(self): config = { 'timesteps': 2, 'time_features': ['MINUTE', 'MONTH', 'HOUR', 'DAY', 'YEAR'], 'features': ['float32', 'foo_TIMESTAMP'], 'enable_timestamp_features': True } # The values will need to be different enough for the zscore not to nan timestamp_1 = int(datetime(2000, 1, 1, 0, 0, 0).timestamp()) timestamp_2 = int(datetime(2001, 6, 15, 12, 30, 30).timestamp()) input_data = [{ 'float32': [1000.0, 2000.0], 'foo_TIMESTAMP': [timestamp_1 * 1000, timestamp_2 * 1000] }] input_metadata = tft_unit.metadata_from_feature_spec({ 'float32': tf.io.VarLenFeature(tf.float32), 'foo_TIMESTAMP': tf.io.VarLenFeature(tf.int64) }) output_timestep_1 = self.create_transform_output(timestamp_1) output_timestep_2 = self.create_transform_output(timestamp_2) for i in range(len(output_timestep_1)): values = stats.zscore([output_timestep_1[i], output_timestep_2[i]]) n = numpy.isnan(values) values[n] = 0.0 output_timestep_1[i] = values[0] output_timestep_2[i] = values[1] values = stats.zscore([1000.0, 2000.0]) output_timestep_1.insert(0, values[0]) output_timestep_2.insert(0, values[1]) output = [output_timestep_1, output_timestep_2] expected_data = [{'Float32': output, 'LABEL': output}] expected_metadata = tft_unit.metadata_from_feature_spec({ 'Float32': tf.io.FixedLenFeature([config['timesteps'], 11], tf.float32), 'LABEL': tf.io.FixedLenFeature([config['timesteps'], 11], tf.float32) }) preprocessing_fn = functools.partial( encoder_decoder_preprocessing.preprocessing_fn, custom_config=config) self.assertAnalyzeAndTransformResults(input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata)
def testGaussianizeSparse(self, input_dtype, elementwise): def preprocessing_fn(inputs): x_gaussianized = tf.sparse.to_dense(tft.scale_to_gaussian( tf.cast(inputs['x'], input_dtype), elementwise=elementwise), default_value=np.nan) x_gaussianized.set_shape([None, 4]) self.assertEqual(x_gaussianized.dtype, impl_test._mean_output_dtype(input_dtype)) return {'x_gaussianized': tf.cast(x_gaussianized, tf.float32)} input_data_values = [ 516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16, 508, 669, 617, 502, 532, 517, 479 ] input_data = [] for idx, v in enumerate(input_data_values): input_data.append({ 'idx': [0, 1], 'val': [v] + [-input_data_values[-1 - idx]] }) input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.SparseFeature('idx', 'val', tft_unit.canonical_numeric_dtype(input_dtype), 4) }) if elementwise: expected_data_values = [ -0.09304726, -2.24682532, 1.56900163, -0.78244931, 0.48285998, 0.47461339, -1.50929952, -0.39008015, 0.41659823, -0.81174337, 0.54027596, 2.11624695, -1.72816411, -0.16046759, 1.13320023, 0.74814557, -0.21014091, 0.04373742, -0.08454805, -0.39008015 ] else: expected_data_values = [ 0.91555131, -1.54543642, 1.30767697, 0.73634456, 1.03620536, 1.03443104, 0.26969729, 0.84990131, 1.02201077, 0.72569862, 1.04862563, 1.49752966, -0.02838919, 0.90135672, 1.18702292, 1.09475806, 0.89071077, 0.9439405, 0.91732564, 0.84990131 ] expected_data = [] for idx, v in enumerate(expected_data_values): expected_data.append({ 'x_gaussianized': ([v] + [-expected_data_values[-1 - idx]] + [float('nan'), float('nan')]) }) expected_metadata = tft_unit.metadata_from_feature_spec( {'x_gaussianized': tf.io.FixedLenFeature([4], tf.float32)}) self.assertAnalyzeAndTransformResults(input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata, desired_batch_size=20, beam_pipeline=beam.Pipeline())
def testBucketizePerKey(self): def preprocessing_fn(inputs): x_bucketized = tft.bucketize_per_key(inputs['x'], inputs['key'], num_buckets=3, epsilon=0.00001) return {'x': inputs['x'], 'x_bucketized': x_bucketized} # NOTE: We force 10 batches: data has 100 elements and we request a batch # size of 10. input_data = [{ 'x': x, 'key': 'a' if x < 50 else 'b' } for x in range(1, 100)] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'key': tf.io.FixedLenFeature([], tf.string) }) def compute_quantile(instance): if instance['key'] == 'a': if instance['x'] < 17: return 0 elif instance['x'] < 33: return 1 else: return 2 else: if instance['x'] < 66: return 0 elif instance['x'] < 83: return 1 else: return 2 expected_data = [{ 'x_bucketized': compute_quantile(instance), 'x': instance['x'] } for instance in input_data] expected_metadata = tft_unit.metadata_from_feature_spec( { 'x': tf.io.FixedLenFeature([], tf.float32), 'x_bucketized': tf.io.FixedLenFeature([], tf.int64), }, { 'x_bucketized': schema_pb2.IntDomain(min=0, max=2, is_categorical=True), }) self.assertAnalyzeAndTransformResults(input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata, desired_batch_size=10)
def testQuantileBuckets(self, input_dtype): def analyzer_fn(inputs): return { 'q_b': tft.quantiles( tf.cast(inputs['x'], input_dtype), num_buckets=3, epsilon=0.00001) } # NOTE: We force 3 batches: data has 3000 elements and we request a batch # size of 1000. input_data = [{'x': [x]} for x in range(1, 3000)] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature([1], tft_unit.canonical_numeric_dtype(input_dtype)) }) # The expected data has 2 boundaries that divides the data into 3 buckets. expected_outputs = {'q_b': np.array([[1000, 2000]], np.float32)} self.assertAnalyzerOutputs( input_data, input_metadata, analyzer_fn, expected_outputs, desired_batch_size=1000)
def testElementwiseQuantileBucketsWithWeights(self, input_dtype): def analyzer_fn(inputs): return { 'q_b': tft.quantiles(tf.cast(inputs['x'], input_dtype), num_buckets=3, epsilon=0.00001, weights=inputs['weights'], reduce_instance_dims=False) } input_data = [{ 'x': [[x, 2 * x], [2 * x, x]], 'weights': [x / 100.] } for x in range(1, 3000)] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature( [2, 2], tft_unit.canonical_numeric_dtype(input_dtype)), 'weights': tf.io.FixedLenFeature([1], tf.float32) }) # The expected data has 2 boundaries that divides the data into 3 buckets. expected_outputs = { 'q_b': np.array( [[[1732, 2449], [3464, 4898]], [[3464, 4898], [1732, 2449]]], np.float32) } self.assertAnalyzerOutputs(input_data, input_metadata, analyzer_fn, expected_outputs, desired_batch_size=1000)
def testBucketizeSparseInput(self): def preprocessing_fn(inputs): return { 'x_bucketized': tft.bucketize(inputs['x'], num_buckets=3, epsilon=0.00001) } input_data = [{ 'val': [x], 'idx0': [x % 4], 'idx1': [x % 5] } for x in range(1, 10)] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.SparseFeature(['idx0', 'idx1'], 'val', tf.float32, [4, 5]), }) def compute_bucket(instance): if instance['val'][0] < 4: return 0 if instance['val'][0] < 7: return 1 return 2 expected_data = [{ 'x_bucketized$sparse_values': [compute_bucket(instance)], 'x_bucketized$sparse_indices_0': instance['idx0'], 'x_bucketized$sparse_indices_1': instance['idx1'] } for instance in input_data] self.assertAnalyzeAndTransformResults(input_data, input_metadata, preprocessing_fn, expected_data)
def _assert_quantile_boundaries(self, test_inputs, expected_boundaries, input_dtype, num_buckets=None, num_expected_buckets=None): if not num_buckets: num_buckets = len(expected_boundaries) + 1 if not num_expected_buckets: num_expected_buckets = num_buckets def analyzer_fn(inputs): x = tf.cast(inputs['x'], input_dtype) return {'q_b': tft.quantiles(x, num_buckets, epsilon=0.0001)} input_data = [{'x': [x]} for x in test_inputs] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature([1], tft_unit.canonical_numeric_dtype(input_dtype)) }) expected_data = {'q_b': expected_boundaries} self.assertAnalyzerOutputs( input_data, input_metadata, analyzer_fn, expected_data, desired_batch_size=1000)
def testTukeyHHAnalyzersWithNDDenseInputs(self): def analyzer_fn(inputs): a = inputs['a'] return { 'tukey_location': tft.tukey_location(a, reduce_instance_dims=False), 'tukey_scale': tft.tukey_scale(a, reduce_instance_dims=False), 'tukey_hl': tft.tukey_h_params(a, reduce_instance_dims=False)[0], 'tukey_hr': tft.tukey_h_params(a, reduce_instance_dims=False)[1], } input_data_values = [ 516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16, 508, 669, 617, 502, 532, 517, 479 ] input_data = [] for idx, v in enumerate(input_data_values): input_data.append({ 'a': [[v, -input_data_values[-1 - idx]], [2 * v, -2 * input_data_values[-1 - idx]]] }) input_metadata = tft_unit.metadata_from_feature_spec( {'a': tf.io.FixedLenFeature([2, 2], tf.float32)}) expected_outputs = { 'tukey_location': np.array( [[526.89355, -526.89355], [2. * 526.89355, -2. * 526.89355]], np.float32), 'tukey_scale': np.array( [[116.73997, 116.73997], [2. * 116.73997, 2. * 116.73997]], np.float32), 'tukey_hl': np.array([[0.6629082, 0.11148566], [0.6629082, 0.11148566]], np.float32), 'tukey_hr': np.array([[0.11148566, 0.6629082], [0.11148566, 0.6629082]], np.float32) } self.assertAnalyzerOutputs( input_data, input_metadata, analyzer_fn, expected_outputs, desired_batch_size=20, # Runs the test deterministically on the whole batch. beam_pipeline=beam.Pipeline())
def _assert_quantile_boundaries(self, test_inputs, expected_boundaries, input_dtype, num_buckets=None, num_expected_buckets=None, always_return_num_quantiles=True): if not num_buckets: num_buckets = len(expected_boundaries) + 1 if not num_expected_buckets: num_expected_buckets = num_buckets def preprocessing_fn(inputs): x = tf.cast(inputs['x'], input_dtype) quantiles = tft.quantiles( x, num_buckets, epsilon=0.0001, always_return_num_quantiles=always_return_num_quantiles) quantiles.set_shape([1, num_expected_buckets - 1]) return {'q_b': quantiles} input_data = [{'x': [x]} for x in test_inputs] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature( [1], tft_unit.canonical_numeric_dtype(input_dtype)) }) # Expected data has the same size as input, one bucket per input value. batch_size = 1000 expected_data = [] num_batches = int(math.ceil(len(test_inputs) / float(batch_size))) for _ in range(num_batches): expected_data += [{'q_b': expected_boundaries}] expected_metadata = None self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata, desired_batch_size=batch_size, # TODO(b/110855155): Remove this explicit use of DirectRunner. beam_pipeline=beam.Pipeline())
def testQuantilesPerKey(self): def analyzer_fn(inputs): key_vocab, q_b, scale_factor_per_key, shift_per_key, num_buckets = ( analyzers._quantiles_per_key(inputs['x'], inputs['key'], num_buckets=3, epsilon=0.00001)) return { 'key_vocab': key_vocab, 'q_b': q_b, 'scale_factor_per_key': scale_factor_per_key, 'shift_per_key': shift_per_key, 'num_buckets': num_buckets, } # NOTE: We force 10 batches: data has 100 elements and we request a batch # size of 10. input_data = [{ 'x': [x], 'key': 'a' if x < 50 else 'b' } for x in range(1, 100)] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature([1], tf.int64), 'key': tf.io.FixedLenFeature([], tf.string) }) # The expected data has 2 boundaries that divides the data into 3 buckets. expected_outputs = { 'key_vocab': np.array([b'a', b'b'], np.object), 'q_b': np.array([0., 1., 2.], np.float32), 'scale_factor_per_key': np.array([0.0625, 0.05882353], np.float32), 'shift_per_key': np.array([-1.0625, -2.88235283], np.float32), 'num_buckets': np.array(3, np.int64), } self.assertAnalyzerOutputs(input_data, input_metadata, analyzer_fn, expected_outputs, desired_batch_size=10)
def testBucketization(self, test_inputs, expected_boundaries, do_shuffle, epsilon, should_apply, is_manual_boundaries, input_dtype): test_inputs = list(test_inputs) # Shuffle the input to add randomness to input generated with # simple range(). if do_shuffle: random.shuffle(test_inputs) def preprocessing_fn(inputs): x = tf.cast(inputs['x'], input_dtype) num_buckets = len(expected_boundaries) + 1 if should_apply: if is_manual_boundaries: bucket_boundaries = [expected_boundaries] else: bucket_boundaries = tft.quantiles(inputs['x'], num_buckets, epsilon) result = tft.apply_buckets(x, bucket_boundaries) else: result = tft.bucketize(x, num_buckets=num_buckets, epsilon=epsilon) return {'q_b': result} input_data = [{'x': [x]} for x in test_inputs] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature([1], tft_unit.canonical_numeric_dtype(input_dtype)) }) # Sort the input based on value, index is used to create expected_data. indexed_input = enumerate(test_inputs) sorted_list = sorted(indexed_input, key=lambda p: p[1]) # Expected data has the same size as input, one bucket per input value. expected_data = [None] * len(test_inputs) bucket = 0 for (index, x) in sorted_list: # Increment the bucket number when crossing the boundary if (bucket < len(expected_boundaries) and x >= expected_boundaries[bucket]): bucket += 1 expected_data[index] = {'q_b': [bucket]} expected_metadata = tft_unit.metadata_from_feature_spec( { 'q_b': tf.io.FixedLenFeature([1], tf.int64), }, { 'q_b': schema_pb2.IntDomain( min=0, max=len(expected_boundaries), is_categorical=True), }) @contextlib.contextmanager def no_assert(): yield None assertion = no_assert() if input_dtype == tf.float16: assertion = self.assertRaisesRegexp( TypeError, '.*DataType float16 not in list of allowed values.*') with assertion: self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata, desired_batch_size=1000)
def testBucketizePerKeyWithInfrequentKeys(self): def preprocessing_fn(inputs): x_bucketized = tft.bucketize_per_key(inputs['x'], inputs['key'], num_buckets=4, epsilon=0.00001) return {'x': inputs['x'], 'x_bucketized': x_bucketized} input_data = [{ 'x': [], 'key': [] }, { 'x': [5, 6], 'key': ['a', 'a'] }, { 'x': [7], 'key': ['a'] }, { 'x': [12], 'key': ['b'] }, { 'x': [13], 'key': ['b'] }, { 'x': [15], 'key': ['c'] }, { 'x': [2], 'key': ['d'] }, { 'x': [4], 'key': ['d'] }, { 'x': [6], 'key': ['d'] }, { 'x': [8], 'key': ['d'] }, { 'x': [2], 'key': ['e'] }, { 'x': [4], 'key': ['e'] }, { 'x': [6], 'key': ['e'] }, { 'x': [8], 'key': ['e'] }, { 'x': [10], 'key': ['e'] }, { 'x': [11], 'key': ['e'] }, { 'x': [12], 'key': ['e'] }, { 'x': [13], 'key': ['e'] }] # pyformat: disable input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.VarLenFeature(tf.float32), 'key': tf.io.VarLenFeature(tf.string) }) expected_data = [{ 'x': [], 'x_bucketized': [] }, { 'x': [5, 6], 'x_bucketized': [1, 2] }, { 'x': [7], 'x_bucketized': [3] }, { 'x': [12], 'x_bucketized': [1] }, { 'x': [13], 'x_bucketized': [3] }, { 'x': [15], 'x_bucketized': [1] }, { 'x': [2], 'x_bucketized': [0] }, { 'x': [4], 'x_bucketized': [1] }, { 'x': [6], 'x_bucketized': [2] }, { 'x': [8], 'x_bucketized': [3] }, { 'x': [2], 'x_bucketized': [0] }, { 'x': [4], 'x_bucketized': [0] }, { 'x': [6], 'x_bucketized': [1] }, { 'x': [8], 'x_bucketized': [1] }, { 'x': [10], 'x_bucketized': [2] }, { 'x': [11], 'x_bucketized': [2] }, { 'x': [12], 'x_bucketized': [3] }, { 'x': [13], 'x_bucketized': [2] }] # pyformat: disable expected_metadata = tft_unit.metadata_from_feature_spec( { 'x': tf.io.VarLenFeature(tf.float32), 'x_bucketized': tf.io.VarLenFeature(tf.int64), }, { 'x_bucketized': schema_pb2.IntDomain(min=0, max=3, is_categorical=True), }) self.assertAnalyzeAndTransformResults(input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata, desired_batch_size=10)
def testBucketizePerKeySparse(self): def preprocessing_fn(inputs): x_bucketized = tft.bucketize_per_key(inputs['x'], inputs['key'], num_buckets=3, epsilon=0.00001) return {'x_bucketized': x_bucketized} # NOTE: We force 10 batches: data has 100 elements and we request a batch # size of 10. input_data = [{ 'x': [x], 'idx0': [0], 'idx1': [0], 'key': ['a'] if x < 50 else ['b'] } for x in range(1, 100)] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.SparseFeature(['idx0', 'idx1'], 'x', tf.float32, (2, 2)), 'key': tf.io.VarLenFeature(tf.string) }) def compute_bucket(instance): if instance['key'][0] == 'a': if instance['x'][0] < 17: return 0 elif instance['x'][0] < 33: return 1 else: return 2 else: if instance['x'][0] < 66: return 0 elif instance['x'][0] < 83: return 1 else: return 2 expected_data = [{ 'x_bucketized$sparse_values': [compute_bucket(instance)], 'x_bucketized$sparse_indices_0': [0], 'x_bucketized$sparse_indices_1': [0], } for instance in input_data] expected_metadata = tft_unit.metadata_from_feature_spec( { 'x_bucketized': tf.io.SparseFeature([ 'x_bucketized$sparse_indices_0', 'x_bucketized$sparse_indices_1' ], 'x_bucketized$sparse_values', tf.int64, (None, None), already_sorted=True), }, { 'x_bucketized': schema_pb2.IntDomain(min=0, max=2, is_categorical=True), }) self.assertAnalyzeAndTransformResults(input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata, desired_batch_size=10)
def testTukeyHHAnalyzersWithSparseInputs(self, input_dtype, output_dtypes, elementwise=True): def analyzer_fn(inputs): a = tf.cast(inputs['a'], input_dtype) def assert_and_cast_dtype(tensor, out_dtype): self.assertEqual(tensor.dtype, out_dtype) return tf.cast(tensor, tft_unit.canonical_numeric_dtype(out_dtype)) return { 'tukey_location': assert_and_cast_dtype( tft.tukey_location(a, reduce_instance_dims=not elementwise), output_dtypes['tukey_location']), 'tukey_scale': assert_and_cast_dtype( tft.tukey_scale(a, reduce_instance_dims=not elementwise), output_dtypes['tukey_scale']), 'tukey_hl': assert_and_cast_dtype( tft.tukey_h_params( a, reduce_instance_dims=not elementwise)[0], output_dtypes['tukey_hl']), 'tukey_hr': assert_and_cast_dtype( tft.tukey_h_params( a, reduce_instance_dims=not elementwise)[1], output_dtypes['tukey_hr']), } input_data_values = [ 516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16, 508, 669, 617, 502, 532, 517, 479 ] input_data = [] for idx, v in enumerate(input_data_values): input_data.append({ 'idx0': [0, 0], 'idx1': [0, 1], 'val': [v, -input_data_values[-1 - idx]] }) input_metadata = tft_unit.metadata_from_feature_spec({ 'a': tf.io.SparseFeature(['idx0', 'idx1'], 'val', tft_unit.canonical_numeric_dtype(input_dtype), (2, 2)) }) expected_outputs = { 'tukey_location': np.array( [[526.89355, -526.89355], [0., 0.]] if elementwise else 0.0, tft_unit.canonical_numeric_dtype( output_dtypes['tukey_location']).as_numpy_dtype), 'tukey_scale': np.array([[116.73997, 116.73997], [1., 1.]] if elementwise else 572.2776, tft_unit.canonical_numeric_dtype( output_dtypes['tukey_scale']).as_numpy_dtype), 'tukey_hl': np.array( [[0.6629082, 0.11148566], [0., 0.]] if elementwise else 0.0, tft_unit.canonical_numeric_dtype( output_dtypes['tukey_hl']).as_numpy_dtype), 'tukey_hr': np.array( [[0.11148566, 0.6629082], [0., 0.]] if elementwise else 0.0, tft_unit.canonical_numeric_dtype( output_dtypes['tukey_hr']).as_numpy_dtype), } self.assertAnalyzerOutputs( input_data, input_metadata, analyzer_fn, expected_outputs, desired_batch_size=20, # Runs the test deterministically on the whole batch. beam_pipeline=beam.Pipeline())
def testBucketizationElementwise(self, test_inputs, expected_boundaries, do_shuffle, epsilon, should_apply, is_manual_boundaries, input_dtype): test_inputs = list(test_inputs) # Shuffle the input to add randomness to input generated with # simple range(). if do_shuffle: random.shuffle(test_inputs) def preprocessing_fn(inputs): x = tf.cast(inputs['x'], input_dtype) num_buckets = len(expected_boundaries) + 1 if should_apply: if is_manual_boundaries: bucket_boundaries = [ expected_boundaries, [2 * b for b in expected_boundaries] ] else: bucket_boundaries = tft.quantiles( x, num_buckets, epsilon, reduce_instance_dims=False) bucket_boundaries = tf.unstack(bucket_boundaries, axis=0) result = [] for i, boundaries in enumerate(bucket_boundaries): boundaries = tf.cast(boundaries, tf.float32) result.append( tft.apply_buckets(x[:, i], tf.expand_dims(boundaries, axis=0))) result = tf.stack(result, axis=1) else: result = tft.bucketize(x, num_buckets=num_buckets, epsilon=epsilon, elementwise=True) return {'q_b': result} input_data = [{'x': [x, 2 * x]} for x in test_inputs] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature( [2], tft_unit.canonical_numeric_dtype(input_dtype)) }) # Sort the input based on value, index is used to create expected_data. sorted_list = sorted(enumerate(test_inputs), key=lambda p: p[1]) # Expected data has the same size as input, one bucket per input value. expected_data = [[None, None]] * len(test_inputs) bucket = 0 for (index, x) in sorted_list: # Increment the bucket number when crossing the boundary if (bucket < len(expected_boundaries) and x >= expected_boundaries[bucket]): bucket += 1 expected_data[index] = {'q_b': [bucket, bucket]} expected_metadata = tft_unit.metadata_from_feature_spec( { 'q_b': tf.io.FixedLenFeature([2], tf.int64), }, None) @contextlib.contextmanager def no_assert(): yield None assertion = no_assert() if input_dtype == tf.float16: assertion = self.assertRaisesRegexp( TypeError, '.*DataType float16 not in list of allowed values.*') with assertion: self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata, desired_batch_size=1000, # TODO(b/110855155): Remove this explicit use of DirectRunner. beam_pipeline=beam.Pipeline())