def testElementwiseQuantileBucketsWithWeights(self, input_dtype): def analyzer_fn(inputs): return { 'q_b': tft.quantiles(tf.cast(inputs['x'], input_dtype), num_buckets=3, epsilon=0.00001, weights=inputs['weights'], reduce_instance_dims=False) } input_data = [{ 'x': [[x, 2 * x], [2 * x, x]], 'weights': [x / 100.] } for x in range(1, 3000)] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature( [2, 2], tft_unit.canonical_numeric_dtype(input_dtype)), 'weights': tf.io.FixedLenFeature([1], tf.float32) }) # The expected data has 2 boundaries that divides the data into 3 buckets. expected_outputs = { 'q_b': np.array( [[[1732, 2449], [3464, 4898]], [[3464, 4898], [1732, 2449]]], np.float32) } self.assertAnalyzerOutputs(input_data, input_metadata, analyzer_fn, expected_outputs, desired_batch_size=1000)
def _assert_quantile_boundaries(self, test_inputs, expected_boundaries, input_dtype, num_buckets=None, num_expected_buckets=None): if not num_buckets: num_buckets = len(expected_boundaries) + 1 if not num_expected_buckets: num_expected_buckets = num_buckets def analyzer_fn(inputs): x = tf.cast(inputs['x'], input_dtype) return {'q_b': tft.quantiles(x, num_buckets, epsilon=0.0001)} input_data = [{'x': [x]} for x in test_inputs] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature([1], tft_unit.canonical_numeric_dtype(input_dtype)) }) expected_data = {'q_b': expected_boundaries} self.assertAnalyzerOutputs( input_data, input_metadata, analyzer_fn, expected_data, desired_batch_size=1000)
def testGaussianize(self, input_data, output_data, elementwise): def preprocessing_fn(inputs): x = inputs['x'] x_cast = tf.cast(x, tf.as_dtype(input_data.dtype)) x_gaussianized = tft.scale_to_gaussian(x_cast, elementwise=elementwise) self.assertEqual(x_gaussianized.dtype, tf.as_dtype(output_data.dtype)) return {'x_gaussianized': tf.cast(x_gaussianized, tf.float32)} input_data_dicts = [{'x': x} for x in input_data] expected_data_dicts = [{ 'x_gaussianized': x_gaussianized } for x_gaussianized in output_data] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature( input_data.shape[1:], tft_unit.canonical_numeric_dtype(tf.as_dtype( input_data.dtype))), }) expected_metadata = tft_unit.metadata_from_feature_spec({ 'x_gaussianized': tf.io.FixedLenFeature(output_data.shape[1:], tf.float32), }) self.assertAnalyzeAndTransformResults(input_data_dicts, input_metadata, preprocessing_fn, expected_data_dicts, expected_metadata, desired_batch_size=20, beam_pipeline=beam.Pipeline())
def testQuantileBuckets(self, input_dtype): def analyzer_fn(inputs): return { 'q_b': tft.quantiles( tf.cast(inputs['x'], input_dtype), num_buckets=3, epsilon=0.00001) } # NOTE: We force 3 batches: data has 3000 elements and we request a batch # size of 1000. input_data = [{'x': [x]} for x in range(1, 3000)] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature([1], tft_unit.canonical_numeric_dtype(input_dtype)) }) # The expected data has 2 boundaries that divides the data into 3 buckets. expected_outputs = {'q_b': np.array([[1000, 2000]], np.float32)} self.assertAnalyzerOutputs( input_data, input_metadata, analyzer_fn, expected_outputs, desired_batch_size=1000)
def testGaussianizeSparse(self, input_dtype, elementwise): def preprocessing_fn(inputs): x_gaussianized = tf.sparse.to_dense(tft.scale_to_gaussian( tf.cast(inputs['x'], input_dtype), elementwise=elementwise), default_value=np.nan) x_gaussianized.set_shape([None, 4]) self.assertEqual(x_gaussianized.dtype, impl_test._mean_output_dtype(input_dtype)) return {'x_gaussianized': tf.cast(x_gaussianized, tf.float32)} input_data_values = [ 516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16, 508, 669, 617, 502, 532, 517, 479 ] input_data = [] for idx, v in enumerate(input_data_values): input_data.append({ 'idx': [0, 1], 'val': [v] + [-input_data_values[-1 - idx]] }) input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.SparseFeature('idx', 'val', tft_unit.canonical_numeric_dtype(input_dtype), 4) }) if elementwise: expected_data_values = [ -0.09304726, -2.24682532, 1.56900163, -0.78244931, 0.48285998, 0.47461339, -1.50929952, -0.39008015, 0.41659823, -0.81174337, 0.54027596, 2.11624695, -1.72816411, -0.16046759, 1.13320023, 0.74814557, -0.21014091, 0.04373742, -0.08454805, -0.39008015 ] else: expected_data_values = [ 0.91555131, -1.54543642, 1.30767697, 0.73634456, 1.03620536, 1.03443104, 0.26969729, 0.84990131, 1.02201077, 0.72569862, 1.04862563, 1.49752966, -0.02838919, 0.90135672, 1.18702292, 1.09475806, 0.89071077, 0.9439405, 0.91732564, 0.84990131 ] expected_data = [] for idx, v in enumerate(expected_data_values): expected_data.append({ 'x_gaussianized': ([v] + [-expected_data_values[-1 - idx]] + [float('nan'), float('nan')]) }) expected_metadata = tft_unit.metadata_from_feature_spec( {'x_gaussianized': tf.io.FixedLenFeature([4], tf.float32)}) self.assertAnalyzeAndTransformResults(input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata, desired_batch_size=20, beam_pipeline=beam.Pipeline())
def testGaussianizeRagged(self, input_dtype): tft_unit.skip_if_not_tf2('RaggedFeature is not available in TF 1.x.') def preprocessing_fn(inputs): x_gaussianized = tft.scale_to_gaussian( tf.cast(inputs['x'], input_dtype)) self.assertEqual(x_gaussianized.dtype, impl_test._mean_output_dtype(input_dtype)) return {'x_gaussianized': tf.cast(x_gaussianized, tf.float32)} input_data_values = [ 516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16, 508, 669, 617, 502, 532, 517, 479 ] input_data = [] for idx, v in enumerate(input_data_values): input_data.append({ 'val': [v, -input_data_values[-1 - idx]], 'row_lengths_1': [2, 1, 0], 'row_lengths_2': [1, 0, 1], }) input_metadata = tft.DatasetMetadata.from_feature_spec({ 'x': tf.io.RaggedFeature( tft_unit.canonical_numeric_dtype(input_dtype), value_key='val', partitions=[ tf.io.RaggedFeature.RowLengths('row_lengths_1'), # pytype: disable=attribute-error tf.io.RaggedFeature.RowLengths('row_lengths_2') # pytype: disable=attribute-error ]), }) expected_data_values = [ 0.91555131, -1.54543642, 1.30767697, 0.73634456, 1.03620536, 1.03443104, 0.26969729, 0.84990131, 1.02201077, 0.72569862, 1.04862563, 1.49752966, -0.02838919, 0.90135672, 1.18702292, 1.09475806, 0.89071077, 0.9439405, 0.91732564, 0.84990131 ] expected_data = [] for idx, v in enumerate(expected_data_values): expected_data.append({ 'x_gaussianized$ragged_values': ([v, -expected_data_values[-1 - idx]]), 'x_gaussianized$row_lengths_1': [2, 1, 0], 'x_gaussianized$row_lengths_2': [1, 0, 1] }) self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_data, desired_batch_size=20, # Runs the test deterministically on the whole batch. beam_pipeline=beam.Pipeline())
def _assert_quantile_boundaries(self, test_inputs, expected_boundaries, input_dtype, num_buckets=None, num_expected_buckets=None, always_return_num_quantiles=True): if not num_buckets: num_buckets = len(expected_boundaries) + 1 if not num_expected_buckets: num_expected_buckets = num_buckets def preprocessing_fn(inputs): x = tf.cast(inputs['x'], input_dtype) quantiles = tft.quantiles( x, num_buckets, epsilon=0.0001, always_return_num_quantiles=always_return_num_quantiles) quantiles.set_shape([1, num_expected_buckets - 1]) return {'q_b': quantiles} input_data = [{'x': [x]} for x in test_inputs] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature( [1], tft_unit.canonical_numeric_dtype(input_dtype)) }) # Expected data has the same size as input, one bucket per input value. batch_size = 1000 expected_data = [] num_batches = int(math.ceil(len(test_inputs) / float(batch_size))) for _ in range(num_batches): expected_data += [{'q_b': expected_boundaries}] expected_metadata = None self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata, desired_batch_size=batch_size, # TODO(b/110855155): Remove this explicit use of DirectRunner. beam_pipeline=beam.Pipeline())
def testQuantileBucketsWithWeights(self, input_dtype, with_nans): def analyzer_fn(inputs): return { 'q_b': tft.quantiles(tf.cast(inputs['x'], input_dtype), num_buckets=3, epsilon=0.00001, weights=inputs['weights']) } input_data = [{ 'x': [x], 'weights': [x / 100.] } for x in range(1, 3000)] if with_nans: input_data += [{ 'x': [np.nan], 'weights': [100000] }, { 'x': [100000], 'weights': [np.nan] }] input_metadata = tft.DatasetMetadata.from_feature_spec({ 'x': tf.io.FixedLenFeature( [1], tft_unit.canonical_numeric_dtype(input_dtype)), 'weights': tf.io.FixedLenFeature([1], tf.float32) }) # The expected data has 2 boundaries that divides the data into 3 buckets. expected_outputs = {'q_b': np.array([[1732, 2449]], np.float32)} self.assertAnalyzerOutputs(input_data, input_metadata, analyzer_fn, expected_outputs, desired_batch_size=1000)
def testBucketizationElementwise(self, test_inputs, expected_boundaries, do_shuffle, epsilon, should_apply, is_manual_boundaries, input_dtype): test_inputs = list(test_inputs) # Shuffle the input to add randomness to input generated with # simple range(). if do_shuffle: random.shuffle(test_inputs) def preprocessing_fn(inputs): x = tf.cast(inputs['x'], input_dtype) num_buckets = len(expected_boundaries) + 1 if should_apply: if is_manual_boundaries: bucket_boundaries = [ expected_boundaries, [2 * b for b in expected_boundaries] ] else: bucket_boundaries = tft.quantiles( x, num_buckets, epsilon, reduce_instance_dims=False) bucket_boundaries = tf.unstack(bucket_boundaries, axis=0) result = [] for i, boundaries in enumerate(bucket_boundaries): boundaries = tf.cast(boundaries, tf.float32) result.append( tft.apply_buckets(x[:, i], tf.expand_dims(boundaries, axis=0))) result = tf.stack(result, axis=1) else: result = tft.bucketize(x, num_buckets=num_buckets, epsilon=epsilon, elementwise=True) return {'q_b': result} input_data = [{'x': [x, 2 * x]} for x in test_inputs] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature( [2], tft_unit.canonical_numeric_dtype(input_dtype)) }) # Sort the input based on value, index is used to create expected_data. sorted_list = sorted(enumerate(test_inputs), key=lambda p: p[1]) # Expected data has the same size as input, one bucket per input value. expected_data = [[None, None]] * len(test_inputs) bucket = 0 for (index, x) in sorted_list: # Increment the bucket number when crossing the boundary if (bucket < len(expected_boundaries) and x >= expected_boundaries[bucket]): bucket += 1 expected_data[index] = {'q_b': [bucket, bucket]} expected_metadata = tft_unit.metadata_from_feature_spec( { 'q_b': tf.io.FixedLenFeature([2], tf.int64), }, None) @contextlib.contextmanager def no_assert(): yield None assertion = no_assert() if input_dtype == tf.float16: assertion = self.assertRaisesRegexp( TypeError, '.*DataType float16 not in list of allowed values.*') with assertion: self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata, desired_batch_size=1000, # TODO(b/110855155): Remove this explicit use of DirectRunner. beam_pipeline=beam.Pipeline())
def testBucketization(self, test_inputs, expected_boundaries, do_shuffle, epsilon, should_apply, is_manual_boundaries, input_dtype): test_inputs = list(test_inputs) # Shuffle the input to add randomness to input generated with # simple range(). if do_shuffle: random.shuffle(test_inputs) def preprocessing_fn(inputs): x = tf.cast(inputs['x'], input_dtype) num_buckets = len(expected_boundaries) + 1 if should_apply: if is_manual_boundaries: bucket_boundaries = [expected_boundaries] else: bucket_boundaries = tft.quantiles(inputs['x'], num_buckets, epsilon) result = tft.apply_buckets(x, bucket_boundaries) else: result = tft.bucketize(x, num_buckets=num_buckets, epsilon=epsilon) return {'q_b': result} input_data = [{'x': [x]} for x in test_inputs] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature([1], tft_unit.canonical_numeric_dtype(input_dtype)) }) # Sort the input based on value, index is used to create expected_data. indexed_input = enumerate(test_inputs) sorted_list = sorted(indexed_input, key=lambda p: p[1]) # Expected data has the same size as input, one bucket per input value. expected_data = [None] * len(test_inputs) bucket = 0 for (index, x) in sorted_list: # Increment the bucket number when crossing the boundary if (bucket < len(expected_boundaries) and x >= expected_boundaries[bucket]): bucket += 1 expected_data[index] = {'q_b': [bucket]} expected_metadata = tft_unit.metadata_from_feature_spec( { 'q_b': tf.io.FixedLenFeature([1], tf.int64), }, { 'q_b': schema_pb2.IntDomain( min=0, max=len(expected_boundaries), is_categorical=True), }) @contextlib.contextmanager def no_assert(): yield None assertion = no_assert() if input_dtype == tf.float16: assertion = self.assertRaisesRegexp( TypeError, '.*DataType float16 not in list of allowed values.*') with assertion: self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata, desired_batch_size=1000)
def assert_and_cast_dtype(tensor, out_dtype): self.assertEqual(tensor.dtype, out_dtype) return tf.cast(tensor, tft_unit.canonical_numeric_dtype(out_dtype))
def testTukeyHHAnalyzersWithSparseInputs(self, input_dtype, output_dtypes, elementwise=True): def analyzer_fn(inputs): a = tf.cast(inputs['a'], input_dtype) def assert_and_cast_dtype(tensor, out_dtype): self.assertEqual(tensor.dtype, out_dtype) return tf.cast(tensor, tft_unit.canonical_numeric_dtype(out_dtype)) return { 'tukey_location': assert_and_cast_dtype( tft.tukey_location(a, reduce_instance_dims=not elementwise), output_dtypes['tukey_location']), 'tukey_scale': assert_and_cast_dtype( tft.tukey_scale(a, reduce_instance_dims=not elementwise), output_dtypes['tukey_scale']), 'tukey_hl': assert_and_cast_dtype( tft.tukey_h_params( a, reduce_instance_dims=not elementwise)[0], output_dtypes['tukey_hl']), 'tukey_hr': assert_and_cast_dtype( tft.tukey_h_params( a, reduce_instance_dims=not elementwise)[1], output_dtypes['tukey_hr']), } input_data_values = [ 516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16, 508, 669, 617, 502, 532, 517, 479 ] input_data = [] for idx, v in enumerate(input_data_values): input_data.append({ 'idx0': [0, 0], 'idx1': [0, 1], 'val': [v, -input_data_values[-1 - idx]] }) input_metadata = tft_unit.metadata_from_feature_spec({ 'a': tf.io.SparseFeature(['idx0', 'idx1'], 'val', tft_unit.canonical_numeric_dtype(input_dtype), (2, 2)) }) expected_outputs = { 'tukey_location': np.array( [[526.89355, -526.89355], [0., 0.]] if elementwise else 0.0, tft_unit.canonical_numeric_dtype( output_dtypes['tukey_location']).as_numpy_dtype), 'tukey_scale': np.array([[116.73997, 116.73997], [1., 1.]] if elementwise else 572.2776, tft_unit.canonical_numeric_dtype( output_dtypes['tukey_scale']).as_numpy_dtype), 'tukey_hl': np.array( [[0.6629082, 0.11148566], [0., 0.]] if elementwise else 0.0, tft_unit.canonical_numeric_dtype( output_dtypes['tukey_hl']).as_numpy_dtype), 'tukey_hr': np.array( [[0.11148566, 0.6629082], [0., 0.]] if elementwise else 0.0, tft_unit.canonical_numeric_dtype( output_dtypes['tukey_hr']).as_numpy_dtype), } self.assertAnalyzerOutputs( input_data, input_metadata, analyzer_fn, expected_outputs, desired_batch_size=20, # Runs the test deterministically on the whole batch. beam_pipeline=beam.Pipeline())
def testTukeyHHAnalyzersWithRaggedInputs(self, input_dtype): tft_unit.skip_if_not_tf2('RaggedFeature is not available in TF 1.x.') output_dtype = impl_test._mean_output_dtype(input_dtype) canonical_output_dtype = tft_unit.canonical_numeric_dtype(output_dtype) def analyzer_fn(inputs): a = tf.cast(inputs['a'], input_dtype) def assert_and_cast_dtype(tensor): self.assertEqual(tensor.dtype, output_dtype) return tf.cast(tensor, canonical_output_dtype) return { 'tukey_location': assert_and_cast_dtype(tft.tukey_location(a)), 'tukey_scale': assert_and_cast_dtype(tft.tukey_scale(a)), 'tukey_hl': assert_and_cast_dtype(tft.tukey_h_params(a)[0]), 'tukey_hr': assert_and_cast_dtype(tft.tukey_h_params(a)[1]), } input_data_values = [ 516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16, 508, 669, 617, 502, 532, 517, 479 ] input_data = [] for idx, v in enumerate(input_data_values): input_data.append({ 'val': [v, -input_data_values[-1 - idx]], 'row_lengths_1': [2, 0, 1], 'row_lengths_2': [0, 1, 1] }) input_metadata = tft.DatasetMetadata.from_feature_spec({ 'a': tf.io.RaggedFeature( tft_unit.canonical_numeric_dtype(input_dtype), value_key='val', partitions=[ tf.io.RaggedFeature.RowLengths('row_lengths_1'), # pytype: disable=attribute-error tf.io.RaggedFeature.RowLengths('row_lengths_2') # pytype: disable=attribute-error ]), }) expected_outputs = { 'tukey_location': np.array(0.0, canonical_output_dtype.as_numpy_dtype), 'tukey_scale': np.array(572.2776, canonical_output_dtype.as_numpy_dtype), 'tukey_hl': np.array(0.0, canonical_output_dtype.as_numpy_dtype), 'tukey_hr': np.array(0.0, canonical_output_dtype.as_numpy_dtype), } self.assertAnalyzerOutputs( input_data, input_metadata, analyzer_fn, expected_outputs, desired_batch_size=20, # Runs the test deterministically on the whole batch. beam_pipeline=beam.Pipeline())