def test_strategy_with_file(self, strategy): # TODO(b/180614455): remove this check when MLIR bridge is always enabled. if backend.is_tpu_strategy(strategy): self.skipTest("This test needs MLIR bridge on TPU.") vocab_data = ["earth", "wind", "and", "fire"] vocab_file = self._write_to_temp_file("temp", vocab_data) input_array = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch( 2, drop_remainder=True) expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] tf.config.set_soft_device_placement(True) with strategy.scope(): input_data = keras.Input(shape=(None, ), dtype=tf.string) layer = index_lookup.IndexLookup(max_tokens=None, num_oov_indices=1, mask_token="", oov_token="[OOV]", dtype=tf.string, vocabulary=vocab_file) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) model.compile(loss="mse") output_dataset = model.predict(input_dataset) self.assertAllEqual(expected_output, output_dataset)
def bm_adapt_implementation(self, num_elements, batch_size, k): """Test the KPL adapt implementation.""" ds = tf.data.Dataset.from_generator(word_gen, tf.string, tf.TensorShape([])) batched_ds = ds.take(num_elements).batch(batch_size) input_t = keras.Input(shape=(), dtype=tf.string) layer = index_lookup.IndexLookup(max_tokens=k, num_oov_indices=0, mask_token=None, oov_token="OOV", dtype=tf.string) _ = layer(input_t) num_repeats = 5 starts = [] ends = [] for _ in range(num_repeats): starts.append(time.time()) layer.adapt(batched_ds) ends.append(time.time()) avg_time = np.mean(np.array(ends) - np.array(starts)) name = "index_lookup_adapt|%s_elements|vocab_size_%s|batch_%s" % ( num_elements, k, batch_size) baseline = self.run_numpy_implementation(num_elements, batch_size, k) extras = { "numpy implementation baseline": baseline, "delta seconds": (baseline - avg_time), "delta percent": ((baseline - avg_time) / baseline) * 100 } self.report_benchmark(iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
def test_strategy(self, strategy): if (backend.is_tpu_strategy(strategy) and not tf_test_utils.is_mlir_bridge_enabled()): self.skipTest("TPU tests require MLIR bridge") vocab_data = [[ "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and", "and", "fire" ]] vocab_dataset = tf.data.Dataset.from_tensors(vocab_data) input_array = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch( 2, drop_remainder=True) expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] tf.config.set_soft_device_placement(True) with strategy.scope(): input_data = keras.Input(shape=(None,), dtype=tf.string) layer = index_lookup.IndexLookup( max_tokens=None, num_oov_indices=1, mask_token="", oov_token="[OOV]", vocabulary_dtype=tf.string) layer.adapt(vocab_dataset) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) model.compile(loss="mse") output_dataset = model.predict(input_dataset) self.assertAllEqual(expected_output, output_dataset)
def test_tpu_distribution(self, distribution): vocab_data = [[ "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and", "and", "fire" ]] vocab_dataset = tf.data.Dataset.from_tensors(vocab_data) input_array = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch( 2, drop_remainder=True) expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] tf.config.set_soft_device_placement(True) with distribution.scope(): input_data = keras.Input(shape=(None,), dtype=tf.string) layer = index_lookup.IndexLookup( max_tokens=None, num_oov_indices=1, mask_token="", oov_token="[OOV]", dtype=tf.string) layer.adapt(vocab_dataset) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) model.compile(loss="mse") output_dataset = model.predict(input_dataset) self.assertAllEqual(expected_output, output_dataset)
def bm_adapt_implementation(self, num_elements, batch_size): """Test the KPL adapt implementation.""" vocab = get_vocab() vocab_file = self._write_to_temp_file("vocab", vocab) vocabulary_initializer = tf.lookup.TextFileInitializer( filename=vocab_file, key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE, value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER, value_index_offset=2, ) input_t = keras.Input(shape=(), dtype=tf.string) layer = index_lookup.IndexLookup( vocabulary=vocabulary_initializer, max_tokens=None, num_oov_indices=1, mask_token="", oov_token="OOV", dtype=tf.string, ) out_t = layer(input_t) model = keras.Model(input_t, out_t) num_repeats = 5 starts = [] ends = [] data = tensor_gen(batch_size, num_elements) _ = model(data) for _ in range(num_repeats): starts.append(time.time()) _ = model(data) ends.append(time.time()) avg_time = np.mean(np.array(ends) - np.array(starts)) baseline, _ = self.run_numpy_implementation(data, vocab) extras = { "numpy implementation baseline": baseline, "delta seconds": (baseline - avg_time), "delta percent": ((baseline - avg_time) / baseline) * 100, } name = "index_lookup_forward|%s_elements|batch_%s" % ( num_elements, batch_size, ) self.report_benchmark(iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
def run_numpy_implementation(self, data, vocab): """Test the python implementation.""" input_t = keras.Input(shape=(), dtype=tf.string) layer = index_lookup.IndexLookup(vocabulary=vocab, max_tokens=None, num_oov_indices=1, mask_token="", oov_token="OOV", dtype=tf.string) out_t = layer(input_t) model = keras.Model(input_t, out_t) num_repeats = 5 starts = [] ends = [] _ = model(data) for _ in range(num_repeats): starts.append(time.time()) out = model(data) ends.append(time.time()) avg_time = np.mean(np.array(ends) - np.array(starts)) return avg_time, out
def run_numpy_implementation(self, num_elements, batch_size, k): """Test the python implementation.""" ds = tf.data.Dataset.from_generator(word_gen, tf.string, tf.TensorShape([])) batched_ds = ds.take(num_elements).batch(batch_size) input_t = keras.Input(shape=(), dtype=tf.string) layer = index_lookup.IndexLookup(max_tokens=k, num_oov_indices=0, mask_token=None, oov_token="OOV", dtype=tf.string) _ = layer(input_t) num_repeats = 5 starts = [] ends = [] for _ in range(num_repeats): starts.append(time.time()) vocab = get_top_k(batched_ds, k) layer.set_vocabulary(vocab) ends.append(time.time()) avg_time = np.mean(np.array(ends) - np.array(starts)) return avg_time