Пример #1
0
    def test_strategy_with_file(self, strategy):
        # TODO(b/180614455): remove this check when MLIR bridge is always enabled.
        if backend.is_tpu_strategy(strategy):
            self.skipTest("This test needs MLIR bridge on TPU.")

        vocab_data = ["earth", "wind", "and", "fire"]
        vocab_file = self._write_to_temp_file("temp", vocab_data)

        input_array = np.array([["earth", "wind", "and", "fire"],
                                ["fire", "and", "earth", "michigan"]])
        input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch(
            2, drop_remainder=True)
        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

        tf.config.set_soft_device_placement(True)

        with strategy.scope():
            input_data = keras.Input(shape=(None, ), dtype=tf.string)
            layer = index_lookup.IndexLookup(max_tokens=None,
                                             num_oov_indices=1,
                                             mask_token="",
                                             oov_token="[OOV]",
                                             dtype=tf.string,
                                             vocabulary=vocab_file)
            int_data = layer(input_data)
            model = keras.Model(inputs=input_data, outputs=int_data)
        model.compile(loss="mse")
        output_dataset = model.predict(input_dataset)
        self.assertAllEqual(expected_output, output_dataset)
 def bm_adapt_implementation(self, num_elements, batch_size, k):
     """Test the KPL adapt implementation."""
     ds = tf.data.Dataset.from_generator(word_gen, tf.string,
                                         tf.TensorShape([]))
     batched_ds = ds.take(num_elements).batch(batch_size)
     input_t = keras.Input(shape=(), dtype=tf.string)
     layer = index_lookup.IndexLookup(max_tokens=k,
                                      num_oov_indices=0,
                                      mask_token=None,
                                      oov_token="OOV",
                                      dtype=tf.string)
     _ = layer(input_t)
     num_repeats = 5
     starts = []
     ends = []
     for _ in range(num_repeats):
         starts.append(time.time())
         layer.adapt(batched_ds)
         ends.append(time.time())
     avg_time = np.mean(np.array(ends) - np.array(starts))
     name = "index_lookup_adapt|%s_elements|vocab_size_%s|batch_%s" % (
         num_elements, k, batch_size)
     baseline = self.run_numpy_implementation(num_elements, batch_size, k)
     extras = {
         "numpy implementation baseline": baseline,
         "delta seconds": (baseline - avg_time),
         "delta percent": ((baseline - avg_time) / baseline) * 100
     }
     self.report_benchmark(iters=num_repeats,
                           wall_time=avg_time,
                           extras=extras,
                           name=name)
  def test_strategy(self, strategy):
    if (backend.is_tpu_strategy(strategy) and
        not tf_test_utils.is_mlir_bridge_enabled()):
      self.skipTest("TPU tests require MLIR bridge")

    vocab_data = [[
        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
        "and", "fire"
    ]]
    vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
    input_array = np.array([["earth", "wind", "and", "fire"],
                            ["fire", "and", "earth", "michigan"]])
    input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch(
        2, drop_remainder=True)
    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

    tf.config.set_soft_device_placement(True)

    with strategy.scope():
      input_data = keras.Input(shape=(None,), dtype=tf.string)
      layer = index_lookup.IndexLookup(
          max_tokens=None,
          num_oov_indices=1,
          mask_token="",
          oov_token="[OOV]",
          vocabulary_dtype=tf.string)
      layer.adapt(vocab_dataset)
      int_data = layer(input_data)
      model = keras.Model(inputs=input_data, outputs=int_data)
    model.compile(loss="mse")
    output_dataset = model.predict(input_dataset)
    self.assertAllEqual(expected_output, output_dataset)
  def test_tpu_distribution(self, distribution):
    vocab_data = [[
        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
        "and", "fire"
    ]]
    vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
    input_array = np.array([["earth", "wind", "and", "fire"],
                            ["fire", "and", "earth", "michigan"]])
    input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch(
        2, drop_remainder=True)
    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

    tf.config.set_soft_device_placement(True)

    with distribution.scope():
      input_data = keras.Input(shape=(None,), dtype=tf.string)
      layer = index_lookup.IndexLookup(
          max_tokens=None,
          num_oov_indices=1,
          mask_token="",
          oov_token="[OOV]",
          dtype=tf.string)
      layer.adapt(vocab_dataset)
      int_data = layer(input_data)
      model = keras.Model(inputs=input_data, outputs=int_data)
    model.compile(loss="mse")
    output_dataset = model.predict(input_dataset)
    self.assertAllEqual(expected_output, output_dataset)
 def bm_adapt_implementation(self, num_elements, batch_size):
     """Test the KPL adapt implementation."""
     vocab = get_vocab()
     vocab_file = self._write_to_temp_file("vocab", vocab)
     vocabulary_initializer = tf.lookup.TextFileInitializer(
         filename=vocab_file,
         key_dtype=tf.string,
         key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
         value_dtype=tf.int64,
         value_index=tf.lookup.TextFileIndex.LINE_NUMBER,
         value_index_offset=2,
     )
     input_t = keras.Input(shape=(), dtype=tf.string)
     layer = index_lookup.IndexLookup(
         vocabulary=vocabulary_initializer,
         max_tokens=None,
         num_oov_indices=1,
         mask_token="",
         oov_token="OOV",
         dtype=tf.string,
     )
     out_t = layer(input_t)
     model = keras.Model(input_t, out_t)
     num_repeats = 5
     starts = []
     ends = []
     data = tensor_gen(batch_size, num_elements)
     _ = model(data)
     for _ in range(num_repeats):
         starts.append(time.time())
         _ = model(data)
         ends.append(time.time())
     avg_time = np.mean(np.array(ends) - np.array(starts))
     baseline, _ = self.run_numpy_implementation(data, vocab)
     extras = {
         "numpy implementation baseline": baseline,
         "delta seconds": (baseline - avg_time),
         "delta percent": ((baseline - avg_time) / baseline) * 100,
     }
     name = "index_lookup_forward|%s_elements|batch_%s" % (
         num_elements,
         batch_size,
     )
     self.report_benchmark(iters=num_repeats,
                           wall_time=avg_time,
                           extras=extras,
                           name=name)
 def run_numpy_implementation(self, data, vocab):
     """Test the python implementation."""
     input_t = keras.Input(shape=(), dtype=tf.string)
     layer = index_lookup.IndexLookup(vocabulary=vocab,
                                      max_tokens=None,
                                      num_oov_indices=1,
                                      mask_token="",
                                      oov_token="OOV",
                                      dtype=tf.string)
     out_t = layer(input_t)
     model = keras.Model(input_t, out_t)
     num_repeats = 5
     starts = []
     ends = []
     _ = model(data)
     for _ in range(num_repeats):
         starts.append(time.time())
         out = model(data)
         ends.append(time.time())
     avg_time = np.mean(np.array(ends) - np.array(starts))
     return avg_time, out
 def run_numpy_implementation(self, num_elements, batch_size, k):
     """Test the python implementation."""
     ds = tf.data.Dataset.from_generator(word_gen, tf.string,
                                         tf.TensorShape([]))
     batched_ds = ds.take(num_elements).batch(batch_size)
     input_t = keras.Input(shape=(), dtype=tf.string)
     layer = index_lookup.IndexLookup(max_tokens=k,
                                      num_oov_indices=0,
                                      mask_token=None,
                                      oov_token="OOV",
                                      dtype=tf.string)
     _ = layer(input_t)
     num_repeats = 5
     starts = []
     ends = []
     for _ in range(num_repeats):
         starts.append(time.time())
         vocab = get_top_k(batched_ds, k)
         layer.set_vocabulary(vocab)
         ends.append(time.time())
     avg_time = np.mean(np.array(ends) - np.array(starts))
     return avg_time