Пример #1
0
def wmt_parsing_tokens(model_hparams, wrong_vocab_size):
  """English to parse tree translation benchmark.

  Args:
    model_hparams: a tf.contrib.training.HParams
    wrong_vocab_size: a number used in the filename indicating the approximate
      vocabulary size.  This is not to be confused with the actual vocabulary
      size.
  Returns:
    a tf.contrib.training.HParams
  """
  p = default_problem_hparams()
  # This vocab file must be present within the data directory.
  vocab_filename = os.path.join(model_hparams.data_dir,
                                "tokens.vocab.%d" % wrong_vocab_size)
  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
  p.input_modality = {
      "inputs": modality.SymbolModality(model_hparams, subtokenizer.vocab_size)
  }
  p.target_modality = modality.SymbolModality(model_hparams,
                                              subtokenizer.vocab_size)
  p.vocabulary = {
      "inputs": subtokenizer,
      "targets": subtokenizer,
  }
  p.input_space_id = 3
  p.target_space_id = 15
  return p
Пример #2
0
def test_problem_hparams(model_hparams, input_vocab_size, target_vocab_size):
  """Problem hparams for testing model bodies."""
  p = default_problem_hparams()
  p.input_modality = {
      "inputs": modality.SymbolModality(model_hparams, input_vocab_size)
  }
  p.target_modality = modality.SymbolModality(model_hparams, target_vocab_size)
  p.vocabulary = {
      "inputs": text_encoder.TextEncoder(),
      "targets": text_encoder.TextEncoder()
  }
  return p
Пример #3
0
def wmt_parsing_characters(model_hparams):
  """English to parse tree translation benchmark."""
  p = default_problem_hparams()
  p.input_modality = {"inputs": modality.SymbolModality(model_hparams, 256)}
  p.target_modality = modality.SymbolModality(model_hparams, 256)
  p.vocabulary = {
      "inputs": text_encoder.ByteTextEncoder(),
      "targets": text_encoder.ByteTextEncoder(),
  }
  p.loss_multiplier = 2.0
  p.input_space_id = 2
  p.target_space_id = 14
  return p
Пример #4
0
def algorithmic(vocab_size, model_hparams):
  """Default parameters for algorithmic tasks."""
  p = default_problem_hparams()
  p.input_modality = {
      "inputs": modality.SymbolModality(model_hparams, vocab_size)
  }
  p.target_modality = modality.SymbolModality(model_hparams, vocab_size)
  p.vocabulary = {
      "inputs": text_encoder.TextEncoder(num_reserved_ids=1),
      "targets": text_encoder.TextEncoder(num_reserved_ids=1),
  }
  p.input_space_id = 10
  p.target_space_id = 11
  return p
Пример #5
0
def audio_timit_tokens(model_hparams, wrong_vocab_size):
  """English audio transcription benchmark.

  Args:
    model_hparams: a tf.contrib.training.HParams
    wrong_vocab_size: a number used in the filename indicating the approximate
      vocabulary size.  This is not to be confused with the actual vocabulary
      size.
  Returns:
    a tf.contrib.training.HParams
  """
  p = default_problem_hparams()
  # This vocab file must be present within the data directory.
  vocab_filename = os.path.join(model_hparams.data_dir,
                                "tokens.vocab.%d" % wrong_vocab_size)
  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
  p.input_modality = {
      "inputs": modality.AudioModality(model_hparams),
  }
  p.target_modality = modality.SymbolModality(model_hparams,
                                              subtokenizer.vocab_size)
  p.vocabulary = {
      "inputs": text_encoder.TextEncoder(),
      "targets": subtokenizer,
  }
  p.batch_size_multiplier = 256
  p.loss_multiplier = 2.0
  p.input_space_id = 13
  p.target_space_id = 3
  return p
Пример #6
0
 def testSymbolModalityTargets(self):
   batch_size = 10
   num_datashards = 5
   length = 6
   height = 7
   hidden_size = 9
   vocab_size = 11
   model_hparams = tf.contrib.training.HParams(
       symbol_modality_num_shards=4,
       hidden_size=hidden_size,
       label_smoothing=0.2,
       shared_embedding_and_softmax_weights=0)
   body_output = -1 + np.random.random_integers(
       100, size=(batch_size, length, height, hidden_size))
   targets = -1 + np.random.random_integers(
       vocab_size, size=(batch_size, length, height, 1))
   m = modality.SymbolModality(model_hparams, vocab_size)
   data_parallelism = expert_utils.Parallelism(
       ["/device:CPU:0"] * num_datashards, reuse=True)
   with self.test_session() as session:
     sharded_body_output = tf.split(tf.to_float(body_output), num_datashards)
     sharded_targets = tf.split(targets, num_datashards)
     sharded_logits, train_loss = m.targets_top_sharded(
         sharded_body_output, sharded_targets, data_parallelism)
     logits = tf.concat(sharded_logits, 0)
     session.run(tf.global_variables_initializer())
     res1, res2 = session.run((logits, train_loss))
   self.assertEqual(res1.shape, (batch_size, length, height, 1, vocab_size))
   self.assertEqual(res2.shape, ())
Пример #7
0
def wmt_ende_v2(model_hparams, vocab_size):
  """English to German translation benchmark with separate vocabularies."""
  p = default_problem_hparams()
  # These vocab files must be present within the data directory.
  source_vocab_filename = os.path.join(model_hparams.data_dir,
                                       "wmt_ende_v2.en.vocab.%d" % vocab_size)
  target_vocab_filename = os.path.join(model_hparams.data_dir,
                                       "wmt_ende_v2.de.vocab.%d" % vocab_size)
  p.input_modality = {
      "inputs": modality.SymbolModality(model_hparams, vocab_size)
  }
  p.target_modality = modality.SymbolModality(model_hparams, vocab_size)
  p.vocabulary = {
      "inputs": text_encoder.SubwordTextEncoder(source_vocab_filename),
      "targets": text_encoder.SubwordTextEncoder(target_vocab_filename),
  }
  p.input_space_id = 3
  p.target_space_id = 8
  return p
Пример #8
0
def wmt_ende_tokens(model_hparams, wrong_vocab_size):
  """English to German translation benchmark."""
  p = default_problem_hparams()
  # This vocab file must be present within the data directory.
  vocab_filename = os.path.join(model_hparams.data_dir,
                                "tokens.vocab.%d" % wrong_vocab_size)
  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
  p.input_modality = {
      "inputs": modality.SymbolModality(model_hparams, subtokenizer.vocab_size)
  }
  p.target_modality = modality.SymbolModality(model_hparams,
                                              subtokenizer.vocab_size)
  p.vocabulary = {
      "inputs": subtokenizer,
      "targets": subtokenizer,
  }
  p.input_space_id = 3
  p.target_space_id = 8
  return p
Пример #9
0
def image_mnist(model_hparams):
  """MNIST."""
  p = default_problem_hparams()
  p.input_modality = {"inputs": modality.SymbolModality(model_hparams, 256)}
  p.target_modality = modality.ClassLabelModality(model_hparams, 10)
  p.batch_size_multiplier = 4
  p.max_expected_batch_size_per_shard = 8
  p.loss_multiplier = 3.0
  p.input_space_id = 1
  p.target_space_id = 1
  return p
Пример #10
0
def lm1b_64k(model_hparams):
  """Billion-word language-modeling benchmark, 64k subtoken vocabulary."""
  p = default_problem_hparams()
  p.perplexity_exponent = 1.067068
  p.input_modality = {}
  p.target_modality = modality.SymbolModality(model_hparams, 65536)
  p.vocabulary = {
      "targets":
          text_encoder.SubwordTextEncoder(
              os.path.join(model_hparams.data_dir,
                           "lm1b_64k.subword_text_encoder"))
  }
  p.target_space_id = 3
  return p
Пример #11
0
def image_mscoco_characters(model_hparams):
  """COCO image captioning with captions as characters."""
  p = default_problem_hparams()
  p.input_modality = {"inputs": modality.ImageModality(model_hparams)}
  p.target_modality = modality.SymbolModality(model_hparams, 256)
  p.vocabulary = {
      "inputs": text_encoder.TextEncoder(),
      "targets": text_encoder.ByteTextEncoder(),
  }
  p.batch_size_multiplier = 128
  p.max_expected_batch_size_per_shard = 2
  p.loss_multiplier = 2.0
  p.input_space_id = 1
  p.target_space_id = 2
  return p
Пример #12
0
def audio_wsj_characters(model_hparams):
  """English audio transcription benchmark."""
  p = default_problem_hparams()
  p.input_modality = {
      "inputs": modality.AudioSpectralModality(model_hparams),
  }
  p.target_modality = modality.SymbolModality(model_hparams, 256)
  p.vocabulary = {
      "inputs": text_encoder.TextEncoder(),
      "targets": text_encoder.ByteTextEncoder(),
  }
  p.batch_size_multiplier = 512
  p.loss_multiplier = 2.0
  p.input_space_id = 13
  p.target_space_id = 2
  return p
Пример #13
0
def image_mscoco_tokens(model_hparams, vocab_count):
  """COCO image captioning with captions as tokens."""
  p = default_problem_hparams()
  p.input_modality = {"inputs": modality.ImageModality(model_hparams)}
  # This vocab file must be present within the data directory.
  vocab_filename = os.path.join(model_hparams.data_dir,
                                "tokens.vocab.%d" % vocab_count)
  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
  p.target_modality = modality.SymbolModality(model_hparams,
                                              subtokenizer.vocab_size)
  p.vocabulary = {
      "inputs": text_encoder.TextEncoder(),
      "targets": subtokenizer,
  }
  p.batch_size_multiplier = 256
  p.max_expected_batch_size_per_shard = 2
  p.input_space_id = 1
  p.target_space_id = 3
  return p
Пример #14
0
def wmt_ende_bpe32k(model_hparams):
  """English to German translation benchmark."""
  p = default_problem_hparams()
  # single modality object enables embedding sharing between inputs and target
  # when model_hparams.shared_source_target_embedding is True.
  vocab_size = 40960
  m = modality.SymbolModality(model_hparams, vocab_size)
  p.input_modality = {"inputs": m}
  p.target_modality = m
  # This vocab file must be present within the data directory.
  vocab_filename = os.path.join(model_hparams.data_dir, "vocab.bpe.32000")
  p.vocabulary = {
      "inputs": text_encoder.TokenTextEncoder(vocab_filename=vocab_filename),
      "targets": text_encoder.TokenTextEncoder(vocab_filename=vocab_filename),
  }
  p.loss_multiplier = 1.4
  p.input_space_id = 4
  p.target_space_id = 9
  return p
Пример #15
0
 def testSymbolModalityInputs(self):
   batch_size = 10
   num_datashards = 5
   length = 5
   vocab_size = 5000
   hidden_size = 9
   model_hparams = tf.contrib.training.HParams(
       symbol_modality_num_shards=4,
       hidden_size=hidden_size,
       multiply_embedding_mode="sqrt_depth",
       shared_embedding_and_softmax_weights=0)
   x = -1 + np.random.random_integers(vocab_size, size=(
       batch_size, length, 1, 1))
   m = modality.SymbolModality(model_hparams, vocab_size)
   data_parallelism = expert_utils.Parallelism(
       ["/device:CPU:0"] * num_datashards, reuse=True)
   with self.test_session() as session:
     xs = tf.split(x, num_datashards)
     sharded_output = m.inputs_bottom_sharded(xs, data_parallelism)
     output = tf.concat(sharded_output, 0)
     session.run(tf.global_variables_initializer())
     res = session.run(output)
   self.assertEqual(res.shape, (batch_size, length, 1, hidden_size))