def testCheckVocab(self): # Create a vocab file vocab_dir = os.path.join(tf.test.get_temp_dir(), 'vocab_dir') os.makedirs(vocab_dir) vocab_file = os.path.join(vocab_dir, 'vocab_file') vocab = ["alpha", "beta", "charli", 'delta'] with codecs.getreader('utf-8')(tf.gfile.GFile(vocab_file, 'wb')) as f: for word in vocab: f.write('%s\n' % word) # Call vocab_utils out_dir = os.path.join(tf.test.get_temp_dir(), 'out_dir') os.makedirs(out_dir) vocab_size, new_vocab_file = vocab_utils.check_vocab( vocab_file, out_dir) # Assert: we expect the code to add <unk>, <s>, </s> and # create a new vocab file self.assertEqual(len(vocab) + 3, vocab_size) self.assertEqual(os.path.join(out_dir, 'vocab_file'), new_vocab_file) new_vocab = [] with codecs.getreader('utf-8')(tf.gfile.GFile(new_vocab_file, 'rb')) as f: for line in f: new_vocab.append(line.strip()) self.assertEqual([vocab_utils.UNK, vocab_utils.SOS, vocab_utils.EOS] + vocab, new_vocab)
def testCheckVocab(self): # Create a vocab file vocab_dir = os.path.join(tf.test.get_temp_dir(), "vocab_dir") os.makedirs(vocab_dir) vocab_file = os.path.join(vocab_dir, "vocab_file") vocab = ["a", "b", "c"] with codecs.getwriter("utf-8")(tf.gfile.GFile(vocab_file, "wb")) as f: for word in vocab: f.write("%s\n" % word) # Call vocab_utils out_dir = os.path.join(tf.test.get_temp_dir(), "out_dir") os.makedirs(out_dir) vocab_size, new_vocab_file = vocab_utils.check_vocab( vocab_file, out_dir) # Assert: we expect the code to add <unk>, <s>, </s> and # create a new vocab file self.assertEqual(len(vocab) + 3, vocab_size) self.assertEqual(os.path.join(out_dir, "vocab_file"), new_vocab_file) new_vocab, _ = vocab_utils.load_vocab(new_vocab_file) self.assertEqual( [vocab_utils.UNK, vocab_utils.SOS, vocab_utils.EOS] + vocab, new_vocab)
def create_standard_hparams(data_path, out_dir): hparams = tf.contrib.training.HParams( # Data src="vi", tgt="en", train_prefix=os.path.join(data_path, "train"), dev_prefix=os.path.join(data_path, "tst2012"), test_prefix=os.path.join(data_path, "tst2013"), vocab_prefix="", embed_prefix="", out_dir=out_dir, src_vocab_file=os.path.join(data_path, "vocab.vi"), tgt_vocab_file=os.path.join(data_path, "vocab.en"), src_embed_file="", tgt_embed_file="", src_file=os.path.join(data_path, "train.vi"), tgt_file=os.path.join(data_path, "train.en"), dev_src_file=os.path.join(data_path, "tst2012.vi"), dev_tgt_file=os.path.join(data_path, "tst2012.en"), test_src_file=os.path.join(data_path, "tst2013.vi"), test_tgt_file=os.path.join(data_path, "tst2013.en"), # Networks num_units=512, num_layers=1, num_encoder_layers=1, num_decoder_layers=1, num_encoder_residual_layers=0, num_decoder_residual_layers=0, dropout=0.2, encoder_type="uni", residual=False, time_major=True, num_embeddings_partitions=0, unit_type="custom", custom_cell=SkipLSTMCell, # Train optimizer="sgd", batch_size=128, init_op="uniform", init_weight=0.1, max_gradient_norm=100.0, learning_rate=1.0, warmup_steps=0, warmup_scheme="t2t", decay_scheme="luong234", colocate_gradients_with_ops=True, num_train_steps=20000, # Data constraints num_buckets=5, max_train=0, src_max_len=25, tgt_max_len=25, src_max_len_infer=0, tgt_max_len_infer=0, # Data format sos="<s>", eos="</s>", subword_option="", check_special_token=True, # Misc forget_bias=1.0, num_gpus=1, epoch_step=0, # record where we were within an epoch. steps_per_stats=100, steps_per_eval=1000, steps_per_external_eval=500, share_vocab=False, metrics=["bleu"], log_device_placement=False, random_seed=None, # only enable beam search during inference when beam_width > 0. beam_width=0, length_penalty_weight=0.0, override_loaded_hparams=True, num_keep_ckpts=5, avg_ckpts=False, num_intra_threads=1, num_inter_threads=8, # For inference inference_indices=None, infer_batch_size=32, sampling_temperature=0.0, num_translations_per_input=1, ) src_vocab_size, _ = vocab_utils.check_vocab(hparams.src_vocab_file, hparams.out_dir) tgt_vocab_size, _ = vocab_utils.check_vocab(hparams.tgt_vocab_file, hparams.out_dir) hparams.add_hparam('src_vocab_size', src_vocab_size) hparams.add_hparam('tgt_vocab_size', tgt_vocab_size) out_dir = hparams.out_dir if not tf.gfile.Exists(out_dir): tf.gfile.MakeDirs(out_dir) for metric in hparams.metrics: hparams.add_hparam("best_" + metric, 0) # larger is better best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric) hparams.add_hparam("best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) if hparams.avg_ckpts: hparams.add_hparam("avg_best_" + metric, 0) # larger is better best_metric_dir = os.path.join(hparams.out_dir, "avg_best_" + metric) hparams.add_hparam("avg_best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) return hparams
def extend_hparams(hparams): """Extend training hparams.""" assert hparams.num_encoder_layers and hparams.num_decoder_layers if hparams.num_encoder_layers != hparams.num_decoder_layers: hparams.pass_hidden_state = False utils.print_out( "Num encoder layer %d is different from num decoder layer" " %d, so set pass_hidden_state to False" % (hparams.num_encoder_layers, hparams.num_decoder_layers)) # Sanity checks if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0: raise ValueError("For bi, num_encoder_layers %d should be even" % hparams.num_encoder_layers) if (hparams.attention_architecture in ["gnmt"] and hparams.num_encoder_layers < 2): raise ValueError("For gnmt attention architecture, " "num_encoder_layers %d should be >= 2" % hparams.num_encoder_layers) # Set residual layers num_encoder_residual_layers = 0 num_decoder_residual_layers = 0 if hparams.residual: if hparams.num_encoder_layers > 1: num_encoder_residual_layers = hparams.num_encoder_layers - 1 if hparams.num_decoder_layers > 1: num_decoder_residual_layers = hparams.num_decoder_layers - 1 if hparams.encoder_type == "gnmt": # The first unidirectional layer (after the bi-directional layer) in # the GNMT encoder can't have residual connection due to the input is # the concatenation of fw_cell and bw_cell's outputs. num_encoder_residual_layers = hparams.num_encoder_layers - 2 # Compatible for GNMT models if hparams.num_encoder_layers == hparams.num_decoder_layers: num_decoder_residual_layers = num_encoder_residual_layers hparams.add_hparam("num_encoder_residual_layers", num_encoder_residual_layers) hparams.add_hparam("num_decoder_residual_layers", num_decoder_residual_layers) if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: raise ValueError("subword option must be either spm, or bpe") # Flags utils.print_out("# hparams:") utils.print_out(" src=%s" % hparams.src) utils.print_out(" tgt=%s" % hparams.tgt) utils.print_out(" train_prefix=%s" % hparams.train_prefix) utils.print_out(" dev_prefix=%s" % hparams.dev_prefix) utils.print_out(" test_prefix=%s" % hparams.test_prefix) utils.print_out(" out_dir=%s" % hparams.out_dir) ## Vocab # Get vocab file names first if hparams.vocab_prefix: src_vocab_file = hparams.vocab_prefix + "." + hparams.src tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt else: raise ValueError("hparams.vocab_prefix must be provided.") # Source vocab src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.out_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) # Target vocab if hparams.share_vocab: utils.print_out(" using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size else: tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, hparams.out_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) hparams.add_hparam("src_vocab_size", src_vocab_size) hparams.add_hparam("tgt_vocab_size", tgt_vocab_size) hparams.add_hparam("src_vocab_file", src_vocab_file) hparams.add_hparam("tgt_vocab_file", tgt_vocab_file) # Pretrained Embeddings: hparams.add_hparam("src_embed_file", "") hparams.add_hparam("tgt_embed_file", "") if hparams.embed_prefix: src_embed_file = hparams.embed_prefix + "." + hparams.src tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt if tf.gfile.Exists(src_embed_file): hparams.src_embed_file = src_embed_file if tf.gfile.Exists(tgt_embed_file): hparams.tgt_embed_file = tgt_embed_file # Check out_dir if not tf.gfile.Exists(hparams.out_dir): utils.print_out("# Creating output directory %s ..." % hparams.out_dir) tf.gfile.MakeDirs(hparams.out_dir) # Evaluation for metric in hparams.metrics: hparams.add_hparam("best_" + metric, 0) # larger is better best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric) hparams.add_hparam("best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) if hparams.avg_ckpts: hparams.add_hparam("avg_best_" + metric, 0) # larger is better best_metric_dir = os.path.join(hparams.out_dir, "avg_best_" + metric) hparams.add_hparam("avg_best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) return hparams
def extend_hparams(hparams): """Add new arguments to hparams.""" # Sanity checks if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0: raise ValueError("For bi, num_encoder_layers %d should be even" % hparams.num_encoder_layers) if (hparams.attention_architecture in ["gnmt"] and hparams.num_encoder_layers < 2): raise ValueError("For gnmt attention architecture, " "num_encoder_layers %d should be >= 2" % hparams.num_encoder_layers) if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: raise ValueError("subword option must be either spm, or bpe") if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0: raise ValueError( "beam_width must greater than 0 when using beam_search" "decoder.") if hparams.infer_mode == "sample" and hparams.sampling_temperature <= 0.0: raise ValueError( "sampling_temperature must greater than 0.0 when using" "sample decoder.") # Different number of encoder / decoder layers assert hparams.num_encoder_layers and hparams.num_decoder_layers if hparams.num_encoder_layers != hparams.num_decoder_layers: hparams.pass_hidden_state = False utils.print_out( "Num encoder layer %d is different from num decoder layer" " %d, so set pass_hidden_state to False" % (hparams.num_encoder_layers, hparams.num_decoder_layers)) # Set residual layers num_encoder_residual_layers = 0 num_decoder_residual_layers = 0 if hparams.residual: if hparams.num_encoder_layers > 1: num_encoder_residual_layers = hparams.num_encoder_layers - 1 if hparams.num_decoder_layers > 1: num_decoder_residual_layers = hparams.num_decoder_layers - 1 if hparams.encoder_type == "gnmt": # The first unidirectional layer (after the bi-directional layer) in # the GNMT encoder can't have residual connection due to the input is # the concatenation of fw_cell and bw_cell's outputs. num_encoder_residual_layers = hparams.num_encoder_layers - 2 # Compatible for GNMT models if hparams.num_encoder_layers == hparams.num_decoder_layers: num_decoder_residual_layers = num_encoder_residual_layers _add_argument(hparams, "num_encoder_residual_layers", num_encoder_residual_layers) _add_argument(hparams, "num_decoder_residual_layers", num_decoder_residual_layers) # Language modeling if getattr(hparams, "language_model", None): hparams.attention = "" hparams.attention_architecture = "" hparams.pass_hidden_state = False hparams.share_vocab = True hparams.src = hparams.tgt utils.print_out( "For language modeling, we turn off attention and " "pass_hidden_state; turn on share_vocab; set src to tgt.") ## Vocab # Get vocab file names first if hparams.vocab_prefix: src_vocab_file = hparams.vocab_prefix + "." + hparams.src tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt else: raise ValueError("hparams.vocab_prefix must be provided.") # Source vocab check_special_token = getattr(hparams, "check_special_token", True) src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.out_dir, check_special_token=check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) # Target vocab if hparams.share_vocab: utils.print_out(" using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size else: tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, hparams.out_dir, check_special_token=check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) _add_argument(hparams, "src_vocab_size", src_vocab_size) _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size) _add_argument(hparams, "src_vocab_file", src_vocab_file) _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file) # Num embedding partitions num_embeddings_partitions = getattr(hparams, "num_embeddings_partitions", 0) _add_argument(hparams, "num_enc_emb_partitions", num_embeddings_partitions) _add_argument(hparams, "num_dec_emb_partitions", num_embeddings_partitions) # Pretrained Embeddings _add_argument(hparams, "src_embed_file", "") _add_argument(hparams, "tgt_embed_file", "") if getattr(hparams, "embed_prefix", None): src_embed_file = hparams.embed_prefix + "." + hparams.src tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt if tf.gfile.Exists(src_embed_file): utils.print_out(" src_embed_file %s exist" % src_embed_file) hparams.src_embed_file = src_embed_file utils.print_out( "For pretrained embeddings, set num_enc_emb_partitions to 1") hparams.num_enc_emb_partitions = 1 else: utils.print_out(" src_embed_file %s doesn't exist" % src_embed_file) if tf.gfile.Exists(tgt_embed_file): utils.print_out(" tgt_embed_file %s exist" % tgt_embed_file) hparams.tgt_embed_file = tgt_embed_file utils.print_out( "For pretrained embeddings, set num_dec_emb_partitions to 1") hparams.num_dec_emb_partitions = 1 else: utils.print_out(" tgt_embed_file %s doesn't exist" % tgt_embed_file) # Evaluation for metric in hparams.metrics: best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric) tf.gfile.MakeDirs(best_metric_dir) _add_argument(hparams, "best_" + metric, 0, update=False) _add_argument(hparams, "best_" + metric + "_dir", best_metric_dir) if getattr(hparams, "avg_ckpts", None): best_metric_dir = os.path.join(hparams.out_dir, "avg_best_" + metric) tf.gfile.MakeDirs(best_metric_dir) _add_argument(hparams, "avg_best_" + metric, 0, update=False) _add_argument(hparams, "avg_best_" + metric + "_dir", best_metric_dir) return hparams