def check_vocab(vocab_file, out_dir, check_special_token=True, sos=None, eos=None, unk=None): """Check if vocab_file doesn't exist, create from corpus_file.""" if tf.gfile.Exists(vocab_file): utils.print_out("# Vocab file %s exists" % vocab_file) vocab, vocab_size = load_vocab(vocab_file) if check_special_token: # Verify if the vocab starts with unk, sos, eos # If not, prepend those tokens & generate a new vocab file if not unk: unk = UNK if not sos: sos = SOS if not eos: eos = EOS assert len(vocab) >= 3 if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos: utils.print_out("The first 3 vocab words [%s, %s, %s]" " are not [%s, %s, %s]" % (vocab[0], vocab[1], vocab[2], unk, sos, eos)) vocab = [unk, sos, eos] + vocab vocab_size += 3 new_vocab_file = os.path.join(out_dir, os.path.basename(vocab_file)) with codecs.getwriter("utf-8")(tf.gfile.GFile( new_vocab_file, "wb")) as f: for word in vocab: f.write("%s\n" % word) vocab_file = new_vocab_file else: raise ValueError("vocab_file '%s' does not exist." % vocab_file) vocab_size = len(vocab) return vocab_size, vocab_file
def extend_hparams(hparams): """Add new arguments to hparams.""" # Sanity checks if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: raise ValueError("subword option must be either spm, or bpe") if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0: raise ValueError( "beam_width must greater than 0 when using beam_search" "decoder.") # Different number of encoder / decoder layers assert hparams.num_encoder_layers == hparams.num_decoder_layers # The first unidirectional layer (after the bi-directional layer) in # the GNMT encoder can't have residual connection due to the input is # the concatenation of fw_cell and bw_cell's outputs. num_encoder_residual_layers = hparams.num_encoder_layers - 2 num_decoder_residual_layers = num_encoder_residual_layers _add_argument(hparams, "num_encoder_residual_layers", num_encoder_residual_layers) _add_argument(hparams, "num_decoder_residual_layers", num_decoder_residual_layers) ## Vocab # Get vocab file names first if hparams.vocab_prefix: src_vocab_file = hparams.vocab_prefix + "." + hparams.src tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt else: raise ValueError("hparams.vocab_prefix must be provided.") # Source vocab src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.out_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) # Target vocab utils.print_out(" using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size _add_argument(hparams, "src_vocab_size", src_vocab_size) _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size) _add_argument(hparams, "src_vocab_file", src_vocab_file) _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file) # Num embedding partitions _add_argument(hparams, "num_enc_emb_partitions", hparams.num_embeddings_partitions) _add_argument(hparams, "num_dec_emb_partitions", hparams.num_embeddings_partitions) # Pretrained Embeddings _add_argument(hparams, "src_embed_file", "") _add_argument(hparams, "tgt_embed_file", "") return hparams
def _single_cell(num_units, forget_bias, dropout, mode, residual_connection=False, residual_fn=None, global_step=None, fast_reverse=False, seq_len=None): """Create an instance of a single RNN cell.""" # dropout (= 1 - keep_prob) is set to 0 during eval and infer dropout = dropout if mode == tf.contrib.learn.ModeKeys.TRAIN else 0.0 # Cell Type utils.print_out(" LSTM, forget_bias=%g" % forget_bias, new_line=False) single_cell = tf.contrib.rnn.BasicLSTMCell(num_units, forget_bias=forget_bias) # Dropout (= 1 - keep_prob) enabled = ( mode == tf.contrib.learn.ModeKeys.TRAIN) or dropout > 0.0 or fast_reverse single_cell = CellWrapper( cell=single_cell, input_keep_prob=(1.0 - dropout), global_step=global_step, seq_len=seq_len, enabled=enabled) # Residual if residual_connection: single_cell = tf.contrib.rnn.ResidualWrapper( single_cell, residual_fn=residual_fn) utils.print_out(" %s" % type(single_cell).__name__, new_line=False) return single_cell
def _cell_list(num_units, num_layers, num_residual_layers, forget_bias, dropout, mode, single_cell_fn=None, residual_fn=None, global_step=None, fast_reverse=False, seq_len=None): """Create a list of RNN cells.""" if not single_cell_fn: single_cell_fn = _single_cell # Multi-GPU cell_list = [] for i in range(num_layers): utils.print_out(" cell %d" % i, new_line=False) single_cell = single_cell_fn( num_units=num_units, forget_bias=forget_bias, dropout=dropout, mode=mode, residual_connection=(i >= num_layers - num_residual_layers), residual_fn=residual_fn, global_step=global_step, fast_reverse=fast_reverse, seq_len=seq_len) utils.print_out("") cell_list.append(single_cell) return cell_list
def get_metric(hparams, predictions, current_step): """Run inference and compute metric.""" predicted_ids = [] for prediction in predictions: predicted_ids.append(prediction["predictions"]) if hparams.examples_to_infer < len(predicted_ids): predicted_ids = predicted_ids[0:hparams.examples_to_infer] translations = _convert_ids_to_strings(hparams.tgt_vocab_file, predicted_ids) trans_file = os.path.join( hparams.out_dir, "newstest2014_out_{}.tok.de".format(current_step)) trans_dir = os.path.dirname(trans_file) if not tf.gfile.Exists(trans_dir): tf.gfile.MakeDirs(trans_dir) tf.logging.info("Writing to file %s" % trans_file) with codecs.getwriter("utf-8")(tf.gfile.GFile(trans_file, mode="wb")) as trans_f: trans_f.write("") # Write empty string to ensure file is created. for translation in translations: sentence = nmt_utils.get_translation( translation, tgt_eos=hparams.eos, subword_option=hparams.subword_option) trans_f.write((sentence + b"\n").decode("utf-8")) # Evaluation output_dir = os.path.join(hparams.out_dir, "eval") tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) ref_file = "%s.%s" % (hparams.test_prefix, hparams.tgt) metric = "bleu" if hparams.use_borg: score = evaluation_utils.evaluate(ref_file, trans_file, metric, hparams.subword_option) else: score = get_sacrebleu(trans_file, hparams.detokenizer_file) with tf.Graph().as_default(): summaries = [] summaries.append(tf.Summary.Value(tag=metric, simple_value=score)) tf_summary = tf.Summary(value=list(summaries)) summary_writer.add_summary(tf_summary, current_step) misc_utils.print_out(" %s: %.1f" % (metric, score)) summary_writer.close() return score
def build_graph(self, hparams, source, max_seq_len): """Subclass must implement this method. Creates a sequence-to-sequence model with dynamic RNN decoder API. Args: hparams: Hyperparameter configurations. source: The input source. max_seq_len: The max sequence length Returns: A tuple of the form (logits, predicted_ids) for infererence and (loss, None) for training. where: logits: float32 Tensor [batch_size x num_decoder_symbols] loss: float32 scalar predicted_ids: predicted ids from beam search. """ utils.print_out("# Creating %s graph ..." % self.mode) source = tf.reshape( tf.slice(source, [0, 0], [self.batch_size, max_seq_len]), [self.batch_size, max_seq_len]) with tf.variable_scope("dynamic_seq2seq", dtype=self.dtype, reuse=self.reuse): if hparams.activation_dtype == "bfloat16": tf.get_variable_scope().set_dtype(tf.bfloat16) # Encoder encoder_outputs, encoder_states = self._build_encoder( hparams, source) ## Decoder with tf.variable_scope("decoder", reuse=self.reuse): with tf.variable_scope("output_projection", reuse=self.reuse): self.output_layer = tf.slice( tf.get_variable("kernel", [ self.num_units, 128 * (self.tgt_vocab_size // 128 + 1) ]), [0, 0], [self.num_units, self.tgt_vocab_size]) return self._build_decoder(encoder_outputs, encoder_states, hparams)[1]
def load_embed_txt(embed_file): """Load embed_file into a python dictionary. Note: the embed_file should be a Glove/word2vec formatted txt file. Assuming Here is an exampe assuming embed_size=5: the -0.071549 0.093459 0.023738 -0.090339 0.056123 to 0.57346 0.5417 -0.23477 -0.3624 0.4037 and 0.20327 0.47348 0.050877 0.002103 0.060547 For word2vec format, the first line will be: <num_words> <emb_size>. Args: embed_file: file path to the embedding file. Returns: a dictionary that maps word to vector, and the size of embedding dimensions. """ emb_dict = dict() emb_size = None is_first_line = True with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, "rb")) as f: for line in f: tokens = line.rstrip().split(" ") if is_first_line: is_first_line = False if len(tokens) == 2: # header line emb_size = int(tokens[1]) continue word = tokens[0] vec = list(map(float, tokens[1:])) emb_dict[word] = vec if emb_size: if emb_size != len(vec): utils.print_out( "Ignoring %s since embeding size is inconsistent." % word) del emb_dict[word] else: emb_size = len(vec) return emb_dict, emb_size
def _get_learning_rate_warmup(self, hparams): """Get learning rate warmup.""" warmup_steps = hparams.warmup_steps warmup_scheme = hparams.warmup_scheme utils.print_out( " learning_rate=%g, warmup_steps=%d, warmup_scheme=%s" % (hparams.learning_rate, warmup_steps, warmup_scheme)) # Apply inverse decay if global steps less than warmup steps. # Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3) # When step < warmup_steps, # learing_rate *= warmup_factor ** (warmup_steps - step) if warmup_scheme == "t2t": # 0.01^(1/warmup_steps): we start with a lr, 100 times smaller warmup_factor = tf.exp(tf.log(0.01) / warmup_steps) inv_decay = warmup_factor**(tf.to_float(warmup_steps - self.global_step)) else: raise ValueError("Unknown warmup scheme %s" % warmup_scheme) return tf.cond(self.global_step < hparams.warmup_steps, lambda: inv_decay * self.learning_rate, lambda: self.learning_rate, name="learning_rate_warump_cond")
def _set_train_or_infer(self, res, hparams): """Set up training.""" if self.mode == tf.contrib.learn.ModeKeys.INFER: self.predicted_ids = res[1] params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrange for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: loss = res[0] self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) else: raise ValueError("Unknown optimizer type %s" % hparams.optimizer) opt = tf.contrib.tpu.CrossShardOptimizer(opt) # Gradients gradients = tf.gradients(loss, params, colocate_gradients_with_ops=True) clipped_grads, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm = grad_norm self.update = opt.apply_gradients(zip(clipped_grads, params), global_step=self.global_step) # Print trainable variables utils.print_out("# Trainable variables") utils.print_out("Format: <name>, <shape>, <(soft) device placement>") for param in params: utils.print_out( " %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def _create_pretrained_emb_from_txt( vocab_file, embed_file, num_trainable_tokens=3, dtype=tf.float32, scope=None): """Load pretrain embeding from embed_file, and return an embedding matrix. Args: vocab_file: Path to vocab file. embed_file: Path to a Glove formmated embedding txt file. num_trainable_tokens: Make the first n tokens in the vocab file as trainable variables. Default is 3, which is "<unk>", "<s>" and "</s>". dtype: data type. scope: tf scope name. Returns: pretrained embedding table variable. """ vocab, _ = vocab_utils.load_vocab(vocab_file) trainable_tokens = vocab[:num_trainable_tokens] utils.print_out("# Using pretrained embedding: %s." % embed_file) utils.print_out(" with trainable tokens: ") emb_dict, emb_size = vocab_utils.load_embed_txt(embed_file) for token in trainable_tokens: utils.print_out(" %s" % token) if token not in emb_dict: emb_dict[token] = [0.0] * emb_size emb_mat = np.array( [emb_dict[token] for token in vocab], dtype=dtype.as_numpy_dtype()) emb_mat = tf.constant(emb_mat) emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1]) with tf.variable_scope(scope or "pretrain_embeddings", dtype=dtype) as scope: emb_mat_var = tf.get_variable( "emb_mat_var", [num_trainable_tokens, emb_size]) return tf.concat([emb_mat_var, emb_mat_const], 0)
def create_emb_for_encoder_and_decoder(src_vocab_size, tgt_vocab_size, src_embed_size, tgt_embed_size, dtype=tf.float32, num_enc_partitions=0, num_dec_partitions=0, src_vocab_file=None, tgt_vocab_file=None, src_embed_file=None, tgt_embed_file=None, scope=None): """Create embedding matrix for both encoder and decoder. Args: src_vocab_size: An integer. The source vocab size. tgt_vocab_size: An integer. The target vocab size. src_embed_size: An integer. The embedding dimension for the encoder's embedding. tgt_embed_size: An integer. The embedding dimension for the decoder's embedding. dtype: dtype of the embedding matrix. Default to float32. num_enc_partitions: number of partitions used for the encoder's embedding vars. num_dec_partitions: number of partitions used for the decoder's embedding vars. src_vocab_file: A string. The source vocabulary file. tgt_vocab_file: A string. The target vocabulary file. src_embed_file: A string. The source embedding file. tgt_embed_file: A string. The target embedding file. scope: VariableScope for the created subgraph. Default to "embedding". Returns: embedding_encoder: Encoder's embedding matrix. embedding_decoder: Decoder's embedding matrix. Raises: ValueError: if source and target have different vocab size. """ if num_enc_partitions <= 1: enc_partitioner = None else: # Note: num_partitions > 1 is required for distributed training due to # embedding_lookup tries to colocate single partition-ed embedding variable # with lookup ops. This may cause embedding variables being placed on worker # jobs. enc_partitioner = tf.fixed_size_partitioner(num_enc_partitions) if num_dec_partitions <= 1: dec_partitioner = None else: # Note: num_partitions > 1 is required for distributed training due to # embedding_lookup tries to colocate single partition-ed embedding variable # with lookup ops. This may cause embedding variables being placed on worker # jobs. dec_partitioner = tf.fixed_size_partitioner(num_dec_partitions) if src_embed_file and enc_partitioner: raise ValueError( "Can't set num_enc_partitions > 1 when using pretrained encoder " "embedding") if tgt_embed_file and dec_partitioner: raise ValueError( "Can't set num_dec_partitions > 1 when using pretrained decdoer " "embedding") with tf.variable_scope( scope or "embeddings", dtype=dtype, partitioner=enc_partitioner) as scope: if src_vocab_size != tgt_vocab_size: raise ValueError("Share embedding but different src/tgt vocab sizes" " %d vs. %d" % (src_vocab_size, tgt_vocab_size)) assert src_embed_size == tgt_embed_size utils.print_out("# Use the same embedding for source and target") vocab_file = src_vocab_file or tgt_vocab_file embed_file = src_embed_file or tgt_embed_file with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): embedding_encoder = _create_or_load_embed("embedding_encoder", vocab_file, embed_file, src_vocab_size, src_embed_size, dtype) with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): embedding_decoder = _create_or_load_embed("embedding_decoder", vocab_file, embed_file, src_vocab_size, src_embed_size, dtype) return embedding_encoder, embedding_decoder