def check_vocab(vocab_file,
                out_dir,
                check_special_token=True,
                sos=None,
                eos=None,
                unk=None):
    """Check if vocab_file doesn't exist, create from corpus_file."""
    if tf.gfile.Exists(vocab_file):
        utils.print_out("# Vocab file %s exists" % vocab_file)
        vocab, vocab_size = load_vocab(vocab_file)
        if check_special_token:
            # Verify if the vocab starts with unk, sos, eos
            # If not, prepend those tokens & generate a new vocab file
            if not unk: unk = UNK
            if not sos: sos = SOS
            if not eos: eos = EOS
            assert len(vocab) >= 3
            if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos:
                utils.print_out("The first 3 vocab words [%s, %s, %s]"
                                " are not [%s, %s, %s]" %
                                (vocab[0], vocab[1], vocab[2], unk, sos, eos))
                vocab = [unk, sos, eos] + vocab
                vocab_size += 3
                new_vocab_file = os.path.join(out_dir,
                                              os.path.basename(vocab_file))
                with codecs.getwriter("utf-8")(tf.gfile.GFile(
                        new_vocab_file, "wb")) as f:
                    for word in vocab:
                        f.write("%s\n" % word)
                vocab_file = new_vocab_file
    else:
        raise ValueError("vocab_file '%s' does not exist." % vocab_file)

    vocab_size = len(vocab)
    return vocab_size, vocab_file
Пример #2
0
def extend_hparams(hparams):
    """Add new arguments to hparams."""
    # Sanity checks
    if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]:
        raise ValueError("subword option must be either spm, or bpe")
    if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0:
        raise ValueError(
            "beam_width must greater than 0 when using beam_search"
            "decoder.")

    # Different number of encoder / decoder layers
    assert hparams.num_encoder_layers == hparams.num_decoder_layers

    # The first unidirectional layer (after the bi-directional layer) in
    # the GNMT encoder can't have residual connection due to the input is
    # the concatenation of fw_cell and bw_cell's outputs.
    num_encoder_residual_layers = hparams.num_encoder_layers - 2
    num_decoder_residual_layers = num_encoder_residual_layers
    _add_argument(hparams, "num_encoder_residual_layers",
                  num_encoder_residual_layers)
    _add_argument(hparams, "num_decoder_residual_layers",
                  num_decoder_residual_layers)

    ## Vocab
    # Get vocab file names first
    if hparams.vocab_prefix:
        src_vocab_file = hparams.vocab_prefix + "." + hparams.src
        tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
    else:
        raise ValueError("hparams.vocab_prefix must be provided.")

    # Source vocab
    src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
        src_vocab_file,
        hparams.out_dir,
        check_special_token=hparams.check_special_token,
        sos=hparams.sos,
        eos=hparams.eos,
        unk=vocab_utils.UNK)

    # Target vocab
    utils.print_out("  using source vocab for target")
    tgt_vocab_file = src_vocab_file
    tgt_vocab_size = src_vocab_size
    _add_argument(hparams, "src_vocab_size", src_vocab_size)
    _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size)
    _add_argument(hparams, "src_vocab_file", src_vocab_file)
    _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file)

    # Num embedding partitions
    _add_argument(hparams, "num_enc_emb_partitions",
                  hparams.num_embeddings_partitions)
    _add_argument(hparams, "num_dec_emb_partitions",
                  hparams.num_embeddings_partitions)

    # Pretrained Embeddings
    _add_argument(hparams, "src_embed_file", "")
    _add_argument(hparams, "tgt_embed_file", "")

    return hparams
def _single_cell(num_units,
                 forget_bias,
                 dropout,
                 mode,
                 residual_connection=False,
                 residual_fn=None,
                 global_step=None,
                 fast_reverse=False,
                 seq_len=None):
  """Create an instance of a single RNN cell."""
  # dropout (= 1 - keep_prob) is set to 0 during eval and infer
  dropout = dropout if mode == tf.contrib.learn.ModeKeys.TRAIN else 0.0

  # Cell Type
  utils.print_out("  LSTM, forget_bias=%g" % forget_bias, new_line=False)
  single_cell = tf.contrib.rnn.BasicLSTMCell(num_units, forget_bias=forget_bias)

  # Dropout (= 1 - keep_prob)
  enabled = (
      mode == tf.contrib.learn.ModeKeys.TRAIN) or dropout > 0.0 or fast_reverse
  single_cell = CellWrapper(
      cell=single_cell,
      input_keep_prob=(1.0 - dropout),
      global_step=global_step,
      seq_len=seq_len,
      enabled=enabled)

  # Residual
  if residual_connection:
    single_cell = tf.contrib.rnn.ResidualWrapper(
        single_cell, residual_fn=residual_fn)
    utils.print_out("  %s" % type(single_cell).__name__, new_line=False)

  return single_cell
def _cell_list(num_units,
               num_layers,
               num_residual_layers,
               forget_bias,
               dropout,
               mode,
               single_cell_fn=None,
               residual_fn=None,
               global_step=None,
               fast_reverse=False,
               seq_len=None):
  """Create a list of RNN cells."""
  if not single_cell_fn:
    single_cell_fn = _single_cell

  # Multi-GPU
  cell_list = []
  for i in range(num_layers):
    utils.print_out("  cell %d" % i, new_line=False)
    single_cell = single_cell_fn(
        num_units=num_units,
        forget_bias=forget_bias,
        dropout=dropout,
        mode=mode,
        residual_connection=(i >= num_layers - num_residual_layers),
        residual_fn=residual_fn,
        global_step=global_step,
        fast_reverse=fast_reverse,
        seq_len=seq_len)
    utils.print_out("")
    cell_list.append(single_cell)

  return cell_list
Пример #5
0
def get_metric(hparams, predictions, current_step):
    """Run inference and compute metric."""
    predicted_ids = []
    for prediction in predictions:
        predicted_ids.append(prediction["predictions"])

    if hparams.examples_to_infer < len(predicted_ids):
        predicted_ids = predicted_ids[0:hparams.examples_to_infer]
    translations = _convert_ids_to_strings(hparams.tgt_vocab_file,
                                           predicted_ids)

    trans_file = os.path.join(
        hparams.out_dir, "newstest2014_out_{}.tok.de".format(current_step))
    trans_dir = os.path.dirname(trans_file)
    if not tf.gfile.Exists(trans_dir):
        tf.gfile.MakeDirs(trans_dir)
    tf.logging.info("Writing to file %s" % trans_file)
    with codecs.getwriter("utf-8")(tf.gfile.GFile(trans_file,
                                                  mode="wb")) as trans_f:
        trans_f.write("")  # Write empty string to ensure file is created.
        for translation in translations:
            sentence = nmt_utils.get_translation(
                translation,
                tgt_eos=hparams.eos,
                subword_option=hparams.subword_option)
            trans_f.write((sentence + b"\n").decode("utf-8"))

    # Evaluation
    output_dir = os.path.join(hparams.out_dir, "eval")
    tf.gfile.MakeDirs(output_dir)
    summary_writer = tf.summary.FileWriter(output_dir)

    ref_file = "%s.%s" % (hparams.test_prefix, hparams.tgt)

    metric = "bleu"
    if hparams.use_borg:
        score = evaluation_utils.evaluate(ref_file, trans_file, metric,
                                          hparams.subword_option)
    else:
        score = get_sacrebleu(trans_file, hparams.detokenizer_file)
    with tf.Graph().as_default():
        summaries = []
        summaries.append(tf.Summary.Value(tag=metric, simple_value=score))
    tf_summary = tf.Summary(value=list(summaries))
    summary_writer.add_summary(tf_summary, current_step)

    misc_utils.print_out("  %s: %.1f" % (metric, score))

    summary_writer.close()
    return score
Пример #6
0
    def build_graph(self, hparams, source, max_seq_len):
        """Subclass must implement this method.

    Creates a sequence-to-sequence model with dynamic RNN decoder API.
    Args:
      hparams: Hyperparameter configurations.
      source: The input source.
      max_seq_len: The max sequence length

    Returns:
      A tuple of the form (logits, predicted_ids) for infererence and
      (loss, None) for training.
      where:
        logits: float32 Tensor [batch_size x num_decoder_symbols]
        loss: float32 scalar
        predicted_ids: predicted ids from beam search.
    """
        utils.print_out("# Creating %s graph ..." % self.mode)

        source = tf.reshape(
            tf.slice(source, [0, 0], [self.batch_size, max_seq_len]),
            [self.batch_size, max_seq_len])
        with tf.variable_scope("dynamic_seq2seq",
                               dtype=self.dtype,
                               reuse=self.reuse):
            if hparams.activation_dtype == "bfloat16":
                tf.get_variable_scope().set_dtype(tf.bfloat16)
            # Encoder
            encoder_outputs, encoder_states = self._build_encoder(
                hparams, source)

            ## Decoder
            with tf.variable_scope("decoder", reuse=self.reuse):
                with tf.variable_scope("output_projection", reuse=self.reuse):
                    self.output_layer = tf.slice(
                        tf.get_variable("kernel", [
                            self.num_units, 128 *
                            (self.tgt_vocab_size // 128 + 1)
                        ]), [0, 0], [self.num_units, self.tgt_vocab_size])

            return self._build_decoder(encoder_outputs, encoder_states,
                                       hparams)[1]
def load_embed_txt(embed_file):
    """Load embed_file into a python dictionary.

  Note: the embed_file should be a Glove/word2vec formatted txt file. Assuming
  Here is an exampe assuming embed_size=5:

  the -0.071549 0.093459 0.023738 -0.090339 0.056123
  to 0.57346 0.5417 -0.23477 -0.3624 0.4037
  and 0.20327 0.47348 0.050877 0.002103 0.060547

  For word2vec format, the first line will be: <num_words> <emb_size>.

  Args:
    embed_file: file path to the embedding file.
  Returns:
    a dictionary that maps word to vector, and the size of embedding dimensions.
  """
    emb_dict = dict()
    emb_size = None

    is_first_line = True
    with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, "rb")) as f:
        for line in f:
            tokens = line.rstrip().split(" ")
            if is_first_line:
                is_first_line = False
                if len(tokens) == 2:  # header line
                    emb_size = int(tokens[1])
                    continue
            word = tokens[0]
            vec = list(map(float, tokens[1:]))
            emb_dict[word] = vec
            if emb_size:
                if emb_size != len(vec):
                    utils.print_out(
                        "Ignoring %s since embeding size is inconsistent." %
                        word)
                    del emb_dict[word]
            else:
                emb_size = len(vec)
    return emb_dict, emb_size
Пример #8
0
    def _get_learning_rate_warmup(self, hparams):
        """Get learning rate warmup."""
        warmup_steps = hparams.warmup_steps
        warmup_scheme = hparams.warmup_scheme
        utils.print_out(
            "  learning_rate=%g, warmup_steps=%d, warmup_scheme=%s" %
            (hparams.learning_rate, warmup_steps, warmup_scheme))

        # Apply inverse decay if global steps less than warmup steps.
        # Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3)
        # When step < warmup_steps,
        #   learing_rate *= warmup_factor ** (warmup_steps - step)
        if warmup_scheme == "t2t":
            # 0.01^(1/warmup_steps): we start with a lr, 100 times smaller
            warmup_factor = tf.exp(tf.log(0.01) / warmup_steps)
            inv_decay = warmup_factor**(tf.to_float(warmup_steps -
                                                    self.global_step))
        else:
            raise ValueError("Unknown warmup scheme %s" % warmup_scheme)

        return tf.cond(self.global_step < hparams.warmup_steps,
                       lambda: inv_decay * self.learning_rate,
                       lambda: self.learning_rate,
                       name="learning_rate_warump_cond")
Пример #9
0
    def _set_train_or_infer(self, res, hparams):
        """Set up training."""
        if self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.predicted_ids = res[1]

        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrange for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            loss = res[0]
            self.learning_rate = tf.constant(hparams.learning_rate)
            # warm-up
            self.learning_rate = self._get_learning_rate_warmup(hparams)
            # decay
            self.learning_rate = self._get_learning_rate_decay(hparams)

            # Optimizer
            if hparams.optimizer == "sgd":
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            elif hparams.optimizer == "adam":
                opt = tf.train.AdamOptimizer(self.learning_rate)
            else:
                raise ValueError("Unknown optimizer type %s" %
                                 hparams.optimizer)

            opt = tf.contrib.tpu.CrossShardOptimizer(opt)
            # Gradients
            gradients = tf.gradients(loss,
                                     params,
                                     colocate_gradients_with_ops=True)

            clipped_grads, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm = grad_norm

            self.update = opt.apply_gradients(zip(clipped_grads, params),
                                              global_step=self.global_step)

        # Print trainable variables
        utils.print_out("# Trainable variables")
        utils.print_out("Format: <name>, <shape>, <(soft) device placement>")
        for param in params:
            utils.print_out(
                "  %s, %s, %s" %
                (param.name, str(param.get_shape()), param.op.device))
def _create_pretrained_emb_from_txt(
    vocab_file, embed_file, num_trainable_tokens=3, dtype=tf.float32,
    scope=None):
  """Load pretrain embeding from embed_file, and return an embedding matrix.

  Args:
    vocab_file: Path to vocab file.
    embed_file: Path to a Glove formmated embedding txt file.
    num_trainable_tokens: Make the first n tokens in the vocab file as trainable
      variables. Default is 3, which is "<unk>", "<s>" and "</s>".
    dtype: data type.
    scope: tf scope name.

  Returns:
    pretrained embedding table variable.
  """
  vocab, _ = vocab_utils.load_vocab(vocab_file)
  trainable_tokens = vocab[:num_trainable_tokens]

  utils.print_out("# Using pretrained embedding: %s." % embed_file)
  utils.print_out("  with trainable tokens: ")

  emb_dict, emb_size = vocab_utils.load_embed_txt(embed_file)
  for token in trainable_tokens:
    utils.print_out("    %s" % token)
    if token not in emb_dict:
      emb_dict[token] = [0.0] * emb_size

  emb_mat = np.array(
      [emb_dict[token] for token in vocab], dtype=dtype.as_numpy_dtype())
  emb_mat = tf.constant(emb_mat)
  emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1])
  with tf.variable_scope(scope or "pretrain_embeddings", dtype=dtype) as scope:
    emb_mat_var = tf.get_variable(
        "emb_mat_var", [num_trainable_tokens, emb_size])
  return tf.concat([emb_mat_var, emb_mat_const], 0)
def create_emb_for_encoder_and_decoder(src_vocab_size,
                                       tgt_vocab_size,
                                       src_embed_size,
                                       tgt_embed_size,
                                       dtype=tf.float32,
                                       num_enc_partitions=0,
                                       num_dec_partitions=0,
                                       src_vocab_file=None,
                                       tgt_vocab_file=None,
                                       src_embed_file=None,
                                       tgt_embed_file=None,
                                       scope=None):
  """Create embedding matrix for both encoder and decoder.

  Args:
    src_vocab_size: An integer. The source vocab size.
    tgt_vocab_size: An integer. The target vocab size.
    src_embed_size: An integer. The embedding dimension for the encoder's
      embedding.
    tgt_embed_size: An integer. The embedding dimension for the decoder's
      embedding.
    dtype: dtype of the embedding matrix. Default to float32.
    num_enc_partitions: number of partitions used for the encoder's embedding
      vars.
    num_dec_partitions: number of partitions used for the decoder's embedding
      vars.
    src_vocab_file: A string. The source vocabulary file.
    tgt_vocab_file: A string. The target vocabulary file.
    src_embed_file: A string. The source embedding file.
    tgt_embed_file: A string. The target embedding file.
    scope: VariableScope for the created subgraph. Default to "embedding".

  Returns:
    embedding_encoder: Encoder's embedding matrix.
    embedding_decoder: Decoder's embedding matrix.

  Raises:
    ValueError: if source and target have different vocab size.
  """
  if num_enc_partitions <= 1:
    enc_partitioner = None
  else:
    # Note: num_partitions > 1 is required for distributed training due to
    # embedding_lookup tries to colocate single partition-ed embedding variable
    # with lookup ops. This may cause embedding variables being placed on worker
    # jobs.
    enc_partitioner = tf.fixed_size_partitioner(num_enc_partitions)

  if num_dec_partitions <= 1:
    dec_partitioner = None
  else:
    # Note: num_partitions > 1 is required for distributed training due to
    # embedding_lookup tries to colocate single partition-ed embedding variable
    # with lookup ops. This may cause embedding variables being placed on worker
    # jobs.
    dec_partitioner = tf.fixed_size_partitioner(num_dec_partitions)

  if src_embed_file and enc_partitioner:
    raise ValueError(
        "Can't set num_enc_partitions > 1 when using pretrained encoder "
        "embedding")

  if tgt_embed_file and dec_partitioner:
    raise ValueError(
        "Can't set num_dec_partitions > 1 when using pretrained decdoer "
        "embedding")

  with tf.variable_scope(
      scope or "embeddings", dtype=dtype, partitioner=enc_partitioner) as scope:
    if src_vocab_size != tgt_vocab_size:
      raise ValueError("Share embedding but different src/tgt vocab sizes"
                       " %d vs. %d" % (src_vocab_size, tgt_vocab_size))
    assert src_embed_size == tgt_embed_size
    utils.print_out("# Use the same embedding for source and target")
    vocab_file = src_vocab_file or tgt_vocab_file
    embed_file = src_embed_file or tgt_embed_file

    with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
      embedding_encoder = _create_or_load_embed("embedding_encoder", vocab_file,
                                                embed_file, src_vocab_size,
                                                src_embed_size, dtype)
    with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
      embedding_decoder = _create_or_load_embed("embedding_decoder", vocab_file,
                                                embed_file, src_vocab_size,
                                                src_embed_size, dtype)

  return embedding_encoder, embedding_decoder