示例#1
0
def _cell_list(unit_type,
               num_units,
               num_layers,
               num_residual_layers,
               forget_bias,
               dropout,
               mode,
               dtype=None,
               single_cell_fn=None,
               residual_fn=None,
               use_block_lstm=False):
    """Create a list of RNN cells."""
    if not single_cell_fn:
        single_cell_fn = _single_cell

    # Multi-GPU
    cell_list = []
    for i in range(num_layers):
        utils.print_out("  cell %d" % i, new_line=False)
        single_cell = single_cell_fn(
            unit_type=unit_type,
            num_units=num_units,
            forget_bias=forget_bias,
            dropout=dropout,
            mode=mode,
            dtype=dtype,
            residual_connection=(i >= num_layers - num_residual_layers),
            residual_fn=residual_fn,
            use_block_lstm=use_block_lstm)
        utils.print_out("")
        cell_list.append(single_cell)

    return cell_list
示例#2
0
    def _get_learning_rate_warmup(self, hparams):
        """Get learning rate warmup."""
        warmup_steps = hparams.warmup_steps
        warmup_scheme = hparams.warmup_scheme
        utils.print_out(
            "  learning_rate=%g, warmup_steps=%d, warmup_scheme=%s" %
            (hparams.learning_rate, warmup_steps, warmup_scheme))
        if not warmup_scheme:
            return self.learning_rate

        # Apply inverse decay if global steps less than warmup steps.
        # Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3)
        # When step < warmup_steps,
        #   learing_rate *= warmup_factor ** (warmup_steps - step)
        if warmup_scheme == "t2t":
            # 0.01^(1/warmup_steps): we start with a lr, 100 times smaller
            warmup_factor = tf.exp(tf.log(0.01) / warmup_steps)
            inv_decay = warmup_factor**(tf.to_float(warmup_steps -
                                                    self.global_step))
        else:
            raise ValueError("Unknown warmup scheme %s" % warmup_scheme)

        return tf.cond(self.global_step < hparams.warmup_steps,
                       lambda: inv_decay * self.learning_rate,
                       lambda: self.learning_rate,
                       name="learning_rate_warump_cond")
示例#3
0
    def build_graph_dist_strategy(self, features, labels, mode, params):
        """Model function."""
        del labels, params
        misc_utils.print_out("Running dist_strategy mode_fn")

        hparams = self.hparams

        # Create a GNMT model for training.
        # assert (hparams.encoder_type == "gnmt" or
        #        hparams.attention_architecture in ["gnmt", "gnmt_v2"])
        with mixed_precision_scope():
            model = gnmt_model.GNMTModel(hparams, mode=mode, features=features)
            if mode == tf.contrib.learn.ModeKeys.INFER:
                sample_ids = model.sample_id
                reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file(
                    hparams.tgt_vocab_file, default_value=vocab_utils.UNK)
                sample_words = reverse_target_vocab_table.lookup(
                    tf.to_int64(sample_ids))
                # make sure outputs is of shape [batch_size, time] or [beam_width,
                # batch_size, time] when using beam search.
                if hparams.time_major:
                    sample_words = tf.transpose(sample_words)
                elif sample_words.shape.ndims == 3:
                    # beam search output in [batch_size, time, beam_width] shape.
                    sample_words = tf.transpose(sample_words, [2, 0, 1])
                predictions = {"predictions": sample_words}
                # return loss, vars, grads, predictions, train_op, scaffold
                return None, None, None, predictions, None, None
            elif mode == tf.contrib.learn.ModeKeys.TRAIN:
                loss = model.train_loss
                train_op = model.update
                return loss, model.params, model.grads, None, train_op, None
            else:
                raise ValueError("Unknown mode in model_fn: %s" % mode)
示例#4
0
def tokenize(hparams, file, tokenized_file):
    utils.print_out("tokenizing {} -> {}".format(file, tokenized_file))
    with open(file, 'rb') as input_file:
        with open(tokenized_file, 'wb') as output_file:
            subprocess.run([hparams.tokenizer_file, '-l', hparams.src],
                           stdin=input_file,
                           stdout=output_file)
示例#5
0
def print_variables_in_ckpt(ckpt_path):
    """Print a list of variables in a checkpoint together with their shapes."""
    utils.print_out("# Variables in ckpt %s" % ckpt_path)
    reader = tf.train.NewCheckpointReader(ckpt_path)
    variable_map = reader.get_variable_to_shape_map()
    for key in sorted(variable_map.keys()):
        utils.print_out("  %s: %s" % (key, variable_map[key]))
示例#6
0
    def _compute_tower_grads(self,
                             tower_loss,
                             tower_params,
                             learning_rate,
                             use_fp16=False,
                             loss_scale=None,
                             colocate_gradients_with_ops=True):
        """docstring."""
        if use_fp16:
            assert loss_scale
            scaled_loss = tf.multiply(tower_loss,
                                      tf.convert_to_tensor(
                                          loss_scale, dtype=tower_loss.dtype),
                                      name="scaling_loss")
        else:
            scaled_loss = tower_loss

        opt = self.get_optimizer(self.hparams, learning_rate)
        grads_and_vars = opt.compute_gradients(
            scaled_loss,
            tower_params,
            colocate_gradients_with_ops=self.hparams.
            colocate_gradients_with_ops)
        grads = [x for (x, _) in grads_and_vars]
        assert grads
        for g in grads:
            assert g.dtype == tf.float32, "grad.dtype isn't fp32: %s" % g.name
        # Downscale grads
        for var, grad in zip(tower_params, grads):
            if grad is None:
                misc_utils.print_out("%s gradient is None!" % var.name)

        if use_fp16:
            grads = [grad * tf.reciprocal(loss_scale) for grad in grads]
        return tower_params, grads, opt
示例#7
0
    def build_graph(self, hparams, scope=None):
        """Subclass must implement this method.

    Creates a sequence-to-sequence model with dynamic RNN decoder API.
    Args:
      hparams: Hyperparameter configurations.
      scope: VariableScope for the created subgraph; default "dynamic_seq2seq".

    Returns:
      A tuple of the form (logits, loss_tuple, final_context_state, sample_id),
      where:
        logits: float32 Tensor [batch_size x num_decoder_symbols].
        loss: loss = the total loss / batch_size.
        final_context_state: the final state of decoder RNN.
        sample_id: sampling indices.

    Raises:
      ValueError: if encoder_type differs from mono and bi, or
        attention_option is not (luong | scaled_luong |
        bahdanau | normed_bahdanau).
    """
        utils.print_out("# Creating %s graph ..." % self.mode)

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = tf.layers.Dense(self.tgt_vocab_size,
                                                    use_bias=False,
                                                    name="output_projection",
                                                    dtype=self.dtype)

        with tf.variable_scope(scope or "dynamic_seq2seq", dtype=self.dtype):
            # Encoder
            if hparams.language_model:  # no encoder for language modeling
                utils.print_out("  language modeling: no encoder")
                self.encoder_outputs = None
                encoder_state = None
            else:
                self.encoder_outputs, encoder_state = self._build_encoder(
                    hparams)

            ## Decoder
            logits, sample_id = (self._build_decoder(self.encoder_outputs,
                                                     encoder_state, hparams))

            ## Loss
            if self.mode != tf.contrib.learn.ModeKeys.INFER:
                loss = self._compute_loss(logits, hparams.label_smoothing)
            else:
                loss = tf.constant(0.0)

        return logits, loss, sample_id
示例#8
0
def train_fn(hparams):
    """Train function."""
    model_fn = make_model_fn(hparams)
    input_fn = make_input_fn(hparams, tf.contrib.learn.ModeKeys.TRAIN)

    log_step_count_steps = hparams.log_step_count_steps
    save_checkpoints_steps = hparams.save_checkpoints_steps
    if hparams.use_dist_strategy:
        distribution_strategy = get_distribution_strategy(hparams.num_gpus)
        config = tf.estimator.RunConfig(
            train_distribute=distribution_strategy,
            log_step_count_steps=log_step_count_steps,
            keep_checkpoint_max=None,
            save_checkpoints_steps=save_checkpoints_steps)
    else:
        sess_config = tf.ConfigProto(allow_soft_placement=True)
        if hparams.use_autojit_xla:
            sess_config.graph_options.optimizer_options.global_jit_level = (
                tf.OptimizerOptions.ON_1)
        if not hparams.use_pintohost_optimizer:
            sess_config.graph_options.rewrite_options.pin_to_host_optimization = (
                rewriter_config_pb2.RewriterConfig.OFF)
        config = tf.estimator.RunConfig(
            log_step_count_steps=log_step_count_steps,
            session_config=sess_config,
            keep_checkpoint_max=None,
            save_checkpoints_steps=save_checkpoints_steps)

    misc_utils.print_out("sess master is %s" % config.master)
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=hparams.output_dir,
                                       config=config)

    benchmark_hook = BenchmarkHook(hparams.batch_size,
                                   hparams.warmup_steps + 5)
    train_hooks = [benchmark_hook]
    if hparams.profile:
        train_hooks.append(
            tf.train.ProfilerHook(output_dir=hparams.output_dir,
                                  save_steps=hparams.profile_save_steps,
                                  show_dataflow=True,
                                  show_memory=True))

    max_steps = hparams.debug_num_train_steps
    estimator.train(
        input_fn=input_fn,
        max_steps=max_steps,
        hooks=train_hooks,
    )

    return benchmark_hook.get_average_speed_and_latencies()
  def get_post_init_ops(self):
    # Copy initialized values for variables on GPU 0 to other GPUs.
    global_vars = tf.global_variables()
    var_by_name = dict([(v.name, v) for v in global_vars])
    post_init_ops = []
    copy_froms = set()
    skipped_vars = []
    for v in global_vars:
      split_name = v.name.split('/')
      # TODO(b/62630508): use more specific prefix than v or v0.
      if split_name[0] == 'v0' or not v.name.startswith('v'):
        skipped_vars.append(v)
        continue
      # Only vars starts with "v[number]" are synced.
      split_name[0] = 'v0'
      copy_from = var_by_name['/'.join(split_name)]
      copy_froms.add(copy_from)
      post_init_ops.append(v.assign(copy_from.read_value()))
    post_init_ops += self._warmup_ops
    # If copy-froms is empty, then all vars are actually saved.
    misc_utils.print_out('All copy-from vars(%d): ' % len(copy_froms))
    for gv in copy_froms:
      misc_utils.print_out(gv.name)
    misc_utils.print_out('All skippped vars(%d): ' % len(skipped_vars))
    for gv in skipped_vars:
      misc_utils.print_out(gv.name)
    assert len(skipped_vars) >= len(copy_froms)

    return post_init_ops
示例#10
0
 def _get_infer_maximum_iterations(self, hparams, source_sequence_length):
     """Maximum decoding steps at inference time."""
     if hparams.tgt_max_len_infer:
         maximum_iterations = hparams.tgt_max_len_infer
         utils.print_out("  decoding maximum_iterations %d" %
                         maximum_iterations)
     else:
         # TODO(thangluong): add decoding_length_factor flag
         decoding_length_factor = 2.0
         max_encoder_length = tf.reduce_max(source_sequence_length)
         maximum_iterations = tf.to_int32(
             tf.round(
                 tf.to_float(max_encoder_length) * decoding_length_factor))
     return maximum_iterations
示例#11
0
def create_or_load_model(model, model_dir, session, name):
    """Create translation model and initialize or load parameters in session."""
    latest_ckpt = tf.train.latest_checkpoint(model_dir)
    if latest_ckpt:
        model = load_model(model, latest_ckpt, session, name)
    else:
        start_time = time.time()
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        utils.print_out(
            "  created %s model with fresh parameters, time %.2fs" %
            (name, time.time() - start_time))

    global_step = model.global_step.eval(session=session)
    return model, global_step
示例#12
0
    def _get_learning_rate_decay(self, hparams):
        """Get learning rate decay."""
        start_decay_step, decay_steps, decay_factor = self._get_decay_info(
            hparams)
        utils.print_out(
            "  decay_scheme=%s, start_decay_step=%d, decay_steps %d, "
            "decay_factor %g" % (hparams.decay_scheme, start_decay_step,
                                 decay_steps, decay_factor))

        return tf.cond(
            self.global_step < start_decay_step,
            lambda: self.learning_rate,
            lambda: tf.train.exponential_decay(  # pylint: disable=g-long-lambda
                self.learning_rate, (self.global_step - start_decay_step),
                decay_steps,
                decay_factor,
                staircase=True),
            name="learning_rate_decay_cond")
示例#13
0
 def _print_varinfo(self, var_params, tower_id):
     # Print trainable variables
     misc_utils.print_out("# Trainable variables for tower: %d" % tower_id)
     misc_utils.print_out(
         "Format: <name>, <shape>, <dtype>, <(soft) device placement>")
     for param in var_params:
         misc_utils.print_out(
             "  %s, %s, %s, %s" % (param.name, str(
                 param.get_shape()), param.dtype.name, param.op.device))
     misc_utils.print_out("Total params size: %.2f GB" % (4. * np.sum([
         p.get_shape().num_elements()
         for p in var_params if p.get_shape().is_fully_defined()
     ]) / 2**30))
示例#14
0
def run_main(flags, default_hparams, estimator_fn):
    """Run main."""
    # Random
    random_seed = flags.random_seed
    if random_seed is not None and random_seed > 0:
        utils.print_out("# Set random seed to %d" % random_seed)
        random.seed(random_seed)
        np.random.seed(random_seed)
        tf.set_random_seed(random_seed)

    # Model output directory
    output_dir = flags.output_dir
    if output_dir and not tf.gfile.Exists(output_dir):
        utils.print_out("# Creating output directory %s ..." % output_dir)
        tf.gfile.MakeDirs(output_dir)

    # Load hparams.
    hparams = create_or_load_hparams(default_hparams, flags.hparams_path)

    # Train or Evaluation
    estimator_fn(hparams)
    return hparams
示例#15
0
def load_model(model, ckpt_path, session, name):
    """Load model from a checkpoint."""
    start_time = time.time()
    try:
        model.saver.restore(session, ckpt_path)
    except tf.errors.NotFoundError as e:
        utils.print_out("Can't load checkpoint")
        print_variables_in_ckpt(ckpt_path)
        utils.print_out("%s" % str(e))

    session.run(tf.tables_initializer())
    utils.print_out("  loaded %s model parameters from %s, time %.2fs" %
                    (name, ckpt_path, time.time() - start_time))
    return model
示例#16
0
  def _build_encoder(self, hparams):
    """Build a GNMT encoder."""
    assert hparams.encoder_type == "gnmt"

    # Build GNMT encoder.
    num_bi_layers = 1
    num_uni_layers = self.num_encoder_layers - num_bi_layers
    utils.print_out("# Build a GNMT encoder")
    utils.print_out("  num_bi_layers = %d" % num_bi_layers)
    utils.print_out("  num_uni_layers = %d" % num_uni_layers)

    # source is batch-majored
    source = self.features["source"]
    import sys
    print('source.shape: %s' % source.shape, file=sys.stderr)
    if self.time_major:
      # Later rnn would use time-majored inputs
      source = tf.transpose(source)

    with tf.variable_scope("encoder"):
      dtype = self.dtype

      encoder_emb_inp = tf.cast(
          self.encoder_emb_lookup_fn(self.embedding_encoder, source), dtype)

      # Build 1st bidi layer.
      bi_encoder_outputs, bi_encoder_state = self._build_encoder_layers_bidi(
          encoder_emb_inp, self.features["source_sequence_length"], hparams,
          dtype)

      # Build all the rest unidi layers
      encoder_state, encoder_outputs = self._build_encoder_layers_unidi(
          bi_encoder_outputs, self.features["source_sequence_length"],
          num_uni_layers, hparams, dtype)

      # Pass all encoder states to the decoder
      #   except the first bi-directional layer
      encoder_state = (bi_encoder_state[1],) + (
          (encoder_state,) if num_uni_layers == 1 else encoder_state)
    return encoder_outputs, encoder_state
示例#17
0
def _create_pretrained_emb_from_txt(vocab_file,
                                    embed_file,
                                    num_trainable_tokens=3,
                                    dtype=tf.float32,
                                    scope=None):
    """Load pretrain embeding from embed_file, and return an embedding matrix.

  Args:
    vocab_file: Path to vocab file.
    embed_file: Path to a Glove formmated embedding txt file.
    num_trainable_tokens: Make the first n tokens in the vocab file as trainable
      variables. Default is 3, which is "<unk>", "<s>" and "</s>".
    dtype: data type.
    scope: tf scope name.

  Returns:
    pretrained embedding table variable.
  """
    vocab, _ = vocab_utils.load_vocab(vocab_file)
    trainable_tokens = vocab[:num_trainable_tokens]

    utils.print_out("# Using pretrained embedding: %s." % embed_file)
    utils.print_out("  with trainable tokens: ")

    emb_dict, emb_size = vocab_utils.load_embed_txt(embed_file)
    for token in trainable_tokens:
        utils.print_out("    %s" % token)
        if token not in emb_dict:
            emb_dict[token] = [0.0] * emb_size

    emb_mat = np.array([emb_dict[token] for token in vocab],
                       dtype=dtype.as_numpy_dtype())
    emb_mat = tf.constant(emb_mat)
    emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1])
    with tf.variable_scope(scope or "pretrain_embeddings",
                           dtype=dtype) as scope:
        emb_mat_var = tf.get_variable("emb_mat_var",
                                      [num_trainable_tokens, emb_size])
    return tf.concat([emb_mat_var, emb_mat_const], 0)
示例#18
0
def avg_checkpoints(model_dir, num_last_checkpoints, global_step_name):
    """Average the last N checkpoints in the model_dir."""
    checkpoint_state = tf.train.get_checkpoint_state(model_dir)
    if not checkpoint_state:
        utils.print_out("# No checkpoint file found in directory: %s" %
                        model_dir)
        return None

    # Checkpoints are ordered from oldest to newest.
    checkpoints = (
        checkpoint_state.all_model_checkpoint_paths[-num_last_checkpoints:])

    if len(checkpoints) < num_last_checkpoints:
        utils.print_out(
            "# Skipping averaging checkpoints because not enough checkpoints is "
            "available.")
        return None

    avg_model_dir = os.path.join(model_dir, "avg_checkpoints")
    if not tf.gfile.Exists(avg_model_dir):
        utils.print_out(
            "# Creating new directory %s for saving averaged checkpoints." %
            avg_model_dir)
        tf.gfile.MakeDirs(avg_model_dir)

    utils.print_out("# Reading and averaging variables in checkpoints:")
    var_list = tf.contrib.framework.list_variables(checkpoints[0])
    var_values, var_dtypes = {}, {}
    for (name, shape) in var_list:
        if name != global_step_name:
            var_values[name] = np.zeros(shape)

    for checkpoint in checkpoints:
        utils.print_out("    %s" % checkpoint)
        reader = tf.contrib.framework.load_checkpoint(checkpoint)
        for name in var_values:
            tensor = reader.get_tensor(name)
            var_dtypes[name] = tensor.dtype
            var_values[name] += tensor

    for name in var_values:
        var_values[name] /= len(checkpoints)

    # Build a graph with same variables in the checkpoints, and save the averaged
    # variables into the avg_model_dir.
    with tf.Graph().as_default():
        tf_vars = [
            tf.get_variable(v,
                            shape=var_values[v].shape,
                            dtype=var_dtypes[name]) for v in var_values
        ]

        placeholders = [
            tf.placeholder(v.dtype, shape=v.shape) for v in tf_vars
        ]
        assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)]
        saver = tf.train.Saver(tf.all_variables(), save_relative_paths=True)

        with tf.Session() as sess:
            sess.run(tf.initialize_all_variables())
            for p, assign_op, (name, value) in zip(placeholders, assign_ops,
                                                   six.iteritems(var_values)):
                sess.run(assign_op, {p: value})

            # Use the built saver to save the averaged checkpoint. Only keep 1
            # checkpoint and the best checkpoint will be moved to avg_best_metric_dir.
            saver.save(sess, os.path.join(avg_model_dir, "translate.ckpt"))

    return avg_model_dir
示例#19
0
    def _set_params_initializer(self,
                                hparams,
                                mode,
                                features,
                                scope,
                                extra_args=None):
        """Set various params for self and initialize."""
        self.mode = mode
        self.src_vocab_size = hparams.src_vocab_size
        self.tgt_vocab_size = hparams.tgt_vocab_size
        self.features = features
        self.time_major = hparams.time_major

        if hparams.use_char_encode:
            assert (not self.time_major), ("Can't use time major for"
                                           " char-level inputs.")

        self.dtype = tf.float16 if hparams.use_fp16 else tf.float32

        # extra_args: to make it flexible for adding external customizable code
        self.single_cell_fn = None
        if extra_args:
            self.single_cell_fn = extra_args.single_cell_fn

        # Set num units
        self.num_units = hparams.num_units
        # Set num layers
        self.num_encoder_layers = hparams.num_encoder_layers
        self.num_decoder_layers = hparams.num_decoder_layers
        assert self.num_encoder_layers
        assert self.num_decoder_layers

        # Set num residual layers
        if hasattr(hparams,
                   "num_residual_layers"):  # compatible common_test_utils
            self.num_encoder_residual_layers = hparams.num_residual_layers
            self.num_decoder_residual_layers = hparams.num_residual_layers
        else:
            self.num_encoder_residual_layers = hparams.num_encoder_residual_layers
            self.num_decoder_residual_layers = hparams.num_decoder_residual_layers

        # Batch size
        self.batch_size = tf.size(self.features["source_sequence_length"])

        # Global step
        global_step = tf.train.get_global_step()
        if global_step is not None:
            utils.print_out("global_step already created!")

        self.global_step = tf.train.get_or_create_global_step()
        utils.print_out("model.global_step.name: %s" % self.global_step.name)

        # Initializer
        self.random_seed = hparams.random_seed
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   self.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.encoder_emb_lookup_fn = tf.nn.embedding_lookup
        self.init_embeddings(hparams, scope)
示例#20
0
def extend_hparams(hparams):
    """Add new arguments to hparams."""
    # Sanity checks
    if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0:
        raise ValueError("For bi, num_encoder_layers %d should be even" %
                         hparams.num_encoder_layers)
    if (hparams.attention_architecture in ["gnmt"]
            and hparams.num_encoder_layers < 2):
        raise ValueError("For gnmt attention architecture, "
                         "num_encoder_layers %d should be >= 2" %
                         hparams.num_encoder_layers)
    if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]:
        raise ValueError("subword option must be either spm, or bpe")
    if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0:
        raise ValueError(
            "beam_width must greater than 0 when using beam_search"
            "decoder.")
    if hparams.mode == "translate" and not hparams.translate_file:
        raise ValueError(
            "--translate_file flag must be specified in translate mode")

    # Different number of encoder / decoder layers
    assert hparams.num_encoder_layers and hparams.num_decoder_layers
    if hparams.num_encoder_layers != hparams.num_decoder_layers:
        hparams.pass_hidden_state = False
        utils.print_out(
            "Num encoder layer %d is different from num decoder layer"
            " %d, so set pass_hidden_state to False" %
            (hparams.num_encoder_layers, hparams.num_decoder_layers))

    # Set residual layers
    num_encoder_residual_layers = 0
    num_decoder_residual_layers = 0
    if hparams.residual:
        if hparams.num_encoder_layers > 1:
            num_encoder_residual_layers = hparams.num_encoder_layers - 1
        if hparams.num_decoder_layers > 1:
            num_decoder_residual_layers = hparams.num_decoder_layers - 1

        if hparams.encoder_type == "gnmt":
            # The first unidirectional layer (after the bi-directional layer) in
            # the GNMT encoder can't have residual connection due to the input is
            # the concatenation of fw_cell and bw_cell's outputs.
            num_encoder_residual_layers = hparams.num_encoder_layers - 2

            # Compatible for GNMT models
            if hparams.num_encoder_layers == hparams.num_decoder_layers:
                num_decoder_residual_layers = num_encoder_residual_layers
    _add_argument(hparams, "num_encoder_residual_layers",
                  num_encoder_residual_layers)
    _add_argument(hparams, "num_decoder_residual_layers",
                  num_decoder_residual_layers)

    # Language modeling
    if hparams.language_model:
        hparams.attention = ""
        hparams.attention_architecture = ""
        hparams.pass_hidden_state = False
        hparams.share_vocab = True
        hparams.src = hparams.tgt
        utils.print_out(
            "For language modeling, we turn off attention and "
            "pass_hidden_state; turn on share_vocab; set src to tgt.")

    ## Vocab
    # Get vocab file names first
    if hparams.vocab_prefix:
        src_vocab_file = hparams.vocab_prefix + "." + hparams.src
        tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
    else:
        raise ValueError("hparams.vocab_prefix must be provided.")

    # Source vocab
    src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
        src_vocab_file,
        hparams.output_dir,
        check_special_token=hparams.check_special_token,
        sos=hparams.sos,
        eos=hparams.eos,
        unk=vocab_utils.UNK,
        pad_vocab=True)

    # Target vocab
    if hparams.share_vocab:
        utils.print_out("  using source vocab for target")
        tgt_vocab_file = src_vocab_file
        tgt_vocab_size = src_vocab_size
    else:
        tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(
            tgt_vocab_file,
            hparams.output_dir,
            check_special_token=hparams.check_special_token,
            sos=hparams.sos,
            eos=hparams.eos,
            unk=vocab_utils.UNK)
    _add_argument(hparams, "src_vocab_size", src_vocab_size)
    _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size)
    _add_argument(hparams, "src_vocab_file", src_vocab_file)
    _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file)

    # Num embedding partitions
    _add_argument(hparams, "num_enc_emb_partitions",
                  hparams.num_embeddings_partitions)
    _add_argument(hparams, "num_dec_emb_partitions",
                  hparams.num_embeddings_partitions)

    # Pretrained Embeddings
    _add_argument(hparams, "src_embed_file", "")
    _add_argument(hparams, "tgt_embed_file", "")
    if hparams.embed_prefix:
        src_embed_file = hparams.embed_prefix + "." + hparams.src
        tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt

        if tf.gfile.Exists(src_embed_file):
            utils.print_out("  src_embed_file %s exist" % src_embed_file)
            hparams.src_embed_file = src_embed_file

            utils.print_out(
                "For pretrained embeddings, set num_enc_emb_partitions to 1")
            hparams.num_enc_emb_partitions = 1
        else:
            utils.print_out("  src_embed_file %s doesn't exist" %
                            src_embed_file)

        if tf.gfile.Exists(tgt_embed_file):
            utils.print_out("  tgt_embed_file %s exist" % tgt_embed_file)
            hparams.tgt_embed_file = tgt_embed_file

            utils.print_out(
                "For pretrained embeddings, set num_dec_emb_partitions to 1")
            hparams.num_dec_emb_partitions = 1
        else:
            utils.print_out("  tgt_embed_file %s doesn't exist" %
                            tgt_embed_file)

    # Evaluation
    metric = "bleu"
    best_metric_dir = os.path.join(hparams.output_dir, "best_" + metric)
    tf.gfile.MakeDirs(best_metric_dir)
    _add_argument(hparams, "best_" + metric, 0, update=False)
    _add_argument(hparams, "best_" + metric + "_dir", best_metric_dir)

    return hparams
示例#21
0
def main(unused_argv):
    experiment_start = time.time()

    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.use_fp16 and FLAGS.use_dist_strategy:
        raise ValueError("use_fp16 and use_dist_strategy aren't compatible")

    if FLAGS.use_fp16 + FLAGS.use_amp + FLAGS.use_fastmath > 1:
        raise ValueError(
            "Only one of use_fp16, use_amp, use_fastmath can be set")

    if FLAGS.use_amp:
        utils.print_out('Enabling TF-AMP')

        os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'

    if FLAGS.use_fastmath:
        utils.print_out('Enabling FastMath')

        os.environ["TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32"] = '1'
        os.environ["TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32"] = '1'
        os.environ["TF_ENABLE_CUDNN_RNN_TENSOR_OP_MATH_FP32"] = '1'

    # Set up hacky envvars.
    # Hack that affects Defun in attention_wrapper.py
    active_xla_option_nums = np.sum(
        [FLAGS.use_xla, FLAGS.use_autojit_xla, FLAGS.xla_compile])
    if active_xla_option_nums > 1:
        raise ValueError(
            "Only one of use_xla, xla_compile, use_autojit_xla can be set")

    os.environ["use_xla"] = str(FLAGS.use_xla).lower()
    if FLAGS.use_xla:
        os.environ["use_defun"] = str(True).lower()
    else:
        os.environ["use_defun"] = str(FLAGS.use_defun).lower()
    utils.print_out("use_defun is %s for attention" % os.environ["use_defun"])

    # TODO(jamesqin): retire this config after Cuda9.1
    os.environ["use_fp32_batch_matmul"] = (
        "true" if FLAGS.use_fp32_batch_matmul else "false")
    os.environ["xla_compile"] = "true" if FLAGS.xla_compile else "false"
    os.environ["force_inputs_padding"] = ("true" if FLAGS.force_inputs_padding
                                          else "false")

    if FLAGS.mode == "train":
        utils.print_out("Running training mode.")
        default_hparams = create_hparams(FLAGS)
        run_main(FLAGS, default_hparams, estimator.train_fn)
    elif FLAGS.mode == "infer" or FLAGS.mode == "translate":
        if FLAGS.mode == "infer":
            utils.print_out("Running inference mode.")
            translate_mode = False
        else:
            utils.print_out("Running translate mode on file {}.".format(
                FLAGS.translate_file))
            translate_mode = True

        # Random
        random_seed = FLAGS.random_seed
        if random_seed is not None and random_seed > 0:
            utils.print_out("# Set random seed to %d" % random_seed)
            random.seed(random_seed)
            np.random.seed(random_seed)
            tf.set_random_seed(random_seed)

        # Model output directory
        output_dir = FLAGS.output_dir
        if output_dir and not tf.gfile.Exists(output_dir):
            utils.print_out("# Creating output directory %s ..." % output_dir)
            tf.gfile.MakeDirs(output_dir)

        # Load hparams.
        default_hparams = create_hparams(FLAGS)
        default_hparams.num_buckets = 1
        # The estimator model_fn is written in a way allowing train hparams to be
        # passed in infer mode.
        hparams = create_or_load_hparams(default_hparams, FLAGS.hparams_path)
        utils.print_out("infer_hparams:")
        utils.print_hparams(hparams)

        if translate_mode:
            tokenize(hparams, hparams.translate_file,
                     hparams.translate_file + ".tok")

        eval_sentences, eval_src_tokens, _ = iterator_utils.get_effective_epoch_size(
            hparams, train=False)

        # Run evaluation when there's a new checkpoint
        tf.logging.info("Starting to evaluate...")
        eval_start = time.time()
        _, (eval_speed,
            eval_latencies), eval_output_tokens = estimator.eval_fn(
                hparams, hparams.ckpt, only_translate=translate_mode)
        eval_end = time.time()
        eval_delta = eval_end - eval_start
        utils.print_out(
            "eval time for ckpt: %.2f mins (%.2f sent/sec, %.2f tokens/sec)" %
            (eval_delta / 60., eval_speed, eval_speed *
             (eval_src_tokens + eval_output_tokens) / eval_sentences),
            f=sys.stderr)
        for lat in sorted(eval_latencies):
            utils.print_out("eval latency_%s for ckpt: %.2f ms" %
                            (lat, eval_latencies[lat] * 1000))

        if translate_mode:
            detokenize(hparams, hparams.translate_file + ".trans.tok",
                       hparams.translate_file + ".trans")

    else:
        assert FLAGS.mode == "train_and_eval"
        utils.print_out("Running train and eval mode.")

        # Random
        random_seed = FLAGS.random_seed
        if random_seed is not None and random_seed > 0:
            utils.print_out("# Set random seed to %d" % random_seed)
            random.seed(random_seed)
            np.random.seed(random_seed)
            tf.set_random_seed(random_seed)

        # Model output directory
        output_dir = FLAGS.output_dir
        if output_dir and not tf.gfile.Exists(output_dir):
            utils.print_out("# Creating output directory %s ..." % output_dir)
            tf.gfile.MakeDirs(output_dir)

        # Load hparams.
        default_hparams = create_hparams(FLAGS)

        hparams = create_or_load_hparams(default_hparams, FLAGS.hparams_path)
        utils.print_out("training hparams:")
        utils.print_hparams(hparams)
        with tf.gfile.GFile(os.path.join(output_dir, "train_hparams.txt"),
                            "w") as f:
            f.write(utils.serialize_hparams(hparams) + "\n")

        # The estimator model_fn is written in a way allowing train hparams to be
        # passed in infer mode.
        infer_hparams = tf.contrib.training.HParams(**hparams.values())
        infer_hparams.num_buckets = 1
        utils.print_out("infer_hparams:")
        utils.print_hparams(infer_hparams)
        with tf.gfile.GFile(os.path.join(output_dir, "infer_hparams.txt"),
                            "w") as f:
            f.write(utils.serialize_hparams(infer_hparams) + "\n")

        epochs = 0
        should_stop = epochs >= FLAGS.max_train_epochs

        train_sentences, train_src_tokens, train_tgt_tokens = iterator_utils.get_effective_epoch_size(
            hparams)
        eval_sentences, eval_src_tokens, _ = iterator_utils.get_effective_epoch_size(
            hparams, train=False)

        while not should_stop:
            utils.print_out("Starting epoch %d" % epochs)
            try:
                train_start = time.time()
                train_speed, _ = estimator.train_fn(hparams)
            except tf.errors.OutOfRangeError:
                utils.print_out("training hits OutOfRangeError", f=sys.stderr)

            train_end = time.time()
            train_delta = train_end - train_start
            utils.print_out(
                "training time for epoch %d: %.2f mins (%.2f sent/sec, %.2f tokens/sec)"
                % (epochs + 1, train_delta / 60., train_speed, train_speed *
                   (train_src_tokens + train_tgt_tokens) / train_sentences),
                f=sys.stderr)

            # This is probably sub-optimal, doing eval per-epoch
            eval_start = time.time()
            bleu_score, (
                eval_speed, eval_latencies
            ), eval_output_tokens = estimator.eval_fn(infer_hparams)
            eval_end = time.time()
            eval_delta = eval_end - eval_start
            utils.print_out(
                "eval time for epoch %d: %.2f mins (%.2f sent/sec, %.2f tokens/sec)"
                % (epochs + 1, eval_delta / 60., eval_speed, eval_speed *
                   (eval_src_tokens + eval_output_tokens) / eval_sentences),
                f=sys.stderr)
            for lat in sorted(eval_latencies):
                utils.print_out("eval latency_%s for epoch %d: %.2f ms" %
                                (lat, epochs + 1, eval_latencies[lat] * 1000))

            if FLAGS.debug or (FLAGS.target_bleu is not None
                               and bleu_score > FLAGS.target_bleu):
                should_stop = True
                utils.print_out(
                    "Stop job since target bleu is reached at epoch %d ." %
                    epochs,
                    f=sys.stderr)

            epochs += 1
            if epochs >= FLAGS.max_train_epochs:
                should_stop = True
                utils.print_out("Stop job since max_train_epochs is reached.",
                                f=sys.stderr)

    experiment_end = time.time()
    utils.print_out('Experiment took {} min'.format(
        (experiment_end - experiment_start) / 60))
示例#22
0
    def _set_train_or_infer(self, res, hparams):
        """Set up training."""
        loss = res[1]
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = loss
            self.word_count = tf.reduce_sum(
                self.features["source_sequence_length"]) + tf.reduce_sum(
                    self.features["target_sequence_length"])
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = loss
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_logits = res[0]
            self.infer_loss = loss
            self.sample_id = res[2]

        if self.mode != tf.contrib.learn.ModeKeys.INFER:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(
                self.features["target_sequence_length"])

        # Gradients and SGD update operation for training the model.
        # Arrange for the embedding vars to appear at the beginning.
        # Only build bprop if running on GPU and using dist_strategy, in which
        # case learning rate, grads and train_op are created in estimator model
        # function.
        with tf.name_scope("learning_rate"):
            self.learning_rate = tf.constant(hparams.learning_rate)
            # warm-up
            self.learning_rate = self._get_learning_rate_warmup(hparams)
            # decay
            self.learning_rate = self._get_learning_rate_decay(hparams)

        if (hparams.use_dist_strategy
                and self.mode == tf.contrib.learn.ModeKeys.TRAIN):
            # Gradients
            params = tf.trainable_variables()
            # Print trainable variables
            utils.print_out("# Trainable variables")
            utils.print_out(
                "Format: <name>, <shape>, <dtype>, <(soft) device placement>")
            for param in params:
                utils.print_out(
                    "  %s, %s, %s, %s" % (param.name, str(
                        param.get_shape()), param.dtype.name, param.op.device))
            utils.print_out("Total params size: %.2f GB" % (4. * np.sum([
                p.get_shape().num_elements()
                for p in params if p.shape.is_fully_defined()
            ]) / 2**30))

            # Optimizer
            if hparams.optimizer == "sgd":
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            elif hparams.optimizer == "adam":
                opt = tf.train.AdamOptimizer(self.learning_rate)
            else:
                raise ValueError("Unknown optimizer type %s" %
                                 hparams.optimizer)
            assert opt is not None

            grads_and_vars = opt.compute_gradients(
                self.train_loss,
                params,
                colocate_gradients_with_ops=hparams.colocate_gradients_with_ops
            )
            gradients = [x for (x, _) in grads_and_vars]

            clipped_grads, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm = grad_norm
            self.params = params
            self.grads = clipped_grads

            self.update = opt.apply_gradients(list(zip(clipped_grads, params)),
                                              global_step=self.global_step)
        else:
            self.grad_norm = None
            self.update = None
            self.params = None
            self.grads = None
示例#23
0
def _single_cell(unit_type,
                 num_units,
                 forget_bias,
                 dropout,
                 mode,
                 dtype=None,
                 residual_connection=False,
                 residual_fn=None,
                 use_block_lstm=False):
    """Create an instance of a single RNN cell."""
    # dropout (= 1 - keep_prob) is set to 0 during eval and infer
    dropout = dropout if mode == tf.contrib.learn.ModeKeys.TRAIN else 0.0

    # Cell Type
    if unit_type == "lstm":
        utils.print_out("  LSTM, forget_bias=%g" % forget_bias, new_line=False)
        if not use_block_lstm:
            single_cell = tf.nn.rnn_cell.LSTMCell(num_units,
                                                  dtype=dtype,
                                                  forget_bias=forget_bias)
        else:
            single_cell = tf.contrib.rnn.LSTMBlockCell(num_units,
                                                       forget_bias=forget_bias)
    elif unit_type == "gru":
        utils.print_out("  GRU", new_line=False)
        single_cell = tf.contrib.rnn.GRUCell(num_units)
    elif unit_type == "layer_norm_lstm":
        utils.print_out("  Layer Normalized LSTM, forget_bias=%g" %
                        forget_bias,
                        new_line=False)
        single_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
            num_units, forget_bias=forget_bias, layer_norm=True)
    elif unit_type == "nas":
        utils.print_out("  NASCell", new_line=False)
        single_cell = tf.contrib.rnn.NASCell(num_units)
    else:
        raise ValueError("Unknown unit type %s!" % unit_type)

    # Dropout (= 1 - keep_prob)
    if dropout > 0.0:
        single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell,
                                                    input_keep_prob=(1.0 -
                                                                     dropout))
        utils.print_out("  %s, dropout=%g " %
                        (type(single_cell).__name__, dropout),
                        new_line=False)

    # Residual
    if residual_connection:
        single_cell = tf.contrib.rnn.ResidualWrapper(single_cell,
                                                     residual_fn=residual_fn)
        utils.print_out("  %s" % type(single_cell).__name__, new_line=False)

    return single_cell
示例#24
0
def create_emb_for_encoder_and_decoder(share_vocab,
                                       src_vocab_size,
                                       tgt_vocab_size,
                                       src_embed_size,
                                       tgt_embed_size,
                                       dtype=tf.float32,
                                       num_enc_partitions=0,
                                       num_dec_partitions=0,
                                       src_vocab_file=None,
                                       tgt_vocab_file=None,
                                       src_embed_file=None,
                                       tgt_embed_file=None,
                                       use_char_encode=False,
                                       scope=None):
    """Create embedding matrix for both encoder and decoder.

  Args:
    share_vocab: A boolean. Whether to share embedding matrix for both
      encoder and decoder.
    src_vocab_size: An integer. The source vocab size.
    tgt_vocab_size: An integer. The target vocab size.
    src_embed_size: An integer. The embedding dimension for the encoder's
      embedding.
    tgt_embed_size: An integer. The embedding dimension for the decoder's
      embedding.
    dtype: dtype of the embedding matrix. Default to float32.
    num_enc_partitions: number of partitions used for the encoder's embedding
      vars.
    num_dec_partitions: number of partitions used for the decoder's embedding
      vars.
    src_vocab_file: A string. The source vocabulary file.
    tgt_vocab_file: A string. The target vocabulary file.
    src_embed_file: A string. The source embedding file.
    tgt_embed_file: A string. The target embedding file.
    use_char_encode: A boolean. If true, use char encoder.
    scope: VariableScope for the created subgraph. Default to "embedding".

  Returns:
    embedding_encoder: Encoder's embedding matrix.
    embedding_decoder: Decoder's embedding matrix.

  Raises:
    ValueError: if use share_vocab but source and target have different vocab
      size.
  """
    if num_enc_partitions <= 1:
        enc_partitioner = None
    else:
        # Note: num_partitions > 1 is required for distributed training due to
        # embedding_lookup tries to colocate single partition-ed embedding variable
        # with lookup ops. This may cause embedding variables being placed on worker
        # jobs.
        enc_partitioner = tf.fixed_size_partitioner(num_enc_partitions)

    if num_dec_partitions <= 1:
        dec_partitioner = None
    else:
        # Note: num_partitions > 1 is required for distributed training due to
        # embedding_lookup tries to colocate single partition-ed embedding variable
        # with lookup ops. This may cause embedding variables being placed on worker
        # jobs.
        dec_partitioner = tf.fixed_size_partitioner(num_dec_partitions)

    if src_embed_file and enc_partitioner:
        raise ValueError(
            "Can't set num_enc_partitions > 1 when using pretrained encoder "
            "embedding")

    if tgt_embed_file and dec_partitioner:
        raise ValueError(
            "Can't set num_dec_partitions > 1 when using pretrained decdoer "
            "embedding")

    with tf.variable_scope(scope or "embeddings",
                           dtype=dtype,
                           partitioner=enc_partitioner) as scope:
        # Share embedding
        if share_vocab:
            if src_vocab_size != tgt_vocab_size:
                raise ValueError(
                    "Share embedding but different src/tgt vocab sizes"
                    " %d vs. %d" % (src_vocab_size, tgt_vocab_size))
            assert src_embed_size == tgt_embed_size
            utils.print_out("# Use the same embedding for source and target")
            vocab_file = src_vocab_file or tgt_vocab_file
            embed_file = src_embed_file or tgt_embed_file

            embedding_encoder = _create_or_load_embed("embedding_share",
                                                      vocab_file, embed_file,
                                                      src_vocab_size,
                                                      src_embed_size, dtype)
            embedding_decoder = embedding_encoder
        else:
            if not use_char_encode:
                with tf.variable_scope("encoder", partitioner=enc_partitioner):
                    embedding_encoder = _create_or_load_embed(
                        "embedding_encoder", src_vocab_file, src_embed_file,
                        src_vocab_size, src_embed_size, dtype)
            else:
                embedding_encoder = None

            with tf.variable_scope("decoder", partitioner=dec_partitioner):
                embedding_decoder = _create_or_load_embed(
                    "embedding_decoder", tgt_vocab_file, tgt_embed_file,
                    tgt_vocab_size, tgt_embed_size, dtype)

    return embedding_encoder, embedding_decoder
示例#25
0
    def build_graph(self, features, labels, mode, params):
        """docstring."""
        del labels, params
        misc_utils.print_out("Running fast mode_fn")

        hparams = self.hparams

        # Create global_step
        tf.train.get_or_create_global_step()

        if mode == tf.contrib.learn.ModeKeys.INFER:
            # Doing inference only on one GPU
            inf_hparams = tf.contrib.training.HParams(**hparams.values())
            inf_hparams.set_hparam("num_gpus", 1)
            # Inference is done in fp32 and in the same way as that of dist_strategy.
            inf_hparams.set_hparam("use_fp16", False)

            misc_utils.print_out("inference hparmas:")
            misc_utils.print_hparams(inf_hparams)

            # Create variable_mgr
            var_mgr = self._get_variable_mgr(inf_hparams)

            with mixed_precision_scope(), tf.device("gpu:0"), tf.name_scope(
                    "tower_0"), var_mgr.create_outer_variable_scope(0):
                model = gnmt_model.GNMTModel(inf_hparams,
                                             mode=mode,
                                             features=features)
                sample_ids = model.sample_id
                reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file(
                    inf_hparams.tgt_vocab_file, default_value=vocab_utils.UNK)
                sample_words = reverse_target_vocab_table.lookup(
                    tf.to_int64(sample_ids))
                # make sure outputs is of shape [batch_size, time] or [beam_width,
                # batch_size, time] when using beam search.
                if inf_hparams.time_major:
                    sample_words = tf.transpose(sample_words)
                elif sample_words.shape.ndims == 3:
                    # beam search output in [batch_size, time, beam_width] shape.
                    sample_words = tf.transpose(sample_words, [2, 0, 1])
                predictions = {"predictions": sample_words}
                # return loss, vars, grads, predictions, train_op, scaffold
                return None, None, None, predictions, None, None
        elif mode == tf.contrib.learn.ModeKeys.TRAIN:
            num_towers = hparams.num_gpus
            # Shard inputs
            tower_features = self._shard_inputs(features, num_towers)
            # Create loss scale vars if necessary
            loss_scale, loss_scale_normal_steps = self._create_loss_scale_vars(
            )

            # Create variable_mgr
            var_mgr = self._get_variable_mgr(hparams)

            # Build per-tower fprop and bprop
            devices = var_mgr.get_devices()
            tower_gradvars = []
            tower_scopes = []
            var_scopes = []
            train_losses = []
            learning_rates = []
            batch_sizes = []
            opts = []

            def fprop_and_bprop(tid):
                """docstring."""
                model = gnmt_model.GNMTModel(hparams,
                                             mode=mode,
                                             features=tower_features[tid])
                # sync training.
                assert model.learning_rate is not None
                # The following handles shouldn't be built in when doing manual
                assert model.grad_norm is None
                assert model.update is None
                tower_loss = model.train_loss
                # Only check loss numerics if in fp16
                if hparams.use_fp16 and hparams.check_tower_loss_numerics:
                    tower_loss = tf.check_numerics(
                        tower_loss, "tower_%d has Inf/NaN loss" % tid)
                # Cast to fp32, otherwise would easily overflow.
                tower_loss = tf.to_float(tower_loss)
                var_params, grads, opt = self._compute_tower_grads(
                    tower_loss,
                    var_mgr.trainable_variables_on_device(tid, tid),
                    model.learning_rate,
                    use_fp16=hparams.use_fp16,
                    loss_scale=loss_scale,
                    colocate_gradients_with_ops=hparams.
                    colocate_gradients_with_ops)
                self._print_varinfo(var_params, tid)
                res = [model.train_loss, model.learning_rate, model.batch_size]
                res.extend(grads)
                opts.append(opt)
                return res

            def unpack_fprop_and_bprop_output(output):
                train_loss = output[0]
                learning_rate = output[1]
                batch_size = output[2]
                grads = output[3:]
                return train_loss, learning_rate, batch_size, grads

            with mixed_precision_scope():
                for tid in range(num_towers):
                    with tf.device(devices[tid % len(devices)]), tf.name_scope(
                            "tower_%s" % tid) as scope:
                        tower_scopes.append(scope)
                        with var_mgr.create_outer_variable_scope(
                                tid) as var_scope:
                            var_scopes.append(var_scope)

                            outputs = maybe_xla_compile(
                                hparams, fprop_and_bprop, tid)
                            (train_loss, learning_rate, batch_size,
                             grads) = unpack_fprop_and_bprop_output(outputs)
                            train_losses.append(train_loss)
                            learning_rates.append(learning_rate)
                            batch_sizes.append(batch_size)
                            var_params = var_mgr.trainable_variables_on_device(
                                tid, tid)
                            tower_gradvars.append(list(zip(grads, var_params)))

            # Add summaries
            if hparams.show_metrics:
                tf.summary.scalar("learning_rate", learning_rates[0])
                if loss_scale:
                    tf.summary.scalar("loss_scale", loss_scale)
                    if hparams.enable_auto_loss_scale:
                        tf.summary.scalar("loss_scale_normal_steps",
                                          loss_scale_normal_steps)
            misc_utils.print_out("Finish building fprop and per-tower bprop.")
            # Aggregate gradients
            # The following compute the aggregated grads for each tower, stored in
            # opaque grad_states structure.
            apply_grads_devices, grad_states = var_mgr.preprocess_device_grads(
                tower_gradvars)
            master_grads = None
            master_params = None
            update_ops = []
            for i, device in enumerate(apply_grads_devices):
                with tf.device(device), tf.name_scope(tower_scopes[i]):
                    # Get per-tower grads.
                    with tf.name_scope("get_gradients_to_apply"):
                        avg_gradvars = var_mgr.get_gradients_to_apply(
                            i, grad_states)
                    avg_grads = [gv[0] for gv in avg_gradvars]

                    # gradients post-processing
                    with tf.name_scope("clip_gradients"):
                        if hparams.clip_grads:
                            clipped_grads, grad_norm = model_helper.gradient_clip(
                                avg_grads,
                                max_gradient_norm=hparams.max_gradient_norm)
                            # summary the grad on the 1st tower
                            if i == 0 and hparams.show_metrics:
                                tf.summary.scalar("grad_norm", grad_norm)
                                tf.summary.scalar(
                                    "clipped_grad_norm",
                                    tf.global_norm(clipped_grads))
                        else:
                            clipped_grads = avg_grads
                        if i == 0:
                            master_grads = clipped_grads

                    # Build apply-gradients ops
                    clipped_gradvars = list(
                        zip(clipped_grads, [gv[1] for gv in avg_gradvars]))
                    if i == 0:
                        master_params = [gv[1] for gv in avg_gradvars]
                    with tf.name_scope("append_gradient_ops"):
                        loss_scale_params = variable_mgr_util.AutoLossScaleParams(
                            enable_auto_loss_scale=hparams.
                            enable_auto_loss_scale,
                            loss_scale=loss_scale,
                            loss_scale_normal_steps=loss_scale_normal_steps,
                            inc_loss_scale_every_n=hparams.
                            fp16_inc_loss_scale_every_n,
                            is_chief=True)
                        opt = opts[i]
                        var_mgr.append_apply_gradients_ops(
                            grad_states, opt, clipped_gradvars, update_ops,
                            loss_scale_params)
            misc_utils.print_out("Finish building grad aggregation.")

            assert len(update_ops) == num_towers
            train_op = tf.group(update_ops)
            with tf.control_dependencies([train_op]):
                global_step = tf.train.get_global_step()
                train_op = global_step.assign_add(1)

            # Compute loss on the first gpu
            # TODO(jamesqin): optimize it?
            with tf.device("gpu:0"):
                loss = misc_utils.weighted_avg(train_losses, batch_sizes)

            # Create local init_ops
            # TODO(jamesqin): handle resource variables!
            # At present if not using mirror strategy, not using resource vars.
            local_init_ops = []
            local_init_op = tf.local_variables_initializer()
            with tf.control_dependencies([local_init_op]):
                local_init_ops.append(var_mgr.get_post_init_ops())
            local_init_ops.extend([local_init_op, tf.tables_initializer()])

            saveable_vars = var_mgr.savable_variables()
            # Add saveables for cudnn vars in master tower.
            saveable_objects = tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS)
            saveable_objects = [x for x in saveable_objects if "v0" in x.name]

            misc_utils.print_out("Saveable vars(%d): " % len(saveable_vars))
            for mv in saveable_vars:
                misc_utils.print_out(mv.name)

            misc_utils.print_out("All global trainable vars(%d): " %
                                 len(tf.trainable_variables()))
            for tv in tf.trainable_variables():
                misc_utils.print_out(tv.name)

            misc_utils.print_out("All global vars(%d): " %
                                 len(tf.global_variables()))
            for gv in tf.global_variables():
                misc_utils.print_out(gv.name)

            misc_utils.print_out("master backproped params(%d): " %
                                 len(master_params))
            for mp in master_params:
                misc_utils.print_out(mp.name)

            # Note the cudnn vars are skipped the init check. :(
            scaffold = tf.train.Scaffold(
                ready_op=tf.report_uninitialized_variables(saveable_vars),
                ready_for_local_init_op=tf.report_uninitialized_variables(
                    saveable_vars),
                local_init_op=tf.group(*local_init_ops),
                saver=tf.train.Saver(saveable_vars + saveable_objects,
                                     save_relative_paths=True))

            misc_utils.print_out("Finish building model_fn")
            # return loss, vars, grads, predictions, train_op, scaffold
            return loss, master_params, master_grads, None, train_op, scaffold
示例#26
0
def get_metrics(hparams, model_fn, ckpt=None, only_translate=False):
    """Run inference and compute metrics."""
    pred_estimator = tf.estimator.Estimator(model_fn=model_fn,
                                            model_dir=hparams.output_dir)

    benchmark_hook = BenchmarkHook(hparams.infer_batch_size)

    predictions = pred_estimator.predict(make_input_fn(
        hparams, tf.contrib.learn.ModeKeys.INFER),
                                         checkpoint_path=ckpt,
                                         hooks=[benchmark_hook])
    translations = []
    output_tokens = []
    beam_id = 0
    for prediction in predictions:
        # get the top translation.
        if beam_id == 0:
            for sent_id in range(hparams.infer_batch_size):
                if sent_id >= prediction["predictions"].shape[0]:
                    break
                trans, output_length = nmt_utils.get_translation(
                    prediction["predictions"],
                    sent_id=sent_id,
                    tgt_eos=hparams.eos,
                    subword_option=hparams.subword_option)
                translations.append(trans)
                output_tokens.append(output_length)
        beam_id += 1
        if beam_id == hparams.beam_width:
            beam_id = 0

    if only_translate:
        trans_file = hparams.translate_file + '.trans.tok'
    else:
        trans_file = os.path.join(
            hparams.output_dir, "newstest2014_out_{}.tok.de".format(
                pred_estimator.get_variable_value(tf.GraphKeys.GLOBAL_STEP)))
    trans_dir = os.path.dirname(trans_file)
    if not tf.gfile.Exists(trans_dir):
        tf.gfile.MakeDirs(trans_dir)
    tf.logging.info("Writing to file %s" % trans_file)
    with codecs.getwriter("utf-8")(tf.gfile.GFile(trans_file,
                                                  mode="wb")) as trans_f:
        trans_f.write("")  # Write empty string to ensure file is created.
        for translation in translations:
            trans_f.write((translation + b"\n").decode("utf-8"))

    if only_translate:
        return None, benchmark_hook.get_average_speed_and_latencies(), sum(
            output_tokens)

    # Evaluation
    output_dir = os.path.join(pred_estimator.model_dir, "eval")
    tf.gfile.MakeDirs(output_dir)
    summary_writer = tf.summary.FileWriter(output_dir)

    ref_file = "%s.%s" % (hparams.test_prefix, hparams.tgt)
    # Hardcoded.
    metric = "bleu"
    score = get_sacrebleu(trans_file, hparams.detokenizer_file)

    misc_utils.print_out("bleu is %.5f" % score)
    with tf.Graph().as_default():
        summaries = []
        summaries.append(tf.Summary.Value(tag=metric, simple_value=score))
    tf_summary = tf.Summary(value=list(summaries))
    summary_writer.add_summary(
        tf_summary,
        pred_estimator.get_variable_value(tf.GraphKeys.GLOBAL_STEP))

    summary_writer.close()
    return score, benchmark_hook.get_average_speed_and_latencies(), sum(
        output_tokens)