Пример #1
0
    def generate_optimizer(self, loss, params, name, learning_rate,
                           max_gradient_norm):
        """generates optimizer."""
        if self.hparams.optimizer == "sgd":
            opt = tf.train.GradientDescentOptimizer(learning_rate,
                                                    name="SGD_self_play_" +
                                                    name)
        else:
            opt = tf.train.AdamOptimizer(learning_rate,
                                         name="ADAM_self_play_" + name)

        gradients = tf.gradients(loss,
                                 params,
                                 colocate_gradients_with_ops=self.hparams.
                                 colocate_gradients_with_ops,
                                 name="gradients_" + name)

        clipped_gradients, gradient_norm_summary = model_helper.gradient_clip(
            gradients, max_gradient_norm=max_gradient_norm)

        update = opt.apply_gradients(zip(clipped_gradients, params),
                                     global_step=self.global_step,
                                     name=name)

        return update, gradient_norm_summary
Пример #2
0
    def _set_train_or_infer(self, res, reverse_target_vocab_table,
                            reverse_target_intent_vocab_table, hparams):
        """Set up training and inference."""
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = res[1]
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = res[1]
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_logits, _, self.label_pred = res
            self.sample_intent = reverse_target_intent_vocab_table.lookup(
                tf.to_int64(self.label_pred))

        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrange for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.learning_rate = tf.constant(hparams.learning_rate)
            # warm-up
            self.learning_rate = self._get_learning_rate_warmup(hparams)
            # decay
            self.learning_rate = self._get_learning_rate_decay(hparams)

            # Optimizer
            if hparams.optimizer == "sgd":
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            elif hparams.optimizer == "adam":
                opt = tf.train.AdamOptimizer(self.learning_rate)
            else:
                raise ValueError("Unknown optimizer type %s" %
                                 hparams.optimizer)

            # Gradients
            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=hparams.
                                     colocate_gradients_with_ops)

            clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm_summary = grad_norm_summary
            self.grad_norm = grad_norm

            self.update = opt.apply_gradients(zip(clipped_grads, params),
                                              global_step=self.global_step)

            # Summary
            self.train_summary = self._get_train_summary()
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_summary = self._get_infer_summary(hparams)

        # Print trainable variables
        utils.print_out("# Trainable variables")
        utils.print_out("Format: <name>, <shape>, <(soft) device placement>")
        for param in params:
            utils.print_out(
                "  %s, %s, %s" %
                (param.name, str(param.get_shape()), param.op.device))
  def _set_train_or_infer(self, res, hparams):
    """Set up training."""
    if self.mode == tf.contrib.learn.ModeKeys.INFER:
      self.predicted_ids = res[1]

    params = tf.trainable_variables()

    # Gradients and SGD update operation for training the model.
    # Arrange for the embedding vars to appear at the beginning.
    if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
      loss = res[0]
      self.loss = loss

      mlperf_log.gnmt_print(key=mlperf_log.OPT_LR, value=hparams.learning_rate)

      if hparams.lottery_force_learning_rate is not None:
        self.learning_rate = lottery.get_lr_tensor(hparams.values())
      else:
        self.learning_rate = tf.constant(hparams.learning_rate)
        # warm-up
        self.learning_rate = self._get_learning_rate_warmup(hparams)
        # decay
        self.learning_rate = self._get_learning_rate_decay(hparams)

      # Optimizer
      mlperf_log.gnmt_print(key=mlperf_log.OPT_NAME, value=hparams.optimizer)
      if hparams.optimizer == "sgd":
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
      elif hparams.optimizer == "adam":
        mlperf_log.gnmt_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=0.9)
        mlperf_log.gnmt_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=0.999)
        mlperf_log.gnmt_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=1e-8)
        opt = tf.train.AdamOptimizer(self.learning_rate)
      else:
        raise ValueError("Unknown optimizer type %s" % hparams.optimizer)

      if hparams.use_tpu:
        opt = tf.contrib.tpu.CrossShardOptimizer(opt)
      # Gradients

      gradients = tf.gradients(
          loss,
          params,
          colocate_gradients_with_ops=hparams.colocate_gradients_with_ops)
      clipped_grads, grad_norm = model_helper.gradient_clip(gradients, max_gradient_norm=hparams.max_gradient_norm)
      self.update = opt.apply_gradients(zip(clipped_grads, params), global_step=self.global_step)

    # Print trainable variables
    utils.print_out("# Trainable variables")
    utils.print_out("Format: <name>, <shape>, <(soft) device placement>")
    for param in params:
      utils.print_out("  %s, %s, %s" % (param.name, str(param.get_shape()),
                                        param.op.device))
Пример #4
0
def optimizer(hparams, loss, global_step):
    opt = None
    learning_rate = learning_rate_update(hparams, global_step)
    learning_rate = tf.maximum(tf.constant(0.00004), learning_rate)
    if hparams.opttype == 'SGD':
        opt = tf.train.GradientDescentOptimizer(learning_rate)
    if hparams.opttype == 'Adam':
        opt = tf.train.AdamOptimizer(learning_rate)
    if hparams.opttype == 'Nadam':
        opt = tf.contrib.opt.NadamOptimizer(learning_rate)
    if hparams.opttype == 'Lazy':
        opt, learning_rate = get_lazy_opt(hparams)

    gradient, vars_param = zip(*opt.compute_gradients(loss))
    clip_gradient, _ = _mh.gradient_clip(gradient)
    apply_gradient_op = opt.apply_gradients(zip(clip_gradient, vars_param),
                                            global_step=global_step)

    variable_averages = tf.train.ExponentialMovingAverage(0.9999, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())
    with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
        train_op = tf.no_op(name='train')

    return train_op, learning_rate
Пример #5
0
    def __init__(self,
                 hparams,
                 mode,
                 iterator,
                 source_vocab_table,
                 target_vocab_table,
                 reverse_target_vocab_table=None,
                 scope=None,
                 extra_args=None):
        """Create the model.

    Args:
      hparams: Hyperparameter configurations.
      mode: TRAIN | EVAL | INFER
      iterator: Dataset Iterator that feeds data.
      source_vocab_table: Lookup table mapping source words to ids.
      target_vocab_table: Lookup table mapping target words to ids.
      reverse_target_vocab_table: Lookup table mapping ids to target words. Only
        required in INFER mode. Defaults to None.
      scope: scope of the model.
      extra_args: model_helper.ExtraArgs, for passing customizable functions.

    """
        assert isinstance(iterator, iterator_utils.BatchedInput)
        self.iterator = iterator
        self.mode = mode
        self.src_vocab_table = source_vocab_table
        self.tgt_vocab_table = target_vocab_table

        self.src_vocab_size = hparams.src_vocab_size
        self.tgt_vocab_size = hparams.tgt_vocab_size
        self.num_gpus = hparams.num_gpus
        self.time_major = hparams.time_major

        # extra_args: to make it flexible for adding external customizable code
        self.single_cell_fn = None
        if extra_args:
            self.single_cell_fn = extra_args.single_cell_fn

        # Set num layers
        self.num_encoder_layers = hparams.num_encoder_layers
        self.num_decoder_layers = hparams.num_decoder_layers
        assert self.num_encoder_layers
        assert self.num_decoder_layers

        # Set num residual layers
        if hasattr(hparams,
                   "num_residual_layers"):  # compatible common_test_utils
            self.num_encoder_residual_layers = hparams.num_residual_layers
            self.num_decoder_residual_layers = hparams.num_residual_layers
        else:
            self.num_encoder_residual_layers = hparams.num_encoder_residual_layers
            self.num_decoder_residual_layers = hparams.num_decoder_residual_layers

        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   hparams.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.init_embeddings(hparams, scope)
        self.batch_size = tf.size(self.iterator.source_sequence_length)

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = layers_core.Dense(hparams.tgt_vocab_size,
                                                      use_bias=False,
                                                      name="output_projection")

        ## Train graph
        res = self.build_graph(hparams, scope=scope)

        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(
                self.iterator.source_sequence_length) + tf.reduce_sum(
                    self.iterator.target_sequence_length)
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = res[1]
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = reverse_target_vocab_table.lookup(
                tf.to_int64(self.sample_id))

        if self.mode != tf.contrib.learn.ModeKeys.INFER:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(
                self.iterator.target_sequence_length)

        self.global_step = tf.Variable(0, trainable=False)
        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrage for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.learning_rate = tf.constant(hparams.learning_rate)
            # warm-up
            self.learning_rate = self._get_learning_rate_warmup(hparams)
            # decay
            self.learning_rate = self._get_learning_rate_decay(hparams)

            # Optimizer
            if hparams.optimizer == "sgd":
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                tf.summary.scalar("lr", self.learning_rate)
            elif hparams.optimizer == "adam":
                opt = tf.train.AdamOptimizer(self.learning_rate)

            # Gradients
            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=hparams.
                                     colocate_gradients_with_ops)

            clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm = grad_norm

            self.update = opt.apply_gradients(zip(clipped_grads, params),
                                              global_step=self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("lr", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + grad_norm_summary)

        if self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_summary = self._get_infer_summary(hparams)

        # Saver
        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=hparams.num_keep_ckpts)

        # Print trainable variables
        utils.print_out("# Trainable variables")
        for param in params:
            utils.print_out(
                "  %s, %s, %s" %
                (param.name, str(param.get_shape()), param.op.device))
Пример #6
0
    def __init__(self,
                 hparams,
                 mode,
                 iterator,
                 source_vocab_table,
                 target_vocab_table,
                 reverse_target_vocab_table=None):

        assert isinstance(iterator, iterator_utils.BatchedInput)
        self.iterator = iterator
        self.mode = mode
        self.src_vocab_table = source_vocab_table
        self.tgt_vocab_table = target_vocab_table

        self.src_vocab_size = hparams.src_vocab_size
        self.tgt_vocab_size = hparams.tgt_vocab_size
        self.time_major = hparams.time_major

        self.single_cell_fn = None

        # Set num layers
        self.num_encoder_layers = hparams.num_encoder_layers
        self.num_decoder_layers = hparams.num_decoder_layers
        assert self.num_encoder_layers
        assert self.num_decoder_layers

        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   hparams.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.init_embeddings(hparams)
        self.batch_size = tf.size(self.iterator.source_sequence_length)

        # Projection
        with tf.variable_scope("build_netword"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = layers_core.Dense(hparams.tgt_vocab_size,
                                                      use_bias=False,
                                                      name="output_projection")

        ## Train graph
        res = self.build_graph(hparams)

        if self.mode == tf.estimator.ModeKeys.TRAIN:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(
                self.iterator.source_sequence_length) + tf.reduce_sum(
                    self.iterator.target_sequence_length)
        elif self.mode == tf.estimator.ModeKeys.EVAL:
            self.eval_loss = res[1]
        elif self.mode == tf.estimator.ModeKeys.PREDICT:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = reverse_target_vocab_table.lookup(
                tf.to_int64(self.sample_id))

        if self.mode != tf.estimator.ModeKeys.PREDICT:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(
                self.iterator.target_sequence_length)

        self.global_step = tf.Variable(0, trainable=False)
        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrange for the embedding vars to appear at the beginning.
        if self.mode == tf.estimator.ModeKeys.TRAIN:
            self.learning_rate = tf.constant(hparams.learning_rate)
            # warm-up
            self.learning_rate = self._get_learning_rate_warmup(hparams)
            # decay
            self.learning_rate = self._get_learning_rate_decay(hparams)

            # Optimizer
            if hparams.optimizer == "sgd":
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                tf.summary.scalar("lr", self.learning_rate)
            elif optimizer == "adam":
                opt = tf.train.AdamOptimizer(self.learning_rate)

            # Gradients
            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=hparams.
                                     colocate_gradients_with_ops)

            clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm = grad_norm

            self.update = opt.apply_gradients(zip(clipped_grads, params),
                                              global_step=self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("lr", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + grad_norm_summary)

        if self.mode == tf.estimator.ModeKeys.PREDICT:
            self.infer_summary = self._get_infer_summary(hparams)

        # Saver
        self.saver = tf.train.Saver(tf.global_variables())

        # Print trainable variables
        utils.print_out("# Trainable variables")
        for param in params:
            utils.print_out(
                "  %s, %s, %s" %
                (param.name, str(param.get_shape()), param.op.device))
Пример #7
0
    def __init__(self, hparams, mode, iterator, target_vocab_table, reverse_target_vocab_table=None, scope=None, single_cell_fn=None):

        """Create the model.

        Args:
          hparams: Hyperparameter configurations.
          mode: TRAIN | EVAL | INFER
          iterator: Dataset Iterator that feeds data.
          target_vocab_table: Lookup table mapping target words to ids.
          reverse_target_vocab_table: Lookup table mapping ids to target words. Only
            required in INFER mode. Defaults to None.
          scope: scope of the model.
          single_cell_fn: allow for adding customized cell. When not specified,
            we default to model_helper._single_cell
        """


        assert isinstance(iterator, iterator_utils.BatchedInput)

        self.iterator = iterator
        self.mode = mode
        self.tgt_vocab_table = target_vocab_table

        self.tgt_vocab_size = hparams.tgt_vocab_size
        self.num_layers = hparams.num_layers
        self.num_gpus = hparams.num_gpus
        self.time_major = hparams.time_major

        self.cnn_input = self.iterator.source
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.cnn = AlexNet(self.cnn_input, (1 - hparams.dropout), model_helper.get_device_str(hparams.base_gpu))
        else:
            self.cnn = AlexNet(self.cnn_input, 1, model_helper.get_device_str(hparams.base_gpu))

        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.init_embeddings(hparams, scope)
        self.batch_size = tf.size(self.iterator.source_sequence_length)

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = layers_core.Dense(hparams.tgt_vocab_size, use_bias=False, name="output_projection")

        # To make it flexible for external code to add other cell types
        # If not specified, we will later use model_helper._single_cell
        self.single_cell_fn = single_cell_fn

        ## Train graph
        res = self.build_graph(hparams, scope=scope)

        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(self.iterator.target_sequence_length)
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = res[1]
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = reverse_target_vocab_table.lookup(tf.to_int64(self.sample_id))

        if self.mode != tf.contrib.learn.ModeKeys.INFER:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(self.iterator.target_sequence_length)

        ## Learning rate
        print("  start_decay_step=%d, learning_rate=%g, decay_steps %d, decay_factor %g" % (hparams.start_decay_step, hparams.learning_rate, hparams.decay_steps, hparams.decay_factor))

        self.global_step = tf.Variable(0, trainable=False)

        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrage for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            if hparams.optimizer == "sgd":
                self.learning_rate = tf.cond(self.global_step < hparams.start_decay_step,
                                             lambda: tf.constant(hparams.learning_rate),
                                             lambda: tf.train.exponential_decay(hparams.learning_rate,
                                                                                (self.global_step - hparams.start_decay_step),
                                                                                hparams.decay_steps,
                                                                                hparams.decay_factor,
                                                                                staircase=True),
                                             name="learning_rate")
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                tf.summary.scalar("lr", self.learning_rate)

            elif hparams.optimizer == "adam":
                assert float(hparams.learning_rate) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate
                self.learning_rate = tf.constant(hparams.learning_rate)
                opt = tf.train.AdamOptimizer(self.learning_rate)

            gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams.colocate_gradients_with_ops)

            clipped_gradients, gradient_norm_summary = model_helper.gradient_clip(gradients, max_gradient_norm=hparams.max_gradient_norm)

            self.update = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss)] + gradient_norm_summary)

        if self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_summary = self._get_infer_summary(hparams)

        # Saver
        if hparams.eval_on_fly:
            self.saver = tf.train.Saver(tf.global_variables(), save_relative_paths= True)
        else:
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=None, save_relative_paths= True)

        # Print trainable variables
        utils.print_out("# Trainable variables")
        for param in params:
            utils.print_out("  %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
Пример #8
0
 def _deploy_exe_info(self, losses, info):
     with tf.name_scope("deploy_exe_info"):
         hp = self.hparams
         if self.trainable:  # Train
             self.train_loss = losses
             params = tf.trainable_variables()
             if hp.tunable:
                 learning_rate = hp.tune_rate
             else:
                 learning_rate = hp.learning_rate
             self.learning_rate = tf.constant(learning_rate, dtype=tf.float32)
             # Warm-up
             self.learning_rate = self._get_learning_rate_warmup()
             # Decay
             self.learning_rate = self._get_learning_rate_decay()
             # Optimizer
             opt = tf.train.MomentumOptimizer(self.learning_rate, hp.momentum_factor)
             # Gradient
             gradients = tf.gradients(self.train_loss, params)
             # Gradient clip
             clipped_grads, grad_norm_summaries, grad_norm = helper.gradient_clip(
                 gradients, max_gradient_norm=hp.max_grad_norm)
             # Gradient norm
             for summary in grad_norm_summaries:
                 self._add_to_summaries(summary)
             self.grad_norm = grad_norm
             # Apply update to params
             self.update = opt.apply_gradients(
                 zip(clipped_grads, params), global_step=self.global_step)
             # Trainable params summary
             print("# Trainable variables")
             print("Format: <name>, <shape>, <(soft) device placement>")
             for param in params:
                 self.histogram.update({param.name: param})
                 print("  %s, %s, %s" % (param.name, str(param.get_shape()),
                                         param.op.device))
             self.histogram.update(train_loss=self.train_loss,
                                   learning_rate=self.learning_rate)
             if hp.forward_rcnn:
                 self.class_predicts = self.reverse_cate_table.lookup(
                     tf.to_int64(info["class_predicts"]))
                 self.detected_images = tf.py_function(
                     misc.draw_boxes_on_image,
                     [self.images_data, info["bbox_labels"],
                      info["class_scores"], self.class_predicts,
                      self.im_info, hp.pixel_mean], Tout=tf.float32)
             self.train_summary = self._config_train_summary()
         elif self.predicable:  # Infer
             stddevs = tf.tile(tf.constant(hp.bbox_norm_stddevs), multiples=hp.num_class)
             means = tf.tile(tf.constant(hp.bbox_norm_means), multiples=hp.num_class)
             deltas = info["bbox_predicts"]
             # Restore bbox predicts
             deltas = tf.add(tf.multiply(deltas, stddevs), means)
             info["bbox_predicts"] = deltas
             rois = info["rois"]
             self.class_scores = info["class_scores"]
             self.class_predicts = self.reverse_cate_table.lookup(
                 tf.to_int64(info["class_predicts"]))
             # Get predicted ground-truth bbox
             self.bboxes = proposal_util.bboxes_regression(rois, deltas)
             self.detected_images = tf.py_function(
                 misc.draw_boxes_on_image,
                 [self.images_data, self.bboxes,
                  self.class_scores, self.class_predicts,
                  self.im_info, hp.pixel_mean], Tout=tf.float32)
             self.infer_summary = self._config_infer_summary()
         else:  # Eval
             rois = info["rois"]
             deltas = info["bbox_predicts"]
             self.eval_loss = losses
             bboxes = proposal_util.bboxes_regression(rois, deltas)
             self.accuracy = misc.mean_avg_overlap(
                 bboxes, self.bbox_labels)
             self.eval_summary = self._config_eval_summary()
Пример #9
0
    def __init__(self,
                 hparams,
                 mode,
                 iterator,
                 handle,
                 vocab_table,
                 reverse_vocab_table=None,
                 scope=None,
                 extra_args=None):
        assert isinstance(iterator, iterator_utils.BatchedInput)
        self.iterator = iterator
        self.handle = handle
        self.mode = mode
        self.vocab_table = vocab_table
        self.vocab_size = hparams.vocab_size
        self.num_layers = hparams.num_layers
        self.num_gpus = hparams.num_gpus
        self.hparams = hparams
        self.single_cell_fn = None
        self.global_gpu_num = 0
        if extra_args:
            self.single_cell_fn = extra_args.single_cell_fn

        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   hparams.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.init_embeddings(hparams, scope)
        self.batch_size = tf.shape(self.iterator.source)[0]

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer1 = layers_core.Dense(
                    hparams.vocab_size,
                    use_bias=False,
                    name="output_projection_1")
                self.output_layer2 = layers_core.Dense(
                    hparams.vocab_size,
                    use_bias=False,
                    name="output_projection_2")
                self.output_layer_action = layers_core.Dense(
                    hparams.vocab_size,
                    use_bias=False,
                    name="output_projection_action")
                self.vn_project11 = layers_core.Dense(
                    hparams.unit_value_network,
                    use_bias=False,
                    name="vn_project_11")
                self.vn_project12 = layers_core.Dense(
                    hparams.unit_value_network,
                    use_bias=False,
                    name="vn_project_12")
                self.vn_project21 = layers_core.Dense(
                    hparams.unit_value_network,
                    use_bias=False,
                    name="vn_project_21")
                self.vn_project22 = layers_core.Dense(
                    hparams.unit_value_network,
                    use_bias=False,
                    name="vn_project_22")

        ## Train graph
        sl_loss, sl_loss_arr, rl_loss_arr, sample_id_arr_train, sample_id_arr_infer = build_graph(
            self, hparams, scope=scope)

        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = sl_loss
            self.all_train_loss = sl_loss_arr
            self.word_count = tf.reduce_sum(self.iterator.dialogue_len)
            self.sample_ids_arr = sample_id_arr_train
            self.sample_words_arr1 = []
            self.sample_words_arr2 = []
            source = self.iterator.source
            for i in range(len(self.sample_ids_arr)):
                element_infer = self.sample_ids_arr[i]
                element_src = source[0]
                # element_src=0
                src = reverse_vocab_table.lookup(tf.to_int64(element_src))
                infer = reverse_vocab_table.lookup(
                    tf.to_int64(element_infer)
                )[0]  # src can only get the first one so I only get the first inference
                if i == 0:
                    self.sample_words_arr1.append((tf.constant(i), src, infer))
                elif i == 1:
                    self.sample_words_arr2.append((tf.constant(i), src, infer))
            self.vl1, self.vl2, self.pl1, self.pl2, self.eq11, self.eq12, self.eq2 = rl_loss_arr  # reinforcement updates

        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = sl_loss
            self.all_eval_loss = sl_loss_arr

        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.sample_ids_arr = sample_id_arr_infer
            self.sample_words_arr = []
            self.source = reverse_vocab_table.lookup(
                tf.to_int64(iterator.source))
            for element in self.sample_ids_arr:
                self.sample_words_arr.append(
                    reverse_vocab_table.lookup(tf.to_int64(element)))
        elif self.mode in dialogue_utils.self_play_modes:
            #### self play
            self.train_loss = sl_loss
            self.all_train_loss = sl_loss_arr
            self.selfplay_agent_1_utt = reverse_vocab_table.lookup(
                tf.to_int64(sample_id_arr_infer[0]))
            self.selfplay_agent_2_utt = reverse_vocab_table.lookup(
                tf.to_int64(sample_id_arr_infer[1]))
            self.selfplay_action = reverse_vocab_table.lookup(
                tf.to_int64(sample_id_arr_infer[2]))
            if self.mode == dialogue_utils.mode_self_play_mutable:
                self.vl1, self.vl2, self.pl1, self.pl2, self.eq11, self.eq12, self.eq2 = rl_loss_arr  # reinforcement updates

        if self.mode != tf.contrib.learn.ModeKeys.INFER:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(self.iterator.dialogue_len)

        ## Learning rate
        warmup_steps = hparams.learning_rate_warmup_steps
        warmup_factor = hparams.learning_rate_warmup_factor
        print("  start_decay_step=%d, learning_rate=%g, decay_steps %d, "
              "decay_factor %g, learning_rate_warmup_steps=%d, "
              "learning_rate_warmup_factor=%g, starting_learning_rate=%g" %
              (hparams.start_decay_step, hparams.learning_rate,
               hparams.decay_steps, hparams.decay_factor, warmup_steps,
               warmup_factor,
               (hparams.learning_rate * warmup_factor**warmup_steps)))
        self.global_step = tf.Variable(0, trainable=False)

        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrage for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN or self.mode == dialogue_utils.mode_self_play_mutable:
            self.learning_rate = tf.constant(hparams.learning_rate)

            inv_decay = warmup_factor**(tf.to_float(warmup_steps -
                                                    self.global_step))
            self.learning_rate = tf.cond(
                self.global_step < hparams.learning_rate_warmup_steps,
                lambda: inv_decay * self.learning_rate,
                lambda: self.learning_rate,
                name="learning_rate_decay_warump_cond")

            if hparams.optimizer == "sgd":
                self.learning_rate = tf.cond(
                    self.global_step < hparams.start_decay_step,
                    lambda: self.learning_rate,
                    lambda: tf.train.exponential_decay(self.learning_rate, (
                        self.global_step - hparams.start_decay_step),
                                                       hparams.decay_steps,
                                                       hparams.decay_factor,
                                                       staircase=True),
                    name="sgd_learning_rate_supervised")
                opt = tf.train.GradientDescentOptimizer(self.learning_rate,
                                                        name="SGD_supervised")
                tf.summary.scalar("lr", self.learning_rate)
            elif hparams.optimizer == "adam":
                assert float(
                    hparams.learning_rate
                ) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate
                opt = tf.train.AdamOptimizer(self.learning_rate,
                                             name="Adam_supervised")

            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=hparams.
                                     colocate_gradients_with_ops,
                                     name="gradients_adam")

            clipped_gradients, gradient_norm_summary = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)

            self.update = opt.apply_gradients(zip(clipped_gradients, params),
                                              global_step=self.global_step,
                                              name="adam_apply_gradients")

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("lr", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + gradient_norm_summary)

        # second part of the learning rate
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN or self.mode == dialogue_utils.mode_self_play_mutable:
            self.learning_rate2 = tf.constant(hparams.learning_rate2)
            self.learning_rate3 = tf.constant(hparams.learning_rate3)
            if hparams.optimizer == "sgd":
                self.learning_rate2 = tf.cond(
                    self.global_step < hparams.start_decay_step,
                    lambda: self.learning_rate2,
                    lambda: tf.train.exponential_decay(self.learning_rate2, (
                        self.global_step - hparams.start_decay_step),
                                                       hparams.decay_steps,
                                                       hparams.decay_factor,
                                                       staircase=True),
                    name="sgd_learning_rate_supervised2")
                self.learning_rate3 = tf.cond(
                    self.global_step < hparams.start_decay_step,
                    lambda: self.learning_rate3,
                    lambda: tf.train.exponential_decay(self.learning_rate3, (
                        self.global_step - hparams.start_decay_step),
                                                       hparams.decay_steps,
                                                       hparams.decay_factor,
                                                       staircase=True),
                    name="sgd_learning_rate_supervised3")
                tf.summary.scalar("self_play_lr", self.learning_rate)
            elif hparams.optimizer == "adam":
                assert float(
                    hparams.learning_rate2
                ) <= 0.001, "! High Adam learning rate2 %g" % hparams.learning_rate2
                assert float(
                    hparams.learning_rate3
                ) <= 0.001, "! High Adam learning rate3 %g" % hparams.learning_rate3

            # params=[]

            print("params=")
            for element in params:
                print(element.name)
            val1_params = self.patial_params(
                params, ["dynamic_seq2seq/value_network1"])
            val2_params = self.patial_params(
                params, ["dynamic_seq2seq/value_network2"])
            embedding_params = self.patial_params(params, ["embeddings"])
            main_dec_enc_params1 = self.patial_params(
                params,
                ["dynamic_seq2seq/encoder1/", "dynamic_seq2seq/decoder1/"])
            main_dec_enc_params2 = self.patial_params(
                params,
                ["dynamic_seq2seq/encoder2/", "dynamic_seq2seq/decoder2/"])
            action_params = self.patial_params(
                params, ["dynamic_seq2seq/decoder_action"])
            encoder_kb_params = self.patial_params(
                params, ["dynamic_seq2seq/encoder2_kb"])
            encoder_intent_params = self.patial_params(
                params, ["dynamic_seq2seq/encoder1_intent"])
            print("val1_params", "\n".join(map(lambda a: a.name, val1_params)))
            print("val2_params", "\n".join(map(lambda a: a.name, val2_params)))
            print("embedding_params",
                  "\n".join(map(lambda a: a.name, embedding_params)))
            print("main_dec_enc_params1",
                  "\n".join(map(lambda a: a.name, main_dec_enc_params1)))
            print("main_dec_enc_params2",
                  "\n".join(map(lambda a: a.name, main_dec_enc_params2)))
            print("action_params",
                  "\n".join(map(lambda a: a.name, action_params)))
            print("encoder_kb_params",
                  "\n".join(map(lambda a: a.name, encoder_kb_params)))
            print("encoder_intent_params",
                  "\n".join(map(lambda a: a.name, encoder_intent_params)))
            self.optimizer_vl1, self.v1_sum = self.generate_optimizer(
                self.vl1, params, "vl1", self.learning_rate2,
                self.hparams.max_gradient_norm2)
            self.optimizer_vl2, self.v2_sum = self.generate_optimizer(
                self.vl2, params, "vl2", self.learning_rate2,
                self.hparams.max_gradient_norm2)
            if hparams.self_play_variable_method == 0:
                rl_param1, rl_param2 = encoder_intent_params, encoder_kb_params + action_params
            elif hparams.self_play_variable_method == 1:
                rl_param1, rl_param2 = main_dec_enc_params1, main_dec_enc_params2
            elif hparams.self_play_variable_method == 2:
                rl_param1, rl_param2 = main_dec_enc_params1 + encoder_intent_params, main_dec_enc_params2 + encoder_kb_params + action_params
            elif hparams.self_play_variable_method == 3:
                rl_param1, rl_param2 = [main_dec_enc_params1[0]
                                        ] + encoder_intent_params, [
                                            main_dec_enc_params2[0]
                                        ] + encoder_kb_params
            elif hparams.self_play_variable_method == 4:
                rl_param1, rl_param2 = [main_dec_enc_params1[0]
                                        ], [main_dec_enc_params2[0]]
            elif hparams.self_play_variable_method == 5:
                rl_param1, rl_param2 = params, params
            self.optimizer_pl1, self.p1_sum = self.generate_optimizer(
                self.pl1, params, "pl1", self.learning_rate3,
                self.hparams.max_gradient_norm3)
            self.optimizer_pl2, self.p2_sum = self.generate_optimizer(
                self.pl2, params, "pl2", self.learning_rate3,
                self.hparams.max_gradient_norm3)
            print("self.learning", self.learning_rate, self.learning_rate2,
                  self.learning_rate3)
            ################################
            ### supervised learning######'
            ###########################
        # Saver
        self.saver = tf.train.Saver(tf.global_variables())

        # Print trainable variables
        utils.print_out("# Trainable variables")
        for param in params:
            utils.print_out(
                "  %s, %s, %s" %
                (param.name, str(param.get_shape()), param.op.device))
Пример #10
0
    def _set_train_or_infer(self, res, hparams):
        """Set up training."""
        loss = res[1]
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = loss
            self.word_count = tf.reduce_sum(
                self.features["source_sequence_length"]) + tf.reduce_sum(
                    self.features["target_sequence_length"])
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = loss
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_logits = res[0]
            self.infer_loss = loss
            self.sample_id = res[2]

        if self.mode != tf.contrib.learn.ModeKeys.INFER:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(
                self.features["target_sequence_length"])

        # Gradients and SGD update operation for training the model.
        # Arrange for the embedding vars to appear at the beginning.
        # Only build bprop if running on GPU and using dist_strategy, in which
        # case learning rate, grads and train_op are created in estimator model
        # function.
        with tf.name_scope("learning_rate"):
            self.learning_rate = tf.constant(hparams.learning_rate)
            # warm-up
            self.learning_rate = self._get_learning_rate_warmup(hparams)
            # decay
            self.learning_rate = self._get_learning_rate_decay(hparams)

        if (hparams.use_dist_strategy
                and self.mode == tf.contrib.learn.ModeKeys.TRAIN):
            # Gradients
            params = tf.trainable_variables()
            # Print trainable variables
            utils.print_out("# Trainable variables")
            utils.print_out(
                "Format: <name>, <shape>, <dtype>, <(soft) device placement>")
            for param in params:
                utils.print_out(
                    "  %s, %s, %s, %s" % (param.name, str(
                        param.get_shape()), param.dtype.name, param.op.device))
            utils.print_out("Total params size: %.2f GB" % (4. * np.sum([
                p.get_shape().num_elements()
                for p in params if p.shape.is_fully_defined()
            ]) / 2**30))

            # Optimizer
            if hparams.optimizer == "sgd":
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            elif hparams.optimizer == "adam":
                opt = tf.train.AdamOptimizer(self.learning_rate)
            else:
                raise ValueError("Unknown optimizer type %s" %
                                 hparams.optimizer)
            assert opt is not None

            grads_and_vars = opt.compute_gradients(
                self.train_loss,
                params,
                colocate_gradients_with_ops=hparams.colocate_gradients_with_ops
            )
            gradients = [x for (x, _) in grads_and_vars]

            clipped_grads, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm = grad_norm
            self.params = params
            self.grads = clipped_grads

            self.update = opt.apply_gradients(list(zip(clipped_grads, params)),
                                              global_step=self.global_step)
        else:
            self.grad_norm = None
            self.update = None
            self.params = None
            self.grads = None
Пример #11
0
    def build_graph(self, features, labels, mode, params):
        """docstring."""
        del labels, params
        misc_utils.print_out("Running fast mode_fn")

        hparams = self.hparams

        # Create global_step
        tf.train.get_or_create_global_step()

        if mode == tf.contrib.learn.ModeKeys.INFER:
            # Doing inference only on one GPU
            inf_hparams = tf.contrib.training.HParams(**hparams.values())
            inf_hparams.set_hparam("num_gpus", 1)
            # Inference is done in fp32 and in the same way as that of dist_strategy.
            inf_hparams.set_hparam("use_fp16", False)

            misc_utils.print_out("inference hparmas:")
            misc_utils.print_hparams(inf_hparams)

            # Create variable_mgr
            var_mgr = self._get_variable_mgr(inf_hparams)

            with mixed_precision_scope(), tf.device("gpu:0"), tf.name_scope(
                    "tower_0"), var_mgr.create_outer_variable_scope(0):
                model = gnmt_model.GNMTModel(inf_hparams,
                                             mode=mode,
                                             features=features)
                sample_ids = model.sample_id
                reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file(
                    inf_hparams.tgt_vocab_file, default_value=vocab_utils.UNK)
                sample_words = reverse_target_vocab_table.lookup(
                    tf.to_int64(sample_ids))
                # make sure outputs is of shape [batch_size, time] or [beam_width,
                # batch_size, time] when using beam search.
                if inf_hparams.time_major:
                    sample_words = tf.transpose(sample_words)
                elif sample_words.shape.ndims == 3:
                    # beam search output in [batch_size, time, beam_width] shape.
                    sample_words = tf.transpose(sample_words, [2, 0, 1])
                predictions = {"predictions": sample_words}
                # return loss, vars, grads, predictions, train_op, scaffold
                return None, None, None, predictions, None, None
        elif mode == tf.contrib.learn.ModeKeys.TRAIN:
            num_towers = hparams.num_gpus
            # Shard inputs
            tower_features = self._shard_inputs(features, num_towers)
            # Create loss scale vars if necessary
            loss_scale, loss_scale_normal_steps = self._create_loss_scale_vars(
            )

            # Create variable_mgr
            var_mgr = self._get_variable_mgr(hparams)

            # Build per-tower fprop and bprop
            devices = var_mgr.get_devices()
            tower_gradvars = []
            tower_scopes = []
            var_scopes = []
            train_losses = []
            learning_rates = []
            batch_sizes = []
            opts = []

            def fprop_and_bprop(tid):
                """docstring."""
                model = gnmt_model.GNMTModel(hparams,
                                             mode=mode,
                                             features=tower_features[tid])
                # sync training.
                assert model.learning_rate is not None
                # The following handles shouldn't be built in when doing manual
                assert model.grad_norm is None
                assert model.update is None
                tower_loss = model.train_loss
                # Only check loss numerics if in fp16
                if hparams.use_fp16 and hparams.check_tower_loss_numerics:
                    tower_loss = tf.check_numerics(
                        tower_loss, "tower_%d has Inf/NaN loss" % tid)
                # Cast to fp32, otherwise would easily overflow.
                tower_loss = tf.to_float(tower_loss)
                var_params, grads, opt = self._compute_tower_grads(
                    tower_loss,
                    var_mgr.trainable_variables_on_device(tid, tid),
                    model.learning_rate,
                    use_fp16=hparams.use_fp16,
                    loss_scale=loss_scale,
                    colocate_gradients_with_ops=hparams.
                    colocate_gradients_with_ops)
                self._print_varinfo(var_params, tid)
                res = [model.train_loss, model.learning_rate, model.batch_size]
                res.extend(grads)
                opts.append(opt)
                return res

            def unpack_fprop_and_bprop_output(output):
                train_loss = output[0]
                learning_rate = output[1]
                batch_size = output[2]
                grads = output[3:]
                return train_loss, learning_rate, batch_size, grads

            with mixed_precision_scope():
                for tid in range(num_towers):
                    with tf.device(devices[tid % len(devices)]), tf.name_scope(
                            "tower_%s" % tid) as scope:
                        tower_scopes.append(scope)
                        with var_mgr.create_outer_variable_scope(
                                tid) as var_scope:
                            var_scopes.append(var_scope)

                            outputs = maybe_xla_compile(
                                hparams, fprop_and_bprop, tid)
                            (train_loss, learning_rate, batch_size,
                             grads) = unpack_fprop_and_bprop_output(outputs)
                            train_losses.append(train_loss)
                            learning_rates.append(learning_rate)
                            batch_sizes.append(batch_size)
                            var_params = var_mgr.trainable_variables_on_device(
                                tid, tid)
                            tower_gradvars.append(list(zip(grads, var_params)))

            # Add summaries
            if hparams.show_metrics:
                tf.summary.scalar("learning_rate", learning_rates[0])
                if loss_scale:
                    tf.summary.scalar("loss_scale", loss_scale)
                    if hparams.enable_auto_loss_scale:
                        tf.summary.scalar("loss_scale_normal_steps",
                                          loss_scale_normal_steps)
            misc_utils.print_out("Finish building fprop and per-tower bprop.")
            # Aggregate gradients
            # The following compute the aggregated grads for each tower, stored in
            # opaque grad_states structure.
            apply_grads_devices, grad_states = var_mgr.preprocess_device_grads(
                tower_gradvars)
            master_grads = None
            master_params = None
            update_ops = []
            for i, device in enumerate(apply_grads_devices):
                with tf.device(device), tf.name_scope(tower_scopes[i]):
                    # Get per-tower grads.
                    with tf.name_scope("get_gradients_to_apply"):
                        avg_gradvars = var_mgr.get_gradients_to_apply(
                            i, grad_states)
                    avg_grads = [gv[0] for gv in avg_gradvars]

                    # gradients post-processing
                    with tf.name_scope("clip_gradients"):
                        if hparams.clip_grads:
                            clipped_grads, grad_norm = model_helper.gradient_clip(
                                avg_grads,
                                max_gradient_norm=hparams.max_gradient_norm)
                            # summary the grad on the 1st tower
                            if i == 0 and hparams.show_metrics:
                                tf.summary.scalar("grad_norm", grad_norm)
                                tf.summary.scalar(
                                    "clipped_grad_norm",
                                    tf.global_norm(clipped_grads))
                        else:
                            clipped_grads = avg_grads
                        if i == 0:
                            master_grads = clipped_grads

                    # Build apply-gradients ops
                    clipped_gradvars = list(
                        zip(clipped_grads, [gv[1] for gv in avg_gradvars]))
                    if i == 0:
                        master_params = [gv[1] for gv in avg_gradvars]
                    with tf.name_scope("append_gradient_ops"):
                        loss_scale_params = variable_mgr_util.AutoLossScaleParams(
                            enable_auto_loss_scale=hparams.
                            enable_auto_loss_scale,
                            loss_scale=loss_scale,
                            loss_scale_normal_steps=loss_scale_normal_steps,
                            inc_loss_scale_every_n=hparams.
                            fp16_inc_loss_scale_every_n,
                            is_chief=True)
                        opt = opts[i]
                        var_mgr.append_apply_gradients_ops(
                            grad_states, opt, clipped_gradvars, update_ops,
                            loss_scale_params)
            misc_utils.print_out("Finish building grad aggregation.")

            assert len(update_ops) == num_towers
            train_op = tf.group(update_ops)
            with tf.control_dependencies([train_op]):
                global_step = tf.train.get_global_step()
                train_op = global_step.assign_add(1)

            # Compute loss on the first gpu
            # TODO(jamesqin): optimize it?
            with tf.device("gpu:0"):
                loss = misc_utils.weighted_avg(train_losses, batch_sizes)

            # Create local init_ops
            # TODO(jamesqin): handle resource variables!
            # At present if not using mirror strategy, not using resource vars.
            local_init_ops = []
            local_init_op = tf.local_variables_initializer()
            with tf.control_dependencies([local_init_op]):
                local_init_ops.append(var_mgr.get_post_init_ops())
            local_init_ops.extend([local_init_op, tf.tables_initializer()])

            saveable_vars = var_mgr.savable_variables()
            # Add saveables for cudnn vars in master tower.
            saveable_objects = tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS)
            saveable_objects = [x for x in saveable_objects if "v0" in x.name]

            misc_utils.print_out("Saveable vars(%d): " % len(saveable_vars))
            for mv in saveable_vars:
                misc_utils.print_out(mv.name)

            misc_utils.print_out("All global trainable vars(%d): " %
                                 len(tf.trainable_variables()))
            for tv in tf.trainable_variables():
                misc_utils.print_out(tv.name)

            misc_utils.print_out("All global vars(%d): " %
                                 len(tf.global_variables()))
            for gv in tf.global_variables():
                misc_utils.print_out(gv.name)

            misc_utils.print_out("master backproped params(%d): " %
                                 len(master_params))
            for mp in master_params:
                misc_utils.print_out(mp.name)

            # Note the cudnn vars are skipped the init check. :(
            scaffold = tf.train.Scaffold(
                ready_op=tf.report_uninitialized_variables(saveable_vars),
                ready_for_local_init_op=tf.report_uninitialized_variables(
                    saveable_vars),
                local_init_op=tf.group(*local_init_ops),
                saver=tf.train.Saver(saveable_vars + saveable_objects,
                                     save_relative_paths=True))

            misc_utils.print_out("Finish building model_fn")
            # return loss, vars, grads, predictions, train_op, scaffold
            return loss, master_params, master_grads, None, train_op, scaffold
Пример #12
0
    def __init__(self,
                 hparams,
                 mode,
                 iterator,
                 vocab_table,
                 scope=None,
                 extra_args=None):
        """Create the model.

    Args:
      hparams: Hyperparameter configurations.
      mode: TRAIN | EVAL | INFER
      iterator: Dataset Iterator that feeds data.
      vocab_table: Lookup table mapping source words to ids.
      scope: scope of the model.
      extra_args: model_helper.ExtraArgs, for passing customizable functions.

    """
        self.iterator = iterator
        self.mode = mode
        self.vocab_table = vocab_table
        #self.vocab_size = len(vocab_table)
        self.time_major = hparams.time_major

        self.single_cell_fn = None
        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   hparams.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.init_embeddings(hparams, scope)
        self.batch_size = tf.size(self.iterator.source_sequence_length)

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = layers_core.Dense(2,
                                                      use_bias=False,
                                                      activation=tf.nn.sigmoid,
                                                      name="output_projection")

        ## Train graph
        loss, accuracy = self.build_graph(hparams, scope=scope)

        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = loss
            self.train_accuracy = accuracy
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = loss

        self.global_step = tf.Variable(0, trainable=False)
        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrage for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.learning_rate = tf.constant(hparams.learning_rate)
            # warm-up
            self.learning_rate = self._get_learning_rate_warmup(hparams)
            # decay
            self.learning_rate = self._get_learning_rate_decay(hparams)

            # Optimizer
            if hparams.optimizer == "sgd":
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                tf.summary.scalar("lr", self.learning_rate)
            elif hparams.optimizer == "adam":
                opt = tf.train.AdamOptimizer(self.learning_rate)

            # Gradients
            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=True)

            clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm = grad_norm

            self.update = opt.apply_gradients(zip(clipped_grads, params),
                                              global_step=self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("lr", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + grad_norm_summary)

        if self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_summary = self._get_infer_summary(hparams)

        # Saver
        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=hparams.num_keep_ckpts)

        # Print trainable variables
        print("# Trainable variables")
        for param in params:
            print("  %s, %s, %s" %
                  (param.name, str(param.get_shape()), param.op.device))
Пример #13
0
    def __init__(self, hparams, mode, iterator, input_vocab_table=None):
        self.n_classes = hparams.n_classes
        self.vocab_size = hparams.vocab_size
        self.input_sequence_length = iterator.input_sequence_length
        self.mode = mode
        self.inputs = iterator.input
        self.targets = iterator.target
        self.input_vocab_table = input_vocab_table
        self.batch_size = iterator.batch_size

        # Initializer for all model parameters.
        initializer = tf.random_uniform_initializer(-hparams.init_weight,
                                                    hparams.init_weight,
                                                    seed=hparams.random_seed)
        tf.get_variable_scope().set_initializer(initializer)
        # Create embedding layer.
        self.input_embedding, self.input_emb_init, self.input_emb_placeholder = model_helper.create_embeddings \
            (vocab_size=self.vocab_size,
             emb_size=hparams.input_emb_size,
             emb_trainable=hparams.input_emb_trainable,
             emb_pretrain=hparams.input_emb_pretrain)

        # build graph of rnn model.
        # res = self.build_graph(hparams)
        # Computing the log likelihood using tf.crf function
        log_likelihood, transition_params, logits = self.build_graph(hparams)

        self.transition_params = transition_params
        self.predictions = {
            "probabilities": self.compute_probabilities(logits),
            "labels": tf.cast(self.compute_labels(logits), tf.int32)
        }
        self.accuracy = self.compute_accuracy(self.predictions["labels"])
        # Computing the training loss
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = tf.reduce_mean(-log_likelihood)
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = tf.reduce_mean(-log_likelihood)
        # Calculate accuracy metric.
        self.logits = logits

        ## Learning rate
        print("  start_decay_step=%d, learning_rate=%g, decay_steps %d,"
              " decay_factor %g" %
              (hparams.start_decay_step, hparams.learning_rate,
               hparams.decay_steps, hparams.decay_factor))
        self.global_step = tf.Variable(0, trainable=False)
        params = tf.trainable_variables()
        # Gradients and sgd update operation for model training.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            # Optimizer
            if hparams.optimizer == "sgd":
                # perform SGD with a learning rate with exponential decay
                self.learning_rate = tf.cond(
                    self.global_step < hparams.start_decay_step,
                    lambda: tf.constant(hparams.learning_rate),
                    lambda: tf.train.exponential_decay(hparams.learning_rate, (
                        self.global_step - hparams.start_decay_step),
                                                       hparams.decay_steps,
                                                       hparams.decay_factor,
                                                       staircase=True),
                    name="learning_rate")
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                tf.summary.scalar("lr", self.learning_rate)
            elif hparams.optimizer == "adam":
                self.learning_rate = tf.constant(hparams.learning_rate)
                opt = tf.train.AdamOptimizer(self.learning_rate)
            # compute the gradients of train_loss w.r.t to the model trainable parameters.
            # if colocate_gradients_with_ops is true, the gradients will be computed in the same gpu/cpu device with the
            # original (forward-pass) operator
            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=hparams.
                                     colocate_gradients_with_ops)
            # clip gradients below a threshold to avoid explosion
            clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm = grad_norm
            # ask the optimizer to apply the processed gradients. We give as argument a list of pairs (gradient,variable).
            self.update = opt.apply_gradients(zip(clipped_grads, params),
                                              global_step=self.global_step)
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("lr", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + grad_norm_summary)
        # Saver. As argument, we give the variables that are going to be saved and restored.
        # The Saver op will save the variables of the graph within it is defined. All graphs (train/eval/predict)
        # have a Saver operator.
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=50)
        # Print trainable variables
        print("# Trainable variables")
        for param in params:
            print("  %s, %s" % (param.name, str(param.get_shape())))
        import numpy as np
        total_params = np.sum([
            np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()
        ])
        print("Total number of parameters: %d" % total_params)
Пример #14
0
    def __init__(self, iterator, hps, mode, vocab_table,reverse_target_vocab_table=None, scope=None):
        self.init_iter = iterator.initializer
        self.hps = hps
        self.vocab_table = vocab_table
        self.reverse_target_vocab_table = reverse_target_vocab_table
        self.iterator = iterator
        self.use_test_set = False
        self.mode = mode
        self.single_cell_fn = None
        self.time_major = hps.time_major
        self.batch_size = hps.batch_size

        # self._output_layer = layers_core.Dense(
        #     self._vocab[1], use_bias=False, name="output_projection")
        # self.start_decoding = tf.cast(vocab_table.lookup(tf.constant(hps.START_DECODING)), tf.int32)
        # self.stop_decoding = tf.cast(vocab_table.lookup(tf.constant(hps.STOP_DECODING)), tf.int32)

        #init
        # self.rand_unif_init = tf.random_uniform_initializer(-hps.rand_unif_init_mag, hps.rand_unif_init_mag, seed=123)
        #
        # self.trunc_norm_init = tf.truncated_normal_initializer(stddev=hps.trunc_norm_init_std)

        self.init_embeddings(hps, scope)
        self.batch_size = tf.size(self.iterator.source_sequence_length)

        # Projection
        with tf.variable_scope(scope or "build_network"):
         with tf.variable_scope("decoder/output_projection"):
          self.output_layer = layers_core.Dense(hps.vocab_size, use_bias=False, name="output_projection")

         ## Train graph
         res = self.build_graph(hps, scope=scope)

         if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
             self.train_loss = res[1]
             self.word_count = tf.reduce_sum(
                 self.iterator.source_sequence_length) + tf.reduce_sum(
                 self.iterator.target_sequence_length)
             if (len(res) > 4):
                 self.coverage_loss = res[4]
             else:
                 self.coverage_loss = tf.constant(0)
         elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
             self.eval_loss = res[1]
         elif self.mode == tf.contrib.learn.ModeKeys.INFER:
             if (len(res) > 4):
                 self.infer_logits, _, self.final_context_state, self.sample_id, _ = res
             else:
                 self.infer_logits, _, self.final_context_state, self.sample_id = res

             self.sample_words = reverse_target_vocab_table.lookup(
                 tf.to_int64(self.sample_id))

         if self.mode != tf.contrib.learn.ModeKeys.INFER:
             ## Count the number of predicted words for compute ppl.
             self.predict_count = tf.reduce_sum(
                 self.iterator.target_sequence_length)

         self.global_step = tf.Variable(0, trainable=False)
         params = tf.trainable_variables()

         # Gradients and SGD update operation for training the model.
         # Arrage for the embedding vars to appear at the beginning.
         if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
             self.learning_rate = tf.constant(hps.learning_rate)
             # warm-up
             self.learning_rate = self._get_learning_rate_warmup(hps)
             # decay
             self.learning_rate = self._get_learning_rate_decay(hps)

             # Optimizer
             if hps.optimizer == "sgd":
                 opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                 tf.summary.scalar("lr", self.learning_rate)
             elif hps.optimizer == "adam":
                 opt = tf.train.AdamOptimizer(self.learning_rate)

             # Gradients
             gradients = tf.gradients(
                 self.train_loss,
                 params,
                 colocate_gradients_with_ops=hps.colocate_gradients_with_ops)



             clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
                 gradients, max_gradient_norm=hps.max_gradient_norm)
             self.grad_norm = grad_norm

             self.update = opt.apply_gradients(
                 zip(clipped_grads, params), global_step=self.global_step)

             # Summary
             # Summary
             if (self.coverage_loss is not None):
                 self.train_summary = tf.summary.merge([
                                                           tf.summary.scalar("lr", self.learning_rate),
                                                           tf.summary.scalar("train_loss", self.train_loss),
                                                           tf.summary.scalar("coverage_loss", self.coverage_loss)

                                                       ] + grad_norm_summary)
             else:
                 self.train_summary = tf.summary.merge([
                                                           tf.summary.scalar("lr", self.learning_rate),
                                                           tf.summary.scalar("train_loss", self.train_loss)

                                                   ] + grad_norm_summary)
         if self.mode == tf.contrib.learn.ModeKeys.INFER:
             self.infer_summary = self._get_infer_summary(hps)

         # Saver
         self.saver = tf.train.Saver(tf.global_variables())

         # Print trainable variables
         utils.print_out("# Trainable variables")
         for param in params:
             utils.print_out("  %s, %s, %s" % (param.name, str(param.get_shape()),
                                               param.op.device))
Пример #15
0
    def __init__(self, iterator, hparams, mode, scope=None):
        self.iterator = iterator
        self.hparams = hparams
        self.mode = mode
        self.scope = scope

        # Initializer
        initializer = model_helper.get_initializer(self.hparams.init_op, None,
                                                   self.hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        with tf.variable_scope(scope or 'embedding'):
            self.embedding = tf.get_variable(
                'embedding', [self.hparams.vocab_size, self.hparams.num_units],
                dtype=tf.float32)

        # Output Layer
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope('decoder/output_projection'):
                self.output_layer = tf.layers.Dense(self.hparams.vocab_size,
                                                    use_bias=False)

        # Batch Size
        self.batch_size = tf.size(self.iterator.src_seq)

        # Build Graph
        print("# Building graph for the model ...")
        res = self.build_graph(self.scope)

        if self.mode == TRAIN:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(
                tf.reduce_sum(self.iterator.src_seq) +
                tf.reduce_sum(self.iterator.tar_seq))
        elif self.mode == EVAL:
            self.eval_loss = res[1]
        elif self.mode == PREDICT:
            self.infer_logits, _, self.final_state, self.sample_id = res

        if self.mode != PREDICT:
            # Count the number of predicted words for compute perplexity.
            self.predict_count = tf.reduce_sum(self.iterator.tar_seq)

        # Define variables
        self.global_step = tf.Variable(0, trainable=False)
        params = tf.trainable_variables()

        # Optimizer
        if self.mode == TRAIN:
            self.learning_rate = tf.placeholder(tf.float32,
                                                shape=[],
                                                name='learning_rate')

            # self.learning_rate = tf.train.exponential_decay(
            #     0.001, self.global_step, 1000, 0.9)
            opt = tf.train.AdamOptimizer(self.learning_rate)

            # Gradient
            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=self.hparams.
                                     colocate_gradients_with_ops)
            clipped_gradients, gradient_norm_summary, _ = model_helper.gradient_clip(
                gradients, self.hparams.max_gradient_norm)
            self.update = opt.apply_gradients(zip(clipped_gradients, params),
                                              self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar('train_loss', self.train_loss),
                tf.summary.scalar('learning_rate', self.learning_rate)
            ] + gradient_norm_summary)
        else:
            self.infer_summary = tf.no_op()

        # Saver
        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=self.hparams.max_to_keep)
Пример #16
0
    def __init__(self,
                 hparams,
                 mode,
                 iterator,
                 source_vocab_table,
                 target_vocab_table,
                 reverse_target_vocab_table=None,
                 scope=None,
                 extra_args=None):
        """Create the model.

    Args:
      hparams: Hyperparameter configurations.
      mode: TRAIN | EVAL | INFER
      iterator: Dataset Iterator that feeds data.
      source_vocab_table: Lookup table mapping source words to ids.
      target_vocab_table: Lookup table mapping target words to ids.
      reverse_target_vocab_table: Lookup table mapping ids to target words. Only
        required in INFER mode. Defaults to None.
      scope: scope of the model.
      extra_args: model_helper.ExtraArgs, for passing customizable functions.

    """
        assert isinstance(iterator, iterator_utils.BatchedInput)
        self.iterator = iterator
        self.mode = mode
        self.src_vocab_table = source_vocab_table
        self.tgt_vocab_table = target_vocab_table

        self.src_vocab_size = hparams.src_vocab_size
        self.tgt_vocab_size = hparams.tgt_vocab_size
        self.num_layers = hparams.num_layers
        self.num_gpus = hparams.num_gpus
        self.time_major = hparams.time_major

        # extra_args: to make it flexible for adding external customizable code
        self.single_cell_fn = None
        if extra_args:
            self.single_cell_fn = extra_args.single_cell_fn

        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   hparams.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        # TODO(ebrevdo): Only do this if the mode is TRAIN?
        self.init_embeddings(hparams, scope)
        self.batch_size = tf.size(self.iterator.source_sequence_length)

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = layers_core.Dense(hparams.tgt_vocab_size,
                                                      use_bias=False,
                                                      name="output_projection")

        ## Train graph
        res = self.build_graph(hparams, scope=scope)

        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(
                self.iterator.source_sequence_length) + tf.reduce_sum(
                    self.iterator.target_sequence_length)
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = res[1]
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = reverse_target_vocab_table.lookup(
                tf.to_int64(self.sample_id))

        if self.mode != tf.contrib.learn.ModeKeys.INFER:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(
                self.iterator.target_sequence_length)

        ## Learning rate
        warmup_steps = hparams.learning_rate_warmup_steps
        warmup_factor = hparams.learning_rate_warmup_factor
        print("  start_decay_step=%d, learning_rate=%g, decay_steps %d, "
              "decay_factor %g, learning_rate_warmup_steps=%d, "
              "learning_rate_warmup_factor=%g, starting_learning_rate=%g" %
              (hparams.start_decay_step, hparams.learning_rate,
               hparams.decay_steps, hparams.decay_factor, warmup_steps,
               warmup_factor,
               (hparams.learning_rate * warmup_factor**warmup_steps)))
        self.global_step = tf.Variable(0, trainable=False)

        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrage for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.learning_rate = tf.constant(hparams.learning_rate)

            # Apply inverse decay if global steps less than warmup steps.
            # Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3)
            # When step < warmup_steps,
            #   learing_rate *= warmup_factor ** (warmup_steps - step)
            inv_decay = warmup_factor**(tf.to_float(warmup_steps -
                                                    self.global_step))
            self.learning_rate = tf.cond(
                self.global_step < hparams.learning_rate_warmup_steps,
                lambda: inv_decay * self.learning_rate,
                lambda: self.learning_rate,
                name="learning_rate_decay_warump_cond")

            if hparams.optimizer == "sgd":
                self.learning_rate = tf.cond(
                    self.global_step < hparams.start_decay_step,
                    lambda: self.learning_rate,
                    lambda: tf.train.exponential_decay(self.learning_rate, (
                        self.global_step - hparams.start_decay_step),
                                                       hparams.decay_steps,
                                                       hparams.decay_factor,
                                                       staircase=True),
                    name="learning_rate")
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                tf.summary.scalar("lr", self.learning_rate)
            elif hparams.optimizer == "adam":
                assert float(
                    hparams.learning_rate
                ) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate
                opt = tf.train.AdamOptimizer(self.learning_rate)

            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=hparams.
                                     colocate_gradients_with_ops)

            clipped_gradients, gradient_norm_summary = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)

            self.update = opt.apply_gradients(zip(clipped_gradients, params),
                                              global_step=self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("lr", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + gradient_norm_summary)

        if self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_summary = self._get_infer_summary(hparams)

        # Saver
        self.saver = tf.train.Saver(tf.global_variables())

        # Print trainable variables
        utils.print_out("# Trainable variables")
        for param in params:
            utils.print_out(
                "  %s, %s, %s" %
                (param.name, str(param.get_shape()), param.op.device))