示例#1
0
 def _set_params(self, hparams, data_wrapper, reverse_cate_table, scope=None):
     # Put anything that you want to print into 'print_pool', then it will be printed when session runs
     self.print_pool = dict()
     # Store tensors that will be feed in histogram summary
     self.histogram = dict()
     # Store summaries
     self.summaries = list()
     # Store the specific layer's activation for visual utility
     self.activations = list()
     self.restore_op = tf.no_op()
     self.reverse_cate_table = reverse_cate_table
     self.scope = scope
     self.feat_stride = [16]
     self.hparams = hparams
     self.ori_count = len(hparams.anchor_ratios) * len(hparams.anchor_scales)
     self.trainable = hparams.mode is "train"
     self.tunable = hparams.tunable
     self.predicable = hparams.mode is "infer"
     # Set data
     assert isinstance(data_wrapper, iterator_wrapper.DataWrapper)
     # !!!Make sure dataset batch size is 1
     self.im_info = data_wrapper.images_size
     self.images_data = data_wrapper.images_data
     self.bbox_labels = data_wrapper.bbox_locations
     # Initializer
     self.initializer = helper.get_initializer(
         hparams.init_op, hparams.ran_seed, hparams.init_weight)
     self.bbox_initializer = helper.get_initializer(
         hparams.bbox_init_op, hparams.bbox_ran_seed, hparams.bbox_init_weight)
     # Regularization
     # weights_decay = hparams.weight_decay_factor
     # if hparams.bias_decay:
     #     biases_decay = weights_decay
     # else:
     #     biases_decay = None
     # layers.fill_arg_scope(weights_decay=weights_decay, biases_decay=biases_decay)
     self.weights_regularizer = tf.contrib.layers.l2_regularizer(hparams.weight_decay_factor)
     if hparams.bias_decay:
         self.biases_regularizer = self.weights_regularizer
     else:
         self.biases_regularizer = None
     # tf.get_variable_scope().set_initializer(self.initializer)
     # Set up global step
     self._setup_gloabal_step()
示例#2
0
    def _set_params_initializer(self,
                                hparams,
                                mode,
                                iterator,
                                source_vocab_table,
                                target_vocab_table,
                                label_vocab_table,
                                scope,
                                extra_args=None):
        """Set various params for self and initialize."""
        assert isinstance(iterator, iterator_utils.BatchedInput)
        self.iterator = iterator
        self.mode = mode
        self.src_vocab_table = source_vocab_table
        self.tgt_vocab_table = target_vocab_table
        self.lbl_vocab_table = label_vocab_table

        self.src_vocab_size = hparams.src_vocab_size
        self.tgt_vocab_size = hparams.tgt_vocab_size
        self.lbl_vocab_size = hparams.lbl_vocab_size
        self.num_gpus = hparams.num_gpus
        self.time_major = hparams.time_major

        self.dtype = tf.float32
        self.num_sampled_softmax = hparams.num_sampled_softmax

        # extra_args: to make it flexible for adding external customizable code
        self.single_cell_fn = None
        if extra_args:
            self.single_cell_fn = extra_args.single_cell_fn

        # Set num units
        self.num_units = hparams.num_units

        # Set num layers
        self.num_encoder_layers = hparams.num_encoder_layers
        self.num_decoder_layers = hparams.num_decoder_layers
        assert self.num_encoder_layers
        assert self.num_decoder_layers

        # Batch size
        self.batch_size = tf.size(self.iterator.source_sequence_length)

        # Global step
        self.global_step = tf.Variable(0, trainable=False)

        # Initializer
        self.random_seed = hparams.random_seed
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   self.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.encoder_emb_lookup_fn = tf.nn.embedding_lookup
        self.init_embeddings(hparams, scope)
示例#3
0
 def set_hparams_init(self, flags, mode):
     # Select mode for TRAIN | PREDICT(infer)
     self.mode = mode
     
     # Set environment 
     self.env = gym.make(flags.env)
     # [batch_size, image_height, image_width, num_channels]
     self.state = tf.placeholder(tf.float32, 
         [None, flags.img_height, flags.img_width, 1]) 
     # possible_action: [stop, left, right]
     self.action_size = 3 
     
     # Set image height and width
     self.img_height = flags.img_height
     self.img_width = flags.img_width
     
     # Global step
     self.global_step = tf.Variable(0, trainable=False)
     
     self.num_gpus = flags.num_gpus
     
     # Initializer for weights, biases
     self.random_seed = flags.random_seed
     self.w_init = model_helper.get_initializer(
         flags.w_init_op, self.random_seed, flags.mean, flags.stddev)
     self.b_init = model_helper.get_initializer(
         flags.b_init_op, self.random_seed, bias_start=flags.bias_start)
     
     # Convolution
     self.cv_num_outputs = flags.cv_num_outputs
     self.f_height = flags.f_height # filter height
     self.f_width = flags.f_width # filter width
     self.stride = flags.stride
     self.padding = flags.padding
     
     # Recurrent 
     self.rnn_num_layers = flags.rnn_num_layers
     self.cell_type = flags.cell_type
     self.num_units = flags.num_units
     self.dropout = flags.dropout
     self.residual_connect = flags.residual_connect
示例#4
0
    def _set_params_initializer(self, hparams, mode, features):
        """Set various params for self and initialize."""
        self.mode = mode
        self.src_vocab_size = hparams.src_vocab_size
        self.tgt_vocab_size = hparams.tgt_vocab_size
        self.features = features

        self.dtype = tf.as_dtype(hparams.activation_dtype)

        self.single_cell_fn = None

        # Set num units
        self.num_units = hparams.num_units
        self.eos_id = hparams.tgt_eos_id
        self.label_smoothing = hparams.label_smoothing

        # Set num layers
        self.num_encoder_layers = hparams.num_encoder_layers
        self.num_decoder_layers = hparams.num_decoder_layers
        assert self.num_encoder_layers
        assert self.num_decoder_layers

        # Batch size
        self.batch_size = tf.size(self.features["source_sequence_length"])

        # Global step
        # Use get_global_step instead of user-defied global steps. Otherwise the
        # num_train_steps in TPUEstimator.train has no effect (will train forever).
        # TPUestimator only check if tf.train.get_global_step() < num_train_steps
        self.global_step = tf.train.get_or_create_global_step()

        # Initializer
        self.random_seed = hparams.random_seed
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   self.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.encoder_emb_lookup_fn = (self._emb_lookup if self.mode
                                      == tf.contrib.learn.ModeKeys.TRAIN else
                                      tf.nn.embedding_lookup)
示例#5
0
文件: model.py 项目: cltdevelop/nmt
    def __init__(self,
                 hparams,
                 mode,
                 iterator,
                 source_vocab_table,
                 target_vocab_table,
                 reverse_target_vocab_table=None,
                 scope=None,
                 extra_args=None):
        """Create the model.

    Args:
      hparams: Hyperparameter configurations.
      mode: TRAIN | EVAL | INFER
      iterator: Dataset Iterator that feeds data.
      source_vocab_table: Lookup table mapping source words to ids.
      target_vocab_table: Lookup table mapping target words to ids.
      reverse_target_vocab_table: Lookup table mapping ids to target words. Only
        required in INFER mode. Defaults to None.
      scope: scope of the model.
      extra_args: model_helper.ExtraArgs, for passing customizable functions.

    """
        assert isinstance(iterator, iterator_utils.BatchedInput)
        self.iterator = iterator
        self.mode = mode
        self.src_vocab_table = source_vocab_table
        self.tgt_vocab_table = target_vocab_table

        self.src_vocab_size = hparams.src_vocab_size
        self.tgt_vocab_size = hparams.tgt_vocab_size
        self.num_gpus = hparams.num_gpus
        self.time_major = hparams.time_major

        # extra_args: to make it flexible for adding external customizable code
        self.single_cell_fn = None
        if extra_args:
            self.single_cell_fn = extra_args.single_cell_fn

        # Set num layers
        self.num_encoder_layers = hparams.num_encoder_layers
        self.num_decoder_layers = hparams.num_decoder_layers
        assert self.num_encoder_layers
        assert self.num_decoder_layers

        # Set num residual layers
        if hasattr(hparams,
                   "num_residual_layers"):  # compatible common_test_utils
            self.num_encoder_residual_layers = hparams.num_residual_layers
            self.num_decoder_residual_layers = hparams.num_residual_layers
        else:
            self.num_encoder_residual_layers = hparams.num_encoder_residual_layers
            self.num_decoder_residual_layers = hparams.num_decoder_residual_layers

        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   hparams.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.init_embeddings(hparams, scope)
        self.batch_size = tf.size(self.iterator.source_sequence_length)

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = layers_core.Dense(hparams.tgt_vocab_size,
                                                      use_bias=False,
                                                      name="output_projection")

        ## Train graph
        res = self.build_graph(hparams, scope=scope)

        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(
                self.iterator.source_sequence_length) + tf.reduce_sum(
                    self.iterator.target_sequence_length)
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = res[1]
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = reverse_target_vocab_table.lookup(
                tf.to_int64(self.sample_id))

        if self.mode != tf.contrib.learn.ModeKeys.INFER:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(
                self.iterator.target_sequence_length)

        self.global_step = tf.Variable(0, trainable=False)
        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrage for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.learning_rate = tf.constant(hparams.learning_rate)
            # warm-up
            self.learning_rate = self._get_learning_rate_warmup(hparams)
            # decay
            self.learning_rate = self._get_learning_rate_decay(hparams)

            # Optimizer
            if hparams.optimizer == "sgd":
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                tf.summary.scalar("lr", self.learning_rate)
            elif hparams.optimizer == "adam":
                opt = tf.train.AdamOptimizer(self.learning_rate)

            # Gradients
            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=hparams.
                                     colocate_gradients_with_ops)

            clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm = grad_norm

            self.update = opt.apply_gradients(zip(clipped_grads, params),
                                              global_step=self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("lr", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + grad_norm_summary)

        if self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_summary = self._get_infer_summary(hparams)

        # Saver
        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=hparams.num_keep_ckpts)

        # Print trainable variables
        utils.print_out("# Trainable variables")
        for param in params:
            utils.print_out(
                "  %s, %s, %s" %
                (param.name, str(param.get_shape()), param.op.device))
    def __init__(self,
                 hparams,
                 mode,
                 iterator,
                 source_vocab_table,
                 target_vocab_table,
                 reverse_target_vocab_table=None):

        assert isinstance(iterator, iterator_utils.BatchedInput)
        self.iterator = iterator
        self.mode = mode
        self.src_vocab_table = source_vocab_table
        self.tgt_vocab_table = target_vocab_table

        self.src_vocab_size = hparams.src_vocab_size
        self.tgt_vocab_size = hparams.tgt_vocab_size
        self.time_major = hparams.time_major

        self.single_cell_fn = None

        # Set num layers
        self.num_encoder_layers = hparams.num_encoder_layers
        self.num_decoder_layers = hparams.num_decoder_layers
        assert self.num_encoder_layers
        assert self.num_decoder_layers

        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   hparams.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.init_embeddings(hparams)
        self.batch_size = tf.size(self.iterator.source_sequence_length)

        # Projection
        with tf.variable_scope("build_netword"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = layers_core.Dense(hparams.tgt_vocab_size,
                                                      use_bias=False,
                                                      name="output_projection")

        ## Train graph
        res = self.build_graph(hparams)

        if self.mode == tf.estimator.ModeKeys.TRAIN:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(
                self.iterator.source_sequence_length) + tf.reduce_sum(
                    self.iterator.target_sequence_length)
        elif self.mode == tf.estimator.ModeKeys.EVAL:
            self.eval_loss = res[1]
        elif self.mode == tf.estimator.ModeKeys.PREDICT:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = reverse_target_vocab_table.lookup(
                tf.to_int64(self.sample_id))

        if self.mode != tf.estimator.ModeKeys.PREDICT:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(
                self.iterator.target_sequence_length)

        self.global_step = tf.Variable(0, trainable=False)
        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrange for the embedding vars to appear at the beginning.
        if self.mode == tf.estimator.ModeKeys.TRAIN:
            self.learning_rate = tf.constant(hparams.learning_rate)
            # warm-up
            self.learning_rate = self._get_learning_rate_warmup(hparams)
            # decay
            self.learning_rate = self._get_learning_rate_decay(hparams)

            # Optimizer
            if hparams.optimizer == "sgd":
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                tf.summary.scalar("lr", self.learning_rate)
            elif optimizer == "adam":
                opt = tf.train.AdamOptimizer(self.learning_rate)

            # Gradients
            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=hparams.
                                     colocate_gradients_with_ops)

            clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm = grad_norm

            self.update = opt.apply_gradients(zip(clipped_grads, params),
                                              global_step=self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("lr", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + grad_norm_summary)

        if self.mode == tf.estimator.ModeKeys.PREDICT:
            self.infer_summary = self._get_infer_summary(hparams)

        # Saver
        self.saver = tf.train.Saver(tf.global_variables())

        # Print trainable variables
        utils.print_out("# Trainable variables")
        for param in params:
            utils.print_out(
                "  %s, %s, %s" %
                (param.name, str(param.get_shape()), param.op.device))
示例#7
0
    def _set_params_initializer(self,
                                hparams,
                                mode,
                                features,
                                scope,
                                extra_args=None):
        """Set various params for self and initialize."""
        self.mode = mode
        self.src_vocab_size = hparams.src_vocab_size
        self.tgt_vocab_size = hparams.tgt_vocab_size
        self.features = features
        self.time_major = hparams.time_major

        if hparams.use_char_encode:
            assert (not self.time_major), ("Can't use time major for"
                                           " char-level inputs.")

        self.dtype = tf.float16 if hparams.use_fp16 else tf.float32

        # extra_args: to make it flexible for adding external customizable code
        self.single_cell_fn = None
        if extra_args:
            self.single_cell_fn = extra_args.single_cell_fn

        # Set num units
        self.num_units = hparams.num_units
        # Set num layers
        self.num_encoder_layers = hparams.num_encoder_layers
        self.num_decoder_layers = hparams.num_decoder_layers
        assert self.num_encoder_layers
        assert self.num_decoder_layers

        # Set num residual layers
        if hasattr(hparams,
                   "num_residual_layers"):  # compatible common_test_utils
            self.num_encoder_residual_layers = hparams.num_residual_layers
            self.num_decoder_residual_layers = hparams.num_residual_layers
        else:
            self.num_encoder_residual_layers = hparams.num_encoder_residual_layers
            self.num_decoder_residual_layers = hparams.num_decoder_residual_layers

        # Batch size
        self.batch_size = tf.size(self.features["source_sequence_length"])

        # Global step
        global_step = tf.train.get_global_step()
        if global_step is not None:
            utils.print_out("global_step already created!")

        self.global_step = tf.train.get_or_create_global_step()
        utils.print_out("model.global_step.name: %s" % self.global_step.name)

        # Initializer
        self.random_seed = hparams.random_seed
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   self.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.encoder_emb_lookup_fn = tf.nn.embedding_lookup
        self.init_embeddings(hparams, scope)
示例#8
0
文件: model.py 项目: zhang197/nslt
    def __init__(self, hparams, mode, iterator, target_vocab_table, reverse_target_vocab_table=None, scope=None, single_cell_fn=None):

        """Create the model.

        Args:
          hparams: Hyperparameter configurations.
          mode: TRAIN | EVAL | INFER
          iterator: Dataset Iterator that feeds data.
          target_vocab_table: Lookup table mapping target words to ids.
          reverse_target_vocab_table: Lookup table mapping ids to target words. Only
            required in INFER mode. Defaults to None.
          scope: scope of the model.
          single_cell_fn: allow for adding customized cell. When not specified,
            we default to model_helper._single_cell
        """


        assert isinstance(iterator, iterator_utils.BatchedInput)

        self.iterator = iterator
        self.mode = mode
        self.tgt_vocab_table = target_vocab_table

        self.tgt_vocab_size = hparams.tgt_vocab_size
        self.num_layers = hparams.num_layers
        self.num_gpus = hparams.num_gpus
        self.time_major = hparams.time_major

        self.cnn_input = self.iterator.source
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.cnn = AlexNet(self.cnn_input, (1 - hparams.dropout), model_helper.get_device_str(hparams.base_gpu))
        else:
            self.cnn = AlexNet(self.cnn_input, 1, model_helper.get_device_str(hparams.base_gpu))

        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.init_embeddings(hparams, scope)
        self.batch_size = tf.size(self.iterator.source_sequence_length)

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = layers_core.Dense(hparams.tgt_vocab_size, use_bias=False, name="output_projection")

        # To make it flexible for external code to add other cell types
        # If not specified, we will later use model_helper._single_cell
        self.single_cell_fn = single_cell_fn

        ## Train graph
        res = self.build_graph(hparams, scope=scope)

        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(self.iterator.target_sequence_length)
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = res[1]
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = reverse_target_vocab_table.lookup(tf.to_int64(self.sample_id))

        if self.mode != tf.contrib.learn.ModeKeys.INFER:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(self.iterator.target_sequence_length)

        ## Learning rate
        print("  start_decay_step=%d, learning_rate=%g, decay_steps %d, decay_factor %g" % (hparams.start_decay_step, hparams.learning_rate, hparams.decay_steps, hparams.decay_factor))

        self.global_step = tf.Variable(0, trainable=False)

        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrage for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            if hparams.optimizer == "sgd":
                self.learning_rate = tf.cond(self.global_step < hparams.start_decay_step,
                                             lambda: tf.constant(hparams.learning_rate),
                                             lambda: tf.train.exponential_decay(hparams.learning_rate,
                                                                                (self.global_step - hparams.start_decay_step),
                                                                                hparams.decay_steps,
                                                                                hparams.decay_factor,
                                                                                staircase=True),
                                             name="learning_rate")
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                tf.summary.scalar("lr", self.learning_rate)

            elif hparams.optimizer == "adam":
                assert float(hparams.learning_rate) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate
                self.learning_rate = tf.constant(hparams.learning_rate)
                opt = tf.train.AdamOptimizer(self.learning_rate)

            gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams.colocate_gradients_with_ops)

            clipped_gradients, gradient_norm_summary = model_helper.gradient_clip(gradients, max_gradient_norm=hparams.max_gradient_norm)

            self.update = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss)] + gradient_norm_summary)

        if self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_summary = self._get_infer_summary(hparams)

        # Saver
        if hparams.eval_on_fly:
            self.saver = tf.train.Saver(tf.global_variables(), save_relative_paths= True)
        else:
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=None, save_relative_paths= True)

        # Print trainable variables
        utils.print_out("# Trainable variables")
        for param in params:
            utils.print_out("  %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
示例#9
0
    def __init__(self,
                 hparams,
                 mode,
                 iterator,
                 handle,
                 vocab_table,
                 reverse_vocab_table=None,
                 scope=None,
                 extra_args=None):
        assert isinstance(iterator, iterator_utils.BatchedInput)
        self.iterator = iterator
        self.handle = handle
        self.mode = mode
        self.vocab_table = vocab_table
        self.vocab_size = hparams.vocab_size
        self.num_layers = hparams.num_layers
        self.num_gpus = hparams.num_gpus
        self.hparams = hparams
        self.single_cell_fn = None
        self.global_gpu_num = 0
        if extra_args:
            self.single_cell_fn = extra_args.single_cell_fn

        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   hparams.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.init_embeddings(hparams, scope)
        self.batch_size = tf.shape(self.iterator.source)[0]

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer1 = layers_core.Dense(
                    hparams.vocab_size,
                    use_bias=False,
                    name="output_projection_1")
                self.output_layer2 = layers_core.Dense(
                    hparams.vocab_size,
                    use_bias=False,
                    name="output_projection_2")
                self.output_layer_action = layers_core.Dense(
                    hparams.vocab_size,
                    use_bias=False,
                    name="output_projection_action")
                self.vn_project11 = layers_core.Dense(
                    hparams.unit_value_network,
                    use_bias=False,
                    name="vn_project_11")
                self.vn_project12 = layers_core.Dense(
                    hparams.unit_value_network,
                    use_bias=False,
                    name="vn_project_12")
                self.vn_project21 = layers_core.Dense(
                    hparams.unit_value_network,
                    use_bias=False,
                    name="vn_project_21")
                self.vn_project22 = layers_core.Dense(
                    hparams.unit_value_network,
                    use_bias=False,
                    name="vn_project_22")

        ## Train graph
        sl_loss, sl_loss_arr, rl_loss_arr, sample_id_arr_train, sample_id_arr_infer = build_graph(
            self, hparams, scope=scope)

        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = sl_loss
            self.all_train_loss = sl_loss_arr
            self.word_count = tf.reduce_sum(self.iterator.dialogue_len)
            self.sample_ids_arr = sample_id_arr_train
            self.sample_words_arr1 = []
            self.sample_words_arr2 = []
            source = self.iterator.source
            for i in range(len(self.sample_ids_arr)):
                element_infer = self.sample_ids_arr[i]
                element_src = source[0]
                # element_src=0
                src = reverse_vocab_table.lookup(tf.to_int64(element_src))
                infer = reverse_vocab_table.lookup(
                    tf.to_int64(element_infer)
                )[0]  # src can only get the first one so I only get the first inference
                if i == 0:
                    self.sample_words_arr1.append((tf.constant(i), src, infer))
                elif i == 1:
                    self.sample_words_arr2.append((tf.constant(i), src, infer))
            self.vl1, self.vl2, self.pl1, self.pl2, self.eq11, self.eq12, self.eq2 = rl_loss_arr  # reinforcement updates

        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = sl_loss
            self.all_eval_loss = sl_loss_arr

        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.sample_ids_arr = sample_id_arr_infer
            self.sample_words_arr = []
            self.source = reverse_vocab_table.lookup(
                tf.to_int64(iterator.source))
            for element in self.sample_ids_arr:
                self.sample_words_arr.append(
                    reverse_vocab_table.lookup(tf.to_int64(element)))
        elif self.mode in dialogue_utils.self_play_modes:
            #### self play
            self.train_loss = sl_loss
            self.all_train_loss = sl_loss_arr
            self.selfplay_agent_1_utt = reverse_vocab_table.lookup(
                tf.to_int64(sample_id_arr_infer[0]))
            self.selfplay_agent_2_utt = reverse_vocab_table.lookup(
                tf.to_int64(sample_id_arr_infer[1]))
            self.selfplay_action = reverse_vocab_table.lookup(
                tf.to_int64(sample_id_arr_infer[2]))
            if self.mode == dialogue_utils.mode_self_play_mutable:
                self.vl1, self.vl2, self.pl1, self.pl2, self.eq11, self.eq12, self.eq2 = rl_loss_arr  # reinforcement updates

        if self.mode != tf.contrib.learn.ModeKeys.INFER:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(self.iterator.dialogue_len)

        ## Learning rate
        warmup_steps = hparams.learning_rate_warmup_steps
        warmup_factor = hparams.learning_rate_warmup_factor
        print("  start_decay_step=%d, learning_rate=%g, decay_steps %d, "
              "decay_factor %g, learning_rate_warmup_steps=%d, "
              "learning_rate_warmup_factor=%g, starting_learning_rate=%g" %
              (hparams.start_decay_step, hparams.learning_rate,
               hparams.decay_steps, hparams.decay_factor, warmup_steps,
               warmup_factor,
               (hparams.learning_rate * warmup_factor**warmup_steps)))
        self.global_step = tf.Variable(0, trainable=False)

        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrage for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN or self.mode == dialogue_utils.mode_self_play_mutable:
            self.learning_rate = tf.constant(hparams.learning_rate)

            inv_decay = warmup_factor**(tf.to_float(warmup_steps -
                                                    self.global_step))
            self.learning_rate = tf.cond(
                self.global_step < hparams.learning_rate_warmup_steps,
                lambda: inv_decay * self.learning_rate,
                lambda: self.learning_rate,
                name="learning_rate_decay_warump_cond")

            if hparams.optimizer == "sgd":
                self.learning_rate = tf.cond(
                    self.global_step < hparams.start_decay_step,
                    lambda: self.learning_rate,
                    lambda: tf.train.exponential_decay(self.learning_rate, (
                        self.global_step - hparams.start_decay_step),
                                                       hparams.decay_steps,
                                                       hparams.decay_factor,
                                                       staircase=True),
                    name="sgd_learning_rate_supervised")
                opt = tf.train.GradientDescentOptimizer(self.learning_rate,
                                                        name="SGD_supervised")
                tf.summary.scalar("lr", self.learning_rate)
            elif hparams.optimizer == "adam":
                assert float(
                    hparams.learning_rate
                ) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate
                opt = tf.train.AdamOptimizer(self.learning_rate,
                                             name="Adam_supervised")

            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=hparams.
                                     colocate_gradients_with_ops,
                                     name="gradients_adam")

            clipped_gradients, gradient_norm_summary = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)

            self.update = opt.apply_gradients(zip(clipped_gradients, params),
                                              global_step=self.global_step,
                                              name="adam_apply_gradients")

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("lr", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + gradient_norm_summary)

        # second part of the learning rate
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN or self.mode == dialogue_utils.mode_self_play_mutable:
            self.learning_rate2 = tf.constant(hparams.learning_rate2)
            self.learning_rate3 = tf.constant(hparams.learning_rate3)
            if hparams.optimizer == "sgd":
                self.learning_rate2 = tf.cond(
                    self.global_step < hparams.start_decay_step,
                    lambda: self.learning_rate2,
                    lambda: tf.train.exponential_decay(self.learning_rate2, (
                        self.global_step - hparams.start_decay_step),
                                                       hparams.decay_steps,
                                                       hparams.decay_factor,
                                                       staircase=True),
                    name="sgd_learning_rate_supervised2")
                self.learning_rate3 = tf.cond(
                    self.global_step < hparams.start_decay_step,
                    lambda: self.learning_rate3,
                    lambda: tf.train.exponential_decay(self.learning_rate3, (
                        self.global_step - hparams.start_decay_step),
                                                       hparams.decay_steps,
                                                       hparams.decay_factor,
                                                       staircase=True),
                    name="sgd_learning_rate_supervised3")
                tf.summary.scalar("self_play_lr", self.learning_rate)
            elif hparams.optimizer == "adam":
                assert float(
                    hparams.learning_rate2
                ) <= 0.001, "! High Adam learning rate2 %g" % hparams.learning_rate2
                assert float(
                    hparams.learning_rate3
                ) <= 0.001, "! High Adam learning rate3 %g" % hparams.learning_rate3

            # params=[]

            print("params=")
            for element in params:
                print(element.name)
            val1_params = self.patial_params(
                params, ["dynamic_seq2seq/value_network1"])
            val2_params = self.patial_params(
                params, ["dynamic_seq2seq/value_network2"])
            embedding_params = self.patial_params(params, ["embeddings"])
            main_dec_enc_params1 = self.patial_params(
                params,
                ["dynamic_seq2seq/encoder1/", "dynamic_seq2seq/decoder1/"])
            main_dec_enc_params2 = self.patial_params(
                params,
                ["dynamic_seq2seq/encoder2/", "dynamic_seq2seq/decoder2/"])
            action_params = self.patial_params(
                params, ["dynamic_seq2seq/decoder_action"])
            encoder_kb_params = self.patial_params(
                params, ["dynamic_seq2seq/encoder2_kb"])
            encoder_intent_params = self.patial_params(
                params, ["dynamic_seq2seq/encoder1_intent"])
            print("val1_params", "\n".join(map(lambda a: a.name, val1_params)))
            print("val2_params", "\n".join(map(lambda a: a.name, val2_params)))
            print("embedding_params",
                  "\n".join(map(lambda a: a.name, embedding_params)))
            print("main_dec_enc_params1",
                  "\n".join(map(lambda a: a.name, main_dec_enc_params1)))
            print("main_dec_enc_params2",
                  "\n".join(map(lambda a: a.name, main_dec_enc_params2)))
            print("action_params",
                  "\n".join(map(lambda a: a.name, action_params)))
            print("encoder_kb_params",
                  "\n".join(map(lambda a: a.name, encoder_kb_params)))
            print("encoder_intent_params",
                  "\n".join(map(lambda a: a.name, encoder_intent_params)))
            self.optimizer_vl1, self.v1_sum = self.generate_optimizer(
                self.vl1, params, "vl1", self.learning_rate2,
                self.hparams.max_gradient_norm2)
            self.optimizer_vl2, self.v2_sum = self.generate_optimizer(
                self.vl2, params, "vl2", self.learning_rate2,
                self.hparams.max_gradient_norm2)
            if hparams.self_play_variable_method == 0:
                rl_param1, rl_param2 = encoder_intent_params, encoder_kb_params + action_params
            elif hparams.self_play_variable_method == 1:
                rl_param1, rl_param2 = main_dec_enc_params1, main_dec_enc_params2
            elif hparams.self_play_variable_method == 2:
                rl_param1, rl_param2 = main_dec_enc_params1 + encoder_intent_params, main_dec_enc_params2 + encoder_kb_params + action_params
            elif hparams.self_play_variable_method == 3:
                rl_param1, rl_param2 = [main_dec_enc_params1[0]
                                        ] + encoder_intent_params, [
                                            main_dec_enc_params2[0]
                                        ] + encoder_kb_params
            elif hparams.self_play_variable_method == 4:
                rl_param1, rl_param2 = [main_dec_enc_params1[0]
                                        ], [main_dec_enc_params2[0]]
            elif hparams.self_play_variable_method == 5:
                rl_param1, rl_param2 = params, params
            self.optimizer_pl1, self.p1_sum = self.generate_optimizer(
                self.pl1, params, "pl1", self.learning_rate3,
                self.hparams.max_gradient_norm3)
            self.optimizer_pl2, self.p2_sum = self.generate_optimizer(
                self.pl2, params, "pl2", self.learning_rate3,
                self.hparams.max_gradient_norm3)
            print("self.learning", self.learning_rate, self.learning_rate2,
                  self.learning_rate3)
            ################################
            ### supervised learning######'
            ###########################
        # Saver
        self.saver = tf.train.Saver(tf.global_variables())

        # Print trainable variables
        utils.print_out("# Trainable variables")
        for param in params:
            utils.print_out(
                "  %s, %s, %s" %
                (param.name, str(param.get_shape()), param.op.device))
示例#10
0
  def _set_params_initializer(self,
                              hparams,
                              mode,
                              features,
                              scope,
                              extra_args=None):
    """Set various params for self and initialize."""
    self.mode = mode
    self.src_vocab_size = hparams.src_vocab_size
    self.tgt_vocab_size = hparams.tgt_vocab_size
    self.features = features
    self.time_major = hparams.time_major
    if self.time_major:
      mlperf_log.gnmt_print(key=mlperf_log.INPUT_ORDER, value="time_major")
    else:
      mlperf_log.gnmt_print(key=mlperf_log.INPUT_ORDER, value="batch_major")

    if hparams.use_char_encode:
      assert (not self.time_major), ("Can't use time major for"
                                     " char-level inputs.")

    self.dtype = tf.as_dtype(hparams.activation_dtype)

    # extra_args: to make it flexible for adding external customizable code
    self.single_cell_fn = None
    if extra_args:
      self.single_cell_fn = extra_args.single_cell_fn

    # Set num units
    mlperf_log.gnmt_print(key=mlperf_log.MODEL_HP_HIDDEN_SIZE,
                          value=hparams.num_units)
    self.num_units = hparams.num_units
    self.eos_id = hparams.tgt_eos_id
    self.label_smoothing = hparams.label_smoothing

    # Set num layers
    mlperf_log.gnmt_print(key=mlperf_log.MODEL_HP_NUM_LAYERS,
                          value={"encoder": hparams.num_encoder_layers,
                                 "decoder": hparams.num_decoder_layers})
    self.num_encoder_layers = hparams.num_encoder_layers
    self.num_decoder_layers = hparams.num_decoder_layers
    assert self.num_encoder_layers
    assert self.num_decoder_layers

    # Set num residual layers
    if hasattr(hparams, "num_residual_layers"):  # compatible common_test_utils
      self.num_encoder_residual_layers = hparams.num_residual_layers
      self.num_decoder_residual_layers = hparams.num_residual_layers
    else:
      self.num_encoder_residual_layers = hparams.num_encoder_residual_layers
      self.num_decoder_residual_layers = hparams.num_decoder_residual_layers

    # Batch size
    self.batch_size = tf.size(self.features["source_sequence_length"])

    # Global step
    # Use get_global_step instead of user-defied global steps. Otherwise the
    # num_train_steps in TPUEstimator.train has no effect (will train forever).
    # TPUestimator only check if tf.train.get_global_step() < num_train_steps
    self.global_step = tf.train.get_or_create_global_step()

    # Initializer
    mlperf_log.gnmt_print(key=mlperf_log.RUN_SET_RANDOM_SEED,
                          value=hparams.random_seed)
    self.random_seed = hparams.random_seed
    initializer = model_helper.get_initializer(
        hparams.init_op, self.random_seed, hparams.init_weight)
    tf.get_variable_scope().set_initializer(initializer)

    # Embeddings
    self.encoder_emb_lookup_fn = (
        self._emb_lookup if self.mode == tf.contrib.learn.ModeKeys.TRAIN else
        tf.nn.embedding_lookup)
    self.init_embeddings(hparams, scope, self.dtype)
示例#11
0
    def __init__(self,
                 hparams,
                 mode,
                 iterator,
                 vocab_table,
                 scope=None,
                 extra_args=None):
        """Create the model.

    Args:
      hparams: Hyperparameter configurations.
      mode: TRAIN | EVAL | INFER
      iterator: Dataset Iterator that feeds data.
      vocab_table: Lookup table mapping source words to ids.
      scope: scope of the model.
      extra_args: model_helper.ExtraArgs, for passing customizable functions.

    """
        self.iterator = iterator
        self.mode = mode
        self.vocab_table = vocab_table
        #self.vocab_size = len(vocab_table)
        self.time_major = hparams.time_major

        self.single_cell_fn = None
        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   hparams.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.init_embeddings(hparams, scope)
        self.batch_size = tf.size(self.iterator.source_sequence_length)

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = layers_core.Dense(2,
                                                      use_bias=False,
                                                      activation=tf.nn.sigmoid,
                                                      name="output_projection")

        ## Train graph
        loss, accuracy = self.build_graph(hparams, scope=scope)

        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = loss
            self.train_accuracy = accuracy
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = loss

        self.global_step = tf.Variable(0, trainable=False)
        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrage for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.learning_rate = tf.constant(hparams.learning_rate)
            # warm-up
            self.learning_rate = self._get_learning_rate_warmup(hparams)
            # decay
            self.learning_rate = self._get_learning_rate_decay(hparams)

            # Optimizer
            if hparams.optimizer == "sgd":
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                tf.summary.scalar("lr", self.learning_rate)
            elif hparams.optimizer == "adam":
                opt = tf.train.AdamOptimizer(self.learning_rate)

            # Gradients
            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=True)

            clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm = grad_norm

            self.update = opt.apply_gradients(zip(clipped_grads, params),
                                              global_step=self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("lr", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + grad_norm_summary)

        if self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_summary = self._get_infer_summary(hparams)

        # Saver
        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=hparams.num_keep_ckpts)

        # Print trainable variables
        print("# Trainable variables")
        for param in params:
            print("  %s, %s, %s" %
                  (param.name, str(param.get_shape()), param.op.device))
示例#12
0
    def __init__(self, iterator, hparams, mode, scope=None):
        self.iterator = iterator
        self.hparams = hparams
        self.mode = mode
        self.scope = scope

        # Initializer
        initializer = model_helper.get_initializer(self.hparams.init_op, None,
                                                   self.hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        with tf.variable_scope(scope or 'embedding'):
            self.embedding = tf.get_variable(
                'embedding', [self.hparams.vocab_size, self.hparams.num_units],
                dtype=tf.float32)

        # Output Layer
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope('decoder/output_projection'):
                self.output_layer = tf.layers.Dense(self.hparams.vocab_size,
                                                    use_bias=False)

        # Batch Size
        self.batch_size = tf.size(self.iterator.src_seq)

        # Build Graph
        print("# Building graph for the model ...")
        res = self.build_graph(self.scope)

        if self.mode == TRAIN:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(
                tf.reduce_sum(self.iterator.src_seq) +
                tf.reduce_sum(self.iterator.tar_seq))
        elif self.mode == EVAL:
            self.eval_loss = res[1]
        elif self.mode == PREDICT:
            self.infer_logits, _, self.final_state, self.sample_id = res

        if self.mode != PREDICT:
            # Count the number of predicted words for compute perplexity.
            self.predict_count = tf.reduce_sum(self.iterator.tar_seq)

        # Define variables
        self.global_step = tf.Variable(0, trainable=False)
        params = tf.trainable_variables()

        # Optimizer
        if self.mode == TRAIN:
            self.learning_rate = tf.placeholder(tf.float32,
                                                shape=[],
                                                name='learning_rate')

            # self.learning_rate = tf.train.exponential_decay(
            #     0.001, self.global_step, 1000, 0.9)
            opt = tf.train.AdamOptimizer(self.learning_rate)

            # Gradient
            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=self.hparams.
                                     colocate_gradients_with_ops)
            clipped_gradients, gradient_norm_summary, _ = model_helper.gradient_clip(
                gradients, self.hparams.max_gradient_norm)
            self.update = opt.apply_gradients(zip(clipped_gradients, params),
                                              self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar('train_loss', self.train_loss),
                tf.summary.scalar('learning_rate', self.learning_rate)
            ] + gradient_norm_summary)
        else:
            self.infer_summary = tf.no_op()

        # Saver
        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=self.hparams.max_to_keep)
示例#13
0
    def __init__(self,
                 hparams,
                 mode,
                 iterator,
                 source_vocab_table,
                 target_vocab_table,
                 reverse_target_vocab_table=None,
                 scope=None,
                 extra_args=None):
        """Create the model.

    Args:
      hparams: Hyperparameter configurations.
      mode: TRAIN | EVAL | INFER
      iterator: Dataset Iterator that feeds data.
      source_vocab_table: Lookup table mapping source words to ids.
      target_vocab_table: Lookup table mapping target words to ids.
      reverse_target_vocab_table: Lookup table mapping ids to target words. Only
        required in INFER mode. Defaults to None.
      scope: scope of the model.
      extra_args: model_helper.ExtraArgs, for passing customizable functions.

    """
        assert isinstance(iterator, iterator_utils.BatchedInput)
        self.iterator = iterator
        self.mode = mode
        self.src_vocab_table = source_vocab_table
        self.tgt_vocab_table = target_vocab_table

        self.src_vocab_size = hparams.src_vocab_size
        self.tgt_vocab_size = hparams.tgt_vocab_size
        self.num_layers = hparams.num_layers
        self.num_gpus = hparams.num_gpus
        self.time_major = hparams.time_major

        # extra_args: to make it flexible for adding external customizable code
        self.single_cell_fn = None
        if extra_args:
            self.single_cell_fn = extra_args.single_cell_fn

        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   hparams.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        # TODO(ebrevdo): Only do this if the mode is TRAIN?
        self.init_embeddings(hparams, scope)
        self.batch_size = tf.size(self.iterator.source_sequence_length)

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = layers_core.Dense(hparams.tgt_vocab_size,
                                                      use_bias=False,
                                                      name="output_projection")

        ## Train graph
        res = self.build_graph(hparams, scope=scope)

        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(
                self.iterator.source_sequence_length) + tf.reduce_sum(
                    self.iterator.target_sequence_length)
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = res[1]
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = reverse_target_vocab_table.lookup(
                tf.to_int64(self.sample_id))

        if self.mode != tf.contrib.learn.ModeKeys.INFER:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(
                self.iterator.target_sequence_length)

        ## Learning rate
        warmup_steps = hparams.learning_rate_warmup_steps
        warmup_factor = hparams.learning_rate_warmup_factor
        print("  start_decay_step=%d, learning_rate=%g, decay_steps %d, "
              "decay_factor %g, learning_rate_warmup_steps=%d, "
              "learning_rate_warmup_factor=%g, starting_learning_rate=%g" %
              (hparams.start_decay_step, hparams.learning_rate,
               hparams.decay_steps, hparams.decay_factor, warmup_steps,
               warmup_factor,
               (hparams.learning_rate * warmup_factor**warmup_steps)))
        self.global_step = tf.Variable(0, trainable=False)

        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrage for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.learning_rate = tf.constant(hparams.learning_rate)

            # Apply inverse decay if global steps less than warmup steps.
            # Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3)
            # When step < warmup_steps,
            #   learing_rate *= warmup_factor ** (warmup_steps - step)
            inv_decay = warmup_factor**(tf.to_float(warmup_steps -
                                                    self.global_step))
            self.learning_rate = tf.cond(
                self.global_step < hparams.learning_rate_warmup_steps,
                lambda: inv_decay * self.learning_rate,
                lambda: self.learning_rate,
                name="learning_rate_decay_warump_cond")

            if hparams.optimizer == "sgd":
                self.learning_rate = tf.cond(
                    self.global_step < hparams.start_decay_step,
                    lambda: self.learning_rate,
                    lambda: tf.train.exponential_decay(self.learning_rate, (
                        self.global_step - hparams.start_decay_step),
                                                       hparams.decay_steps,
                                                       hparams.decay_factor,
                                                       staircase=True),
                    name="learning_rate")
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                tf.summary.scalar("lr", self.learning_rate)
            elif hparams.optimizer == "adam":
                assert float(
                    hparams.learning_rate
                ) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate
                opt = tf.train.AdamOptimizer(self.learning_rate)

            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=hparams.
                                     colocate_gradients_with_ops)

            clipped_gradients, gradient_norm_summary = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)

            self.update = opt.apply_gradients(zip(clipped_gradients, params),
                                              global_step=self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("lr", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + gradient_norm_summary)

        if self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_summary = self._get_infer_summary(hparams)

        # Saver
        self.saver = tf.train.Saver(tf.global_variables())

        # Print trainable variables
        utils.print_out("# Trainable variables")
        for param in params:
            utils.print_out(
                "  %s, %s, %s" %
                (param.name, str(param.get_shape()), param.op.device))