Exemplo n.º 1
0
    def apply_gradients(self, grads_tvars, global_step=None, name=None):
        self._grads, self._tvars = zip(*[(g, t) for g, t in grads_tvars
                                         if g is not None])

        # for manual gradient clipping
        if self._clip_thresh_var is not None:
            self._grads, self._grads_norm = tf.clip_by_global_norm(
                self._grads, self._clip_thresh_var)

        # loosely adaptive clipping of gradient in case exploding gradient ruins statistics
        if self._use_adapt_grad_clip:
            thresh = tf.cond(
                self._do_tune, lambda: tf.sqrt(self._stat_protect_fac * self.
                                               _adapt_grad_clip_thresh**2),
                lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
            self._grads, self._grads_norm = tf.clip_by_global_norm(
                self._grads, thresh)

        with tf.variable_scope("before_apply"):
            before_apply_op = self.before_apply()

        with tf.variable_scope("update_hyper"):
            with tf.control_dependencies([before_apply_op]):
                update_hyper_op = self.update_hyper_param()

        with tf.variable_scope("apply_updates"):
            with tf.control_dependencies([update_hyper_op]):

                # clip exploding gradient according to h_max
                if self._use_adapt_grad_clip:
                    thresh = tf.cond(
                        tf.greater(tf.global_norm(self._grads),
                                   self._adapt_grad_clip_thresh),
                        lambda: self._adapt_grad_clip_target_val,
                        lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
                    self._grads, self._grads_norm = tf.clip_by_global_norm(
                        self._grads, thresh)

                apply_grad_op = self._optimizer.apply_gradients(
                    zip(self._grads, self._tvars), global_step, name)

        with tf.control_dependencies([apply_grad_op]):
            self._increment_global_step_op = tf.assign(self._global_step,
                                                       self._global_step + 1)

            self._adapt_grad_clip_thresh_op = \
              tf.assign(self._adapt_grad_clip_thresh, tf.sqrt(self._h_max) )
            self._adapt_grad_clip_target_val_op = \
              tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(self._h_max) )
            # self._adapt_grad_clip_target_val_op = \
            #   tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(tf.sqrt(self._h_max * self._h_min)))

        return tf.group(before_apply_op, update_hyper_op, apply_grad_op,
                        self._adapt_grad_clip_thresh_op,
                        self._adapt_grad_clip_target_val_op,
                        self._increment_global_step_op)
Exemplo n.º 2
0
  def get_scaffold(self, mode, global_step=None, iter_initializer=None):
    """Get training scaffold."""

    init_op = tf.global_variables_initializer()
    if iter_initializer is None:
      local_init_op = tf.tables_initializer()
    else:
      local_init_op = tf.group(tf.tables_initializer(), iter_initializer)
    saver = self.get_saver(global_step)
    scaffold = tf.train.Scaffold(
        saver=saver, init_op=init_op, local_init_op=local_init_op)
    return scaffold
Exemplo n.º 3
0
  def get_train_op(self, loss, global_step=None):
    """Get the training operator."""
    apply_gradient_op = self.get_apply_gradients_op(loss, global_step)

    # model average
    self.var_avg(global_step)

    # model average after apply gradients
    with tf.control_dependencies([apply_gradient_op]):
      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
      train_op = tf.group(*update_ops)

    utils.log_vars('moving vars', tf.moving_average_variables())
    return train_op
Exemplo n.º 4
0
    def before_apply(self):
        self._moving_averager = tf.train.ExponentialMovingAverage(
            decay=self._beta, zero_debias=self._zero_debias)
        assert self._grads is not None and len(self._grads) > 0
        before_apply_ops = []

        # get per var g**2 and norm**2
        self._grad_squared = []
        self._grad_norm_squared = []
        for v, g in zip(self._tvars, self._grads):
            if g is None:
                continue
            with ops.colocate_with(v):
                self._grad_squared.append(tf.square(g))
        self._grad_norm_squared = [
            tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared
        ]

        if self._sparsity_debias:
            avg_op_sparsity = self.grad_sparsity()
            before_apply_ops.append(avg_op_sparsity)

        # the following running average on squared norm of gradient is shared
        # by `grad_variance` and `dist_to_opt`
        avg_op = self._moving_averager.apply(self._grad_norm_squared)
        with tf.control_dependencies([avg_op]):
            self._grad_norm_squared_avg = [
                self._moving_averager.average(val)
                for val in self._grad_norm_squared
            ]
            self._grad_norm_squared = tf.add_n(self._grad_norm_squared)
            self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg)
        before_apply_ops.append(avg_op)

        with tf.control_dependencies([avg_op]):
            curv_range_ops = self.curvature_range()
            before_apply_ops += curv_range_ops
            grad_var_ops = self.grad_variance()
            before_apply_ops += grad_var_ops
            dist_to_opt_ops = self.dist_to_opt()
            before_apply_ops += dist_to_opt_ops
        return tf.group(*before_apply_ops)
Exemplo n.º 5
0
  def get_train_op(self, loss, multitask, global_step=None):
    """Get the training operator."""
    # quantize training
    quantconf = self.config['solver']['quantization']
    quantization = quantconf['enable']
    if quantization:
      quant_delay = quantconf['quant_delay']
      logging.info('Quantization training with {} delay'.format(quant_delay))
      tf.contrib.quantize.create_training_graph(quant_delay=quant_delay)

    apply_gradient_op = self.get_apply_gradients_op(loss, multitask,
                                                    global_step)

    # model average
    self.var_avg(global_step)

    # model average after apply gradients
    with tf.control_dependencies([apply_gradient_op]):
      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
      train_op = tf.group(*update_ops)

    utils.log_vars('moving vars', tf.moving_average_variables())
    return train_op
Exemplo n.º 6
0
    def update_hyper_param(self):
        assign_hyper_ops = []
        self._mu = tf.identity(
            tf.cond(self._do_tune, lambda: self.get_mu_tensor(),
                    lambda: self._mu_var))
        with tf.control_dependencies([self._mu]):
            self._lr = tf.identity(
                tf.cond(self._do_tune, lambda: self.get_lr_tensor(),
                        lambda: self._lr_var))

        with tf.control_dependencies([self._mu, self._lr]):
            if self._use_unsmoothed_lr_mu:
                assign_hyper_ops.append(tf.assign(self._mu_var, self._mu))
                assign_hyper_ops.append(tf.assign(self._lr_var, self._lr))
            else:
                self._mu = self._beta * self._mu_var + (1 -
                                                        self._beta) * self._mu
                self._lr = self._beta * self._lr_var + (1 -
                                                        self._beta) * self._lr
                with tf.control_dependencies([self._mu, self._lr]):
                    assign_hyper_ops.append(tf.assign(self._mu_var, self._mu))
                    assign_hyper_ops.append(tf.assign(self._lr_var, self._lr))
        assign_hyper_op = tf.group(*assign_hyper_ops)
        return assign_hyper_op