def apply_gradients(self, grads_tvars, global_step=None, name=None): self._grads, self._tvars = zip(*[(g, t) for g, t in grads_tvars if g is not None]) # for manual gradient clipping if self._clip_thresh_var is not None: self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, self._clip_thresh_var) # loosely adaptive clipping of gradient in case exploding gradient ruins statistics if self._use_adapt_grad_clip: thresh = tf.cond( self._do_tune, lambda: tf.sqrt(self._stat_protect_fac * self. _adapt_grad_clip_thresh**2), lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL))) self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, thresh) with tf.variable_scope("before_apply"): before_apply_op = self.before_apply() with tf.variable_scope("update_hyper"): with tf.control_dependencies([before_apply_op]): update_hyper_op = self.update_hyper_param() with tf.variable_scope("apply_updates"): with tf.control_dependencies([update_hyper_op]): # clip exploding gradient according to h_max if self._use_adapt_grad_clip: thresh = tf.cond( tf.greater(tf.global_norm(self._grads), self._adapt_grad_clip_thresh), lambda: self._adapt_grad_clip_target_val, lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL))) self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, thresh) apply_grad_op = self._optimizer.apply_gradients( zip(self._grads, self._tvars), global_step, name) with tf.control_dependencies([apply_grad_op]): self._increment_global_step_op = tf.assign(self._global_step, self._global_step + 1) self._adapt_grad_clip_thresh_op = \ tf.assign(self._adapt_grad_clip_thresh, tf.sqrt(self._h_max) ) self._adapt_grad_clip_target_val_op = \ tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(self._h_max) ) # self._adapt_grad_clip_target_val_op = \ # tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(tf.sqrt(self._h_max * self._h_min))) return tf.group(before_apply_op, update_hyper_op, apply_grad_op, self._adapt_grad_clip_thresh_op, self._adapt_grad_clip_target_val_op, self._increment_global_step_op)
def get_scaffold(self, mode, global_step=None, iter_initializer=None): """Get training scaffold.""" init_op = tf.global_variables_initializer() if iter_initializer is None: local_init_op = tf.tables_initializer() else: local_init_op = tf.group(tf.tables_initializer(), iter_initializer) saver = self.get_saver(global_step) scaffold = tf.train.Scaffold( saver=saver, init_op=init_op, local_init_op=local_init_op) return scaffold
def get_train_op(self, loss, global_step=None): """Get the training operator.""" apply_gradient_op = self.get_apply_gradients_op(loss, global_step) # model average self.var_avg(global_step) # model average after apply gradients with tf.control_dependencies([apply_gradient_op]): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(*update_ops) utils.log_vars('moving vars', tf.moving_average_variables()) return train_op
def before_apply(self): self._moving_averager = tf.train.ExponentialMovingAverage( decay=self._beta, zero_debias=self._zero_debias) assert self._grads is not None and len(self._grads) > 0 before_apply_ops = [] # get per var g**2 and norm**2 self._grad_squared = [] self._grad_norm_squared = [] for v, g in zip(self._tvars, self._grads): if g is None: continue with ops.colocate_with(v): self._grad_squared.append(tf.square(g)) self._grad_norm_squared = [ tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared ] if self._sparsity_debias: avg_op_sparsity = self.grad_sparsity() before_apply_ops.append(avg_op_sparsity) # the following running average on squared norm of gradient is shared # by `grad_variance` and `dist_to_opt` avg_op = self._moving_averager.apply(self._grad_norm_squared) with tf.control_dependencies([avg_op]): self._grad_norm_squared_avg = [ self._moving_averager.average(val) for val in self._grad_norm_squared ] self._grad_norm_squared = tf.add_n(self._grad_norm_squared) self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg) before_apply_ops.append(avg_op) with tf.control_dependencies([avg_op]): curv_range_ops = self.curvature_range() before_apply_ops += curv_range_ops grad_var_ops = self.grad_variance() before_apply_ops += grad_var_ops dist_to_opt_ops = self.dist_to_opt() before_apply_ops += dist_to_opt_ops return tf.group(*before_apply_ops)
def get_train_op(self, loss, multitask, global_step=None): """Get the training operator.""" # quantize training quantconf = self.config['solver']['quantization'] quantization = quantconf['enable'] if quantization: quant_delay = quantconf['quant_delay'] logging.info('Quantization training with {} delay'.format(quant_delay)) tf.contrib.quantize.create_training_graph(quant_delay=quant_delay) apply_gradient_op = self.get_apply_gradients_op(loss, multitask, global_step) # model average self.var_avg(global_step) # model average after apply gradients with tf.control_dependencies([apply_gradient_op]): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(*update_ops) utils.log_vars('moving vars', tf.moving_average_variables()) return train_op
def update_hyper_param(self): assign_hyper_ops = [] self._mu = tf.identity( tf.cond(self._do_tune, lambda: self.get_mu_tensor(), lambda: self._mu_var)) with tf.control_dependencies([self._mu]): self._lr = tf.identity( tf.cond(self._do_tune, lambda: self.get_lr_tensor(), lambda: self._lr_var)) with tf.control_dependencies([self._mu, self._lr]): if self._use_unsmoothed_lr_mu: assign_hyper_ops.append(tf.assign(self._mu_var, self._mu)) assign_hyper_ops.append(tf.assign(self._lr_var, self._lr)) else: self._mu = self._beta * self._mu_var + (1 - self._beta) * self._mu self._lr = self._beta * self._lr_var + (1 - self._beta) * self._lr with tf.control_dependencies([self._mu, self._lr]): assign_hyper_ops.append(tf.assign(self._mu_var, self._mu)) assign_hyper_ops.append(tf.assign(self._lr_var, self._lr)) assign_hyper_op = tf.group(*assign_hyper_ops) return assign_hyper_op