def _apply_dense(self, grad, var): var_name = var.name.replace(':', '_') with tf.variable_scope('apply_dense/{}'.format(var_name)): gradient = self._gradient[var.name] gradient.update_slots(self, var) grad_flat = tf.reshape(grad, [-1]) gradient_ops = gradient.update_statistics(grad_flat) grad_apply, assign_ops = gradient.compute_apply(gradient_ops, {}) grad_apply = tf.reshape(grad_apply, grad.get_shape()) if assign_ops: with tf.control_dependencies(assign_ops): update_ops = training_ops.apply_gradient_descent( var, math_ops.cast(1.0, var.dtype.base_dtype), grad_apply, use_locking=self._use_locking).op else: update_ops = training_ops.apply_gradient_descent( var, math_ops.cast(1.0, var.dtype.base_dtype), grad_apply, use_locking=self._use_locking).op return update_ops
def _apply_dense(self, grad, var): grad = self._fft_solver(grad, var.name) return training_ops.apply_gradient_descent( var, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): lr = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) update_op = training_ops.apply_gradient_descent( var, lr, grad, use_locking=self._use_locking).op if self._flow.edgename_map.get(var.op.name): with ops.control_dependencies([update_op]): key = self._flow.edgename_map[var.op.name] if self._flow.flow: threshold = math_ops.cast( ops.convert_to_tensor( self._flow.flow[key] * self._learning_rate_tensor * self._group_lasso_strength_tensor), var.dtype.base_dtype) else: threshold = math_ops.cast( ops.convert_to_tensor( self._learning_rate_tensor * self._group_lasso_strength_tensor), var.dtype.base_dtype) norm = math_ops.maximum(math_ops.abs(var), 1E-16) mask = math_ops.maximum(1.0 - (threshold / norm), 0.) new_var = math_ops.multiply(var, mask) shrinkage = state_ops.assign(var, new_var) return shrinkage else: return update_op
def _apply_dense(self, grad, var): return training_ops.apply_gradient_descent( var, math_ops.cast(self._get_hyper("learning_rate"), var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): lr = self._learning_rate_tensor grad_clipped = tf.minimum(var/lr, grad) return training_ops.apply_gradient_descent( var, self._learning_rate_tensor, grad_clipped, use_locking=self._use_locking).op
def _apply_sparse(self, grad, var): rms = self.get_slot(var, 'rms') new_grad = self._apply_noisy_update(rms, grad, var) return training_ops.apply_gradient_descent( var, tf.cast(self._learning_rate_tensor, var.dtype.base_dtype), new_grad, use_locking=self._use_locking).op
def _apply_sparse(self, grad, var): rms = self.get_slot(var, 'rms') new_grad = self._apply_noisy_update(rms, grad, var) return training_ops.apply_gradient_descent( var, tf.cast(self._learning_rate_tensor, var.dtype.base_dtype), new_grad, use_locking=self._use_locking).op
def _testTypes(self, x, alpha, delta, use_gpu=None): self.setUp() with self.session(use_gpu=use_gpu): var = variables.VariableV1(x) variables.global_variables_initializer().run() self.assertAllCloseAccordingToType(x, self.evaluate(var)) apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta) out = self.evaluate(apply_sgd) self.assertShapeEqual(out, apply_sgd) self.assertAllCloseAccordingToType(x - alpha * delta, out)
def _testTypes(self, x, alpha, delta, use_gpu=None): self.setUp() with self.test_session(use_gpu=use_gpu): var = variables.Variable(x) variables.initialize_all_variables().run() self.assertAllEqual(x, var.eval()) apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta) out = apply_sgd.eval() self.assertShapeEqual(out, apply_sgd) self.assertAllEqual(x - alpha * delta, out)
def _testTypes(self, x, alpha, delta, use_gpu=None): self.setUp() with self.test_session(use_gpu=use_gpu): var = variables.Variable(x) variables.global_variables_initializer().run() self.assertAllCloseAccordingToType(x, var.eval()) apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta) out = apply_sgd.eval() self.assertShapeEqual(out, apply_sgd) self.assertAllCloseAccordingToType(x - alpha * delta, out)
def _apply_sparse(self, grad, var): rms = self.get_slot(var, 'rms') with ops.control_dependencies([ self._update_momentum(rms, grad, math_ops.cast(self._decay_tensor, var.dtype.base_dtype))]): new_grad = self._apply_noisy_update(rms, grad) return training_ops.apply_gradient_descent( var, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), new_grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): previous_grad = self.get_slot(var, "previous_grad") lr = self.get_slot(var, "learning_rate") scale_factor = tf.pow(self._scale_tensor, tf.sign(grad * previous_grad)) lr_update = lr.assign(lr * scale_factor) with tf.control_dependencies([lr_update]): previous_grad_update = previous_grad.assign(grad) with tf.control_dependencies([previous_grad_update]): apply_grad_op = training_ops.apply_gradient_descent( var, 1.0, lr * grad, use_locking=self._use_locking).op return apply_grad_op
def _apply_dense(self, grad, var): if self.quantizer is None: return training_ops.apply_gradient_descent( var, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), grad, use_locking=self._use_locking).op else: lr = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) delta = self.quantizer.quantize( self.quantizer.quantize(grad) * self.quantizer.quantize(lr)) new_var = self.quantizer.quantize(var - delta) return var.assign(new_var).op
def _apply_dense(self, grad, var): max_learning_rate = tf.where(self._counter < self._burnin, self._burnin_max_learning_rate, self._max_learning_rate) learn_rates = tf.clip_by_value( self._get_coordinatewise_learning_rate(grad, var), 0., tf.cast(max_learning_rate, var.dtype.base_dtype)) newgrad = grad * learn_rates return training_ops.apply_gradient_descent( var, tf.cast(1., var.dtype), newgrad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): max_learning_rate = tf.where(self._counter < self._burnin, self._burnin_max_learning_rate, self._max_learning_rate) learn_rates = tf.clip_by_value( self._get_coordinatewise_learning_rate(grad, var), 0., tf.cast(max_learning_rate, var.dtype.base_dtype)) newgrad = grad * learn_rates return training_ops.apply_gradient_descent( var, tf.cast(1., var.dtype), newgrad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var, state): if self._use_momentum: mom = state.get_slot(var, "momentum") return training_ops.apply_momentum( var, mom, state.get_hyper("learning_rate", var.dtype.base_dtype), grad, state.get_hyper("momentum", var.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._use_nesterov).op else: return training_ops.apply_gradient_descent( var, state.get_hyper("learning_rate", var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var, state): if self._use_momentum: mom = state.get_slot(var, "momentum") return training_ops.apply_momentum( var, mom, state.get_hyper("learning_rate", var.dtype.base_dtype), grad, state.get_hyper("momentum", var.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._use_nesterov).op else: return training_ops.apply_gradient_descent( var, state.get_hyper("learning_rate", var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): previous_grad = self.get_slot(var, "previous_grad") lr = self.get_slot(var, "learning_rate") scale_factor = tf.pow(self._scale_tensor, tf.sign(grad * previous_grad)) lr_update = lr.assign(lr * scale_factor) #streaming_lr_mean, streaming_lr_update = tf.contrib.metrics.streaming_mean(lr_update) #streaming_lr_scalar = tf.summary.scalar('lr_cost', streaming_lr_update) lr_scalar = tf.summary.scalar("learning rate/{}".format(var), tf.reduce_mean(lr * scale_factor)) with tf.control_dependencies([lr_update]): previous_grad_update = previous_grad.assign(grad) with tf.control_dependencies([previous_grad_update]): apply_grad_op = training_ops.apply_gradient_descent( var, 1.0, lr * grad, use_locking=self._use_locking).op return apply_grad_op
def _apply_dense(self, grad, var): step = self.get_slot(var, "step") step_t = step.assign(step + 1) mu = self.get_slot(var, "mu") ax = self.get_slot(var, "ax") var_t = training_ops.apply_gradient_descent( var, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), grad, use_locking=self._use_locking) var_T = var_t.op if mu != 1: ax_t = ax.assign(ax + (var_t - ax) * mu) else: ax_t = ax.assign(var_t) mu_t = mu.assign( 1 / tf.maximum(math_ops.cast(1, step_t.dtype), step_t - self._t0)) return control_flow_ops.group(*[var_T, step_t, ax_t, mu_t])
def _apply_dense(self, grad, var): momentum_buffer = self.get_slot(var, "momentum") learning_rate = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype) nu = math_ops.cast(self._nu_tensor, var.dtype.base_dtype) momentum_op = training_ops.apply_momentum( var, momentum_buffer, nu * (1.0 - momentum) * learning_rate, grad, momentum, use_locking=self._use_locking, use_nesterov=False, ).op with ops.control_dependencies([momentum_op]): gd_op = training_ops.apply_gradient_descent( var, (1.0 - nu) * learning_rate, grad, use_locking=self._use_locking ).op return control_flow_ops.group(momentum_op, gd_op)
def _apply_dense(self, grad, var, state): return training_ops.apply_gradient_descent( var, state.get_hyper("learning_rate", var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): return training_ops.apply_gradient_descent( var, self._learning_rate_tensor, grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): return training_ops.apply_gradient_descent( var, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): return training_ops.apply_gradient_descent( var, math_ops.cast(self._lr_t, var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): rms = self.get_slot(var, "rms") mom = self.get_slot(var, "momentum") eps = self.get_slot(var, 'eps') tf.summary.scalar('grad_norm', tf.norm(grad)) # debug_here() if 'orthogonal_stiefel' in var.name and 'bias' not in var.name: with tf.variable_scope("orthogonal_update"): print('Appling an orthogonality preserving step to', var.name) # apply the rms update rule. new_rms = self._decay_tensor * rms + (1. - self._decay_tensor) \ * tf.square(grad) rms_assign_op = tf.assign(rms, new_rms) # scale the gradient. if self._nat_grad_normalization: grad = grad / (tf.sqrt(rms) + eps) # the update should preserve orthogonality. grad_shape = tf.Tensor.get_shape(grad).as_list() # W_new_lst = [] eye = tf.eye(grad_shape[0], dtype=tf.float32) G = grad W = var # Reunitarize after n steps. if self._qr_steps is not None: W = tf.cond(tf.equal(tf.mod(self._global_step_tensor, self._qr_steps), 0), lambda: self.re_unitarize(W), lambda: W) # A = tf.matmul(tf.transpose(G), W) - tf.matmul(tf.transpose(W), G) A = tf.matmul(G, tf.transpose(W)) - tf.matmul(W, tf.transpose(G)) cayleyDenom = eye + (self._learning_rate_tensor/2.0) * A cayleyNumer = eye - (self._learning_rate_tensor/2.0) * A C = tf.matmul(tf.matrix_inverse(cayleyDenom), cayleyNumer) W_new = tf.matmul(C, W) if self._debug: # self._summary_A(A) self._summary_C(C) self._summary_W(W) var_update_op = tf.assign(var, W_new) return tf.group(*[var_update_op, rms_assign_op]) elif 'unitary_stiefel' in var.name and 'bias' not in var.name: with tf.variable_scope("unitary_update"): print('Appling an unitarity preserving step to', var.name) # apply the rms update rule. new_rms = self._decay_tensor * rms + (1. - self._decay_tensor) \ * tf.square(grad) rms_assign_op = tf.assign(rms, new_rms) # scale the gradient. if self._nat_grad_normalization: grad = grad / (tf.sqrt(new_rms) + eps) # do an update step, which preserves unitary structure. # checking shapes. grad_shape = tf.Tensor.get_shape(grad).as_list() assert grad_shape[0] == grad_shape[1] eye = tf.eye(grad_shape[0], dtype=tf.complex64) G = tf.complex(grad[:, :, 0], grad[:, :, 1]) W = tf.complex(var[:, :, 0], var[:, :, 1]) # Reunitarize after n steps. if self._qr_steps is not None: W = tf.cond(tf.equal(tf.mod(self._global_step_tensor, self._qr_steps), 0), lambda: self.re_unitarize(W), lambda: W) A = tf.matmul(G, tf.conj(tf.transpose(W))) \ - tf.matmul(W, tf.conj(tf.transpose(G))) # A must be skew symmetric. larning_rate_scale = tf.complex(self._learning_rate_tensor/2.0, tf.zeros_like(self._learning_rate_tensor)) cayleyDenom = eye + larning_rate_scale * A cayleyNumer = eye - larning_rate_scale * A C = tf.matmul(tf.matrix_inverse(cayleyDenom), cayleyNumer) W_new = tf.matmul(C, W) if self._debug: # self._summary_A(A) self._summary_C(C) self._summary_W(W) # debug_here() W_new_re = tf.real(W_new) W_new_img = tf.imag(W_new) W_array = tf.stack([W_new_re, W_new_img], -1) var_update_op = tf.assign(var, W_array) return tf.group(*[var_update_op, rms_assign_op]) else: # do the usual RMSprop update rms = False if rms: if 1: # tensorflow default. print('Appling standard rmsprop to', var.name) return training_ops.apply_rms_prop( var, rms, mom, tf.cast(self._learning_rate_tensor, var.dtype.base_dtype), tf.cast(self._decay_tensor, var.dtype.base_dtype), tf.cast(self._momentum_tensor, var.dtype.base_dtype), tf.cast(self._epsilon_tensor, var.dtype.base_dtype), grad, use_locking=False).op else: # My rmsprop implementation. new_rms = self._decay_tensor * rms \ + (1. - self._decay_tensor) * tf.square(grad) rms_assign_op = tf.assign(rms, new_rms) W_new = var - self._learning_rate_tensor * grad \ / (tf.sqrt(new_rms) + eps) var_update_op = tf.assign(var, W_new) return tf.group(*[var_update_op, rms_assign_op]) else: print('Appling default gradient descent to', var.name) return training_ops.apply_gradient_descent( var, tf.cast(self._learning_rate_tensor, var.dtype.base_dtype), grad, use_locking=False).op
def _apply_dense(self, grad, var): return training_ops.apply_gradient_descent( var, self._learning_rate_tensor, grad, use_locking=self._use_locking).op