def testVarChange(self): with imperative_mode.ImperativeMode(self._target) as mode: x = variables.Variable(constant_op.constant(1.0)) for i in range(10): with mode.new_step() as step: step.run(state_ops.assign_sub(x, 0.1)) self.assertAllClose(array_ops.identity(x).value, 1.0 - (i + 1) * 0.1)
def assign_moving_average(variable, value, decay, name=None): """Compute the moving average of a variable. The moving average of 'variable' updated with 'value' is: variable * decay + value * (1 - decay) The returned Operation sets 'variable' to the newly computed moving average. The new value of 'variable' can be set with the 'AssignSub' op as: variable -= (1 - decay) * (variable - value) Args: variable: A Variable. value: A tensor with the same shape as 'variable' decay: A float Tensor or float value. The moving average decay. name: Optional name of the returned operation. Returns: An Operation that updates 'variable' with the newly computed moving average. """ with ops.op_scope([variable, value, decay], name, "AssignMovingAvg") as scope: with ops.device(variable.device): decay = ops.convert_to_tensor(1.0 - decay, name="decay") if decay.dtype != variable.dtype.base_dtype: decay = math_ops.cast(decay, variable.dtype.base_dtype) return state_ops.assign_sub(variable, (variable - value) * decay, name=scope)
def _apply_dense(self, grad, var): beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values, use_locking=self._use_locking) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values, use_locking=self._use_locking) # amsgrad vhat = self.get_slot(var, "vhat") vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat)) v_sqrt = math_ops.sqrt(vhat_t) var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t])
def _apply_sparse_shared(self, grad, var, indices, scatter_add): beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = scatter_add(m, indices, m_scaled_g_values) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = scatter_add(v, indices, v_scaled_g_values) v_sqrt = math_ops.sqrt(v_t) var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
def assign_moving_average(variable, value, decay, zero_debias=True, name=None): """Compute the moving average of a variable. The moving average of 'variable' updated with 'value' is: variable * decay + value * (1 - decay) The returned Operation sets 'variable' to the newly computed moving average. The new value of 'variable' can be set with the 'AssignSub' op as: variable -= (1 - decay) * (variable - value) Since variables that are initialized to a `0` value will be `0` biased, `zero_debias` optionally enables scaling by the mathematically correct debiasing factor of 1 - decay ** num_updates See `ADAM: A Method for Stochastic Optimization` Section 3 for more details (https://arxiv.org/abs/1412.6980). The names of the debias shadow variables, by default, include both the scope they were created in and the scope of the variables they debias. They are also given a uniqifying-suffix. E.g.: ``` with tf.variable_scope('scope1'): with tf.variable_scope('scope2'): var = tf.get_variable('foo') tf.assign_moving_average(var, 0.0, 1.0) tf.assign_moving_average(var, 0.0, 0.9) # var.name: 'scope1/scope2/foo' # shadow var names: 'scope1/scope2/scope1/scope2/foo/biased' # 'scope1/scope2/scope1/scope2/foo/biased_1' ``` Args: variable: A Variable. value: A tensor with the same shape as 'variable'. decay: A float Tensor or float value. The moving average decay. zero_debias: A python bool. If true, assume the variable is 0-initialized and unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in `_zero_debias` for more details. name: Optional name of the returned operation. Returns: A reference to the input 'variable' tensor with the newly computed moving average. """ with ops.name_scope(name, "AssignMovingAvg", [variable, value, decay]) as scope: with ops.colocate_with(variable): decay = ops.convert_to_tensor(1.0 - decay, name="decay") if decay.dtype != variable.dtype.base_dtype: decay = math_ops.cast(decay, variable.dtype.base_dtype) if zero_debias: update_delta = _zero_debias(variable, value, decay) else: update_delta = (variable - value) * decay return state_ops.assign_sub(variable, update_delta, name=scope)
def _Update_global_variables(): local_vars = [v for g, v in grads_and_vars if g is not None] global_center_vars = [self._global_map[var] for var in local_vars] local_center_vars = [self._local_map[var] for var in local_vars] local_center_vars_update = [] for lvar, var in zip(local_center_vars, global_center_vars): local_center_vars_update.append(lvar.assign(var)) update_ops = [] differences = [] with ops.control_dependencies(local_center_vars_update): for v, lv in zip(local_vars, local_center_vars): with ops.device(v.device): differences.append(math_ops.subtract(v, lv)) for lvar, diff in zip(local_vars, differences): with ops.device(lvar.device): update_ops.append( state_ops.assign_sub(lvar, math_ops.multiply(self._moving_rate, diff))) for var, diff in zip(global_center_vars, differences): with ops.device(var.device): update_ops.append( state_ops.assign_add(var, math_ops.multiply(self._moving_rate, diff))) if global_step: with ops.colocate_with(global_step): update_ops.append(state_ops.assign_add(global_step, 1)) variable_update = control_flow_ops.group(*(update_ops)) return variable_update
def testReadWrite(self): """Tests initialization, reading, and writing a resource variable.""" for dtype in self.numeric_types: with self.test_session() as session: with self.test_scope(): with variable_scope.variable_scope("ascope", use_resource=True): x = variable_scope.get_variable( "x", shape=[], dtype=dtype, initializer=init_ops.constant_initializer(2)) a = x.read_value() with ops.control_dependencies([a]): b = state_ops.assign(x, dtype(47)) with ops.control_dependencies([b]): c = x.read_value() with ops.control_dependencies([c]): d = state_ops.assign_add(x, np.array(6 + 2j).astype(dtype)) with ops.control_dependencies([d]): e = state_ops.assign_sub(x, dtype(3)) with ops.control_dependencies([e]): f = x.read_value() session.run(variables.global_variables_initializer()) v1, v2, v3 = session.run([a, c, f]) self.assertAllClose(dtype(2), v1) self.assertAllClose(dtype(47), v2) self.assertAllClose(np.array(50 + 2j).astype(dtype), v3)
def _apply_sparse(self, grad, var): beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad.values * (1 - beta1_t) m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values, use_locking=self._use_locking) # u_t = max(beta_2 * u_{t-1}, L1(g_t)) # theta_t = theta_{t-1} - alpha/(1-beta_1).m_t/u_t v = self.get_slot(var, "v") g_abs_values = tensorflow.abs(g_t) v_t = state_ops.assign(v, v * beta_2, use_locking = self._use_locking) v_t = state_ops.assign_max(v_t, grad.indices, g_abs_values, use_locking=self._use_locking) var_update = state_ops.assign_sub(var, lr*m_t/(v_t*(1 - beta_1)), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
def _resource_apply_sparse(self, grad, var, indices): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) beta_1_t = self._get_hyper('beta_1', var_dtype) beta_2_t = self._get_hyper('beta_2', var_dtype) local_step = math_ops.cast(self.iterations + 1, var_dtype) beta_1_power = math_ops.pow(beta_1_t, local_step) beta_2_power = math_ops.pow(beta_2_t, local_step) epsilon_t = self._get_hyper('epsilon', var_dtype) lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, 'm') m_scaled_g_values = grad * (1 - beta_1_t) m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = self._resource_scatter_add(m, indices, m_scaled_g_values) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, 'v') v_scaled_g_values = (grad * grad) * (1 - beta_2_t) v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = self._resource_scatter_add(v, indices, v_scaled_g_values) v_sqrt = math_ops.sqrt(v_t) var_update = state_ops.assign_sub( var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
def assign_moving_average(variable, value, decay, name=None): with ops.op_scope([variable, value, decay], name, "AssignMovingAvg") as name: with ops.device(variable.device): decay = ops.convert_to_tensor(1.0 - decay, name="decay") if decay.dtype != variable.dtype.base_dtype: decay = math_ops.cast(decay, variable.dtype.base_dtype) return state_ops.assign_sub(variable, (variable - value) * decay, name=name)
def _initAssignSubFetch(self, x, y, use_gpu=False): """Initialize a param to init, and compute param -= y.""" with self.test_session(use_gpu=use_gpu): p = variables.Variable(x) sub = state_ops.assign_sub(p, y) p.initializer.run() new_value = sub.eval() return p.eval(), new_value
def _assign_moving_average(self, variable, value, momentum): with ops.name_scope(None, 'AssignMovingAvg', [variable, value, momentum]) as scope: decay = ops.convert_to_tensor(1.0 - momentum, name='decay') if decay.dtype != variable.dtype.base_dtype: decay = math_ops.cast(decay, variable.dtype.base_dtype) update_delta = (variable - value) * decay return state_ops.assign_sub(variable, update_delta, name=scope)
def update_fn(v, value, decay=decay): decay = ops.convert_to_tensor(1.0 - decay, name="decay") if decay.dtype != v.dtype.base_dtype: decay = math_ops.cast(decay, v.dtype.base_dtype) if zero_debias: update_delta = _zero_debias(v, value, decay) else: update_delta = (v - value) * decay return state_ops.assign_sub(v, update_delta, name=scope)
def _apply_dense(self, grad, var): lr = self._lr_t * math_ops.sqrt(1 - self._beta2_power) / (1 - self._beta1_power) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - self._beta1_t) m_t = m * self._beta1_t m_t = m_t + m_scaled_g_values # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = tf.pow(grad, 2) * (1 - self._beta2_t) v_t = v * self._beta2_t v_t = v_t + v_scaled_g_values v_sqrt = tf.pow(v_t, self._pow_t) var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + self._epsilon_t), use_locking=self._use_locking) # regularization var_update = state_ops.assign_sub(var_update, self._dense_regularization * var, use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
def update_fn(v, value, biased_var, local_step): update_biased = state_ops.assign_sub(biased_var, (biased_var - value) * decay) update_local_step = local_step.assign_add(1) # This function gets `1 - decay`, so use `1.0 - decay` in the exponent. bias_factor = 1 - math_ops.pow(1.0 - decay, update_local_step) return state_ops.assign( v, update_biased / bias_factor, name=ops.get_name_scope() + "/")
def _apply_sparse(self, grad, var): lr = self._lr_t * math_ops.sqrt(1 - self._beta2_power) / (1 - self._beta1_power) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad.values * (1 - self._beta1_t) m_t = state_ops.assign(m, m * self._beta1_t, use_locking=self._use_locking) m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values, use_locking=self._use_locking) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad.values * grad.values) * (1 - self._beta2_t) v_t = state_ops.assign(v, v * self._beta2_t, use_locking=self._use_locking) v_t = state_ops.scatter_add(v_t, grad.indices, v_scaled_g_values, use_locking=self._use_locking) v_sqrt = tf.pow(v_t, self._pow_t) var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + self._epsilon_t), use_locking=self._use_locking) # regularization var_update = state_ops.assign_sub(var_update, self._sparse_regularization * var, use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) epsilon = self._get_hyper('epsilon', var_dtype) acc = self.get_slot(var, 'accumulator') acc_t = state_ops.assign_add( acc, math_ops.square(grad), use_locking=self._use_locking) var_update = state_ops.assign_sub( var, lr_t * grad / (math_ops.sqrt(acc_t) + epsilon)) return var_update
def _assign_moving_average(self, variable, value, one_minus_decay): with ops.name_scope(None, 'AssignMovingAvg', [variable, value, one_minus_decay]) as scope: with ops.colocate_with(variable): update_delta = (variable.read_value() - value) * one_minus_decay if isinstance(variable, resource_variable_ops.ResourceVariable): # state_ops.assign_sub does an extra read_variable_op after the # assign. We avoid that here. return gen_resource_variable_ops.assign_sub_variable_op( variable.handle, update_delta, name=scope) else: return state_ops.assign_sub(variable, update_delta, name=scope)
def _apply_dense(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_t = m.assign(tf.maximum(beta_t * m + eps, tf.abs(grad))) var_update = state_ops.assign_sub(var, lr_t * grad * (1.0 + alpha_t * tf.sign(grad) * tf.sign(m_t))) # Create an op that groups multiple operations # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
def assign_sub(self, delta, use_locking=False): """Subtracts a value from this variable. This is essentially a shortcut for `assign_sub(self, delta)`. Args: delta: A `Tensor`. The value to subtract from this variable. use_locking: If `True`, use locking during the operation. Returns: A `Tensor` that will hold the new value of this variable after the subtraction has completed. """ return state_ops.assign_sub(self._variable, delta, use_locking=use_locking)
def _apply_dense(self, grad, var): # m_t = mu * m + (1 - mu) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - self._mu_t) m_t = state_ops.assign(m, m * self._mu_t, use_locking=self._use_locking) m_t = state_ops.assign_add(m_t, m_scaled_g_values, use_locking=self._use_locking) m_t_ = m_t / (1 - self._mu2_t * self._mu_power) # m_bar = mu * m_t + (1 - mu) * g_t m_bar = self._mu2_t * m_t_ + m_scaled_g_values / (1 - self._mu_power) var_update = state_ops.assign_sub(var, self._lr_t * m_bar, use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t])
def _apply_dense(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) if var.dtype.base_dtype == tf.float16: eps = 1e-7 # Can't use 1e-8 due to underflow -- not sure if it makes a big difference. else: eps = 1e-8 v = self.get_slot(var, "v") v_t = v.assign(beta1_t * v + (1. - beta1_t) * grad) m = self.get_slot(var, "m") m_t = m.assign(tf.maximum(beta2_t * m + eps, tf.abs(grad))) g_t = v_t / m_t var_update = state_ops.assign_sub(var, lr_t * g_t) return control_flow_ops.group(*[var_update, m_t, v_t])
def assign_moving_average(variable, value, decay, zero_debias=True, name=None): """Compute the moving average of a variable. The moving average of 'variable' updated with 'value' is: variable * decay + value * (1 - decay) The returned Operation sets 'variable' to the newly computed moving average. The new value of 'variable' can be set with the 'AssignSub' op as: variable -= (1 - decay) * (variable - value) Since variables that are initialized to a `0` value will be `0` biased, `zero_debias` optionally enables scaling by the mathematically correct debiasing factor of 1 - decay ** num_updates See `ADAM: A Method for Stochastic Optimization` Section 3 for more details (https://arxiv.org/abs/1412.6980). Args: variable: A Variable. value: A tensor with the same shape as 'variable'. decay: A float Tensor or float value. The moving average decay. zero_debias: A python bool. If true, assume the variable is 0-intialized and unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in `_zero_debias` for more details. name: Optional name of the returned operation. Returns: An Operation that updates 'variable' with the newly computed moving average. """ with ops.name_scope(name, "AssignMovingAvg", [variable, value, decay]) as scope: with ops.colocate_with(variable): decay = ops.convert_to_tensor(1.0 - decay, name="decay") if decay.dtype != variable.dtype.base_dtype: decay = math_ops.cast(decay, variable.dtype.base_dtype) if zero_debias: update_delta = _zero_debias(variable, value, decay) else: update_delta = (variable - value) * decay return state_ops.assign_sub(variable, update_delta, name=scope)
def _apply_sparse(self, grad, var): # ms_t = decay * ms + (1 - decay) * (g_t * g_t) ms = self.get_slot(var, "rms") # should not be named rms when it's ms print('---SPARSE TIME---') print('lr: ' + str(self._learning_rate_tensor.get_shape())) print('decay: ' + str(self._decay_tensor.get_shape())) print('momentum: ' + str(self._momentum_tensor.get_shape())) print('epsilon: ' + str(self._epsilon_tensor.get_shape())) print('ms: ' + str(ms.get_shape())) print('grad.values: ' + str(grad.values.get_shape())) ms_scaled_g_values = (grad.values * grad.values) * \ (1 - self._decay_tensor) print('ms_scaled_g_values:' + str(ms_scaled_g_values.get_shape())) # no clue what these ops does ms_t = state_ops.assign(ms, ms * self._decay_tensor, use_locking=self._use_locking) print('ms_t: ' + str(ms_t.get_shape())) ms_t = state_ops.scatter_add(ms_t, grad.indices, ms_scaled_g_values, use_locking=self._use_locking) print('ms_t: ' + str(ms_t.get_shape())) rms = math_ops.sqrt(ms_t) print('rms: ' + str(rms.get_shape())) rms += self._epsilon_tensor print('rms: ' + str(rms.get_shape())) mom = self.get_slot(var, "momentum") print('mom: ' + str(mom.get_shape())) sparse_grad = self.get_slot(var, "sparse_grad") sparse_grad_t = state_ops.assign(sparse_grad, sparse_grad, use_locking=self._use_locking) sparse_grad_t = state_ops.scatter_add(sparse_grad, grad.indices, grad.values*self._learning_rate, use_locking=self._use_locking) mom_scaled_g_values = sparse_grad_t / rms print('mom_scaled_g_values: ' + str(mom.get_shape())) mom_t = state_ops.assign(mom, mom * self._momentum_tensor, use_locking=self._use_locking) print('mom_t: ' + str(mom_t.get_shape())) mom_t += mom_scaled_g_values # mom_t = state_ops.scatter_add(mom_t, grad.indices, mom_scaled_g_values, # use_locking=self._use_locking) print('mom_t: ' + str(mom_t.get_shape())) var_update = state_ops.assign_sub(var, mom_t, use_locking=self._use_locking) return control_flow_ops.group(*[var_update, ms_t, mom_t])
def _mini_batch_sync_updates_op(self, update_in_steps, cluster_centers_var, cluster_centers_updated, total_counts): if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1: assert update_in_steps is not None with ops.colocate_with(update_in_steps): def _f(): # Note that there is a race condition here, so we do a best effort # updates here. We reset update_in_steps first so that other workers # don't duplicate the updates. Also we update cluster_center_vars # before resetting total_counts to avoid large updates to # cluster_centers_updated based on partially updated # cluster_center_vars. with ops.control_dependencies([ state_ops.assign(update_in_steps, self._mini_batch_steps_per_iteration - 1) ]): with ops.colocate_with( cluster_centers_updated, ignore_existing=True): if self._distance_metric == COSINE_DISTANCE: cluster_centers = nn_impl.l2_normalize( cluster_centers_updated, dim=1) else: cluster_centers = cluster_centers_updated with ops.colocate_with(cluster_centers_var): with ops.control_dependencies( [state_ops.assign(cluster_centers_var, cluster_centers)]): with ops.colocate_with( cluster_centers_var, ignore_existing=True): with ops.control_dependencies([ state_ops.assign(total_counts, array_ops.zeros_like(total_counts)) ]): return array_ops.identity(update_in_steps) return control_flow_ops.cond( update_in_steps <= 0, _f, lambda: state_ops.assign_sub(update_in_steps, 1)) else: return control_flow_ops.no_op()
def _apply_dense(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.assign(m, beta1_t * m + (1. - beta1_t) * grad, use_locking=self._use_locking) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.assign(v, beta2_t * v + (1. - beta2_t) * tf.square(grad), use_locking=self._use_locking) v_prime = self.get_slot(var, "v_prime") v_t_prime = state_ops.assign(v_prime, tf.maximum(v_prime, v_t)) var_update = state_ops.assign_sub(var, lr_t * m_t / (tf.sqrt(v_t_prime) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime])
def _apply_dense(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) if var.dtype.base_dtype == tf.float16: eps = 1e-7 # Can't use 1e-8 due to underflow -- not sure if it makes a big difference. else: eps = 1e-8 v = self.get_slot(var, "v") v_t = v.assign(beta2_t * v + (1. - beta2_t) * tf.square(grad)) m = self.get_slot(var, "m") m_t = m.assign( beta1_t * m + (1. - beta1_t) * grad ) v_t_hat = tf.div(v_t, 1. - beta2_t) m_t_hat = tf.div(m_t, 1. - beta1_t) g_t = tf.div( m_t, tf.sqrt(v_t)+eps ) g_t_1 = self.get_slot(var, "g") g_t = g_t_1.assign( g_t ) var_update = state_ops.assign_sub(var, 2. * lr_t * g_t - lr_t * g_t_1) #Adam would be lr_t * g_t return control_flow_ops.group(*[var_update, m_t, v_t, g_t])
def testAssignUpdateNoVarShape(self): var = state_ops.variable_op([1, 2], dtypes.float32, set_shape=False) added = state_ops.assign_add(var, [[2.0, 3.0]]) self.assertEqual([1, 2], added.get_shape()) subbed = state_ops.assign_sub(var, [[12.0, 13.0]]) self.assertEqual([1, 2], subbed.get_shape())
def _resource_apply_dense(self, grad, var): step, beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) if self._initial_total_steps > 0: total_steps = math_ops.cast(self._total_steps_t, var.dtype.base_dtype) warmup_proportion = math_ops.cast(self._warmup_proportion_t, var.dtype.base_dtype) min_lr = math_ops.cast(self._min_lr_t, var.dtype.base_dtype) warmup_steps = total_steps * warmup_proportion decay_steps = math_ops.maximum(total_steps - warmup_steps, 1) decay_rate = (min_lr - lr_t) / decay_steps lr_t = tf.where( step <= warmup_steps, lr_t * (step / warmup_steps), lr_t + decay_rate * math_ops.minimum(step - warmup_steps, decay_steps), ) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) v = self.get_slot(var, "v") if self.clip_gradients: clipVal = math_ops.sqrt( tf.reduce_sum(v) / (1.0 - beta2_power)) * self.clip_multiplier_t + self.clip_epsilon_t grad = clip_ops.clip_by_norm(grad, clipVal) sma_inf = 2.0 / (1.0 - beta2_t) - 1.0 sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power) m = self.get_slot(var, "m") v_t = state_ops.assign(v, beta2_t * v + (1.0 - beta2_t) * math_ops.square(grad), use_locking=self._use_locking) v_corr_t = math_ops.sqrt(v_t / (1.0 - beta2_power)) + epsilon_t grad_corr = grad / v_corr_t m_t = state_ops.assign(m, beta1_t * m + (1.0 - beta1_t) * grad_corr, use_locking=self._use_locking) m_corr_t = m_t / (1.0 - beta1_power) r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) / (sma_inf - 2.0) * sma_inf / sma_t) var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t, m_corr_t) if var in self.reg_vars: if self._initial_weight_decay > 0.0: var_t += math_ops.cast(self._weight_decay_t, var.dtype.base_dtype) * var if self._L1_decay > 0.0: var_t += math_ops.cast( self._L1_decay, var.dtype.base_dtype) * math_ops.sign(var) with tf.control_dependencies([var_t]): var_update = state_ops.assign_sub(var, lr_t * var_t, use_locking=self._use_locking) updates = [var_update, m_t, v_t] return control_flow_ops.group(*updates)
def _apply_dense(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) var_update = state_ops.assign_sub(var, lr_t * grad) return control_flow_ops.group(*[var_update])
def _apply_gradient(self, grad, var, indices=None): """The main function to update a variable. Args: grad: A Tensor containing gradient to apply. var: A Tensor containing the variable to update. indices: An array of integers, for sparse update. Returns: Updated variable var = var - learning_rate * preconditioner * grad If the gradient is dense, var and grad have the same shape. If the update is sparse, then the first dimension of the gradient and var may differ, others are all the same. In this case the indices array provides the set of indices of the variable which are to be updated with each row of the gradient. """ global_step = self._global_step + 1 # Update accumulated weighted average of gradients gbar = self.get_slot(var, "gbar") gbar_decay_t = GetParam(self._gbar_decay, global_step) gbar_weight_t = GetParam(self._gbar_weight, global_step) if indices is not None: # Note - the sparse update is not easily implemented, since the # algorithm needs all indices of gbar to be updated # if mat_gbar_decay != 1 or mat_gbar_decay != 0. # One way to make mat_gbar_decay = 1 is by rescaling. # If we want the update: # G_{t+1} = a_{t+1} G_t + b_{t+1} w_t # define: # r_{t+1} = a_{t+1} * r_t # h_t = G_t / r_t # Then: # h_{t+1} = h_t + (b_{t+1} / r_{t+1}) * w_t # So we get the mat_gbar_decay = 1 as desired. # We can implement this in a future version as needed. # However we still need gbar_decay = 0, otherwise all indices # of the variable will need to be updated. if self._gbar_decay != 0.0: tf_logging.warning("Not applying momentum for variable: %s" % var.name) gbar_updated = grad else: gbar_updated = self._weighted_average(gbar, self._gbar_decay, gbar_decay_t, gbar_weight_t * grad) # Update the preconditioners and compute the preconditioned gradient shape = var.get_shape() mat_g_list = [] for i in range(len(shape)): mat_g_list.append(self.get_slot(var, "Gbar_" + str(i))) mat_gbar_decay_t = GetParam(self._mat_gbar_decay, global_step) mat_gbar_weight_t = GetParam(self._mat_gbar_weight, global_step) preconditioned_grad = gbar_updated v_rank = len(mat_g_list) neg_alpha = -GetParam(self._alpha, global_step) / v_rank svd_interval = GetParam(self._svd_interval, global_step) precond_update_interval = GetParam(self._precond_update_interval, global_step) for i, mat_g in enumerate(mat_g_list): # axes is the list of indices to reduce - everything but the current i. axes = list(range(i)) + list(range(i + 1, v_rank)) if shape[i] <= self._max_matrix_size: # If the tensor size is sufficiently small perform full Shampoo update # Note if precond_update_interval > 1 and mat_gbar_decay_t != 1, this # is not strictly correct. However we will use it for now, and # fix if needed. (G_1 = aG + bg ==> G_n = a^n G + (1+a+..+a^{n-1})bg) # pylint: disable=g-long-lambda,cell-var-from-loop mat_g_updated = control_flow_ops.cond( math_ops.mod(global_step, precond_update_interval) < 1, lambda: self._update_mat_g( mat_g, grad, axes, mat_gbar_decay_t, mat_gbar_weight_t * precond_update_interval, i), lambda: mat_g) mat_g_updated = mat_g_updated / float(shape[i].value) if self._svd_interval == 1: mat_h = self._compute_power(var, mat_g_updated, shape[i], neg_alpha) else: mat_h = control_flow_ops.cond( math_ops.mod(global_step, svd_interval) < 1, lambda: self._compute_power(var, mat_g_updated, shape[ i], neg_alpha, "H_" + str(i)), lambda: self.get_slot(var, "H_" + str(i))) # mat_h is a square matrix of size d_i x d_i # preconditioned_grad is a d_i x ... x d_n x d_0 x ... d_{i-1} tensor # After contraction with a d_i x d_i tensor # it becomes a d_{i+1} x ... x d_n x d_0 x ... d_i tensor # (the first dimension is contracted out, and the second dimension of # mat_h is appended). After going through all the indices, it becomes # a d_0 x ... x d_n tensor again. preconditioned_grad = math_ops.tensordot(preconditioned_grad, mat_h, axes=([0], [0]), name="precond_" + str(i)) else: # Tensor size is too large -- perform diagonal Shampoo update # Only normalize non-vector cases. if axes: normalizer = 1.0 if indices is not None else float( shape[i].value) grad_outer = math_ops.reduce_sum(grad * grad, axis=axes) / normalizer else: grad_outer = grad * grad if i == 0 and indices is not None: assert self._mat_gbar_decay == 1.0 mat_g_updated = state_ops.scatter_add( mat_g, indices, mat_gbar_weight_t * grad_outer) mat_h = math_ops.pow( array_ops.gather(mat_g_updated, indices) + self._epsilon, neg_alpha) else: mat_g_updated = self._weighted_average( mat_g, self._mat_gbar_decay, mat_gbar_decay_t, mat_gbar_weight_t * grad_outer) mat_h = math_ops.pow(mat_g_updated + self._epsilon, neg_alpha) # Need to do the transpose to ensure that the tensor becomes # a d_{i+1} x ... x d_n x d_0 x ... d_i tensor as described above. preconditioned_grad = array_ops.transpose( preconditioned_grad, perm=list(range(1, v_rank)) + [0]) * mat_h # Update the variable based on the Shampoo update learning_rate_t = GetParam(self._learning_rate, global_step) if indices is not None: var_updated = state_ops.scatter_add( var, indices, -learning_rate_t * preconditioned_grad) else: var_updated = state_ops.assign_sub( var, learning_rate_t * preconditioned_grad) return var_updated
def _zero_debias(unbiased_var, value, decay): """Compute the delta required for a debiased Variable. All exponential moving averages initialized with Tensors are initialized to 0, and therefore are biased to 0. Variables initialized to 0 and used as EMAs are similarly biased. This function creates the debias updated amount according to a scale factor, as in https://arxiv.org/abs/1412.6980. To demonstrate the bias the results from 0-initialization, take an EMA that was initialized to `0` with decay `b`. After `t` timesteps of seeing the constant `c`, the variable have the following value: ``` EMA = 0*b^(t) + c*(1 - b)*b^(t-1) + c*(1 - b)*b^(t-2) + ... = c*(1 - b^t) ``` To have the true value `c`, we would divide by the scale factor `1 - b^t`. In order to perform debiasing, we use two shadow variables. One keeps track of the biased estimate, and the other keeps track of the number of updates that have occurred. Args: unbiased_var: A Variable representing the current value of the unbiased EMA. value: A Tensor representing the most recent value. decay: A Tensor representing `1-decay` for the EMA. Returns: The amount that the unbiased variable should be updated. Computing this tensor will also update the shadow variables appropriately. """ with variable_scope.variable_scope( unbiased_var.op.name, values=[unbiased_var, value, decay]) as scope: with ops.colocate_with(unbiased_var): with ops.control_dependencies(None): biased_initializer = init_ops.zeros_initializer( dtype=unbiased_var.dtype)(unbiased_var.get_shape()) local_step_initializer = init_ops.ones_initializer() biased_var = variable_scope.get_variable( "biased", initializer=biased_initializer, trainable=False) # Initializing the local_step to `0` would cause problems with the # debiasing equation, so we instead initialize to `1`. local_step = variable_scope.get_variable( "local_step", shape=[], dtype=unbiased_var.dtype, initializer=local_step_initializer, trainable=False) # Get an update ops for both shadow variables. update_biased = state_ops.assign_sub(biased_var, (biased_var - value) * decay, name=scope.name) update_local_step = local_step.assign_add(1) # Compute the value of the delta to update the unbiased EMA. Make sure to # use the new values of the biased variable and the local step. with ops.control_dependencies([update_biased, update_local_step]): # This function gets `1 - decay`, so use `1.0 - decay` in the exponent. unbiased_ema_delta = (unbiased_var - biased_var.read_value() / (1 - math_ops.pow( 1.0 - decay, local_step.read_value()))) return unbiased_ema_delta
def testInitRequiredAssignSub(self): with self.cached_session(): p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32) a = state_ops.assign_sub(p, array_ops.fill([1024, 1024], 0)) with self.assertRaisesOpError("use uninitialized"): a.op.run()
def _finish(self, update_ops, name_scope): """""" caches = [update_op[0] for update_op in update_ops] update_ops = [update_op[1:] for update_op in update_ops] if self._noise is not None: for cache in caches: s_t, x_tm1 = cache[:2] s_t += random_ops.random_normal( x_tm1.initialized_value().get_shape(), stddev=self._noise) cache[0] = s_t if self._clip is not None: S_t = [cache[0] for cache in caches] S_t, _ = clip_ops.clip_by_global_norm(S_t, self._clip) for cache, s_t in zip(caches, S_t): cache[0] = s_t new_update_ops = [] for cache, update_op in zip(caches, update_ops): if len(cache) == 3: s_t, x_tm1 = cache[:2] with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): x_t = state_ops.assign_sub(x_tm1, s_t, use_locking=self._use_locking) cache.append(x_t) else: s_t_, x_tm1, idxs = cache[:3] with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): x_t = state_ops.scatter_sub(x_tm1, idxs, s_t_, use_locking=self._use_locking) cache.append(x_t) new_update_ops.append(control_flow_ops.group(*([x_t] + update_op))) with ops.control_dependencies(new_update_ops): more_update_ops = [] if self._save_step: for cache in caches: if len(cache) == 4: s_t, x_tm1 = cache[:2] s_tm1 = self.get_slot(x_tm1, 's') with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): new_step_and_grads = [] s_t = state_ops.assign( s_tm1, -s_t, use_locking=self._use_locking) else: s_t_, x_tm1, idxs = cache[:3] s_tm1 = self.get_slot(x_tm1, 's') with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): s_t = state_ops.scatter_update( s_tm1, idxs, -s_t_, use_locking=self._use_locking) more_update_ops.append(s_t) if self._save_grad: for cache in caches: if len(cache) == 4: x_tm1, g_t = cache[1:3] g_tm1 = self.get_slot(x_tm1, 'g') with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): new_step_and_grads = [] g_t = state_ops.assign( g_tm1, g_t, use_locking=self._use_locking) else: x_tm1, idxs, g_t_ = cache[1:4] g_tm1 = self.get_slot(x_tm1, 'g') with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): g_t = state_ops.scatter_update( g_tm1, idxs, g_t_, use_locking=self._use_locking) more_update_ops.append(g_t) if self._chi > 0: for cache in caches: if len(cache) == 4: _, x_tm1, _, x_t = cache with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): x_and_t = self._dense_moving_average( x_tm1, x_t, 'x', self._chi) more_update_ops.append( control_flow_ops.group(*x_and_t)) else: _, x_tm1, idxs, _, x_t = cache with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): x_t_ = array_ops.gather(x_t, idxs) x_and_t = self._sparse_moving_average( x_tm1, idxs, x_t_, 'x', self._chi) more_update_ops.append( control_flow_ops.group(*x_and_t)) return control_flow_ops.group(*(new_update_ops + more_update_ops), name=name_scope)
def _apply_dense(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) g = [self.get_slot(var, "g%d" % i) for i in range(self._keep_num + 1)] v = self.get_slot(var, "v") z = self.get_slot(var, "z") b2p = self.get_slot(var, "b2p") if self._pred_g_op == 'none': v_t = state_ops.assign(v, v * beta2_t + tf.square(g[0]) * (1 - beta2_t), use_locking=self._use_locking) elif self._pred_g_op == 'max': v_t = state_ops.assign( v, v * beta2_t + tf.reduce_max(tf.square(g[0])) * (1 - beta2_t), use_locking=self._use_locking) elif self._pred_g_op == 'mean': v_t = state_ops.assign( v, v * beta2_t + tf.reduce_mean(tf.square(g[0])) * (1 - beta2_t), use_locking=self._use_locking) else: assert False with ops.control_dependencies([v_t]): g_t = state_ops.assign(g[-1], grad, use_locking=self._use_locking) for i in range(self._keep_num): with ops.control_dependencies([g_t]): g_t = state_ops.assign(g[i], g[i + 1], use_locking=self._use_locking) with ops.control_dependencies([g_t]): # m_t = tf.reduce_sum([g[-self._mov_num-1+i]*self.s[i] for i in range(self._mov_num)], axis=0) m_t = tf.reduce_sum( [g[-i - 2] * self.s[-i - 1] for i in range(self._mov_num)], axis=0) # m_t = tf.reduce_mean(g[:self._keep_num], axis=0) with ops.control_dependencies([v_t]): z_t = state_ops.assign( z, tf.cast(tf.logical_or(v_t > 0.0, z > 0.0), tf.float32)) b2p_t = state_ops.assign(b2p, b2p * beta2_t * tf.sign(z_t) + (1.0 - tf.sign(z_t)), use_locking=self._use_locking) b2_fix = tf.maximum(1e-8, 1.0 - b2p_t) step_t = z_t * m_t / (math_ops.sqrt(v_t / b2_fix) + epsilon_t) # if var.name == self.first_var.name: #'discriminator/final_linear/w:0': # idx = 0 # step_t = tf.Print(step_t, [z_t[idx]], 'z_t', summarize=1000) # step_t = tf.Print(step_t, [g[i][idx] for i in range(len(g))], 'g', summarize=1000) # step_t = tf.Print(step_t, [grad[idx]], 'grad', summarize=1000) # step_t = tf.Print(step_t, [b2p_t[idx]], 'b2p_t', summarize=1000) # step_t = tf.Print(step_t, [b2_fix], 'beta2_fix', summarize=1000) # step_t = tf.Print(step_t, [m_t[idx]], 'm_t', summarize=1000) # step_t = tf.Print(step_t, [tf.sqrt(v_t / b2_fix)[idx]], 'v_t', summarize=1000) # step_t = tf.Print(step_t, [step_t], 'step', summarize=1000) var_update = state_ops.assign_sub(var, lr_t * step_t, use_locking=self._use_locking) return control_flow_ops.group(*([var_update]))
def run_and_check(): # Assign float32 values self.assertAllClose(3.14, self.evaluate(x.assign(v1))) self.assertAllClose(3.14 * 2, self.evaluate(x.assign_add(v1))) self.assertAllClose(3.14, self.evaluate(x.assign_sub(v1))) # Attempt to assign float16 values with self.assertRaisesRegexp( ValueError, 'conversion requested dtype float32 for Tensor with dtype float16' ): self.evaluate(x.assign(v2)) with self.assertRaisesRegexp( ValueError, 'conversion requested dtype float32 for Tensor with dtype float16' ): self.evaluate(x.assign_add(v2)) with self.assertRaisesRegexp( ValueError, 'conversion requested dtype float32 for Tensor with dtype float16' ): self.evaluate(x.assign_sub(v2)) # Assign Python floats self.assertAllClose(0., self.evaluate(x.assign(0.))) self.assertAllClose(3.14, self.evaluate(x.assign(3.14))) self.assertAllClose(3.14 * 2, self.evaluate(x.assign_add(3.14))) self.assertAllClose(3.14, self.evaluate(x.assign_sub(3.14))) # Assign multiple times assign = x.assign(1.) self.assertAllClose(1., self.evaluate(assign)) self.assertAllClose(0., self.evaluate(assign.assign(0.))) assign_add = x.assign_add(3.14) self.assertAllClose(3.14, self.evaluate(assign_add)) self.assertAllClose( 3.14 * 3, self.evaluate(x.assign_add(3.14).assign_add(3.14))) self.assertAllClose(3.14 * 3, x) assign_sub = x.assign_sub(3.14) self.assertAllClose(3.14 * 2, self.evaluate(assign_sub)) self.assertAllClose( 0., self.evaluate(x.assign_sub(3.14).assign_sub(3.14))) # Assign with read_value=False self.assertIsNone(self.evaluate(x.assign(1., read_value=False))) self.assertAllClose(1., self.evaluate(x)) self.assertIsNone( self.evaluate(x.assign_add(2., read_value=False))) self.assertAllClose(3., self.evaluate(x)) self.assertIsNone( self.evaluate(x.assign_sub(3., read_value=False))) self.assertAllClose(0., self.evaluate(x)) # Use the tf.assign functions instead of the var.assign methods. self.assertAllClose(0., self.evaluate(state_ops.assign(x, 0.))) self.assertAllClose(3.14, self.evaluate(state_ops.assign(x, 3.14))) self.assertAllClose( 3.14 * 2, self.evaluate(state_ops.assign_add(x, 3.14))) self.assertAllClose( 3.14, self.evaluate(state_ops.assign_sub(x, 3.14)))
def update_fn(v, value): return state_ops.assign_sub(v, (v - value) * decay, name=scope)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) """ Adam """ # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, 'm') m_scaled_g_values = grad * coefficients['one_minus_beta_1_t'] m_t = state_ops.assign(m, m * coefficients['beta_1_t'], use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = self._resource_scatter_add(m, indices, m_scaled_g_values) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, 'v') v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t'] v_t = state_ops.assign(v, v * coefficients['beta_2_t'], use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = self._resource_scatter_add(v, indices, v_scaled_g_values) denorm = (math_ops.sqrt(v_t) / math_ops.sqrt( coefficients['bias_correction2'])) + coefficients['epsilon'] step_size = coefficients['lr'] / coefficients['bias_correction1'] if self.nesterov: p_scaled_g_values = grad * coefficients['one_minus_beta_1_t'] perturb = m_t * coefficients['beta_1_t'] perturb = self._resource_scatter_add(perturb, indices, p_scaled_g_values) / denorm else: perturb = m_t / denorm # Projection wd_ratio = 1 if len(var.shape) > 1: perturb, wd_ratio = self._projection(var, grad, perturb, coefficients['delta'], coefficients['wd_ratio'], coefficients['epsilon']) # Weight decay if self.weight_decay > 0: var = state_ops.assign( var, var * (1 - coefficients['lr'] * coefficients['weight_decay'] * wd_ratio), use_locking=self._use_locking) var_update = state_ops.assign_sub(var, step_size * perturb, use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
def _apply_dense(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) m = self.get_slot(var, "m") v = self.get_slot(var, "v") g = self.get_slot(var, "g") z = self.get_slot(var, "z") b1p = self.get_slot(var, "b1p") b2p = self.get_slot(var, "b2p") m_t = state_ops.assign(m, beta1_t * m + grad * (1 - beta1_t), use_locking=self._use_locking) if self._pred_g_op == 'none': v_t = state_ops.assign(v, v * beta2_t + tf.square(g) * (1 - beta2_t), use_locking=self._use_locking) elif self._pred_g_op == 'max': v_t = state_ops.assign(v, v * beta2_t + tf.reduce_max(tf.square(g)) * (1 - beta2_t), use_locking=self._use_locking) elif self._pred_g_op == 'mean': v_t = state_ops.assign(v, v * beta2_t + tf.reduce_mean(tf.square(g)) * (1 - beta2_t), use_locking=self._use_locking) else: assert False # v_t = tf.cond(tf.less(self._current_iter, tf.constant(self._init_step)), # lambda: state_ops.assign(v, v * beta2_t + (grad * grad) * (1 - beta2_t), use_locking=self._use_locking), # lambda: state_ops.assign(v, v * beta2_t + (g * g) * (1 - beta2_t), use_locking=self._use_locking)) # cond = (tf.sign(tf.cast(self._current_iter - tf.constant(self._init_step), tf.float32) + tf.constant(0.5)) + tf.constant(1.0)) / tf.constant(2.0) # v_a = v * beta2_t + (grad * grad) * (1 - beta2_t) # v_b = v * beta2_t + (g * g) * (1 - beta2_t) # v_t = state_ops.assign(v, v_a * (1 - cond) + v_b * cond, use_locking=self._use_locking) # cond = tf.abs(tf.sign(g)) # v_t = state_ops.assign(v, v * (1 - cond) + (v * beta2_t + (g * g) * (1 - beta2_t)) * cond, use_locking=self._use_locking) # v_t = state_ops.assign(v, v * beta2_t + (g * g) * (1 - beta2_t), use_locking=self._use_locking) # v_t = state_ops.assign(v, tf.maximum(grad * grad * beta2_fix, v * beta2_t + (g * g) * (1 - beta2_t)), use_locking=self._use_locking) with ops.control_dependencies([v_t]): z_t = state_ops.assign(z, tf.cast(tf.logical_or(v_t > 0.0, z > 0.0), tf.float32)) g_t = state_ops.assign(g, grad, use_locking=self._use_locking) b1p_t = state_ops.assign(b1p, b1p * beta1_t * tf.sign(z_t) + (1.0 - tf.sign(z_t)), use_locking=self._use_locking) b2p_t = state_ops.assign(b2p, b2p * beta2_t * tf.sign(z_t) + (1.0 - tf.sign(z_t)), use_locking=self._use_locking) b1_fix = tf.maximum(1e-8, 1.0 - b1p_t) b2_fix = tf.maximum(1e-8, 1.0 - b2p_t) step_t = z_t * (m_t / b1_fix) / (math_ops.sqrt(v_t / b2_fix) + epsilon_t) # if var.name == self.first_var.name: #'discriminator/final_linear/w:0': # idx = 0 # step_t = tf.Print(step_t, [z_t[idx]], 'z_t', summarize=1000) # step_t = tf.Print(step_t, [g[idx]], 'g', summarize=1000) # step_t = tf.Print(step_t, [grad[idx]], 'grad', summarize=1000) # step_t = tf.Print(step_t, [b2p_t[idx]], 'b2p_t', summarize=1000) # step_t = tf.Print(step_t, [b2_fix], 'beta2_fix', summarize=1000) # step_t = tf.Print(step_t, [tf.sqrt(v_t / b2_fix)[idx]], 'v_t', summarize=1000) # step_t = tf.Print(step_t, [step_t], 'step', summarize=1000) var_update = state_ops.assign_sub(var, lr_t * step_t, use_locking=self._use_locking) return control_flow_ops.group(*[var_update, g_t])
def _resource_apply_dense(self, grad, var): step, beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) if self._initial_total_steps > 0: total_steps = math_ops.cast(self._total_steps_t, var.dtype.base_dtype) warmup_proportion = math_ops.cast(self._warmup_proportion_t, var.dtype.base_dtype) min_lr = math_ops.cast(self._min_lr_t, var.dtype.base_dtype) warmup_steps = total_steps * warmup_proportion lr_t = tf.where( step <= warmup_steps, lr_t * (step / warmup_steps), min_lr + (lr_t - min_lr) * (1.0 - math_ops.minimum(step, total_steps) / total_steps)) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) sma_inf = 2.0 / (1.0 - beta2_t) - 1.0 sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power) m = self.get_slot(var, "m") m_t = state_ops.assign(m, beta1_t * m + (1.0 - beta1_t) * grad, use_locking=self._use_locking) m_corr_t = m_t / (1.0 - beta1_power) v = self.get_slot(var, "v") v_t = state_ops.assign(v, beta2_t * v + (1.0 - beta2_t) * math_ops.square(grad), use_locking=self._use_locking) if self._amsgrad: vhat = self.get_slot(var, 'vhat') vhat_t = state_ops.assign(vhat, math_ops.maximum(vhat, v_t), use_locking=self._use_locking) v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta2_power) + epsilon_t) else: v_corr_t = math_ops.sqrt(v_t / (1.0 - beta2_power) + epsilon_t) r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) / (sma_inf - 2.0) * sma_inf / sma_t) var_t = tf.where(sma_t > 5.0, r_t * m_corr_t / v_corr_t, m_corr_t) if self._initial_weight_decay > 0.0: var_t += math_ops.cast(self._weight_decay_t, var.dtype.base_dtype) * var var_update = state_ops.assign_sub(var, lr_t * var_t, use_locking=self._use_locking) updates = [var_update, m_t, v_t] if self._amsgrad: updates.append(vhat_t) return control_flow_ops.group(*updates)
def sub(ref1, value1): return state_ops.assign_sub(ref1, value1)
def testAssignUpdateNoValueShape(self): var = state_ops.variable_op([1, 2], dtypes.float32) added = state_ops.assign_add(var, self._NewShapelessTensor()) self.assertEqual([1, 2], added.get_shape()) subbed = state_ops.assign_sub(var, self._NewShapelessTensor()) self.assertEqual([1, 2], subbed.get_shape())
def testAssignUpdateNoShape(self): var = state_ops.variable_op([1, 2], dtypes.float32, set_shape=False) added = state_ops.assign_add(var, self._NewShapelessTensor()) self.assertEqual(tensor_shape.unknown_shape(), added.get_shape()) subbed = state_ops.assign_sub(var, self._NewShapelessTensor()) self.assertEqual(tensor_shape.unknown_shape(), subbed.get_shape())
def _apply_dense(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) m = self.get_slot(var, "m") v = self.get_slot(var, "v") g = self.get_slot(var, "g") b2p = self.get_slot(var, "b2p") cond1 = tf.abs(tf.sign(grad)) m_t = state_ops.assign(m, (beta1_t * m + grad * (1.0 - beta1_t)) * cond1 + m * (1.0 - cond1), use_locking=self._use_locking) # g_square = tf.square(g) # def mean(g_square): # return (tf.reduce_sum(g_square) - g_square) / (tf.reduce_prod(tf.shape(g_square))-1.0) # # def max(g_square): # max_g_square = tf.reduce_max(g_square) # cond = (g_square == max_g_square) # max1_g_square = tf.reduce_max(g_square - cond * g_square) # max_g_square = max_g_square * (1.0 - cond) + max1_g_square * cond # return max_g_square # # gs = max(g_square) # gs = mean(g_square) gs = tf.maximum(tf.reduce_mean(tf.square(g)), tf.square(g)) cond2 = tf.abs(tf.sign(gs)) v_t = state_ops.assign(v, (v * beta2_t + gs * (1.0 - beta2_t)) * cond2 + v * (1.0 - cond2), use_locking=self._use_locking) b2p_t = state_ops.assign(b2p, b2p * beta2_t * cond2 + (1.0 - cond2), use_locking=self._use_locking) b2_fix = tf.maximum(1.0 - self._beta2, 1.0 - b2p_t) with ops.control_dependencies([v_t]): g_t = state_ops.assign(g, grad, use_locking=self._use_locking) step_t = m_t / (math_ops.sqrt(v_t / b2_fix) + epsilon_t) * cond2 * cond1 if 'discriminator67345715' in var.name: step_t = tf.Print(step_t, [cond1[0]], var.name + ' cond1:') step_t = tf.Print(step_t, [cond2[0]], var.name + ' cond2:') step_t = tf.Print(step_t, [b2_fix[0]], var.name + ' b2_fix:') step_t = tf.Print(step_t, [grad[0]], var.name + ' grad:') step_t = tf.Print(step_t, [m_t[0]], var.name + ' m_t:') step_t = tf.Print(step_t, [math_ops.sqrt(v_t / b2_fix)[0]], var.name + ' v_t_fix:') step_t = tf.Print(step_t, [step_t[0]], var.name + ' step_t:') step_t = tf.Print(step_t, [tf.reduce_max(step_t)], var.name + ' max_step_t:') var_update = state_ops.assign_sub(var, lr_t * step_t, use_locking=self._use_locking) return control_flow_ops.group(*[var_update, g_t])
def _finish(self, state): var_dtype = self._variables[0].dtype.base_dtype # Update global step. global_step = self._get_global_step(state) update_global_step = state_ops.assign_add(global_step, 1.) # Update the first moment estimate. beta1 = state.get_hyper("beta1", dtype=var_dtype) moment1 = self._get_moment1(state) flat_grad = self._get_flat_grad(state) # moment1_t := beta1 * moment1_{t-1} + (1 - beta1) * flat_grad_t update_moment1 = moment1.assign(beta1 * moment1 + (1. - beta1) * flat_grad) # Update the gradient buffer. window = state.get_hyper("window") grad_buffer = self._get_grad_buffer(state) next_grad_index = math_ops.floormod( math_ops.to_int32(update_global_step - 1.), window) # grad_buffer[(t-1) % window] := moment1_t update_grad_buffer = state_ops.scatter_update(grad_buffer, next_grad_index, update_moment1) # Compute the update step. eps = state.get_hyper("eps", dtype=var_dtype) svd_eps = state.get_hyper("svd_eps", dtype=var_dtype) sigma_eps = state.get_hyper("sigma_eps", dtype=var_dtype) lr = state.get_hyper("lr", dtype=var_dtype) denom = math_ops.sqrt( math_ops.minimum( ops.convert_to_tensor(update_global_step), ops.convert_to_tensor(math_ops.cast(window, dtype=var_dtype)))) moment1_2d = array_ops.expand_dims(update_moment1, -1) # m = grad_buffer^T / sqrt(min(t, window)) # m has shape [model dimension, window], where model dimension is the sum # of the dimensions of the flattened variables. m = array_ops.transpose(math_ops.divide(update_grad_buffer, denom)) # sigma, u, _ = SVD(m^Tm + I * svd_eps) mm = math_ops.matmul(m, m, transpose_a=True) damping = math_ops.cast(linalg_ops.eye(window), dtype=var_dtype) * svd_eps sigma, u, _ = linalg_ops.svd(mm + damping) sigma_sqrt = math_ops.sqrt(sigma) sigma_sqrt_min = math_ops.reduce_min(sigma_sqrt) # sigma_sqrt_inv = 1 / (\sqrt{sigma} + sigma_eps) ^ 3 # We add sigma_eps to alleviate numerical instability. # Note that (m^Tm)^(-3/2) = u diag(sigma_sqrt_inv) u^T. sigma_sqrt_inv = math_ops.divide( math_ops.cast(1.0, dtype=var_dtype), math_ops.pow(sigma_sqrt + sigma_eps, 3)) # In full matrix AdaGrad, the update step computes (mm^T)^(-1/2)g, where the # inversion of a model dimension by model dimension matrix is needed. To # speed up this computation we calculate the following instead: # m(m^Tm)^(-3/2)m^T moment1 = m u diag(sigma_sqrt_inv) u^T m^T moment1. new_step = array_ops.expand_dims( array_ops.zeros(flat_grad.get_shape(), dtype=var_dtype), -1) head = math_ops.matmul( m, math_ops.matmul( u, math_ops.matmul( array_ops.diag(sigma_sqrt_inv), math_ops.matmul(u, math_ops.matmul(m, moment1_2d, transpose_a=True), transpose_a=True)))) # When inverting (mm^t)^(1/2), we also add epsilon * I regularization for # degenerate cases. We expand ((mm^t)^(1/2) + epsilon * I)^(-1) using # Woodbury's identity. # For full derivation please see paper at # https://arxiv.org/pdf/1806.02958.pdf tail = moment1_2d - math_ops.matmul( m, math_ops.matmul( u, math_ops.matmul( array_ops.diag( math_ops.divide(math_ops.cast(1.0, dtype=var_dtype), sigma)), math_ops.matmul(u, math_ops.matmul( m, moment1_2d, transpose_a=True), transpose_a=True)))) scaled_tail = math_ops.divide(tail, sigma_sqrt_min) update_new_step = control_flow_ops.cond( sigma_sqrt_min > eps, lambda: math_ops.add(head, scaled_tail), lambda: math_ops.add(new_step, head)) # Update each variable. update_step = [] for var in self._variables: dim = self.shape_dict[var.name] start_index = self.index_dict[var.name] end_index = start_index + dim var_update_correct_shape = array_ops.reshape( update_new_step[start_index:end_index], var.get_shape()) var_updated = state_ops.assign_sub(var, lr * var_update_correct_shape) update_step.append(var_updated) return control_flow_ops.group(update_step)
def update_loss_scale(self, if_finite_grads): return control_flow_ops.cond( if_finite_grads, lambda: state_ops.assign_add(self._loss_scale, 1), lambda: state_ops.assign_sub(self._loss_scale, 1))
class INNAOptimizer(optimizer.Optimizer): """Optimizer that implements the INNA algorithm. See [Castera et al., 2019](https://arxiv.org/abs/1905.12278). """ def __init__(self, lr=0.01, alpha=0.5, beta=0.1, decay=1., decaypower = 0.5, speed_ini=1.0, epsilon=1e-8, use_locking=False, name="INNA"): super(INNAOptimizer, self).__init__(use_locking,name) self._iterations = 0 self._lr = lr self._alpha = alpha self._beta = beta self._epsilon = epsilon self._decay = decay self._decaypower = decaypower self._speed_ini = speed_ini # Tensor versions of the constructor arguments, created in _prepare(). self._lr_t = None self._alpha_t = None self._beta_t = None self._epsilon_t = None self._decay_t = None self._decaypower_t = None self._speed_ini_t = None def _create_slots(self, var_list): # Create slots for the auxiliary variable. for v in var_list: self._zeros_slot(v, "v1", self._name) def _prepare(self): lr = self._call_if_callable(self._lr) alpha = self._call_if_callable(self._alpha) beta = self._call_if_callable(self._beta) epsilon = self._call_if_callable(self._epsilon) decay = self._call_if_callable(self._decay) decaypower = self._call_if_callable(self._decaypower) speed_ini = self._call_if_callable(self._speed_ini) self._lr_t = ops.convert_to_tensor(self._lr, name="lr") self._alpha_t = ops.convert_to_tensor(self._alpha, name="alpha") self._beta_t = ops.convert_to_tensor(self._beta, name="beta") self._epsilon_t = ops.convert_to_tensor(self._epsilon, name="epsilon") self._decay_t = ops.convert_to_tensor(self._decay, name="decay") self._decaypower_t = ops.convert_to_tensor(self._decaypower, name="decaypower") self._speed_ini_t = ops.convert_to_tensor(self._speed_ini, name="speed_ini") def _apply_dense(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) decay_t = math_ops.cast(self._decay_t, var.dtype.base_dtype) decaypower_t = math_ops.cast(self._decaypower_t, var.dtype.base_dtype) speed_ini_t = math_ops.cast(self._speed_ini_t, var.dtype.base_dtype) v = self.get_slot(var, "v1") #(1.-self.alpha*self.beta)*p ) #Initialise v such that the initial speed is in the direction of -grad v_temp = cond( equal(num_iter(),0) , lambda : (1.-alpha_t*beta_t) * var - beta_t**2 * grad + beta_t * speed_ini_t * grad, lambda : v ) ''' if k == 0: v_temp = (1.-alpha_t*beta_t) * var - beta_t**2 * grad + beta_t * speed_ini_t * grad else: v_temp = v ''' v_t = v.assign( v_temp - ( lr_t * decay_t / math_ops.pow(math_ops.cast(num_iter()+1, var.dtype.base_dtype),decaypower_t) ) * ( (alpha_t-1./beta_t) * var + 1./beta_t * v_temp ) ) ''' # ψ_kp1 = ψ_k + γk ( (1/β - α) θ_k - 1/β ψ_k ) # ψ_kp1 = ψ_k - γk ( (α - 1/β) θ_k + 1/β ψ_k ) v = v_temp - ( lr_t * decay_t / math_ops.pow(math_ops.cast(num_iter()+1, var.dtype.base_dtype),decaypower_t) ) * ( (alpha_t-1./beta_t) * var + 1./beta_t * v_temp ) ''' var_update = state_ops.assign_sub( var, ( lr_t * decay_t / math_ops.pow(math_ops.cast(num_iter()+1, var.dtype.base_dtype),decaypower_t) ) * ( (alpha_t-1./beta_t) * var + 1./beta_t * v_temp + beta_t * grad ) ) #Update 'ref' by subtracting 'value
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) non_zero = partial(self._non_zero, epsilon=coefficients['epsilon'], use_softplus=self.use_softplus, beta_softplus=coefficients['beta_softplus']) # prepares gradient grad = self._gradient_clipping(grad, var, non_zero, coefficients['eps_clipping'], coefficients['threshold_clipping']) grad = self._gradient_normalization(grad, non_zero, self.centralize_gradients, self.normalize_gradients) # first moment estimation # using positive-negative momentum and bias correction prev_m = self.get_slot(var, 'prev_m') m = self.get_slot(var, 'm') m_scaled_g_values = grad * coefficients['one_minus_beta1_squared_t'] prev_m_values = coefficients['beta1_squared'] * prev_m prev_m_t = state_ops.assign(prev_m, m, use_locking=self._use_locking) m_beta = coefficients['beta_3_t'] * m m_t = state_ops.assign(m, prev_m_values + m_scaled_g_values, use_locking=self._use_locking) m_ema = coefficients['one_plus_beta_3_t'] * m_t - m_beta m_ema_corr = m_ema / coefficients['one_minus_beta_1_power'] # second moment estimation # using positive-negative momentum and bias correction v = self.get_slot(var, 'v') v_scaled_g_values = math_ops.square( grad) * coefficients['one_minus_beta_2_t'] v_t = state_ops.assign(v, v * coefficients['beta_2_t'] + v_scaled_g_values, use_locking=self._use_locking) v_hat = self.get_slot(var, 'vhat') v_hat_t = math_ops.maximum(v_hat, v_t) with ops.control_dependencies([v_hat_t]): v_hat_t = state_ops.assign(v_hat, v_hat_t, use_locking=self._use_locking) v_ema_hat_corr = v_hat_t / coefficients['one_minus_beta_2_power'] # update vector # takes positive negative momentum into account denom = coefficients['pnm_noise_amplitude'] * math_ops.sqrt( v_ema_hat_corr) update = m_ema_corr / non_zero(denom) # weight decay # combining norm-loss and stable weight decay euclidian_norm = self._axis_aware_euclidian_norm( var) # for norm-loss regularization effective_stepsize_inv = math_ops.sqrt( math_ops.reduce_mean(v_ema_hat_corr)) # for stable weight decay scaled_weight_decay = coefficients['weight_decay'] * ( euclidian_norm - 1.) / non_zero( euclidian_norm * effective_stepsize_inv) update += scaled_weight_decay * var # applies update var_update = state_ops.assign_sub( var, update * coefficients['scheduled_learning_rate'], use_locking=self._use_locking) updates = [prev_m_t, m_t, v_t, v_hat_t, var_update] train_op = control_flow_ops.group(*updates) look_ahead_op = self._look_ahead(coefficients, train_op, var) return control_flow_ops.group(train_op, look_ahead_op)
def assign_moving_mean_variance( mean_var, variance_var, value, decay, name=None): """Compute exponentially weighted moving {mean,variance} of a streaming value. The `value` updated exponentially weighted moving `mean_var` and `variance_var` are given by the following recurrence relations: ```python variance_var = decay * (variance_var + (1-decay) * (value - mean_var)**2) mean_var = decay * mean_var + (1 - decay) * value ``` Note: `mean_var` is updated *after* `variance_var`, i.e., `variance_var` uses the lag-1 mean. For derivation justification, see equation 143 of: T. Finch, Feb 2009. "Incremental calculation of weighted mean and variance". http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf Args: mean_var: `float`-like `Variable` representing the exponentially weighted moving mean. Same shape as `variance_var` and `value`. variance_var: `float`-like `Variable` representing the exponentially weighted moving variance. Same shape as `mean_var` and `value`. value: `float`-like `Tensor`. Same shape as `mean_var` and `variance_var`. decay: A `float`-like `Tensor`. The moving mean decay. Typically close to `1.`, e.g., `0.999`. name: Optional name of the returned operation. Returns: mean_var: `Variable` representing the `value`-updated exponentially weighted moving mean. variance_var: `Variable` representing the `value`-updated exponentially weighted moving variance. Raises: TypeError: if `mean_var` does not have float type `dtype`. TypeError: if `mean_var`, `variance_var`, `value`, `decay` have different `base_dtype`. """ with ops.name_scope(name, "assign_moving_mean_variance", [variance_var, mean_var, value, decay]): with ops.colocate_with(variance_var): with ops.colocate_with(mean_var): base_dtype = mean_var.dtype.base_dtype if not base_dtype.is_floating: raise TypeError( "mean_var.base_dtype({}) does not have float type " "`dtype`.".format(base_dtype.name)) if base_dtype != variance_var.dtype.base_dtype: raise TypeError( "mean_var.base_dtype({}) != variance_var.base_dtype({})".format( base_dtype.name, variance_var.dtype.base_dtype.name)) value = ops.convert_to_tensor(value, dtype=base_dtype, name="value") decay = ops.convert_to_tensor(decay, dtype=base_dtype, name="decay") delta = value - mean_var with ops.control_dependencies([delta]): mean_var = state_ops.assign_add( mean_var, (1. - decay) * delta) variance_var = state_ops.assign_sub( variance_var, (1. - decay) * (variance_var - decay * math_ops.square(delta))) return mean_var, variance_var
use_locking=self._use_locking) # gn_c = ((d/dy) * dLdy) * dydvar ** 2 # gn_t = beta2 * gn + (1 - beta2) * (gn_c) dLdy = tf.gradients(self.loss_t, self.pred_t) sec_loss = tf.gradients(dLdy, self.pred_t) dydvar = tf.gradients(self.pred_t, var) sec_loss_t = math_ops.cast(sec_loss, var.dtype.base_dtype) dydvar_t = tf.gradients(dydvar, var) gn_c = sec_loss * dydvar * dydvar gn = self.get_slot(var, "gn") gn_scaled_g_values = (gn_c) * (1 - beta2_t) gn_t = state_ops.assign(gn, gn * beta2_t, use_locking=self._use_locking) gn_t = state_ops.scatter_add(gn_t, grad.indices, gn_scaled_g_values, use_locking=self._use_locking) var_update = state_ops.assign_sub(var, lr * m_t / (gn_t + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
def _zero_debias(unbiased_var, value, decay): """Compute the delta required for a debiased Variable. All exponential moving averages initialized with Tensors are initialized to 0, and therefore are biased to 0. Variables initialized to 0 and used as EMAs are similarly biased. This function creates the debias updated amount according to a scale factor, as in https://arxiv.org/abs/1412.6980. To demonstrate the bias the results from 0-initialization, take an EMA that was initialized to `0` with decay `b`. After `t` timesteps of seeing the constant `c`, the variable have the following value: ``` EMA = 0*b^(t) + c*(1 - b)*b^(t-1) + c*(1 - b)*b^(t-2) + ... = c*(1 - b^t) ``` To have the true value `c`, we would divide by the scale factor `1 - b^t`. In order to perform debiasing, we use two shadow variables. One keeps track of the biased estimate, and the other keeps track of the number of updates that have occurred. Args: unbiased_var: A Variable representing the current value of the unbiased EMA. value: A Tensor representing the most recent value. decay: A Tensor representing `1-decay` for the EMA. Returns: The amount that the unbiased variable should be updated. Computing this tensor will also update the shadow variables appropriately. """ with variable_scope.variable_scope(unbiased_var.op.name, values=[unbiased_var, value, decay]) as scope: with ops.colocate_with(unbiased_var): with ops.init_scope(): biased_initializer = init_ops.zeros_initializer( dtype=unbiased_var.dtype)(unbiased_var.get_shape()) local_step_initializer = init_ops.zeros_initializer() def _maybe_get_unique(name): """Get name for a unique variable, if not `reuse=True`.""" if variable_scope.get_variable_scope().reuse: return name vs_vars = [ x.op.name for x in variable_scope.get_variable_scope().global_variables() ] full_name = variable_scope.get_variable_scope( ).name + "/" + name if full_name not in vs_vars: return name idx = 1 while full_name + ("_%d" % idx) in vs_vars: idx += 1 return name + ("_%d" % idx) biased_var = variable_scope.get_variable( _maybe_get_unique("biased"), initializer=biased_initializer, trainable=False) local_step = variable_scope.get_variable( _maybe_get_unique("local_step"), shape=[], dtype=unbiased_var.dtype, initializer=local_step_initializer, trainable=False) # Get an update ops for both shadow variables. update_biased = state_ops.assign_sub(biased_var, (biased_var - value) * decay, name=scope.name) update_local_step = local_step.assign_add(1) # Compute the value of the delta to update the unbiased EMA. Make sure to # use the new values of the biased variable and the local step. with ops.control_dependencies([update_biased, update_local_step]): # This function gets `1 - decay`, so use `1.0 - decay` in the exponent. unbiased_ema_delta = ( unbiased_var - biased_var.read_value() / (1 - math_ops.pow(1.0 - decay, local_step.read_value()))) return unbiased_ema_delta
def run_and_check(): # Assign float32 values self.assertAllClose(3., self.evaluate(x.assign(v1))) self.assertAllClose(3. * 2, self.evaluate(x.assign_add(v1))) self.assertAllClose(3., self.evaluate(x.assign_sub(v1))) # Attempt to assign float16 values with self.assertRaisesRegex( ValueError, 'conversion requested dtype float32 for Tensor with dtype float16' ): self.evaluate(x.assign(v2)) with self.assertRaisesRegex( ValueError, 'conversion requested dtype float32 for Tensor with dtype float16' ): self.evaluate(x.assign_add(v2)) with self.assertRaisesRegex( ValueError, 'conversion requested dtype float32 for Tensor with dtype float16' ): self.evaluate(x.assign_sub(v2)) # Assign Python floats self.assertAllClose(0., self.evaluate(x.assign(0.))) self.assertAllClose(3., self.evaluate(x.assign(3.))) self.assertAllClose(3. * 2, self.evaluate(x.assign_add(3.))) self.assertAllClose(3., self.evaluate(x.assign_sub(3.))) # Assign multiple times # This currently only works if no strategy is used if not ds_context.has_strategy(): assign = x.assign(1.) self.assertAllClose(1., self.evaluate(assign)) self.assertAllClose(0., self.evaluate(assign.assign(0.))) assign_add = x.assign_add(3.) self.assertAllClose(3., self.evaluate(assign_add)) self.assertAllClose( 3. * 3, self.evaluate(x.assign_add(3.).assign_add(3.))) self.assertAllClose(3. * 3, x) assign_sub = x.assign_sub(3.) self.assertAllClose(3. * 2, self.evaluate(assign_sub)) self.assertAllClose( 0., self.evaluate(x.assign_sub(3.).assign_sub(3.))) # Assign with read_value=False self.assertIsNone(self.evaluate(x.assign(1., read_value=False))) self.assertAllClose(1., self.evaluate(x)) self.assertIsNone( self.evaluate(x.assign_add(2., read_value=False))) self.assertAllClose(3., self.evaluate(x)) self.assertIsNone( self.evaluate(x.assign_sub(3., read_value=False))) self.assertAllClose(0., self.evaluate(x)) # Use the tf.assign functions instead of the var.assign methods. self.assertAllClose(0., self.evaluate(state_ops.assign(x, 0.))) self.assertAllClose(3., self.evaluate(state_ops.assign(x, 3.))) self.assertAllClose(3. * 2, self.evaluate(state_ops.assign_add(x, 3.))) self.assertAllClose(3., self.evaluate(state_ops.assign_sub(x, 3.)))
def _apply_sparse_shared(self, grad, var, indices, scatter_add): step, beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) if self._initial_total_steps > 0: total_steps = math_ops.cast(self._total_steps_t, var.dtype.base_dtype) warmup_proportion = math_ops.cast(self._warmup_proportion_t, var.dtype.base_dtype) min_lr = math_ops.cast(self._min_lr_t, var.dtype.base_dtype) warmup_steps = total_steps * warmup_proportion decay_steps = math_ops.maximum(total_steps - warmup_steps, 1) decay_rate = (min_lr - lr_t) / decay_steps lr_t = tf.where( step <= warmup_steps, lr_t * (step / warmup_steps), lr_t + decay_rate * math_ops.minimum(step - warmup_steps, decay_steps), ) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) sma_inf = 2.0 / (1.0 - beta2_t) - 1.0 sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power) m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = scatter_add(m, indices, m_scaled_g_values) m_corr_t = m_t / (1.0 - beta1_power) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = scatter_add(v, indices, v_scaled_g_values) if self._amsgrad: vhat = self.get_slot(var, 'vhat') vhat_t = state_ops.assign(vhat, math_ops.maximum(vhat, v_t), use_locking=self._use_locking) v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta2_power)) else: v_corr_t = math_ops.sqrt(v_t / (1.0 - beta2_power)) r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) / (sma_inf - 2.0) * sma_inf / sma_t) var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t / (v_corr_t + epsilon_t), m_corr_t) if self._initial_weight_decay > 0.0: var_t += math_ops.cast(self._weight_decay_t, var.dtype.base_dtype) * var var_update = state_ops.assign_sub(var, lr_t * var_t, use_locking=self._use_locking) updates = [var_update, m_t, v_t] if self._amsgrad: updates.append(vhat_t) return control_flow_ops.group(*updates)
def update_fn(v, update_delta): return state_ops.assign_sub(v, update_delta, name=scope)
def _apply_gradient(self, grad, var, indices=None): """The main function to update a variable. Args: grad: A Tensor containing gradient to apply. var: A Tensor containing the variable to update. indices: An array of integers, for sparse update. Returns: Updated variable var = var - learning_rate * preconditioner * grad If the gradient is dense, var and grad have the same shape. If the update is sparse, then the first dimension of the gradient and var may differ, others are all the same. In this case the indices array provides the set of indices of the variable which are to be updated with each row of the gradient. """ global_step = self._global_step + 1 # Update accumulated weighted average of gradients gbar = self.get_slot(var, "gbar") gbar_decay_t = GetParam(self._gbar_decay, global_step) gbar_weight_t = GetParam(self._gbar_weight, global_step) if indices is not None: # Note - the sparse update is not easily implemented, since the # algorithm needs all indices of gbar to be updated # if mat_gbar_decay != 1 or mat_gbar_decay != 0. # One way to make mat_gbar_decay = 1 is by rescaling. # If we want the update: # G_{t+1} = a_{t+1} G_t + b_{t+1} w_t # define: # r_{t+1} = a_{t+1} * r_t # h_t = G_t / r_t # Then: # h_{t+1} = h_t + (b_{t+1} / r_{t+1}) * w_t # So we get the mat_gbar_decay = 1 as desired. # We can implement this in a future version as needed. # However we still need gbar_decay = 0, otherwise all indices # of the variable will need to be updated. if self._gbar_decay != 0.0: tf_logging.warning("Not applying momentum for variable: %s" % var.name) gbar_updated = grad else: gbar_updated = self._weighted_average(gbar, self._gbar_decay, gbar_decay_t, gbar_weight_t * grad) # Update the preconditioners and compute the preconditioned gradient shape = var.get_shape() mat_g_list = [] for i in range(len(shape)): mat_g_list.append(self.get_slot(var, "Gbar_" + str(i))) mat_gbar_decay_t = GetParam(self._mat_gbar_decay, global_step) mat_gbar_weight_t = GetParam(self._mat_gbar_weight, global_step) preconditioned_grad = gbar_updated v_rank = len(mat_g_list) neg_alpha = - GetParam(self._alpha, global_step) / v_rank svd_interval = GetParam(self._svd_interval, global_step) precond_update_interval = GetParam(self._precond_update_interval, global_step) for i, mat_g in enumerate(mat_g_list): # axes is the list of indices to reduce - everything but the current i. axes = list(range(i)) + list(range(i+1, v_rank)) if shape[i] <= self._max_matrix_size: # If the tensor size is sufficiently small perform full Shampoo update # Note if precond_update_interval > 1 and mat_gbar_decay_t != 1, this # is not strictly correct. However we will use it for now, and # fix if needed. (G_1 = aG + bg ==> G_n = a^n G + (1+a+..+a^{n-1})bg) # pylint: disable=g-long-lambda,cell-var-from-loop mat_g_updated = control_flow_ops.cond( math_ops.mod(global_step, precond_update_interval) < 1, lambda: self._update_mat_g( mat_g, grad, axes, mat_gbar_decay_t, mat_gbar_weight_t * precond_update_interval, i), lambda: mat_g) if self._svd_interval == 1: mat_h = self._compute_power(var, mat_g_updated, shape[i], neg_alpha) else: mat_h = control_flow_ops.cond( math_ops.mod(global_step, svd_interval) < 1, lambda: self._compute_power(var, mat_g_updated, shape[i], neg_alpha, "H_" + str(i)), lambda: self.get_slot(var, "H_" + str(i))) # mat_h is a square matrix of size d_i x d_i # preconditioned_grad is a d_i x ... x d_n x d_0 x ... d_{i-1} tensor # After contraction with a d_i x d_i tensor # it becomes a d_{i+1} x ... x d_n x d_0 x ... d_i tensor # (the first dimension is contracted out, and the second dimension of # mat_h is appended). After going through all the indices, it becomes # a d_0 x ... x d_n tensor again. preconditioned_grad = math_ops.tensordot(preconditioned_grad, mat_h, axes=([0], [0]), name="precond_" + str(i)) else: # Tensor size is too large -- perform diagonal Shampoo update grad_outer = math_ops.reduce_sum(grad * grad, axis=axes) if i == 0 and indices is not None: assert self._mat_gbar_decay == 1.0 mat_g_updated = state_ops.scatter_add(mat_g, indices, mat_gbar_weight_t * grad_outer) mat_h = math_ops.pow( array_ops.gather(mat_g_updated, indices) + self._epsilon, neg_alpha) else: mat_g_updated = self._weighted_average(mat_g, self._mat_gbar_decay, mat_gbar_decay_t, mat_gbar_weight_t * grad_outer) mat_h = math_ops.pow(mat_g_updated + self._epsilon, neg_alpha) # Need to do the transpose to ensure that the tensor becomes # a d_{i+1} x ... x d_n x d_0 x ... d_i tensor as described above. preconditioned_grad = array_ops.transpose( preconditioned_grad, perm=list(range(1, v_rank)) + [0]) * mat_h # Update the variable based on the Shampoo update learning_rate_t = GetParam(self._learning_rate, global_step) if indices is not None: var_updated = state_ops.scatter_add( var, indices, -learning_rate_t * preconditioned_grad) else: var_updated = state_ops.assign_sub(var, learning_rate_t * preconditioned_grad) return var_updated
def assign_sub(self, delta, use_locking=False): return state_ops.assign_sub(self._variable, delta, use_locking=use_locking)