def _resource_apply(self, grad, var, indices=None): # 准备变量 var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') beta_1_t = self._get_hyper('beta_1', var_dtype) beta_2_t = self._get_hyper('beta_2', var_dtype) epsilon_t = K.cast(self.epsilon, var_dtype) local_step = K.cast(self.iterations + 1, var_dtype) beta_1_t_power = K.pow(beta_1_t, local_step) beta_2_t_power = K.pow(beta_2_t, local_step) # 更新公式 if indices is None: m_t = K.update(m, beta_1_t * m + (1 - beta_1_t) * grad) v_t = K.update(v, beta_2_t * v + (1 - beta_2_t) * grad**2) else: mv_ops = [K.update(m, beta_1_t * m), K.update(v, beta_2_t * v)] with tf.control_dependencies(mv_ops): m_t = self._resource_scatter_add(m, indices, (1 - beta_1_t) * grad) v_t = self._resource_scatter_add(v, indices, (1 - beta_2_t) * grad**2) # 返回算子 with tf.control_dependencies([m_t, v_t]): if self.bias_correction: m_t = m_t / (1.0 - beta_1_t_power) v_t = v_t / (1.0 - beta_2_t_power) var_t = var - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) return K.update(var, var_t)
def _resource_apply(self, grad, var, indices=None): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) # 获取梯度 ag = self.get_slot(var, 'ag') old_update = K.update def new_update(x, new_x): new_x = K.switch(cond, new_x, x) return old_update(x, new_x) K.update = new_update ag_t = ag / self.grad_accum_steps op = super(NewOptimizer, self)._resource_apply(ag_t, var) K.update = old_update # 累积梯度 with tf.control_dependencies([op]): ag_t = K.switch(cond, K.zeros_like(ag), ag) with tf.control_dependencies([K.update(ag, ag_t)]): if indices is None: ag_t = K.update(ag, ag + grad) else: ag_t = self._resource_scatter_add(ag, indices, grad) return ag_t
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] self.weights = [self.iterations] lr = self.learning_rate for i, (p, g) in enumerate(zip(params, grads)): g2 = K.square(g) + self.epsilon1 shape, dtype = K.int_shape(p), K.dtype(p) factored_shape = self.factored_shape(shape) if factored_shape is None: # 定义参数 v = K.zeros(shape, dtype=dtype, name='v_' + str(i)) self.weights.append(v) # 定义更新 v_t = self.beta2 * v + (1.0 - self.beta2) * g2 self.updates.append(K.update(v, v_t)) else: # 定义参数 shape1, axis1, shape2, axis2 = factored_shape vr = K.zeros(shape1, dtype=dtype, name='vr_' + str(i)) vc = K.zeros(shape2, dtype=dtype, name='vc_' + str(i)) self.weights.extend([vr, vc]) # 定义更新 vr_t = self.beta2 * vr + K.mean(g2, axis=axis1, keepdims=True) vc_t = self.beta2 * vc + K.mean(g2, axis=axis2, keepdims=True) self.updates.extend([K.update(vr, vr_t), K.update(vc, vc_t)]) # 合成矩阵 v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True) # 增量主体 u = g / K.sqrt(v_t) # 增量裁剪 if self.clipping_threshold is not None: u_rms = K.mean(K.sum(K.square(u))) d = self.clipping_threshold u = u / K.maximum(1.0, u_rms / d) # 增量滑动 if self.beta1 > 0.0: # 定义参数 m = K.zeros(shape, dtype=dtype, name='m_' + str(i)) self.weights.append(m) # 定义更新 m_t = self.beta1 * m + (1.0 - self.beta1) * u self.updates.append(K.update(m, m_t)) u = m_t # 增量调整 if self.multiply_by_parameter_scale: u = u * K.maximum(K.mean(K.sum(K.square(p))), self.epsilon2) # 更新参数 self.updates.append(K.update(p, p - lr * u)) return self.updates
def _resource_apply(self, grad, var, indices=None): op = super(NewOptimizer, self)._resource_apply(grad, var, indices) k, alpha = self.steps_per_slow_update, self.slow_step_size cond = K.equal(self.iterations % k, 0) slow_var = self.get_slot(var, 'slow_var') slow_var_t = slow_var + alpha * (var - slow_var) with tf.control_dependencies([op]): slow_update = K.update(slow_var, K.switch(cond, slow_var_t, slow_var)) with tf.control_dependencies([slow_update]): copy_update = K.update(var, K.switch(cond, slow_var, var)) return copy_update
def get_updates(self, loss, params): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) cond = K.cast(cond, K.floatx()) # 获取梯度 grads = self.get_gradients(loss, params) self.accum_grads = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='accum_grad_%s' % i) for i, p in enumerate(params) ] old_update = K.update def new_update(x, new_x): new_x = cond * new_x + (1 - cond) * x return old_update(x, new_x) K.update = new_update updates = super(NewOptimizer, self).get_updates(loss, params) K.update = old_update # 累积梯度 with tf.control_dependencies(updates): accum_updates = [ K.update(ag, g + (1 - cond) * ag) for g, ag in zip(grads, self.accum_grads) ] return accum_updates
def _resource_apply_dense(self, grad, var): op = super(NewOptimizer, self)._resource_apply_dense(grad, var) ema = self.get_slot(var, 'ema') ema_momentum = self.ema_momentum with tf.control_dependencies([op]): return K.update( ema, ema * ema_momentum + var * (1.0 - ema_momentum))
def get_updates(self, loss, params): updates = super(NewOptimizer, self).get_updates(loss, params) self.model_weights = params self.ema_weights = [K.zeros(K.shape(w)) for w in params] self.old_weights = K.batch_get_value(params) ema_updates, ema_momentum = [], self.ema_momentum with tf.control_dependencies(updates): for w1, w2 in zip(self.ema_weights, params): new_w = ema_momentum * w1 + (1 - ema_momentum) * w2 ema_updates.append(K.update(w1, new_w)) return ema_updates
def _resource_apply(self, grad, var, indices=None): lr = self.learning_rate g2 = K.square(grad) + self.epsilon1 shape = K.int_shape(var) factored_shape = self.factored_shape(shape) if factored_shape is None: v = self.get_slot(var, 'v') # 定义更新 v_t = self.beta2 * v + (1.0 - self.beta2) * g2 v_t = K.update(v, v_t) else: shape1, axis1, shape2, axis2 = factored_shape vr = self.get_slot(var, 'vr') vc = self.get_slot(var, 'vc') # 定义更新 vr_t = self.beta2 * vr + K.mean(g2, axis=axis1, keepdims=True) vc_t = self.beta2 * vc + K.mean(g2, axis=axis2, keepdims=True) vr_t, vc_t = K.update(vr, vr_t), K.update(vc, vc_t) # 合成矩阵 v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True) # 增量主体 u = grad / K.sqrt(v_t) # 增量裁剪 if self.clipping_threshold is not None: u_rms = K.mean(K.sum(K.square(u))) d = self.clipping_threshold u = u / K.maximum(1.0, u_rms / d) # 增量滑动 if self.beta1 > 0.0: m = self.get_slot(var, 'm') # 定义更新 m_t = self.beta1 * m + (1.0 - self.beta1) * u u = K.update(m, m_t) # 增量调整 if self.multiply_by_parameter_scale: u = u * K.maximum(K.mean(K.sum(K.square(var))), self.epsilon2) # 更新参数 return K.update(var, var - lr * u)
def get_updates(self, loss, params): updates = super(NewOptimizer, self).get_updates(loss, params) k, alpha = self.steps_per_slow_update, self.slow_step_size cond = K.equal(self.iterations % k, 0) slow_vars = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='slow_var_%s' % i) for i, p in enumerate(params) ] with tf.control_dependencies(updates): slow_updates = [ K.update(q, K.switch(cond, q + alpha * (p - q), q)) for p, q in zip(params, slow_vars) ] with tf.control_dependencies(slow_updates): copy_updates = [ K.update(p, K.switch(cond, q, p)) for p, q in zip(params, slow_vars) ] return copy_updates