def _step(self, epoch): t = fn.to_tensor(epoch) step_size = self.lr * (fn.sqrt(1 - fn.power(self.beta2, t)) / (1 - fn.power(self.beta1, t))) lower_bound = self.final_lr * (1.0 - 1.0 / (self.gamma * t + 1)) upper_bound = self.final_lr * (1.0 + 1.0 / (self.gamma * t)) grads = [p.grad for p in self.parameters] for p, g, m, v in (zip(self.parameters, grads, self.ms, self.vs)): m = self.beta1 * m + (1 - self.beta1) * g v = self.beta2 * v + (1 - self.beta2) * fn.square(g) denom = fn.sqrt(v) + self.epsilon p -= m * fn.clip(step_size / denom, lower_bound.item(), upper_bound.item())
def _step(self, epoch): t = fn.to_tensor(epoch) grads = [p.grad for p in self.parameters] for p, g, m, v, vhat in (zip(self.parameters, grads, self.ms, self.vs, self.vhats)): m = self.beta1 * m + (1 - self.beta1) * g v = self.beta2 * v + (1 - self.beta2) * fn.square(g) vhat = fn.maximum(vhat, v) p -= self.lr * m / (fn.sqrt(vhat) + self.epsilon)
def _step(self, epoch): t = fn.to_tensor(epoch) grads = [p.grad for p in self.parameters] for p, g, m, v in (zip(self.parameters, grads, self.ms, self.vs)): m = self.beta1*m + (1-self.beta1)*g v = self.beta2*v + (1-self.beta2)*fn.square(g) m_hat = m / (1 - fn.power(self.beta1, t)) v_hat = v / (1 - fn.power(self.beta2, t)) p -= self.lr * m_hat / (fn.sqrt(v_hat) + self.epsilon)
def forward(self, x): if self.train_mode: mean = fn.mean(x) standard_deviation = fn.mean(fn.square(x - mean)) self.u_avg.data = self.momentum.data * self.u_avg.data + ( 1 - self.momentum.data) * mean.data self.std_avg.data = self.momentum.data * self.std_avg.data + ( 1 - self.momentum.data) * standard_deviation.data else: mean = self.u_avg standard_deviation = self.std_avg x = (x - mean) / fn.sqrt(standard_deviation + self.epsilon) return fn.mul(x, self.gamma) + self.beta
def _step(self, epoch): grads = [p.grad for p in self.parameters] for p, g, e in zip(self.parameters, grads, self.E): e = self.beta * e + (1 - self.beta) * fn.square(g) p -= self.lr * g / (fn.sqrt(e) + self.epsilon)
def _step(self, epoch): grads = [p.grad for p in self.parameters] for p, g, gs in zip(self.parameters, grads, self.G): gs += fn.square(g) p -= self.lr * g / fn.sqrt(gs + self.epison)