def asgd(params: List[Tensor], grads: List[Tensor], states: List[Dict], lambd: float, lr: float, t0: float, alpha: float, weight_decay: float): r"""Functional API that performs ASGD algorithm computation. See :class:`~torch.optim.ASGD` for details. """ if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) # decay term eta = states[0]['eta'] torch._foreach_mul_(params, 1 - lambd * eta) # update parameter torch._foreach_add_(params, grads, alpha=-eta) # averaging for i in range(len(states)): if states[i]['mu'] != 1: states[i]['ax'].add_(params[i].sub(states[i]['ax']).mul( states[i]['mu'])) else: states[i]['ax'].copy_(params[i]) # update eta and mu for state in states: state['eta'] = (lr / math.pow((1 + lambd * lr * state['step']), alpha)) state['mu'] = 1 / max(1, state['step'] - t0)
def _update(self): # _foreach_** is n times faster than for loops o_p = [ p.data for p in self._original_model.parameters() if isinstance(p, torch.Tensor) ] e_p = [ p.data for p in self._ema_model.parameters() if isinstance(p, torch.Tensor) ] torch._foreach_mul_(e_p, self.momentum) torch._foreach_add_(e_p, o_p, alpha=1 - self.momentum) # some buffers are integer for counting etc. o_b = [ b for b in self._original_model.buffers() if isinstance(b, torch.Tensor) and torch.is_floating_point(b) ] if len(o_b) > 0: e_b = [ b for b in self._ema_model.buffers() if isinstance(b, torch.Tensor) and torch.is_floating_point(b) ] torch._foreach_mul_(e_b, self.momentum) torch._foreach_add_(e_b, o_b, alpha=1 - self.momentum)
def adamax(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_infs: List[Tensor], states: List[Dict], *, beta1: float, beta2: float, lr: float, weight_decay: float, eps: float): r"""Functional API that performs Adamax algorithm computation. See :class:`~torch.optim.Adamax` for details. """ if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) # Update biased first moment estimate. torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) # Update the exponentially weighted infinity norm. torch._foreach_mul_(exp_infs, beta2) for exp_inf, grad in zip(exp_infs, grads): norm_buf = torch.cat( [exp_inf.unsqueeze(0), grad.abs().add_(eps).unsqueeze_(0)], 0) torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long())) bias_corrections = [1 - beta1**state['step'] for state in states] clr = [-1 * (lr / bias_correction) for bias_correction in bias_corrections] torch._foreach_addcdiv_(params, exp_avgs, exp_infs, clr)
def _multi_tensor_rprop(params: List[Tensor], grads: List[Tensor], prevs: List[Tensor], step_sizes: List[Tensor], *, step_size_min: float, step_size_max: float, etaminus: float, etaplus: float): if len(params) == 0: return signs = torch._foreach_mul(grads, prevs) signs = [s.sign() for s in signs] for sign in signs: sign[sign.gt(0)] = etaplus sign[sign.lt(0)] = etaminus sign[sign.eq(0)] = 1 # update stepsizes with step size updates torch._foreach_mul_(step_sizes, signs) for step_size in step_sizes: step_size.clamp_(step_size_min, step_size_max) # for dir<0, dfdx=0 # for dir>=0 dfdx=dfdx for i in range(len(grads)): grads[i] = grads[i].clone(memory_format=torch.preserve_format) grads[i][signs[i].eq(etaminus)] = 0 # update parameters grad_signs = [grad.sign() for grad in grads] torch._foreach_addcmul_(params, grad_signs, step_sizes, value=-1) for i in range(len(prevs)): prevs[i].copy_(grads[i])
def _multi_tensor_sgd(params: List[Tensor], grads: List[Tensor], momentum_buffer_list: List[Optional[Tensor]], *, weight_decay: float, momentum: float, lr: float, dampening: float, nesterov: bool, maximize: bool, has_sparse_grad: bool): if len(params) == 0: return if has_sparse_grad is None: has_sparse_grad = any(grad.is_sparse for grad in grads) if maximize: grads = torch._foreach_neg(tuple(grads)) # type: ignore[assignment] if weight_decay != 0: grads = torch._foreach_add(grads, params, alpha=weight_decay) if momentum != 0: bufs = [] all_states_with_momentum_buffer = True for i in range(len(momentum_buffer_list)): if momentum_buffer_list[i] is None: all_states_with_momentum_buffer = False break else: bufs.append(momentum_buffer_list[i]) if all_states_with_momentum_buffer: torch._foreach_mul_(bufs, momentum) torch._foreach_add_(bufs, grads, alpha=1 - dampening) else: bufs = [] for i in range(len(momentum_buffer_list)): if momentum_buffer_list[i] is None: buf = momentum_buffer_list[i] = torch.clone(grads[i]).detach() else: buf = momentum_buffer_list[i] buf.mul_(momentum).add_(grads[i], alpha=1 - dampening) bufs.append(buf) if nesterov: torch._foreach_add_(grads, bufs, alpha=momentum) else: grads = bufs if not has_sparse_grad: torch._foreach_add_(params, grads, alpha=-lr) else: # foreach APIs dont support sparse for i in range(len(params)): params[i].add_(grads[i], alpha=-lr)
def _multi_tensor_adamax(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_infs: List[Tensor], state_steps: List[Tensor], *, beta1: float, beta2: float, lr: float, weight_decay: float, eps: float): if len(params) == 0: return # Update steps torch._foreach_add_(state_steps, 1) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) # Update biased first moment estimate. torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) # Update the exponentially weighted infinity norm. torch._foreach_mul_(exp_infs, beta2) for exp_inf, grad in zip(exp_infs, grads): norm_buf = torch.cat( [exp_inf.unsqueeze(0), grad.abs().add_(eps).unsqueeze_(0)], 0) torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long())) bias_corrections = [1 - beta1**step.item() for step in state_steps] clr = [-1 * (lr / bias_correction) for bias_correction in bias_corrections] torch._foreach_addcdiv_(params, exp_avgs, exp_infs, clr)
def post_step(self): if self.post_op: with torch.no_grad(): torch._foreach_mul_(list(self.parameters()), 1. - self.value) if self.log: logging.debug( 'L2 penalty of %s was applied post optimization step', self.value)
def _multi_tensor_radam(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], state_steps: List[Tensor], *, beta1: float, beta2: float, lr: float, weight_decay: float, eps: float): if len(params) == 0: return # Update steps torch._foreach_add_(state_steps, 1) # maximum length of the approximated SMA rho_inf = 2 / (1 - beta2) - 1 # compute the length of the approximated SMA rho_t_list = [ rho_inf - 2 * step.item() * (beta2**step.item()) / (1 - beta2**step.item()) for step in state_steps ] bias_correction1 = [1 - beta1**step.item() for step in state_steps] bias_correction2 = [1 - beta2**step.item() for step in state_steps] if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) # Decay the first and second moment running average coefficient torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sqs, beta2) torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2) rect = [ math.sqrt((rho_t - 4) * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t)) if rho_t > 5 else 0 for rho_t in rho_t_list ] unrectified = [0 if rect > 0 else 1. for rect in rect] exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] denom = torch._foreach_div(exp_avg_sq_sqrt, bias_correction_sqrt) step_size = [(lr * rect / bc) * -1 for rect, bc in zip(rect, bias_correction1)] torch._foreach_addcdiv_(params, exp_avgs, denom, step_size) denom = [ torch.ones_like(exp_av, memory_format=torch.preserve_format) for exp_av in exp_avgs ] step_size = [(lr * rect / bc) * -1 for rect, bc in zip(unrectified, bias_correction1)] torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
def _multi_tensor_adam(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], max_exp_avg_sqs: List[Tensor], state_steps: List[Tensor], *, amsgrad: bool, beta1: float, beta2: float, lr: float, weight_decay: float, eps: float, maximize: bool): if len(params) == 0: return # update steps torch._foreach_add_(state_steps, 1) if maximize: grads = torch._foreach_neg(tuple(grads)) # type: ignore[assignment] bias_correction1 = [1 - beta1 ** step.item() for step in state_steps] bias_correction2 = [1 - beta2 ** step.item() for step in state_steps] if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sqs, beta2) torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs) # type: ignore[assignment] # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, eps) step_size = [(lr / bc) * -1 for bc in bias_correction1] torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
def _multi_tensor_rprop(params: List[Tensor], grads: List[Tensor], prevs: List[Tensor], step_sizes: List[Tensor], *, step_size_min: float, step_size_max: float, etaminus: float, etaplus: float, maximize: bool): if len(params) == 0: return # Handle complex params def _view_complex_as_real(tensor_list): return [torch.view_as_real(t) if torch.is_complex(t) else t for t in tensor_list] grads = _view_complex_as_real(grads) prevs = _view_complex_as_real(prevs) params = _view_complex_as_real(params) step_sizes = _view_complex_as_real(step_sizes) if maximize: grads = torch._foreach_neg(grads) signs = torch._foreach_mul(grads, prevs) signs = [s.sign() for s in signs] for sign in signs: sign[sign.gt(0)] = etaplus sign[sign.lt(0)] = etaminus sign[sign.eq(0)] = 1 # update stepsizes with step size updates torch._foreach_mul_(step_sizes, signs) for step_size in step_sizes: step_size.clamp_(step_size_min, step_size_max) # for dir<0, dfdx=0 # for dir>=0 dfdx=dfdx grads = list(grads) for i in range(len(grads)): grads[i] = grads[i].clone(memory_format=torch.preserve_format) grads[i][signs[i].eq(etaminus)] = 0 # update parameters grad_signs = [grad.sign() for grad in grads] torch._foreach_addcmul_(params, grad_signs, step_sizes, value=-1) for i in range(len(prevs)): prevs[i].copy_(grads[i])
def radam(params: List[Tensor], grads: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], states: List[Dict], *, beta1: float, beta2: float, lr: float, weight_decay: float, eps: float): r"""Functional API that performs RAdam algorithm computation. See :class:`~torch.optim.RAdam` for details. """ # maximum length of the approximated SMA rho_inf = 2 / (1 - beta2) - 1 # compute the length of the approximated SMA rho_t_list = [ rho_inf - 2 * state['step'] * (beta2**state['step']) / (1 - beta2**state['step']) for state in states ] bias_correction1 = [1 - beta1**state['step'] for state in states] bias_correction2 = [1 - beta2**state['step'] for state in states] if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) # Decay the first and second moment running average coefficient torch._foreach_mul_(exp_avg, beta1) torch._foreach_add_(exp_avg, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sq, beta2) torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2) rect = [ math.sqrt((rho_t - 4) * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t)) if rho_t > 5 else 0 for rho_t in rho_t_list ] unrectified = [0 if rect > 0 else 1. for rect in rect] exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] denom = torch._foreach_div(exp_avg_sq_sqrt, bias_correction_sqrt) step_size = [(lr * rect / bc) * -1 for rect, bc in zip(rect, bias_correction1)] torch._foreach_addcdiv_(params, exp_avg, denom, step_size) denom = [ torch.ones_like(exp_av, memory_format=torch.preserve_format) for exp_av in exp_avg ] step_size = [(lr * rect / bc) * -1 for rect, bc in zip(unrectified, bias_correction1)] torch._foreach_addcdiv_(params, exp_avg, denom, step_size)
def update(self, model): x = [] y = [] needs_module = hasattr(model, 'module') and not self.ema_has_module with torch.no_grad(): for ema_v, model_v in zip(self.ema.state_dict().values(), model.state_dict().values()): x.append(ema_v.type(torch.float32)) if self.device: model_v = model_v.detach().to(device=self.device) y.append(model_v.type(torch.float32)) torch._foreach_mul_(x, self.decay) torch._foreach_add_(x, y, alpha=1. - self.decay) for ind, ema_v in enumerate(self.ema.state_dict().values()): ema_v.copy_(x[ind])
def clip_grad_norm_2(parameters: _tensor_or_tensors, max_norm: float): dummy_overflow_buf = torch.cuda.IntTensor([0]) if isinstance(parameters, torch.Tensor): parameters = [parameters] parameters = [p for p in parameters if p.grad is not None] grads = [p.grad for p in parameters] max_norm = float(max_norm) if len(parameters) == 0: return torch.tensor(0.) device = parameters[0].grad.device total_norm, _ = multi_tensor_applier(multi_tensor_l2norm, dummy_overflow_buf, [grads], False) clip_coef = max_norm / (total_norm + 1e-6) if clip_coef < 1: torch._foreach_mul_(grads, clip_coef.item()) return total_norm
def on_step(self, task) -> None: if not task.train: return with torch.no_grad(): if self.use_optimization(task): torch._foreach_mul_(self.ema_model_state_list, self.decay) torch._foreach_add_(self.ema_model_state_list, self.param_list, alpha=(1 - self.decay)) else: for name, param in self.get_model_state_iterator( task.base_model): self.state.ema_model_state[ name] = self.decay * self.state.ema_model_state[ name] + (1 - self.decay) * param.to(device=self.device)
def nadam(params: List[Tensor], grads: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], mu_products: List[Tensor], states: List[Dict], *, beta1: float, beta2: float, lr: float, weight_decay: float, momentum_decay: float, eps: float): r"""Functional API that performs NAdam algorithm computation. See :class:`~torch.optim.NAdam` for details. """ bias_correction1 = [1 - beta1 ** state['step'] for state in states] bias_correction2 = [1 - beta2 ** state['step'] for state in states] mus = [beta1 * (1. - 0.5 * (0.96 ** (state['step'] * momentum_decay))) for state in states] mu_nexts = [beta1 * (1. - 0.5 * (0.96 ** ((state['step'] + 1) * momentum_decay))) for state in states] if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) # Decay the first and second moment running average coefficient torch._foreach_mul_(exp_avg, beta1) torch._foreach_add_(exp_avg, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sq, beta2) torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2) exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, eps) step_size_grads = [(lr * (1. - mu) / (1. - mu_product)) * -1 for mu_product, mu in zip(mu_products, mus)] step_size_expavg = [(lr * mu_next / (1. - mu_product * mu_next)) * -1 for mu_product, mu_next in zip(mu_products, mu_nexts)] torch._foreach_addcdiv_(params, grads, denom, step_size_grads) torch._foreach_addcdiv_(params, exp_avg, denom, step_size_expavg)
def step(self): weight_decays = [] for group in self.optim.param_groups: # absorb weight decay control from optimizer weight_decay = group[ 'weight_decay'] if 'weight_decay' in group else 0 weight_decays.append(weight_decay) group['weight_decay'] = 0 params = [] grads = [] lrs = [] for p in group['params']: if p.grad is None: continue param_norm = torch.norm(p.data) grad_norm = torch.norm(p.grad.data) if param_norm != 0 and grad_norm != 0: # calculate adaptive lr + weight decay # .item() may be sub-optimal, but required because _foreach_* don't support broadcasting at the moment adaptive_lr = (self.trust_coefficient * param_norm / (grad_norm + param_norm * weight_decay + self.eps)).item() # clip learning rate for LARC if self.clip: # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)` adaptive_lr = min(adaptive_lr / group['lr'], 1.0) params.append(p.data) grads.append(p.grad.data) lrs.append(adaptive_lr) # p.grad.data += weight_decay * p.data # p.grad.data *= adaptive_lr torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(grads, lrs) self.optim.step() # return weight decay control to optimizer for i, group in enumerate(self.optim.param_groups): group['weight_decay'] = weight_decays[i]
def step(self, closure ) -> torch.Tensor: """ Args: closure: A closure that reevaluates the model and returns the loss. Returns: the loss value evaluated on the original point """ closure = torch.enable_grad()(closure) loss = closure().detach() for group in self.param_groups: grads = [] params_with_grads = [] rho = group['rho'] # update internal_optim's learning rate for p in group['params']: if p.grad is not None: # without clone().detach(), p.grad will be zeroed by closure() grads.append(p.grad.clone().detach()) params_with_grads.append(p) device = grads[0].device # compute \hat{\epsilon}=\rho/\norm{g}\|g\| grad_norm = torch.stack([g.detach().norm(2).to(device) for g in grads]).norm(2) epsilon = grads # alias for readability torch._foreach_mul_(epsilon, rho / grad_norm) # virtual step toward \epsilon torch._foreach_add_(params_with_grads, epsilon) # compute g=\nabla_w L_B(w)|_{w+\hat{\epsilon}} closure() # virtual step back to the original point torch._foreach_sub_(params_with_grads, epsilon) super().step() return loss
def _multi_tensor_asgd(params: List[Tensor], grads: List[Tensor], axs: List[Tensor], mus: List[Tensor], etas: List[Tensor], state_steps: List[Tensor], *, lambd: float, lr: float, t0: float, alpha: float, weight_decay: float): if len(params) == 0: return # update step torch._foreach_add_(state_steps, 1) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) # decay term eta = etas[0].item() torch._foreach_mul_(params, 1 - lambd * eta) # update parameter torch._foreach_add_(params, grads, alpha=-eta) # averaging for i in range(len(axs)): if mus[i].item() != 1: axs[i].add_(params[i].sub(axs[i]).mul(mus[i])) else: axs[i].copy_(params[i]) # update eta and mu for i in range(len(mus)): new_eta = torch.tensor(lr / math.pow((1 + lambd * lr * state_steps[i].item()), alpha)) etas[i].copy_(new_eta) new_mu = torch.tensor(1 / max(1, state_steps[i].item() - t0)) mus[i].copy_(new_mu)
def _update(self): if torch.cuda.is_available(): torch.cuda.synchronize() # _foreach_** is n times faster than for loops o_p = [ p.data for p in self._original_model.parameters() if isinstance(p, torch.Tensor) ] e_p = [ p.data for p in self._ema_model.parameters() if isinstance(p, torch.Tensor) ] torch._foreach_mul_(e_p, self.momentum) torch._foreach_add_(e_p, o_p, alpha=1 - self.momentum) # some buffers are integer for counting etc. alpha = 0 if self.copy_buffer else self.momentum o_b = [ b for b in self._original_model.buffers() if isinstance(b, torch.Tensor) and torch.is_floating_point(b) ] if len(o_b) > 0: e_b = [ b for b in self._ema_model.buffers() if isinstance(b, torch.Tensor) and torch.is_floating_point(b) ] torch._foreach_mul_(e_b, alpha) torch._foreach_add_(e_b, o_b, alpha=1 - alpha) # integers o_b = [ b for b in self._original_model.buffers() if isinstance(b, torch.Tensor) and not torch.is_floating_point(b) ] if len(o_b) > 0: e_b = [ b for b in self._ema_model.buffers() if isinstance(b, torch.Tensor) and not torch.is_floating_point(b) ] for o, e in zip(o_b, e_b): e.copy_(o)
def _multi_tensor_rmsprop(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], grad_avgs: List[Tensor], momentum_buffer_list: List[Tensor], *, lr: float, alpha: float, eps: float, weight_decay: float, momentum: float, centered: bool): if len(params) == 0: return if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(square_avgs, alpha) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - alpha) if centered: torch._foreach_mul_(grad_avgs, alpha) torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha) avg = torch._foreach_addcmul(square_avgs, grad_avgs, grad_avgs, value=-1) torch._foreach_sqrt_(avg) torch._foreach_add_(avg, eps) else: avg = torch._foreach_sqrt(square_avgs) torch._foreach_add_(avg, eps) if momentum > 0: torch._foreach_mul_(momentum_buffer_list, momentum) torch._foreach_addcdiv_(momentum_buffer_list, grads, avg) torch._foreach_add_(params, momentum_buffer_list, alpha=-lr) else: torch._foreach_addcdiv_(params, grads, avg, value=-lr)
def adadelta(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], acc_deltas: List[Tensor], *, lr: float, weight_decay: float, rho: float, eps: float): r"""Functional API that performs Adadelta algorithm computation. See :class:`~torch.optim.Adadelta` for details. """ if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(square_avgs, rho) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho) std = torch._foreach_add(square_avgs, eps) torch._foreach_sqrt_(std) deltas = torch._foreach_add(acc_deltas, eps) torch._foreach_sqrt_(deltas) torch._foreach_div_(deltas, std) torch._foreach_mul_(deltas, grads) torch._foreach_add_(params, deltas, alpha=-lr) torch._foreach_mul_(acc_deltas, rho) torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
def _multi_tensor_adadelta(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], acc_deltas: List[Tensor], *, lr: float, weight_decay: float, rho: float, eps: float, maximize: bool): if len(params) == 0: return if maximize: grads = torch._foreach_neg(grads) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(square_avgs, rho) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho) std = torch._foreach_add(square_avgs, eps) torch._foreach_sqrt_(std) deltas = torch._foreach_add(acc_deltas, eps) torch._foreach_sqrt_(deltas) torch._foreach_div_(deltas, std) torch._foreach_mul_(deltas, grads) torch._foreach_add_(params, deltas, alpha=-lr) torch._foreach_mul_(acc_deltas, rho) torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
def _multi_tensor_rmsprop(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], grad_avgs: List[Tensor], momentum_buffer_list: List[Tensor], *, lr: float, alpha: float, eps: float, weight_decay: float, momentum: float, centered: bool, maximize: bool, differentiable: bool): if len(params) == 0: return assert not differentiable, "_foreach ops don't support autograd" if maximize: grads = torch._foreach_neg(grads) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) def _view_complex_as_real(tensor_list): return [ torch.view_as_real(t) if torch.is_complex(t) else t for t in tensor_list ] grads = _view_complex_as_real(grads) params = _view_complex_as_real(params) square_avgs = _view_complex_as_real(square_avgs) torch._foreach_mul_(square_avgs, alpha) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - alpha) if centered: grad_avgs = _view_complex_as_real(grad_avgs) torch._foreach_mul_(grad_avgs, alpha) torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha) avg = torch._foreach_addcmul(square_avgs, grad_avgs, grad_avgs, value=-1) torch._foreach_sqrt_(avg) torch._foreach_add_(avg, eps) else: avg = torch._foreach_sqrt(square_avgs) torch._foreach_add_(avg, eps) if momentum > 0: momentum_buffer_list = _view_complex_as_real(momentum_buffer_list) torch._foreach_mul_(momentum_buffer_list, momentum) torch._foreach_addcdiv_(momentum_buffer_list, grads, avg) torch._foreach_add_(params, momentum_buffer_list, alpha=-lr) else: torch._foreach_addcdiv_(params, grads, avg, value=-lr)
def _multi_tensor_nadam(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], mu_products: List[Tensor], state_steps: List[Tensor], *, beta1: float, beta2: float, lr: float, weight_decay: float, momentum_decay: float, eps: float): if len(params) == 0: return # update steps torch._foreach_add_(state_steps, 1) bias_correction1 = [1 - beta1**step.item() for step in state_steps] bias_correction2 = [1 - beta2**step.item() for step in state_steps] mus = [ beta1 * (1. - 0.5 * (0.96**(step.item() * momentum_decay))) for step in state_steps ] mu_nexts = [ beta1 * (1. - 0.5 * (0.96**((step.item() + 1) * momentum_decay))) for step in state_steps ] # update mu_products torch._foreach_mul_(mu_products, mus) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) # Decay the first and second moment running average coefficient torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sqs, beta2) torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2) exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, eps) step_size_grads = [(lr * (1. - mu) / (1. - mu_product.item())) * -1 for mu_product, mu in zip(mu_products, mus)] step_size_expavg = [ (lr * mu_next / (1. - mu_product.item() * mu_next)) * -1 for mu_product, mu_next in zip(mu_products, mu_nexts) ] torch._foreach_addcdiv_(params, grads, denom, step_size_grads) torch._foreach_addcdiv_(params, exp_avgs, denom, step_size_expavg)
def step(self, closure=None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: grads = [] params_with_grad = [] states = [] alpha = group['alpha'] square_avg = [] for p in group['params']: if p.grad is not None: if p.grad.is_sparse: raise RuntimeError( 'RMSprop does not support sparse gradients') grads.append(p.grad) params_with_grad.append(p) state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 state['square_avg'] = torch.zeros_like( p, memory_format=torch.preserve_format) if group['momentum'] > 0: state['momentum_buffer'] = torch.zeros_like( p, memory_format=torch.preserve_format) if group['centered']: state['grad_avg'] = torch.zeros_like( p, memory_format=torch.preserve_format) state['step'] += 1 states.append(state) square_avg.append(state['square_avg']) if group['weight_decay'] != 0: torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay']) torch._foreach_mul_(square_avg, alpha) torch._foreach_addcmul_(square_avg, grads, grads, value=1 - alpha) if group['centered']: grad_avgs = [s['grad_avg'] for s in states] torch._foreach_mul_(grad_avgs, alpha) torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha) avg = torch._foreach_addcmul(square_avg, grad_avgs, grad_avgs, value=-1) torch._foreach_sqrt_(avg) torch._foreach_add_(avg, group['eps']) else: avg = torch._foreach_sqrt(square_avg) torch._foreach_add_(avg, group['eps']) if group['momentum'] > 0: buf = [s['momentum_buffer'] for s in states] torch._foreach_mul_(buf, group['momentum']) torch._foreach_addcdiv_(buf, grads, avg) torch._foreach_add_(params_with_grad, buf, alpha=-group['lr']) else: torch._foreach_addcdiv_(params_with_grad, grads, avg, value=-group['lr']) return loss
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() grads = [] states = [] params_with_grad = [] step_sizes = [] for group in self.param_groups: for p in group['params']: etaminus, etaplus = group['etas'] step_size_min, step_size_max = group['step_sizes'] if p.grad is not None: if p.grad.is_sparse: raise RuntimeError( 'RMSprop does not support sparse gradients') grads.append(p.grad) params_with_grad.append(p) state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 state['prev'] = torch.zeros_like( p, memory_format=torch.preserve_format) state['step_size'] = p.grad.new().resize_as_( p.grad).fill_(group['lr']) state['step'] += 1 states.append(state) step_sizes.append(state['step_size']) signs = torch._foreach_mul(grads, [s['prev'] for s in states]) signs = [s.sign() for s in signs] for sign in signs: sign[sign.gt(0)] = etaplus sign[sign.lt(0)] = etaminus sign[sign.eq(0)] = 1 # update stepsizes with step size updates torch._foreach_mul_(step_sizes, signs) for step_size in step_sizes: step_size.clamp_(step_size_min, step_size_max) # for dir<0, dfdx=0 # for dir>=0 dfdx=dfdx for i in range(len(grads)): grads[i] = grads[i].clone(memory_format=torch.preserve_format) grads[i][signs[i].eq(etaminus)] = 0 # update parameters grad_signs = [grad.sign() for grad in grads] torch._foreach_addcmul_(params_with_grad, grad_signs, step_sizes, value=-1) for i in range(len(states)): states[i]['prev'].copy_(grads[i]) return loss
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: amsgrad = group['amsgrad'] grads = [] states = [] exp_avg = [] exp_avg_sq = [] max_exp_avg_sq = [] params_with_grad = [] for p in group['params']: if p.grad is not None: if p.grad.is_sparse: raise RuntimeError( 'Adam does not support sparse gradients, please consider SparseAdam instead' ) params_with_grad.append(p) grads.append(p.grad) for p in params_with_grad: state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like( p, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like( p, memory_format=torch.preserve_format) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like( p, memory_format=torch.preserve_format) exp_avg.append(state['exp_avg']) exp_avg_sq.append(state['exp_avg_sq']) if amsgrad: max_exp_avg_sq.append(state['max_exp_avg_sq']) state['step'] += 1 states.append(state) beta1, beta2 = group['betas'] bias_correction1 = [1 - beta1**state['step'] for state in states] bias_correction2 = [1 - beta2**state['step'] for state in states] if group['weight_decay'] != 0: grads = torch._foreach_add(grads, params_with_grad, alpha=group['weight_decay']) # # Decay the first and second moment running average coefficient # torch._foreach_mul_(exp_avg, beta1) torch._foreach_add_(exp_avg, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sq, beta2) torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now max_exp_avg_sq = torch._foreach_maximum( max_exp_avg_sq, exp_avg_sq) # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq) bias_correction_sqrt = [ math.sqrt(bc) for bc in bias_correction2 ] torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps']) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq) bias_correction_sqrt = [ math.sqrt(bc) for bc in bias_correction2 ] torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps']) step_size = [(group['lr'] / bc) * -1 for bc in bias_correction1] torch._foreach_addcdiv_(params_with_grad, exp_avg, denom, step_size) return loss
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() grads = [] params_with_grad = [] states = [] for group in self.param_groups: for p in group['params']: if p.grad is not None: if p.grad.is_sparse: raise RuntimeError( 'ASGD does not support sparse gradients') grads.append(p.grad) params_with_grad.append(p) state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 state['eta'] = group['lr'] state['mu'] = 1 state['ax'] = torch.zeros_like( p, memory_format=torch.preserve_format) state['step'] += 1 states.append(state) if group['weight_decay'] != 0: torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay']) # decay term torch._foreach_mul_(params_with_grad, 1 - group['lambd'] * state['eta']) # update parameter torch._foreach_add_(params_with_grad, grads, alpha=-state['eta']) # averaging for i in range(len(states)): if states[i]['mu'] != 1: states[i]['ax'].add_(params_with_grad[i].sub( states[i]['ax']).mul(states[i]['mu'])) else: states[i]['ax'].copy_(params_with_grad[i]) # update eta and mu for state in states: state['eta'] = (group['lr'] / math.pow( (1 + group['lambd'] * group['lr'] * state['step']), group['alpha'])) state['mu'] = 1 / max(1, state['step'] - group['t0']) return loss
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: grads = [] params_with_grad = [] states = [] exp_avgs = [] exp_infs = [] beta1, beta2 = group['betas'] eps = group['eps'] for p in group['params']: if p.grad is not None: if p.grad.is_sparse: raise RuntimeError('Adamax does not support sparse gradients') grads.append(p.grad) params_with_grad.append(p) state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) state['exp_inf'] = torch.zeros_like(p, memory_format=torch.preserve_format) exp_avgs.append(state['exp_avg']) exp_infs.append(state['exp_inf']) state['step'] += 1 states.append(state) if group['weight_decay'] != 0: torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay']) # Update biased first moment estimate. torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) # Update the exponentially weighted infinity norm. torch._foreach_mul_(exp_infs, beta2) for exp_inf, grad in zip(exp_infs, grads): norm_buf = torch.cat([ exp_inf.unsqueeze(0), grad.abs().add_(eps).unsqueeze_(0) ], 0) torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long())) bias_corrections = [1 - beta1 ** state['step'] for state in states] clr = [group['lr'] / bias_correction for bias_correction in bias_corrections] for i in range(len(params_with_grad)): params_with_grad[i].addcdiv_(exp_avgs[i], exp_infs[i], value=-clr[i]) return loss
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] grads = [] params_with_grad = [] states = [] has_sparse_grad = False for p in group['params']: if p.grad is not None: grads.append(p.grad) params_with_grad.append(p) states.append(self.state[p]) if p.grad.is_sparse: has_sparse_grad = True if momentum != 0: raise RuntimeError( 'SGD does not support momentum for sparse gradients' ) if grads == []: return loss if weight_decay != 0: grads = torch._foreach_add(grads, params_with_grad, alpha=weight_decay) if momentum != 0: bufs = [] all_states_with_momentum_buffer = True for i in range(len(states)): if 'momentum_buffer' not in states[i]: all_states_with_momentum_buffer = False break else: bufs.append(states[i]['momentum_buffer']) if all_states_with_momentum_buffer: torch._foreach_mul_(bufs, momentum) torch._foreach_add_(bufs, grads, alpha=1 - dampening) else: bufs = [] for i in range(len(states)): if 'momentum_buffer' not in states[i]: buf = states[i]['momentum_buffer'] = torch.clone( grads[i]).detach() else: buf = states[i]['momentum_buffer'] buf.mul_(momentum).add_(grads[i], alpha=1 - dampening) bufs.append(buf) if nesterov: torch._foreach_add_(grads, bufs, alpha=momentum) else: grads = bufs if not has_sparse_grad: torch._foreach_add_(params_with_grad, grads, alpha=-group['lr']) else: # foreach APIs dont support sparse for i in range(len(params_with_grad)): params_with_grad[i].add_(grads[i], alpha=-group['lr']) return loss