def _multi_tensor_adadelta(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], acc_deltas: List[Tensor], *, lr: float, weight_decay: float, rho: float, eps: float, maximize: bool): if len(params) == 0: return if maximize: grads = torch._foreach_neg(grads) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(square_avgs, rho) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho) std = torch._foreach_add(square_avgs, eps) torch._foreach_sqrt_(std) deltas = torch._foreach_add(acc_deltas, eps) torch._foreach_sqrt_(deltas) torch._foreach_div_(deltas, std) torch._foreach_mul_(deltas, grads) torch._foreach_add_(params, deltas, alpha=-lr) torch._foreach_mul_(acc_deltas, rho) torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
def _multi_tensor_sgd(params: List[Tensor], grads: List[Tensor], momentum_buffer_list: List[Optional[Tensor]], *, weight_decay: float, momentum: float, lr: float, dampening: float, nesterov: bool, maximize: bool, has_sparse_grad: bool): if len(params) == 0: return if has_sparse_grad is None: has_sparse_grad = any(grad.is_sparse for grad in grads) if maximize: grads = torch._foreach_neg(tuple(grads)) # type: ignore[assignment] if weight_decay != 0: grads = torch._foreach_add(grads, params, alpha=weight_decay) if momentum != 0: bufs = [] all_states_with_momentum_buffer = True for i in range(len(momentum_buffer_list)): if momentum_buffer_list[i] is None: all_states_with_momentum_buffer = False break else: bufs.append(momentum_buffer_list[i]) if all_states_with_momentum_buffer: torch._foreach_mul_(bufs, momentum) torch._foreach_add_(bufs, grads, alpha=1 - dampening) else: bufs = [] for i in range(len(momentum_buffer_list)): if momentum_buffer_list[i] is None: buf = momentum_buffer_list[i] = torch.clone(grads[i]).detach() else: buf = momentum_buffer_list[i] buf.mul_(momentum).add_(grads[i], alpha=1 - dampening) bufs.append(buf) if nesterov: torch._foreach_add_(grads, bufs, alpha=momentum) else: grads = bufs if not has_sparse_grad: torch._foreach_add_(params, grads, alpha=-lr) else: # foreach APIs dont support sparse for i in range(len(params)): params[i].add_(grads[i], alpha=-lr)
def _multi_tensor_adam(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], max_exp_avg_sqs: List[Tensor], state_steps: List[Tensor], *, amsgrad: bool, beta1: float, beta2: float, lr: float, weight_decay: float, eps: float, maximize: bool): if len(params) == 0: return # update steps torch._foreach_add_(state_steps, 1) if maximize: grads = torch._foreach_neg(tuple(grads)) # type: ignore[assignment] bias_correction1 = [1 - beta1 ** step.item() for step in state_steps] bias_correction2 = [1 - beta2 ** step.item() for step in state_steps] if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sqs, beta2) torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs) # type: ignore[assignment] # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, eps) step_size = [(lr / bc) * -1 for bc in bias_correction1] torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
def _multi_tensor_rmsprop(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], grad_avgs: List[Tensor], momentum_buffer_list: List[Tensor], *, lr: float, alpha: float, eps: float, weight_decay: float, momentum: float, centered: bool, maximize: bool, differentiable: bool): if len(params) == 0: return assert not differentiable, "_foreach ops don't support autograd" if maximize: grads = torch._foreach_neg(grads) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) def _view_complex_as_real(tensor_list): return [ torch.view_as_real(t) if torch.is_complex(t) else t for t in tensor_list ] grads = _view_complex_as_real(grads) params = _view_complex_as_real(params) square_avgs = _view_complex_as_real(square_avgs) torch._foreach_mul_(square_avgs, alpha) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - alpha) if centered: grad_avgs = _view_complex_as_real(grad_avgs) torch._foreach_mul_(grad_avgs, alpha) torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha) avg = torch._foreach_addcmul(square_avgs, grad_avgs, grad_avgs, value=-1) torch._foreach_sqrt_(avg) torch._foreach_add_(avg, eps) else: avg = torch._foreach_sqrt(square_avgs) torch._foreach_add_(avg, eps) if momentum > 0: momentum_buffer_list = _view_complex_as_real(momentum_buffer_list) torch._foreach_mul_(momentum_buffer_list, momentum) torch._foreach_addcdiv_(momentum_buffer_list, grads, avg) torch._foreach_add_(params, momentum_buffer_list, alpha=-lr) else: torch._foreach_addcdiv_(params, grads, avg, value=-lr)
def _multi_tensor_rprop(params: List[Tensor], grads: List[Tensor], prevs: List[Tensor], step_sizes: List[Tensor], *, step_size_min: float, step_size_max: float, etaminus: float, etaplus: float, maximize: bool): if len(params) == 0: return # Handle complex params def _view_complex_as_real(tensor_list): return [torch.view_as_real(t) if torch.is_complex(t) else t for t in tensor_list] grads = _view_complex_as_real(grads) prevs = _view_complex_as_real(prevs) params = _view_complex_as_real(params) step_sizes = _view_complex_as_real(step_sizes) if maximize: grads = torch._foreach_neg(grads) signs = torch._foreach_mul(grads, prevs) signs = [s.sign() for s in signs] for sign in signs: sign[sign.gt(0)] = etaplus sign[sign.lt(0)] = etaminus sign[sign.eq(0)] = 1 # update stepsizes with step size updates torch._foreach_mul_(step_sizes, signs) for step_size in step_sizes: step_size.clamp_(step_size_min, step_size_max) # for dir<0, dfdx=0 # for dir>=0 dfdx=dfdx grads = list(grads) for i in range(len(grads)): grads[i] = grads[i].clone(memory_format=torch.preserve_format) grads[i][signs[i].eq(etaminus)] = 0 # update parameters grad_signs = [grad.sign() for grad in grads] torch._foreach_addcmul_(params, grad_signs, step_sizes, value=-1) for i in range(len(prevs)): prevs[i].copy_(grads[i])
def _multi_tensor_adamax(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_infs: List[Tensor], state_steps: List[Tensor], *, beta1: float, beta2: float, lr: float, weight_decay: float, eps: float, maximize: bool): if len(params) == 0: return if maximize: grads = torch._foreach_neg(grads) params = [torch.view_as_real(x) if torch.is_complex(x) else x for x in params] grads = [torch.view_as_real(x) if torch.is_complex(x) else x for x in grads] exp_avgs = [torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_avgs] exp_infs = [torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_infs] # Update steps torch._foreach_add_(state_steps, 1) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) # Update biased first moment estimate. torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) # Update the exponentially weighted infinity norm. torch._foreach_mul_(exp_infs, beta2) for exp_inf, grad in zip(exp_infs, grads): norm_buf = torch.cat([ exp_inf.unsqueeze(0), grad.abs().add_(eps).unsqueeze_(0) ], 0) torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long())) bias_corrections = [1 - beta1 ** step.item() for step in state_steps] clr = [-1 * (lr / bias_correction) for bias_correction in bias_corrections] torch._foreach_addcdiv_(params, exp_avgs, exp_infs, clr)
def _multi_tensor_rmsprop(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], grad_avgs: List[Tensor], momentum_buffer_list: List[Tensor], *, lr: float, alpha: float, eps: float, weight_decay: float, momentum: float, centered: bool, maximize: bool): if len(params) == 0: return if maximize: grads = torch._foreach_neg(grads) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(square_avgs, alpha) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - alpha) if centered: torch._foreach_mul_(grad_avgs, alpha) torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha) avg = torch._foreach_addcmul(square_avgs, grad_avgs, grad_avgs, value=-1) torch._foreach_sqrt_(avg) torch._foreach_add_(avg, eps) else: avg = torch._foreach_sqrt(square_avgs) torch._foreach_add_(avg, eps) if momentum > 0: torch._foreach_mul_(momentum_buffer_list, momentum) torch._foreach_addcdiv_(momentum_buffer_list, grads, avg) torch._foreach_add_(params, momentum_buffer_list, alpha=-lr) else: torch._foreach_addcdiv_(params, grads, avg, value=-lr)
def _multi_tensor_asgd(params: List[Tensor], grads: List[Tensor], axs: List[Tensor], mus: List[Tensor], etas: List[Tensor], state_steps: List[Tensor], *, lambd: float, lr: float, t0: float, alpha: float, weight_decay: float, maximize: bool): if len(params) == 0: return if maximize: grads = torch._foreach_neg(grads) # update step torch._foreach_add_(state_steps, 1) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) # decay term eta = etas[0].item() torch._foreach_mul_(params, 1 - lambd * eta) # update parameter torch._foreach_add_(params, grads, alpha=-eta) # averaging for i in range(len(axs)): if mus[i].item() != 1: axs[i].add_(params[i].sub(axs[i]).mul(mus[i])) else: axs[i].copy_(params[i]) # update eta and mu for i in range(len(mus)): new_eta = torch.tensor(lr / math.pow( (1 + lambd * lr * state_steps[i].item()), alpha)) etas[i].copy_(new_eta) new_mu = torch.tensor(1 / max(1, state_steps[i].item() - t0)) mus[i].copy_(new_mu)
def _multi_tensor_adagrad( params: List[Tensor], grads: List[Tensor], state_sums: List[Tensor], state_steps: List[Tensor], *, lr: float, weight_decay: float, lr_decay: float, eps: float, has_sparse_grad: bool, maximize: bool, ): # Foreach functions will throw errors if given empty lists if len(params) == 0: return if maximize: grads = torch._foreach_neg(grads) if has_sparse_grad is None: has_sparse_grad = any(grad.is_sparse for grad in grads) if has_sparse_grad: return _single_tensor_adagrad( params, grads, state_sums, state_steps, lr=lr, weight_decay=weight_decay, lr_decay=lr_decay, eps=eps, has_sparse_grad=has_sparse_grad, maximize=False, ) # Update steps torch._foreach_add_(state_steps, 1) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) minus_clr = [-lr / (1 + (step - 1) * lr_decay) for step in state_steps] grads = [torch.view_as_real(x) if torch.is_complex(x) else x for x in grads] state_sums = [ torch.view_as_real(x) if torch.is_complex(x) else x for x in state_sums ] torch._foreach_addcmul_(state_sums, grads, grads, value=1) std = torch._foreach_add(torch._foreach_sqrt(state_sums), eps) toAdd = torch._foreach_div(torch._foreach_mul(grads, minus_clr), std) toAdd = [ torch.view_as_complex(x) if torch.is_complex(params[i]) else x for i, x in enumerate(toAdd) ] torch._foreach_add_(params, toAdd) state_sums = [ torch.view_as_complex(x) if torch.is_complex(params[i]) else x for i, x in enumerate(state_sums) ]
def _multi_tensor_adamw(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], max_exp_avg_sqs: List[Tensor], state_steps: List[Tensor], *, amsgrad: bool, beta1: float, beta2: float, lr: float, weight_decay: float, eps: float, maximize: bool, capturable: bool): if len(params) == 0: return if capturable: assert all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)), \ "If capturable=True, params and state_steps must be CUDA tensors." if maximize: grads = torch._foreach_neg(tuple(grads)) # type: ignore[assignment] grads = [ torch.view_as_real(x) if torch.is_complex(x) else x for x in grads ] exp_avgs = [ torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_avgs ] exp_avg_sqs = [ torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_avg_sqs ] params = [ torch.view_as_real(x) if torch.is_complex(x) else x for x in params ] # update steps torch._foreach_add_(state_steps, 1) # Perform stepweight decay torch._foreach_mul_(params, 1 - lr * weight_decay) # Decay the first and second moment running average coefficient torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sqs, beta2) torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2) if capturable: # TODO: use foreach_pow if/when foreach_pow is added bias_correction1 = [torch.pow(beta1, step) for step in state_steps] bias_correction2 = [torch.pow(beta2, step) for step in state_steps] # foreach_sub doesn't allow a scalar as the first arg torch._foreach_sub_(bias_correction1, 1) torch._foreach_sub_(bias_correction2, 1) torch._foreach_neg_(bias_correction1) torch._foreach_neg_(bias_correction2) # foreach_div doesn't allow a scalar as the first arg step_size = torch._foreach_div(bias_correction1, lr) torch._foreach_reciprocal_(step_size) torch._foreach_neg_(step_size) bias_correction2_sqrt = torch._foreach_sqrt(bias_correction2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs) # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs) # Folds in (admittedly ugly) 1-elem step_size math here to avoid extra param-set-sized read+write # (can't fold it into addcdiv_ below because addcdiv_ requires value is a Number, not a Tensor) torch._foreach_div_( max_exp_avg_sq_sqrt, torch._foreach_mul(bias_correction2_sqrt, step_size)) eps_over_step_size = torch._foreach_div(step_size, eps) torch._foreach_reciprocal_(eps_over_step_size) denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps_over_step_size) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) torch._foreach_div_( exp_avg_sq_sqrt, torch._foreach_mul(bias_correction2_sqrt, step_size)) eps_over_step_size = torch._foreach_div(step_size, eps) torch._foreach_reciprocal_(eps_over_step_size) denom = torch._foreach_add(exp_avg_sq_sqrt, eps_over_step_size) torch._foreach_addcdiv_(params, exp_avgs, denom) else: bias_correction1 = [1 - beta1**step.item() for step in state_steps] bias_correction2 = [1 - beta2**step.item() for step in state_steps] step_size = [(lr / bc) * -1 for bc in bias_correction1] bias_correction2_sqrt = [math.sqrt(bc) for bc in bias_correction2] if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs) # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs) torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction2_sqrt) denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, eps) torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
def step(self, closure=None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: amsgrad = group['amsgrad'] grads = [] states = [] exp_avg = [] exp_avg_sq = [] max_exp_avg_sq = [] params_with_grad = [] for p in group['params']: if p.grad is not None: if p.grad.is_sparse: raise RuntimeError('AdamW does not support sparse gradients') # Perform stepweight decay p.mul_(1 - group['lr'] * group['weight_decay']) params_with_grad.append(p) grads.append(p.grad) if group['maximize']: grads = torch._foreach_neg(tuple(grads)) for p in params_with_grad: state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) exp_avg.append(state['exp_avg']) exp_avg_sq.append(state['exp_avg_sq']) if amsgrad: max_exp_avg_sq.append(state['max_exp_avg_sq']) state['step'] += 1 states.append(state) beta1, beta2 = group['betas'] bias_correction1 = [1 - beta1 ** state['step'] for state in states] bias_correction2 = [1 - beta2 ** state['step'] for state in states] # # Decay the first and second moment running average coefficient # torch._foreach_mul_(exp_avg, beta1) torch._foreach_add_(exp_avg, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sq, beta2) torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now max_exp_avg_sq = torch._foreach_maximum(max_exp_avg_sq, exp_avg_sq) # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps']) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps']) step_size = [-1 * (group['lr'] / bc) for bc in bias_correction1] torch._foreach_addcdiv_(params_with_grad, exp_avg, denom, step_size) return loss