def test_compare_cpu_and_gpu(dtype): param_norm = torch.tensor(1., dtype=dtype) grad_norm = torch.tensor(1., dtype=dtype) adaptive_lr_cpu = torch.tensor(0., dtype=dtype) weight_decay = 1. eps = 2. trust_coef = 1. adaptive_lr_cpu = compute_adaptive_lr( param_norm, grad_norm, weight_decay, eps, trust_coef, adaptive_lr_cpu) param_norm = torch.tensor(1., dtype=dtype, device='cuda') grad_norm = torch.tensor(1., dtype=dtype, device='cuda') adaptive_lr_gpu = torch.tensor(0., dtype=dtype, device='cuda') weight_decay = 1. eps = 2. trust_coef = 1. adaptive_lr_gpu = compute_adaptive_lr( param_norm, grad_norm, weight_decay, eps, trust_coef, adaptive_lr_gpu) assert torch.allclose(adaptive_lr_cpu, adaptive_lr_gpu.cpu())
def test_specific_case(dtype): param_norm = torch.tensor(1.234, dtype=dtype) grad_norm = torch.tensor(5.678, dtype=dtype) adaptive_lr = torch.tensor(0., dtype=dtype) weight_decay = 1e-4 eps = 1e-8 trust_coef = 0.001 adaptive_lr = compute_adaptive_lr(param_norm, grad_norm, weight_decay, eps, trust_coef, adaptive_lr) assert torch.allclose(adaptive_lr, torch.tensor(0.000217325, dtype=dtype))
def test_when_grad_norm_is_zero_with_half(): param_norm = torch.tensor(1., dtype=torch.half, device='cuda') grad_norm = torch.tensor(0., dtype=torch.half, device='cuda') adaptive_lr = torch.tensor(0., dtype=torch.half, device='cuda') weight_decay = 1. eps = 1. trust_coef = 1. adaptive_lr = compute_adaptive_lr(param_norm, grad_norm, weight_decay, eps, trust_coef, adaptive_lr) assert adaptive_lr == torch.tensor(1., dtype=torch.half, device='cuda')
def test_when_grad_norm_is_zero(dtype): param_norm = torch.tensor(1., dtype=dtype) grad_norm = torch.tensor(0., dtype=dtype) adaptive_lr = torch.tensor(0., dtype=dtype) weight_decay = 1. eps = 1. trust_coef = 1. adaptive_lr = compute_adaptive_lr(param_norm, grad_norm, weight_decay, eps, trust_coef, adaptive_lr) assert adaptive_lr == torch.tensor(1., dtype=dtype)
def apply_adaptive_lrs(self, weight_decays): with torch.no_grad(): for group, weight_decay in zip(self.optim.param_groups, weight_decays): if weight_decay is None: weight_decay = 0.0 for p in group['params']: if p.grad is None: continue if group.get('lars_adaptation', True): param_norm = p.norm() grad_norm = p.grad.norm() # The optimizer class has no method to change `dtype` of # its inner tensors (like `adaptive_lr`) and to select to # use CPU or GPU with Tensor. LARS's interface follows the # optimizer class's interface, so LARS cannot change # `dtype` of inner tensors explicitly also. In that # context, we have constructed LARS can modify its member # variable's spec implicitly by comparing with given spec # by the original optimizer's element. param_norm_spec = (param_norm.is_cuda, param_norm.type()) adaptive_lr_spec = (self.adaptive_lr.is_cuda, self.adaptive_lr.type()) if param_norm_spec != adaptive_lr_spec: self.adaptive_lr = torch.ones_like(param_norm) # calculate adaptive lr & weight decay adaptive_lr = compute_adaptive_lr( param_norm, grad_norm, weight_decay, self.eps, self.trust_coef, self.adaptive_lr) else: adaptive_lr = group['lr'] p.grad.add_(p.data, alpha=weight_decay) p.grad.mul_(adaptive_lr)