def backprop(self, state, loss): if state.stage != Stage.TRAIN: return with timing.time("loss.backward"): precision.backward(state.optimizer, loss) state.scheduler.step_batch() if self.config.max_clip_norm is not None: grad_norm = precision.clip_grad_norm(state.model, self.optimizer, self.config.max_clip_norm) else: grad_norm = None with timing.time("optimizer.step"): state.optimizer.step() # grad_norm could be used to check grads sync in distributed training return grad_norm
def training_backprop(loss): with timing.time("loss.backward"): precision.backward(self.optimizer, loss) if world_size > 1: # DDP fix when some parameters don't receive grads for p in model.parameters(): if p.requires_grad and p.grad is None: p.backward(torch.zeros_like(p.data)) if self.lr_scheduler: self.lr_scheduler.step_batch() if self.config.max_clip_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), self.config.max_clip_norm) else: grad_norm = None with timing.time("optimizer.step"): self.optimizer.step() # grad_norm could be used to check grads sync in distributed training return grad_norm
def backprop(self, state, loss): if state.stage != Stage.TRAIN: return with timing.time("loss.backward"): precision.backward(state.optimizer, loss) if cuda.DISTRIBUTED_WORLD_SIZE > 1: # DDP fix when some parameters don't receive grads for p in state.model.parameters(): if p.requires_grad and p.grad is None: p.backward(torch.zeros_like(p.data)) state.scheduler.step_batch() if self.config.max_clip_norm is not None: grad_norm = precision.clip_grad_norm(state.model, self.optimizer, self.config.max_clip_norm) else: grad_norm = None with timing.time("optimizer.step"): state.optimizer.step() # grad_norm could be used to check grads sync in distributed training return grad_norm
def backprop(self, state, loss): if state.stage != Stage.TRAIN: return with timing.time("loss.backward"): precision.backward(state.optimizer, loss)