예제 #1
0
    def optimizer_step(self, state):
        if state.stage != Stage.TRAIN:
            return

        state.scheduler.step_batch()

        if self.config.max_clip_norm is not None:
            grad_norm = precision.clip_grad_norm(state.model, state.optimizer,
                                                 self.config.max_clip_norm)
        else:
            grad_norm = None

        with timing.time("optimizer.step"):
            state.optimizer.step()
        # grad_norm could be used to check grads sync in distributed training
        return grad_norm
예제 #2
0
    def backprop(self, state, loss):
        if state.stage != Stage.TRAIN:
            return

        with timing.time("loss.backward"):
            precision.backward(state.optimizer, loss)
            if cuda.DISTRIBUTED_WORLD_SIZE > 1:
                # DDP fix when some parameters don't receive grads
                for p in state.model.parameters():
                    if p.requires_grad and p.grad is None:
                        p.backward(torch.zeros_like(p.data))

        state.scheduler.step_batch()

        if self.config.max_clip_norm is not None:
            grad_norm = precision.clip_grad_norm(state.model, self.optimizer,
                                                 self.config.max_clip_norm)
        else:
            grad_norm = None

        with timing.time("optimizer.step"):
            state.optimizer.step()
        # grad_norm could be used to check grads sync in distributed training
        return grad_norm