예제 #1
0
파일: trainer.py 프로젝트: oasis100/pytext
    def backprop(self, state, loss):
        if state.stage != Stage.TRAIN:
            return

        with timing.time("loss.backward"):
            precision.backward(state.optimizer, loss)

        state.scheduler.step_batch()

        if self.config.max_clip_norm is not None:
            grad_norm = precision.clip_grad_norm(state.model, self.optimizer,
                                                 self.config.max_clip_norm)
        else:
            grad_norm = None

        with timing.time("optimizer.step"):
            state.optimizer.step()
        # grad_norm could be used to check grads sync in distributed training
        return grad_norm
예제 #2
0
파일: trainer.py 프로젝트: haydenliu/pytext
        def training_backprop(loss):
            with timing.time("loss.backward"):
                precision.backward(self.optimizer, loss)
                if world_size > 1:
                    # DDP fix when some parameters don't receive grads
                    for p in model.parameters():
                        if p.requires_grad and p.grad is None:
                            p.backward(torch.zeros_like(p.data))

            if self.lr_scheduler:
                self.lr_scheduler.step_batch()

            if self.config.max_clip_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), self.config.max_clip_norm)
            else:
                grad_norm = None

            with timing.time("optimizer.step"):
                self.optimizer.step()
            # grad_norm could be used to check grads sync in distributed training
            return grad_norm
예제 #3
0
    def backprop(self, state, loss):
        if state.stage != Stage.TRAIN:
            return

        with timing.time("loss.backward"):
            precision.backward(state.optimizer, loss)
            if cuda.DISTRIBUTED_WORLD_SIZE > 1:
                # DDP fix when some parameters don't receive grads
                for p in state.model.parameters():
                    if p.requires_grad and p.grad is None:
                        p.backward(torch.zeros_like(p.data))

        state.scheduler.step_batch()

        if self.config.max_clip_norm is not None:
            grad_norm = precision.clip_grad_norm(state.model, self.optimizer,
                                                 self.config.max_clip_norm)
        else:
            grad_norm = None

        with timing.time("optimizer.step"):
            state.optimizer.step()
        # grad_norm could be used to check grads sync in distributed training
        return grad_norm
예제 #4
0
    def backprop(self, state, loss):
        if state.stage != Stage.TRAIN:
            return

        with timing.time("loss.backward"):
            precision.backward(state.optimizer, loss)