Exemplo n.º 1
0
    def _run_optimization(
        self,
        split_batch: Any,
        batch_idx: int,
        optimizer: torch.optim.Optimizer,
        opt_idx: int,
    ) -> Optional[ClosureResult]:
        """Runs closure (train step + backward) together with optimization if necessary.

        Args:
            split_batch: the current tbptt split of the whole batch
            batch_idx: the index of the current batch
            optimizer: the current optimizer
            opt_idx: the index of the current optimizer
        """
        # toggle model params
        self._run_optimization_start(opt_idx, optimizer)

        closure = self._make_closure(split_batch, batch_idx, opt_idx, optimizer, self._hiddens)

        if self.trainer.fit_loop.should_accumulate():
            # For gradient accumulation

            # -------------------
            # calculate loss (train step + train step end)
            # -------------------
            # automatic_optimization=True: perform ddp sync only when performing optimizer_step
            with _block_parallel_sync_behavior(self.trainer, block=True):
                closure()

        # ------------------------------
        # BACKWARD PASS
        # ------------------------------
        # gradient update with accumulated gradients
        else:
            self._optimizer_step(optimizer, opt_idx, batch_idx, closure)

        result = closure.get_result()

        if result:
            # if no result, user decided to skip optimization
            # otherwise update running loss + reset accumulated loss
            # TODO: find proper way to handle updating running loss
            assert self.trainer.fit_loop is not None
            assert self.trainer.fit_loop.epoch_loop is not None
            assert self.trainer.fit_loop.epoch_loop.batch_loop is not None
            assert result.loss is not None
            self.trainer.fit_loop.epoch_loop.batch_loop._update_running_loss(result.loss)

        # untoggle model params
        self._run_optimization_end(opt_idx)
        return result
    def _run_optimization(
        self,
        batch_idx: int,
        split_batch: Any,
        opt_idx: Optional[int] = None,
        optimizer: Optional[torch.optim.Optimizer] = None,
    ) -> Optional[ClosureResult]:
        """Runs closure (train step + backward) together with optimization if necessary.

        Args:
            batch_idx: the index of the current batch
            split_batch: the current tbptt split of the whole batch
            opt_idx: the index of the current optimizer or `None` in case of manual optimization
            optimizer: the current optimizer or `None` in case of manual optimization
        """
        # toggle model params
        self._run_optimization_start(opt_idx, optimizer)

        closure = self._make_closure(split_batch, batch_idx, opt_idx,
                                     optimizer, self._hiddens)

        if self.trainer.fit_loop.should_accumulate():
            # For gradient accumulation

            # -------------------
            # calculate loss (train step + train step end)
            # -------------------
            # automatic_optimization: perform ddp sync only when performing optimizer_step
            with _block_parallel_sync_behavior(self._trainer):
                closure()

        # ------------------------------
        # BACKWARD PASS
        # ------------------------------
        # gradient update with accumulated gradients
        else:
            if self.trainer.lightning_module.automatic_optimization:
                self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
            else:
                closure()

        result = closure.get_result()

        if result:
            # if no result, user decided to skip optimization
            # otherwise update running loss + reset accumulated loss
            self._update_running_loss(result.loss)

        # untoggle model params
        self._run_optimization_end(opt_idx)
        return result
Exemplo n.º 3
0
    def _run_optimization(self, kwargs: OrderedDict, optimizer: torch.optim.Optimizer) -> ClosureResult:
        """Runs closure (train step + backward) together with optimization if necessary.

        Args:
            kwargs: the kwargs passed down to the hooks.
            optimizer: the current optimizer
        """
        opt_idx = kwargs.get("optimizer_idx", 0)

        # toggle model params
        self._run_optimization_start(opt_idx, optimizer)

        closure = self._make_closure(kwargs, optimizer)

        if (
            # when the strategy handles accumulation, we want to always call the optimizer step
            not self.trainer.strategy.handles_gradient_accumulation
            and self.trainer.fit_loop._should_accumulate()
        ):
            # For gradient accumulation

            # -------------------
            # calculate loss (train step + train step end)
            # -------------------
            # automatic_optimization=True: perform ddp sync only when performing optimizer_step
            with _block_parallel_sync_behavior(self.trainer.strategy, block=True):
                closure()

        # ------------------------------
        # BACKWARD PASS
        # ------------------------------
        # gradient update with accumulated gradients
        else:
            # the `batch_idx` is optional with inter-batch parallelism
            self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)

        result = closure.consume_result()

        if result.loss is not None:
            # if no result, user decided to skip optimization
            # otherwise update running loss + reset accumulated loss
            # TODO: find proper way to handle updating running loss
            self.trainer.fit_loop.epoch_loop.batch_loop._update_running_loss(result.loss)

        # untoggle model params
        self._run_optimization_end(opt_idx)
        return result
Exemplo n.º 4
0
    def toggle_model(self, sync_grad: bool = True):
        """This function is just a helper for advanced users.

        Considering the current optimizer as A and all other optimizers as B.
        Toggling means all parameters from B exclusive to A will have ``requires_grad`` set to False.


        When performing gradient accumulation, there is no need to perform grad synchronization
        during the accumulation phase.
        Setting `sync_grad` to False will block this synchronization and improve performance.
        """
        # local import here to avoid circular import
        from pytorch_lightning.loops.utilities import _block_parallel_sync_behavior

        with _block_parallel_sync_behavior(self._trainer, block=(not sync_grad)):
            self._toggle_model()
            yield
            self._untoggle_model()