def _run_optimization( self, split_batch: Any, batch_idx: int, optimizer: torch.optim.Optimizer, opt_idx: int, ) -> Optional[ClosureResult]: """Runs closure (train step + backward) together with optimization if necessary. Args: split_batch: the current tbptt split of the whole batch batch_idx: the index of the current batch optimizer: the current optimizer opt_idx: the index of the current optimizer """ # toggle model params self._run_optimization_start(opt_idx, optimizer) closure = self._make_closure(split_batch, batch_idx, opt_idx, optimizer, self._hiddens) if self.trainer.fit_loop.should_accumulate(): # For gradient accumulation # ------------------- # calculate loss (train step + train step end) # ------------------- # automatic_optimization=True: perform ddp sync only when performing optimizer_step with _block_parallel_sync_behavior(self.trainer, block=True): closure() # ------------------------------ # BACKWARD PASS # ------------------------------ # gradient update with accumulated gradients else: self._optimizer_step(optimizer, opt_idx, batch_idx, closure) result = closure.get_result() if result: # if no result, user decided to skip optimization # otherwise update running loss + reset accumulated loss # TODO: find proper way to handle updating running loss assert self.trainer.fit_loop is not None assert self.trainer.fit_loop.epoch_loop is not None assert self.trainer.fit_loop.epoch_loop.batch_loop is not None assert result.loss is not None self.trainer.fit_loop.epoch_loop.batch_loop._update_running_loss(result.loss) # untoggle model params self._run_optimization_end(opt_idx) return result
def _run_optimization( self, batch_idx: int, split_batch: Any, opt_idx: Optional[int] = None, optimizer: Optional[torch.optim.Optimizer] = None, ) -> Optional[ClosureResult]: """Runs closure (train step + backward) together with optimization if necessary. Args: batch_idx: the index of the current batch split_batch: the current tbptt split of the whole batch opt_idx: the index of the current optimizer or `None` in case of manual optimization optimizer: the current optimizer or `None` in case of manual optimization """ # toggle model params self._run_optimization_start(opt_idx, optimizer) closure = self._make_closure(split_batch, batch_idx, opt_idx, optimizer, self._hiddens) if self.trainer.fit_loop.should_accumulate(): # For gradient accumulation # ------------------- # calculate loss (train step + train step end) # ------------------- # automatic_optimization: perform ddp sync only when performing optimizer_step with _block_parallel_sync_behavior(self._trainer): closure() # ------------------------------ # BACKWARD PASS # ------------------------------ # gradient update with accumulated gradients else: if self.trainer.lightning_module.automatic_optimization: self._optimizer_step(optimizer, opt_idx, batch_idx, closure) else: closure() result = closure.get_result() if result: # if no result, user decided to skip optimization # otherwise update running loss + reset accumulated loss self._update_running_loss(result.loss) # untoggle model params self._run_optimization_end(opt_idx) return result
def _run_optimization(self, kwargs: OrderedDict, optimizer: torch.optim.Optimizer) -> ClosureResult: """Runs closure (train step + backward) together with optimization if necessary. Args: kwargs: the kwargs passed down to the hooks. optimizer: the current optimizer """ opt_idx = kwargs.get("optimizer_idx", 0) # toggle model params self._run_optimization_start(opt_idx, optimizer) closure = self._make_closure(kwargs, optimizer) if ( # when the strategy handles accumulation, we want to always call the optimizer step not self.trainer.strategy.handles_gradient_accumulation and self.trainer.fit_loop._should_accumulate() ): # For gradient accumulation # ------------------- # calculate loss (train step + train step end) # ------------------- # automatic_optimization=True: perform ddp sync only when performing optimizer_step with _block_parallel_sync_behavior(self.trainer.strategy, block=True): closure() # ------------------------------ # BACKWARD PASS # ------------------------------ # gradient update with accumulated gradients else: # the `batch_idx` is optional with inter-batch parallelism self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure) result = closure.consume_result() if result.loss is not None: # if no result, user decided to skip optimization # otherwise update running loss + reset accumulated loss # TODO: find proper way to handle updating running loss self.trainer.fit_loop.epoch_loop.batch_loop._update_running_loss(result.loss) # untoggle model params self._run_optimization_end(opt_idx) return result
def toggle_model(self, sync_grad: bool = True): """This function is just a helper for advanced users. Considering the current optimizer as A and all other optimizers as B. Toggling means all parameters from B exclusive to A will have ``requires_grad`` set to False. When performing gradient accumulation, there is no need to perform grad synchronization during the accumulation phase. Setting `sync_grad` to False will block this synchronization and improve performance. """ # local import here to avoid circular import from pytorch_lightning.loops.utilities import _block_parallel_sync_behavior with _block_parallel_sync_behavior(self._trainer, block=(not sync_grad)): self._toggle_model() yield self._untoggle_model()