Пример #1
0
    def setup(self, trainer: "pl.Trainer") -> None:
        self._rank_0_will_call_children_scripts = self.broadcast(
            self._rank_0_will_call_children_scripts)
        if self._should_run_deadlock_detection():
            self._share_information_to_prevent_deadlock()

        self.accelerator.setup(trainer)

        # move the model to the correct device
        self.model_to_device()

        trainer_fn = trainer.state.fn

        if trainer_fn == TrainerFn.FITTING:
            if self._layer_sync and self.model:
                self.model = self._layer_sync.apply(self.model)

        self.setup_precision_plugin()

        if trainer_fn == TrainerFn.FITTING:
            # set up optimizers after the module has been moved to the device
            # but before the module has been wrapped
            self.setup_optimizers(trainer)
            optimizers_to_device(self.optimizers, self.root_device)

            # skip wrapping the model if we are not fitting as no gradients need to be exchanged
            self._configure_bagua_model(trainer)
Пример #2
0
    def teardown(self) -> None:
        """This method is called to teardown the training process.

        It is the right place to release memory and free other resources.
        """
        optimizers_to_device(self.optimizers, torch.device("cpu"))
        self.precision_plugin.teardown()
Пример #3
0
 def setup(self, trainer: "pl.Trainer") -> None:
     self.accelerator.setup(trainer)
     self.setup_optimizers(trainer)
     self.setup_precision_plugin()
     optimizers_to_device(self.optimizers, self.root_device)
     self.init_deepspeed()
     self.barrier()
Пример #4
0
    def setup(self, trainer: "pl.Trainer") -> None:
        # share ddp pids to all processes
        self._rank_0_will_call_children_scripts = self.broadcast(
            self._rank_0_will_call_children_scripts)
        if self._should_run_deadlock_detection():
            self._share_information_to_prevent_deadlock()

        self.accelerator.setup(trainer)

        # move the model to the correct device
        self.model_to_device()

        # skip wrapping the model if we are not fitting as no gradients need to be exchanged
        trainer_fn = trainer.state.fn

        if trainer_fn == TrainerFn.FITTING:
            if self._layer_sync:
                self.model = self._layer_sync.apply(self.model)

        self.setup_precision_plugin()

        if trainer_fn == TrainerFn.FITTING:
            self.configure_ddp()

            # set up optimizers after the wrapped module has been moved to the device
            self.setup_optimizers(trainer)
            optimizers_to_device(self.optimizers, self.root_device)

        if _TORCH_GREATER_EQUAL_1_10 and trainer_fn == TrainerFn.FITTING:
            import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD

            if isinstance(self._ddp_comm_state,
                          post_localSGD.PostLocalSGDState):
                self._enable_model_averaging()
 def configure_ddp(self) -> None:
     # set up optimizers after the wrapped module has been moved to the device
     self.setup_optimizers(self.lightning_module.trainer)
     self.model, self.optimizers = self._setup_model_and_optimizers(
         model=LightningShardedDataParallel(self.model),
         optimizers=self.optimizers)
     optimizers_to_device(self.optimizers, self.root_device)
Пример #6
0
 def configure_ddp(self) -> None:
     self._set_ddp_kwargs()
     self.setup_optimizers(self.model.trainer)
     self.model, self.optimizers = self._setup_model_and_optimizers(
         model=LightningShardedDataParallel(self.model),
         optimizers=self.optimizers,
     )
     optimizers_to_device(self.optimizers, self.root_device)
Пример #7
0
    def configure_ddp(self) -> None:
        self.pre_configure_ddp()
        self.model = self._setup_model(LightningDistributedModule(self.model))
        self._register_ddp_hooks()

        # set up optimizers after the wrapped module has been moved to the device
        self.setup_optimizers(self.lightning_module.trainer)
        optimizers_to_device(self.optimizers, self.root_device)
Пример #8
0
 def setup(self, trainer: "pl.Trainer") -> None:
     self.accelerator.setup(trainer)
     # we set the device so that optimizers can be created with distributed comms.
     self.lightning_module._device = self.root_device
     self.setup_optimizers(trainer)
     self.setup_precision_plugin()
     optimizers_to_device(self.optimizers, self.root_device)
     self.init_deepspeed()
     self.barrier()
Пример #9
0
    def setup(self, trainer: "pl.Trainer") -> None:
        """Setup plugins for the trainer fit and creates optimizers.

        Args:
            trainer: the trainer instance
        """
        self.accelerator.setup(trainer)
        self.setup_optimizers(trainer)
        self.setup_precision_plugin()
        optimizers_to_device(self.optimizers, self.root_device)
Пример #10
0
    def setup(self, trainer: "pl.Trainer") -> None:
        self.accelerator.setup(trainer)

        if trainer.state.fn == TrainerFn.FITTING and self._layer_sync:
            self.model = self._layer_sync.apply(self.model)

        self.configure_ddp()
        self.barrier()
        self.setup_optimizers(trainer)
        optimizers_to_device(self.optimizers, self.root_device)
        self.setup_precision_plugin()
Пример #11
0
    def teardown(self) -> None:
        """This method is called to teardown the training process.

        It is the right place to release memory and free other resources.
        """
        optimizers_to_device(self.optimizers, torch.device("cpu"))

        if self.lightning_module is not None:
            log.detail(f"{self.__class__.__name__}: moving model to CPU")
            self.lightning_module.cpu()
        self.precision_plugin.teardown()
        self.accelerator.teardown()
    def setup(self, trainer: "pl.Trainer") -> None:
        self.accelerator.setup(trainer)

        if trainer.state.fn == TrainerFn.FITTING and self._layer_sync:
            assert self.model is not None
            self.model = self._layer_sync.apply(self.model)

        if not self.cpu_offload:
            self.model_to_device()

        self.barrier()
        self.setup_optimizers(trainer)
        optimizers_to_device(self.optimizers, self.root_device)
        self.setup_precision_plugin()
Пример #13
0
    def setup(self, trainer: "pl.Trainer") -> None:
        self.accelerator.setup(trainer)

        if self.debug:
            os.environ["PT_XLA_DEBUG"] = str(1)

        shared_params = find_shared_parameters(self.model)
        self.model_to_device()
        set_shared_parameters(self.model.module, shared_params)
        self.setup_precision_plugin()

        if trainer.state.fn == TrainerFn.FITTING:
            self.setup_optimizers(trainer)
            optimizers_to_device(self.optimizers, self.root_device)
Пример #14
0
    def setup(self, trainer: "pl.Trainer") -> None:
        self.accelerator.setup(trainer)
        # share ddp pids to all processes
        self._rank_0_will_call_children_scripts = self.broadcast(self._rank_0_will_call_children_scripts)

        if trainer.state.fn == TrainerFn.FITTING and self._layer_sync:
            assert self.model is not None
            self.model = self._layer_sync.apply(self.model)

        # we set the device so that optimizers can be created with distributed comms.
        assert self.lightning_module is not None
        self.lightning_module._device = self.root_device

        self.barrier()
        self.setup_optimizers(trainer)
        optimizers_to_device(self.optimizers, self.root_device)
        self.setup_precision_plugin()
Пример #15
0
    def setup(self, trainer: "pl.Trainer") -> None:
        self.start_method = "fork"
        self.accelerator.setup(trainer)
        self.setup_optimizers(trainer)
        self.setup_precision_plugin()
        optimizers_to_device(self.optimizers, self.root_device)

        if self.debug:
            os.environ["PT_XLA_DEBUG"] = str(1)

        shared_params = find_shared_parameters(self.model)
        self.model_to_device()
        if is_overridden("on_post_move_to_device", self.lightning_module):
            self.model.module.on_post_move_to_device()
        else:
            set_shared_parameters(self.model.module, shared_params)

        self.setup_optimizers(trainer)
        self.precision_plugin.connect(self.model, None, None)