Пример #1
0
    def select_accelerator(self):
        if self.trainer.accelerator_backend is not None:
            return self.trainer.accelerator_backend

        # ----------------------------------
        # Use the user provided accelerator
        # ----------------------------------
        # use the one the user passed in
        if self.accelerator is not None and isinstance(self.accelerator,
                                                       Accelerator):
            self.accelerator.trainer = self.trainer
            self.accelerator.ddp_plugin = self.trainer.plugin_connector.ddp_plugin
            acc = self.accelerator
            return acc

        # ----------------------------------
        # choose an accelerator for the user
        # ----------------------------------
        use_slurm_ddp = self.trainer.use_ddp and self.trainer.is_slurm_managing_tasks

        # torchelastic or general non_slurm ddp
        te_flags_passed = 'WORLD_SIZE' in os.environ and (
            'GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ)
        use_torchelastic_ddp = self.trainer.use_ddp and te_flags_passed

        use_ddp_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_spawn"
        use_ddp_cpu_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_cpu"

        use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self._is_using_torchelastic(
        )
        use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.trainer.is_slurm_managing_tasks

        # ddp script mode uses the same flags as TE
        # TODO: decouple from TE
        if os.environ.get('PL_IN_DDP_SUBPROCESS', False):
            use_torchelastic_ddp = False

        cluster_env = self._select_environment()

        # choose the appropriate accelerator backend
        if self.trainer.use_ddp2:
            accelerator_backend = accelerators.DDP2Accelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_ddp_cpu_slurm:
            accelerator_backend = accelerators.DDPCPUHPCAccelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_slurm_ddp:
            accelerator_backend = accelerators.DDPHPCAccelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_ddp_cpu_torch_elastic:
            accelerator_backend = accelerators.DDPCPUHPCAccelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_torchelastic_ddp:
            accelerator_backend = accelerators.DDPHPCAccelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_ddp_spawn:
            accelerator_backend = accelerators.DDPSpawnAccelerator(
                self.trainer,
                nprocs=self.trainer.num_processes,
                cluster_environment=cluster_env,
                ddp_plugin=self.trainer.plugin_connector.ddp_plugin)

        elif use_ddp_cpu_spawn:
            accelerator_backend = accelerators.DDPCPUSpawnAccelerator(
                self.trainer,
                nprocs=self.trainer.num_processes,
                cluster_environment=cluster_env,
                ddp_plugin=self.trainer.plugin_connector.ddp_plugin)

        elif self.trainer.distributed_backend == "ddp":
            accelerator_backend = accelerators.DDPAccelerator(
                self.trainer,
                cluster_env,
                ddp_plugin=self.trainer.plugin_connector.ddp_plugin)

        elif self.trainer.use_dp:
            accelerator_backend = accelerators.DataParallelAccelerator(
                self.trainer, cluster_env)

        elif self.trainer.use_horovod:
            accelerator_backend = accelerators.HorovodAccelerator(
                self.trainer, cluster_env)

        elif self.trainer.use_single_gpu:
            accelerator_backend = accelerators.GPUAccelerator(
                self.trainer, cluster_env)

        elif self.trainer.use_tpu:
            accelerator_backend = accelerators.TPUAccelerator(
                self.trainer, cluster_env)

        elif self.trainer.distributed_backend is None:
            accelerator_backend = accelerators.CPUAccelerator(
                self.trainer, cluster_env)
        else:
            raise MisconfigurationException(
                f'Trainer(distributed_backend={self.trainer.distributed_backend} is not a supported backend'
            )

        return accelerator_backend
Пример #2
0
    def select_accelerator(self):
        if self.trainer.accelerator_backend is not None:
            return self.trainer.accelerator_backend

        # ----------------------------------
        # Use the user provided accelerator
        # ----------------------------------
        # use the one the user passed in
        if self.accelerator is not None and isinstance(self.accelerator,
                                                       Accelerator):
            self.accelerator.trainer = self.trainer
            self.accelerator.ddp_plugin = self.trainer.plugin_connector.ddp_plugin
            acc = self.accelerator
            return acc

        # ----------------------------------
        # choose an accelerator for the user
        # ----------------------------------
        use_slurm_ddp = (self.trainer._distrib_type
                         in (DistributedType.DDP, DistributedType.DDP_SPAWN)
                         and self.trainer.is_slurm_managing_tasks)

        # torchelastic or general non_slurm ddp
        te_flags_passed = 'WORLD_SIZE' in os.environ and (
            'GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ)
        use_torchelastic_ddp = (self.trainer._distrib_type
                                in (DistributedType.DDP,
                                    DistributedType.DDP_SPAWN)
                                and te_flags_passed)

        use_ddp_cpu_spawn = (self.trainer._distrib_type
                             in (DistributedType.DDP,
                                 DistributedType.DDP_SPAWN)
                             and self.trainer._device_type == DeviceType.CPU)

        use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self._is_using_torchelastic(
        )
        use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.trainer.is_slurm_managing_tasks

        # ddp script mode uses the same flags as TE
        # TODO: decouple from TE
        if os.environ.get('PL_IN_DDP_SUBPROCESS', False):
            use_torchelastic_ddp = False

        cluster_env = self._select_environment()

        # TODO: clean-up this branching as most just select class and uses the very same arguments
        # choose the appropriate accelerator backend
        if self.trainer._distrib_type == DistributedType.DDP2:
            accelerator_backend = accelerators.DDP2Accelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_ddp_cpu_slurm:
            accelerator_backend = accelerators.DDPCPUHPCAccelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_slurm_ddp:
            accelerator_backend = accelerators.DDPHPCAccelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_ddp_cpu_torch_elastic:
            accelerator_backend = accelerators.DDPCPUHPCAccelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_torchelastic_ddp:
            accelerator_backend = accelerators.DDPHPCAccelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif self.trainer._distrib_type == DistributedType.DDP_SPAWN:
            accelerator_backend = accelerators.DDPSpawnAccelerator(
                self.trainer,
                nprocs=self.trainer.num_processes,
                cluster_environment=cluster_env,
                ddp_plugin=self.trainer.plugin_connector.ddp_plugin)

        elif use_ddp_cpu_spawn:
            accelerator_backend = accelerators.DDPCPUSpawnAccelerator(
                self.trainer,
                nprocs=self.trainer.num_processes,
                cluster_environment=cluster_env,
                ddp_plugin=self.trainer.plugin_connector.ddp_plugin)

        elif self.trainer.distributed_backend == "ddp":
            accelerator_backend = accelerators.DDPAccelerator(
                self.trainer,
                cluster_env,
                ddp_plugin=self.trainer.plugin_connector.ddp_plugin)

        elif self.trainer._distrib_type == DistributedType.DP:
            accelerator_backend = accelerators.DataParallelAccelerator(
                self.trainer, cluster_env)

        elif self.trainer._distrib_type == DistributedType.HOROVOD:
            accelerator_backend = accelerators.HorovodAccelerator(
                self.trainer, cluster_env)

        elif self.trainer._device_type == DeviceType.GPU and self.trainer.num_gpus == 1:
            accelerator_backend = accelerators.GPUAccelerator(
                self.trainer, cluster_env)

        elif self.trainer._device_type == DeviceType.TPU:
            accelerator_backend = accelerators.TPUAccelerator(
                self.trainer, cluster_env)

        elif self.trainer.distributed_backend is None:
            accelerator_backend = accelerators.CPUAccelerator(
                self.trainer, cluster_env)
        else:
            raise MisconfigurationException(
                f'`Trainer(accelerator={self.trainer.distributed_backend}, num_nodes={self.trainer.num_nodes},'
                f' num_processes={self.trainer.num_processes}, ...)` is not a supported backend for'
                f' num_gpus={self.trainer.num_gpus}')

        return accelerator_backend