def select_accelerator(self): if self.trainer.accelerator_backend is not None: return self.trainer.accelerator_backend # ---------------------------------- # Use the user provided accelerator # ---------------------------------- # use the one the user passed in if self.accelerator is not None and isinstance(self.accelerator, Accelerator): self.accelerator.trainer = self.trainer self.accelerator.ddp_plugin = self.trainer.plugin_connector.ddp_plugin acc = self.accelerator return acc # ---------------------------------- # choose an accelerator for the user # ---------------------------------- use_slurm_ddp = self.trainer.use_ddp and self.trainer.is_slurm_managing_tasks # torchelastic or general non_slurm ddp te_flags_passed = 'WORLD_SIZE' in os.environ and ( 'GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ) use_torchelastic_ddp = self.trainer.use_ddp and te_flags_passed use_ddp_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_spawn" use_ddp_cpu_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_cpu" use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self._is_using_torchelastic( ) use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.trainer.is_slurm_managing_tasks # ddp script mode uses the same flags as TE # TODO: decouple from TE if os.environ.get('PL_IN_DDP_SUBPROCESS', False): use_torchelastic_ddp = False cluster_env = self._select_environment() # choose the appropriate accelerator backend if self.trainer.use_ddp2: accelerator_backend = accelerators.DDP2Accelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_ddp_cpu_slurm: accelerator_backend = accelerators.DDPCPUHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_slurm_ddp: accelerator_backend = accelerators.DDPHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_ddp_cpu_torch_elastic: accelerator_backend = accelerators.DDPCPUHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_torchelastic_ddp: accelerator_backend = accelerators.DDPHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_ddp_spawn: accelerator_backend = accelerators.DDPSpawnAccelerator( self.trainer, nprocs=self.trainer.num_processes, cluster_environment=cluster_env, ddp_plugin=self.trainer.plugin_connector.ddp_plugin) elif use_ddp_cpu_spawn: accelerator_backend = accelerators.DDPCPUSpawnAccelerator( self.trainer, nprocs=self.trainer.num_processes, cluster_environment=cluster_env, ddp_plugin=self.trainer.plugin_connector.ddp_plugin) elif self.trainer.distributed_backend == "ddp": accelerator_backend = accelerators.DDPAccelerator( self.trainer, cluster_env, ddp_plugin=self.trainer.plugin_connector.ddp_plugin) elif self.trainer.use_dp: accelerator_backend = accelerators.DataParallelAccelerator( self.trainer, cluster_env) elif self.trainer.use_horovod: accelerator_backend = accelerators.HorovodAccelerator( self.trainer, cluster_env) elif self.trainer.use_single_gpu: accelerator_backend = accelerators.GPUAccelerator( self.trainer, cluster_env) elif self.trainer.use_tpu: accelerator_backend = accelerators.TPUAccelerator( self.trainer, cluster_env) elif self.trainer.distributed_backend is None: accelerator_backend = accelerators.CPUAccelerator( self.trainer, cluster_env) else: raise MisconfigurationException( f'Trainer(distributed_backend={self.trainer.distributed_backend} is not a supported backend' ) return accelerator_backend
def select_accelerator(self): if self.trainer.accelerator_backend is not None: return self.trainer.accelerator_backend # ---------------------------------- # Use the user provided accelerator # ---------------------------------- # use the one the user passed in if self.accelerator is not None and isinstance(self.accelerator, Accelerator): self.accelerator.trainer = self.trainer self.accelerator.ddp_plugin = self.trainer.plugin_connector.ddp_plugin acc = self.accelerator return acc # ---------------------------------- # choose an accelerator for the user # ---------------------------------- use_slurm_ddp = (self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) and self.trainer.is_slurm_managing_tasks) # torchelastic or general non_slurm ddp te_flags_passed = 'WORLD_SIZE' in os.environ and ( 'GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ) use_torchelastic_ddp = (self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) and te_flags_passed) use_ddp_cpu_spawn = (self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) and self.trainer._device_type == DeviceType.CPU) use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self._is_using_torchelastic( ) use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.trainer.is_slurm_managing_tasks # ddp script mode uses the same flags as TE # TODO: decouple from TE if os.environ.get('PL_IN_DDP_SUBPROCESS', False): use_torchelastic_ddp = False cluster_env = self._select_environment() # TODO: clean-up this branching as most just select class and uses the very same arguments # choose the appropriate accelerator backend if self.trainer._distrib_type == DistributedType.DDP2: accelerator_backend = accelerators.DDP2Accelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_ddp_cpu_slurm: accelerator_backend = accelerators.DDPCPUHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_slurm_ddp: accelerator_backend = accelerators.DDPHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_ddp_cpu_torch_elastic: accelerator_backend = accelerators.DDPCPUHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_torchelastic_ddp: accelerator_backend = accelerators.DDPHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif self.trainer._distrib_type == DistributedType.DDP_SPAWN: accelerator_backend = accelerators.DDPSpawnAccelerator( self.trainer, nprocs=self.trainer.num_processes, cluster_environment=cluster_env, ddp_plugin=self.trainer.plugin_connector.ddp_plugin) elif use_ddp_cpu_spawn: accelerator_backend = accelerators.DDPCPUSpawnAccelerator( self.trainer, nprocs=self.trainer.num_processes, cluster_environment=cluster_env, ddp_plugin=self.trainer.plugin_connector.ddp_plugin) elif self.trainer.distributed_backend == "ddp": accelerator_backend = accelerators.DDPAccelerator( self.trainer, cluster_env, ddp_plugin=self.trainer.plugin_connector.ddp_plugin) elif self.trainer._distrib_type == DistributedType.DP: accelerator_backend = accelerators.DataParallelAccelerator( self.trainer, cluster_env) elif self.trainer._distrib_type == DistributedType.HOROVOD: accelerator_backend = accelerators.HorovodAccelerator( self.trainer, cluster_env) elif self.trainer._device_type == DeviceType.GPU and self.trainer.num_gpus == 1: accelerator_backend = accelerators.GPUAccelerator( self.trainer, cluster_env) elif self.trainer._device_type == DeviceType.TPU: accelerator_backend = accelerators.TPUAccelerator( self.trainer, cluster_env) elif self.trainer.distributed_backend is None: accelerator_backend = accelerators.CPUAccelerator( self.trainer, cluster_env) else: raise MisconfigurationException( f'`Trainer(accelerator={self.trainer.distributed_backend}, num_nodes={self.trainer.num_nodes},' f' num_processes={self.trainer.num_processes}, ...)` is not a supported backend for' f' num_gpus={self.trainer.num_gpus}') return accelerator_backend