def select_strategy(self) -> Strategy: if isinstance(self.distributed_backend, Accelerator) and self.distributed_backend.strategy is not None: plugin = self.distributed_backend.strategy elif self.use_ddp2: plugin = DDP2Strategy(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment) elif self.use_ddp and self.use_deepspeed: plugin = DeepSpeedStrategy( cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices ) elif self.use_ddp: use_slurm_ddp = self.use_ddp and self._is_slurm_managing_tasks() use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.detect() use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.detect() use_ddp_spawn = self._strategy_type == _StrategyType.DDP_SPAWN use_ddp_cpu_spawn = use_ddp_spawn and self.use_cpu use_tpu_spawn = self.use_tpu and self._strategy_type == _StrategyType.TPU_SPAWN use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.detect() use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.detect() use_ddp_cpu_slurm = use_ddp_cpu_spawn and self._is_slurm_managing_tasks() use_ddp_sharded = self._strategy_type == _StrategyType.DDP_SHARDED use_ddp_sharded_spawn = self._strategy_type == _StrategyType.DDP_SHARDED_SPAWN use_ddp_fully_sharded = self._strategy_type == _StrategyType.DDP_FULLY_SHARDED if use_tpu_spawn: ddp_strategy_cls = TPUSpawnStrategy elif use_ddp_sharded: ddp_strategy_cls = DDPShardedStrategy elif use_ddp_sharded_spawn: ddp_strategy_cls = DDPSpawnShardedStrategy elif ( use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp or use_kubeflow_ddp or use_ddp_cpu_kubeflow ): ddp_strategy_cls = DDPStrategy elif use_ddp_spawn or use_ddp_cpu_spawn: ddp_strategy_cls = DDPSpawnStrategy elif use_ddp_fully_sharded: ddp_strategy_cls = DDPFullyShardedStrategy else: ddp_strategy_cls = DDPStrategy plugin = ddp_strategy_cls( parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment ) elif self.use_dp: plugin = DataParallelStrategy(parallel_devices=self.parallel_devices) elif self.use_horovod: plugin = HorovodStrategy(parallel_devices=self.parallel_devices) elif self.use_tpu and isinstance(self.tpu_cores, list): plugin = SingleTPUStrategy(self.tpu_id) elif self.use_ipu: plugin = IPUStrategy(parallel_devices=self.parallel_devices) else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) plugin = SingleDeviceStrategy(device=single_gpu_ordinal if self.use_gpu else "cpu") return plugin
def test_detect_after_1_9_1(): """Test the detection of a torchelastic environment configuration after 1.9.1.""" with mock.patch.dict(os.environ, {}): assert not TorchElasticEnvironment.detect() with mock.patch.dict( os.environ, { "TORCHELASTIC_RUN_ID": "", }, ): assert TorchElasticEnvironment.detect()
def test_detect_before_1_9_1(): """Test the detection of a torchelastic environment configuration before 1.9.1.""" with mock.patch.dict(os.environ, {}): assert not TorchElasticEnvironment.detect() with mock.patch.dict( os.environ, { "RANK": "", "GROUP_RANK": "", "LOCAL_RANK": "", "LOCAL_WORLD_SIZE": "", }, ): assert TorchElasticEnvironment.detect()
def _check_strategy_and_fallback(self) -> None: """Checks edge cases when the strategy selection was a string input, and we need to fall back to a different choice depending on other parameters or the environment.""" # current fallback and check logic only apply to user pass in str config and object config # TODO this logic should apply to both str and object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks() ): strategy_flag = "ddp" if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu": rank_zero_warn(f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.") strategy_flag = "ddp" if ( strategy_flag in DDPFullyShardedNativeStrategy.get_registered_strategies() or isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy) ) and self._accelerator_flag != "gpu": raise MisconfigurationException( f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, " "but GPU accelerator is not used." ) if strategy_flag: self._strategy_flag = strategy_flag
def _check_strategy_and_fallback(self) -> None: """Checks edge cases when the strategy selection was a string input, and we need to fall back to a different choice depending on other parameters or the environment.""" # current fallback and check logic only apply to user pass in str config and object config # TODO this logic should apply to both str and object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks()): strategy_flag = "ddp" if strategy_flag == "dp" and self._accelerator_flag == "cpu": rank_zero_warn( f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`." ) strategy_flag = "ddp" if (strategy_flag in DDPFullyShardedNativeStrategy.get_registered_strategies() or isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy) ) and self._accelerator_flag not in ("cuda", "gpu"): raise MisconfigurationException( f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, " "but GPU accelerator is not used.") if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods( ): raise ValueError( f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this" f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead." ) if strategy_flag: self._strategy_flag = strategy_flag
def parse_gpu_ids( gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[int]]: """ Parses the GPU ids given in the format as accepted by the :class:`~pytorch_lightning.trainer.Trainer`. Args: gpus: An int -1 or string '-1' indicate that all available GPUs should be used. A list of unique ints or a string containing list of comma separated unique integers indicates specific GPUs to use. An int 0 means that no GPUs should be used. Any int N > 0 indicates that GPUs [0..N) should be used. Returns: a list of gpus to be used or ``None`` if no GPUs were requested If no GPUs are available but the value of gpus variable indicates request for GPUs then a MisconfigurationException is raised. """ # Check that gpus param is None, Int, String or List _check_data_type(gpus) # Handle the case when no gpus are requested if gpus is None or (isinstance(gpus, int) and gpus == 0) or str(gpus).strip() in ("0", "[]"): return None # We know user requested GPUs therefore if some of the # requested GPUs are not available an exception is thrown. gpus = _normalize_parse_gpu_string_input(gpus) gpus = _normalize_parse_gpu_input_to_list(gpus) if not gpus: raise MisconfigurationException( "GPUs requested but none are available.") if TorchElasticEnvironment.detect() and len(gpus) != 1 and len( _get_all_available_gpus()) == 1: # omit sanity check on torchelastic as by default shows one visible GPU per process return gpus # Check that gpus are unique. Duplicate gpus are not supported by the backend. _check_unique(gpus) return _sanitize_gpu_ids(gpus)
def _check_strategy_and_fallback(self) -> None: """Checks edge cases when the strategy selection was a string input, and we need to fall back to a different choice depending on other parameters or the environment.""" # current fallback and check logic only apply to user pass in str config and object config # TODO this logic should apply to both str and object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag == "ddp_cpu": if _TPU_AVAILABLE: raise MisconfigurationException( "`accelerator='ddp_cpu'` is not supported on TPU machines. " "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" ) if self._devices_flag == 1 and self._num_nodes_flag > 1: strategy_flag = DDPStrategy.strategy_name else: strategy_flag = "ddp_spawn" if self._accelerator_flag == "gpu": rank_zero_warn( "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." ) self._accelerator_flag = "cpu" self.accelerator = CPUAccelerator() if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks()): strategy_flag = "ddp" if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu": rank_zero_warn( f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`." ) strategy_flag = "ddp" if strategy_flag: self._strategy_flag = strategy_flag