def select_strategy(self) -> Strategy: if isinstance(self.distributed_backend, Accelerator) and self.distributed_backend.strategy is not None: plugin = self.distributed_backend.strategy elif self.use_ddp2: plugin = DDP2Strategy(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment) elif self.use_ddp and self.use_deepspeed: plugin = DeepSpeedStrategy( cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices ) elif self.use_ddp: use_slurm_ddp = self.use_ddp and self._is_slurm_managing_tasks() use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.detect() use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.detect() use_ddp_spawn = self._strategy_type == _StrategyType.DDP_SPAWN use_ddp_cpu_spawn = use_ddp_spawn and self.use_cpu use_tpu_spawn = self.use_tpu and self._strategy_type == _StrategyType.TPU_SPAWN use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.detect() use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.detect() use_ddp_cpu_slurm = use_ddp_cpu_spawn and self._is_slurm_managing_tasks() use_ddp_sharded = self._strategy_type == _StrategyType.DDP_SHARDED use_ddp_sharded_spawn = self._strategy_type == _StrategyType.DDP_SHARDED_SPAWN use_ddp_fully_sharded = self._strategy_type == _StrategyType.DDP_FULLY_SHARDED if use_tpu_spawn: ddp_strategy_cls = TPUSpawnStrategy elif use_ddp_sharded: ddp_strategy_cls = DDPShardedStrategy elif use_ddp_sharded_spawn: ddp_strategy_cls = DDPSpawnShardedStrategy elif ( use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp or use_kubeflow_ddp or use_ddp_cpu_kubeflow ): ddp_strategy_cls = DDPStrategy elif use_ddp_spawn or use_ddp_cpu_spawn: ddp_strategy_cls = DDPSpawnStrategy elif use_ddp_fully_sharded: ddp_strategy_cls = DDPFullyShardedStrategy else: ddp_strategy_cls = DDPStrategy plugin = ddp_strategy_cls( parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment ) elif self.use_dp: plugin = DataParallelStrategy(parallel_devices=self.parallel_devices) elif self.use_horovod: plugin = HorovodStrategy(parallel_devices=self.parallel_devices) elif self.use_tpu and isinstance(self.tpu_cores, list): plugin = SingleTPUStrategy(self.tpu_id) elif self.use_ipu: plugin = IPUStrategy(parallel_devices=self.parallel_devices) else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) plugin = SingleDeviceStrategy(device=single_gpu_ordinal if self.use_gpu else "cpu") return plugin
def _check_strategy_and_fallback(self) -> None: """Checks edge cases when the strategy selection was a string input, and we need to fall back to a different choice depending on other parameters or the environment.""" # current fallback and check logic only apply to user pass in str config and object config # TODO this logic should apply to both str and object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks() ): strategy_flag = "ddp" if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu": rank_zero_warn(f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.") strategy_flag = "ddp" if ( strategy_flag in DDPFullyShardedNativeStrategy.get_registered_strategies() or isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy) ) and self._accelerator_flag != "gpu": raise MisconfigurationException( f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, " "but GPU accelerator is not used." ) if strategy_flag: self._strategy_flag = strategy_flag
def _check_strategy_and_fallback(self) -> None: """Checks edge cases when the strategy selection was a string input, and we need to fall back to a different choice depending on other parameters or the environment.""" # current fallback and check logic only apply to user pass in str config and object config # TODO this logic should apply to both str and object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks()): strategy_flag = "ddp" if strategy_flag == "dp" and self._accelerator_flag == "cpu": rank_zero_warn( f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`." ) strategy_flag = "ddp" if (strategy_flag in DDPFullyShardedNativeStrategy.get_registered_strategies() or isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy) ) and self._accelerator_flag not in ("cuda", "gpu"): raise MisconfigurationException( f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, " "but GPU accelerator is not used.") if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods( ): raise ValueError( f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this" f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead." ) if strategy_flag: self._strategy_flag = strategy_flag
def _check_strategy_and_fallback(self) -> None: """Checks edge cases when the strategy selection was a string input, and we need to fall back to a different choice depending on other parameters or the environment.""" # current fallback and check logic only apply to user pass in str config and object config # TODO this logic should apply to both str and object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag == "ddp_cpu": if _TPU_AVAILABLE: raise MisconfigurationException( "`accelerator='ddp_cpu'` is not supported on TPU machines. " "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" ) if self._devices_flag == 1 and self._num_nodes_flag > 1: strategy_flag = DDPStrategy.strategy_name else: strategy_flag = "ddp_spawn" if self._accelerator_flag == "gpu": rank_zero_warn( "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." ) self._accelerator_flag = "cpu" self.accelerator = CPUAccelerator() if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks()): strategy_flag = "ddp" if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu": rank_zero_warn( f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`." ) strategy_flag = "ddp" if strategy_flag: self._strategy_flag = strategy_flag
def test_detect_kubeflow(): assert KubeflowEnvironment.detect()
def test_detect_torchelastic_over_kubeflow(): assert not KubeflowEnvironment.detect()