def select_cluster_environment(self): if self.cluster_environment is not None: return self.cluster_environment if self.is_slurm_managing_tasks: env = SLURMEnvironment() elif self.is_using_torchelastic: env = TorchElasticEnvironment() # TODO: decouple DDP from TE # maybe introduce a DefaultEnvironment? os.environ["PL_IN_DDP_SUBPROCESS"] = "1" else: # TODO: maybe introduce a DefaultEnvironment? env = TorchElasticEnvironment() return env
def environment_combinations(): expected = dict(global_rank=3, local_rank=1, node_rank=1, world_size=4) # Lightning variables = { "CUDA_VISIBLE_DEVICES": "0,1,2,4", "LOCAL_RANK": "1", "NODE_RANK": "1", "WORLD_SIZE": "8" } environment = LightningEnvironment() yield environment, variables, expected # SLURM variables = { "CUDA_VISIBLE_DEVICES": "0,1,2,4", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_LOCALID": "1", "SLURM_NODEID": "1", "SLURM_PROCID": "3", "SLURM_NTASKS": "4", } environment = SLURMEnvironment() yield environment, variables, expected # TorchElastic variables = { "CUDA_VISIBLE_DEVICES": "0,1,2,4", "LOCAL_RANK": "1", "GROUP_RANK": "1", "RANK": "3", "WORLD_SIZE": "4", "LOCAL_WORLD_SIZE": "2", } environment = TorchElasticEnvironment() yield environment, variables, expected
def select_cluster_environment(self) -> ClusterEnvironment: if self._cluster_environment is not None: return self._cluster_environment if self.is_slurm_managing_tasks: env = SLURMEnvironment() # TODO: decouple DDP from SLURM # refactor and let generic cluster env hold the information about who spawns the processes os.environ["PL_IN_DDP_SUBPROCESS"] = "1" elif self.is_using_torchelastic: env = TorchElasticEnvironment() # TODO: decouple DDP from TE # refactor and let generic cluster env hold the information about who spawns the processes os.environ["PL_IN_DDP_SUBPROCESS"] = "1" else: # TODO: maybe introduce a DefaultEnvironment? env = TorchElasticEnvironment() return env
def test_attributes_from_environment_variables(): """ Test that the torchelastic cluster environment takes the attributes from the environment variables. """ env = TorchElasticEnvironment() assert env.master_address() == "1.2.3.4" assert env.master_port() == 500 assert env.world_size() == 20 assert env.local_rank() == 2 assert env.node_rank() == 3
def test_sync_batchnorm_ddp(tmpdir): seed_everything(234) set_random_master_port() # define datamodule and dataloader dm = MNISTDataModule() dm.prepare_data() dm.setup(stage=None) train_dataloader = dm.train_dataloader() model = SyncBNModule() bn_outputs = [] # shuffle is false by default for batch_idx, batch in enumerate(train_dataloader): x, _ = batch _, out_bn = model.forward(x, batch_idx) bn_outputs.append(out_bn) # get 3 steps if batch_idx == 2: break bn_outputs = [x.cuda() for x in bn_outputs] # reset datamodule # batch-size = 16 because 2 GPUs in DDP dm = MNISTDataModule(batch_size=16, dist_sampler=True) dm.prepare_data() dm.setup(stage=None) model = SyncBNModule(gpu_count=2, bn_targets=bn_outputs) trainer = Trainer(gpus=2, num_nodes=1, accelerator='ddp_spawn', max_epochs=1, max_steps=3, sync_batchnorm=True, num_sanity_val_steps=0, replace_sampler_ddp=False, plugins=[ DDPSpawnPlugin( parallel_devices=[ torch.device("cuda", 0), torch.device("cuda", 1) ], num_nodes=1, sync_batchnorm=True, cluster_environment=TorchElasticEnvironment(), find_unused_parameters=True) ]) trainer.fit(model, dm) assert trainer.state == TrainerState.FINISHED, "Sync batchnorm failing with DDP"
def select_cluster_environment(self) -> ClusterEnvironment: if self._cluster_environment is not None: return self._cluster_environment if self.is_slurm_managing_tasks: env = SLURMEnvironment() elif TorchElasticEnvironment.is_using_torchelastic(): env = TorchElasticEnvironment() else: env = LightningEnvironment() return env
def test_default_attributes(): """ Test the default attributes when no environment variables are set. """ env = TorchElasticEnvironment() assert env.creates_children() assert env.master_address() == "127.0.0.1" assert env.master_port() == 12910 assert env.world_size() is None with pytest.raises(KeyError): # local rank is required to be passed as env variable env.local_rank() assert env.node_rank() == 0
def test_default_attributes(): """Test the default attributes when no environment variables are set.""" env = TorchElasticEnvironment() assert env.creates_processes_externally assert env.main_address == "127.0.0.1" assert env.main_port == 12910 with pytest.raises(KeyError): # world size is required to be passed as env variable env.world_size() with pytest.raises(KeyError): # local rank is required to be passed as env variable env.local_rank() assert env.node_rank() == 0
def test_v1_4_0_deprecated_lightning_distributed_data_parallel(tmpdir): model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, gpus=2, accelerator="ddp_spawn", plugins=[ CustomDDPPlugin( parallel_devices=[torch.device("cuda", 0), torch.device("cuda", 1)], cluster_environment=TorchElasticEnvironment(), ) ] ) trainer.fit(model)
def select_cluster_environment(self) -> ClusterEnvironment: if self._cluster_environment is not None: return self._cluster_environment if self._is_slurm_managing_tasks(): env = SLURMEnvironment() rank_zero_info("Multiprocessing is handled by SLURM.") elif TorchElasticEnvironment.is_using_torchelastic(): env = TorchElasticEnvironment() elif KubeflowEnvironment.is_using_kubeflow(): env = KubeflowEnvironment() elif LSFEnvironment.is_using_lsf(): env = LSFEnvironment() else: env = LightningEnvironment() return env
def test_attributes_from_environment_variables(caplog): """Test that the torchelastic cluster environment takes the attributes from the environment variables.""" env = TorchElasticEnvironment() assert env.main_address == "1.2.3.4" assert env.main_port == 500 assert env.world_size() == 20 assert env.global_rank() == 1 assert env.local_rank() == 2 assert env.node_rank() == 3 # setter should be no-op with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"): env.set_global_rank(100) assert env.global_rank() == 1 assert "setting global rank is not allowed" in caplog.text caplog.clear() with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"): env.set_world_size(100) assert env.world_size() == 20 assert "setting world size is not allowed" in caplog.text