示例#1
0
 def select_cluster_environment(self):
     if self.cluster_environment is not None:
         return self.cluster_environment
     if self.is_slurm_managing_tasks:
         env = SLURMEnvironment()
     elif self.is_using_torchelastic:
         env = TorchElasticEnvironment()
         # TODO: decouple DDP from TE
         #   maybe introduce a DefaultEnvironment?
         os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
     else:
         # TODO: maybe introduce a DefaultEnvironment?
         env = TorchElasticEnvironment()
     return env
示例#2
0
def environment_combinations():
    expected = dict(global_rank=3, local_rank=1, node_rank=1, world_size=4)
    # Lightning
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "LOCAL_RANK": "1",
        "NODE_RANK": "1",
        "WORLD_SIZE": "8"
    }
    environment = LightningEnvironment()
    yield environment, variables, expected
    # SLURM
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "SLURM_JOB_NAME": "SOME_NAME",
        "SLURM_LOCALID": "1",
        "SLURM_NODEID": "1",
        "SLURM_PROCID": "3",
        "SLURM_NTASKS": "4",
    }
    environment = SLURMEnvironment()
    yield environment, variables, expected
    # TorchElastic
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "LOCAL_RANK": "1",
        "GROUP_RANK": "1",
        "RANK": "3",
        "WORLD_SIZE": "4",
        "LOCAL_WORLD_SIZE": "2",
    }
    environment = TorchElasticEnvironment()
    yield environment, variables, expected
 def select_cluster_environment(self) -> ClusterEnvironment:
     if self._cluster_environment is not None:
         return self._cluster_environment
     if self.is_slurm_managing_tasks:
         env = SLURMEnvironment()
         # TODO: decouple DDP from SLURM
         #   refactor and let generic cluster env hold the information about who spawns the processes
         os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
     elif self.is_using_torchelastic:
         env = TorchElasticEnvironment()
         # TODO: decouple DDP from TE
         #   refactor and let generic cluster env hold the information about who spawns the processes
         os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
     else:
         # TODO: maybe introduce a DefaultEnvironment?
         env = TorchElasticEnvironment()
     return env
示例#4
0
def test_attributes_from_environment_variables():
    """ Test that the torchelastic cluster environment takes the attributes from the environment variables. """
    env = TorchElasticEnvironment()
    assert env.master_address() == "1.2.3.4"
    assert env.master_port() == 500
    assert env.world_size() == 20
    assert env.local_rank() == 2
    assert env.node_rank() == 3
def test_sync_batchnorm_ddp(tmpdir):
    seed_everything(234)
    set_random_master_port()

    # define datamodule and dataloader
    dm = MNISTDataModule()
    dm.prepare_data()
    dm.setup(stage=None)

    train_dataloader = dm.train_dataloader()
    model = SyncBNModule()

    bn_outputs = []

    # shuffle is false by default
    for batch_idx, batch in enumerate(train_dataloader):
        x, _ = batch

        _, out_bn = model.forward(x, batch_idx)
        bn_outputs.append(out_bn)

        # get 3 steps
        if batch_idx == 2:
            break

    bn_outputs = [x.cuda() for x in bn_outputs]

    # reset datamodule
    # batch-size = 16 because 2 GPUs in DDP
    dm = MNISTDataModule(batch_size=16, dist_sampler=True)
    dm.prepare_data()
    dm.setup(stage=None)

    model = SyncBNModule(gpu_count=2, bn_targets=bn_outputs)

    trainer = Trainer(gpus=2,
                      num_nodes=1,
                      accelerator='ddp_spawn',
                      max_epochs=1,
                      max_steps=3,
                      sync_batchnorm=True,
                      num_sanity_val_steps=0,
                      replace_sampler_ddp=False,
                      plugins=[
                          DDPSpawnPlugin(
                              parallel_devices=[
                                  torch.device("cuda", 0),
                                  torch.device("cuda", 1)
                              ],
                              num_nodes=1,
                              sync_batchnorm=True,
                              cluster_environment=TorchElasticEnvironment(),
                              find_unused_parameters=True)
                      ])

    trainer.fit(model, dm)
    assert trainer.state == TrainerState.FINISHED, "Sync batchnorm failing with DDP"
 def select_cluster_environment(self) -> ClusterEnvironment:
     if self._cluster_environment is not None:
         return self._cluster_environment
     if self.is_slurm_managing_tasks:
         env = SLURMEnvironment()
     elif TorchElasticEnvironment.is_using_torchelastic():
         env = TorchElasticEnvironment()
     else:
         env = LightningEnvironment()
     return env
示例#7
0
def test_default_attributes():
    """ Test the default attributes when no environment variables are set. """
    env = TorchElasticEnvironment()
    assert env.creates_children()
    assert env.master_address() == "127.0.0.1"
    assert env.master_port() == 12910
    assert env.world_size() is None
    with pytest.raises(KeyError):
        # local rank is required to be passed as env variable
        env.local_rank()
    assert env.node_rank() == 0
示例#8
0
def test_default_attributes():
    """Test the default attributes when no environment variables are set."""
    env = TorchElasticEnvironment()
    assert env.creates_processes_externally
    assert env.main_address == "127.0.0.1"
    assert env.main_port == 12910
    with pytest.raises(KeyError):
        # world size is required to be passed as env variable
        env.world_size()
    with pytest.raises(KeyError):
        # local rank is required to be passed as env variable
        env.local_rank()
    assert env.node_rank() == 0
def test_v1_4_0_deprecated_lightning_distributed_data_parallel(tmpdir):
    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        gpus=2,
        accelerator="ddp_spawn",
        plugins=[
            CustomDDPPlugin(
                parallel_devices=[torch.device("cuda", 0), torch.device("cuda", 1)],
                cluster_environment=TorchElasticEnvironment(),
            )
        ]
    )
    trainer.fit(model)
示例#10
0
 def select_cluster_environment(self) -> ClusterEnvironment:
     if self._cluster_environment is not None:
         return self._cluster_environment
     if self._is_slurm_managing_tasks():
         env = SLURMEnvironment()
         rank_zero_info("Multiprocessing is handled by SLURM.")
     elif TorchElasticEnvironment.is_using_torchelastic():
         env = TorchElasticEnvironment()
     elif KubeflowEnvironment.is_using_kubeflow():
         env = KubeflowEnvironment()
     elif LSFEnvironment.is_using_lsf():
         env = LSFEnvironment()
     else:
         env = LightningEnvironment()
     return env
示例#11
0
def test_attributes_from_environment_variables(caplog):
    """Test that the torchelastic cluster environment takes the attributes from the environment variables."""
    env = TorchElasticEnvironment()
    assert env.main_address == "1.2.3.4"
    assert env.main_port == 500
    assert env.world_size() == 20
    assert env.global_rank() == 1
    assert env.local_rank() == 2
    assert env.node_rank() == 3
    # setter should be no-op
    with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"):
        env.set_global_rank(100)
    assert env.global_rank() == 1
    assert "setting global rank is not allowed" in caplog.text

    caplog.clear()

    with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"):
        env.set_world_size(100)
    assert env.world_size() == 20
    assert "setting world size is not allowed" in caplog.text