Пример #1
0
def _test__native_dist_model_create_from_backend_dist(init_method, local_rank,
                                                      rank, world_size,
                                                      backend, true_device):

    import os
    from datetime import timedelta

    timeout = timedelta(seconds=20)
    os.environ["RANK"] = f"{rank}"

    assert "MASTER_ADDR" not in os.environ
    assert "MASTER_PORT" not in os.environ

    model = _NativeDistModel.create_from_backend(backend=backend,
                                                 timeout=timeout,
                                                 init_method=init_method)

    assert dist.is_available() and dist.is_initialized()
    assert dist.get_backend() == backend

    with pytest.raises(
            RuntimeError,
            match=
            r"Can not create new distributed process group if default one is"):
        _NativeDistModel.create_from_backend(backend=backend, timeout=timeout)

    _assert_model(
        model,
        {
            "device": true_device,
            "local_rank": local_rank,
            "rank": rank,
            "world_size": world_size,
            "node_index": 0,
            "nnodes": 1,
            "nproc_per_node": world_size,
        },
    )

    if init_method is None:
        assert model._init_method == "env://"
    else:
        assert model._init_method == init_method

    model.finalize()

    del os.environ["RANK"]

    assert "MASTER_ADDR" not in os.environ
    assert "MASTER_PORT" not in os.environ
    assert "RANK" not in os.environ
Пример #2
0
def test__native_dist_model_create_from_backend_bad_config():
    import os
    from datetime import timedelta

    os.environ["RANK"] = "1"

    with pytest.raises(
            RuntimeError,
            match=
            r"PyTorch distributed configuration should define env variables"):
        _NativeDistModel.create_from_backend(backend="gloo",
                                             timeout=timedelta(seconds=10))

    del os.environ["RANK"]
Пример #3
0
def test__native_dist_model_init_method_is_not_none(world_size, local_rank,
                                                    get_fixed_dirname):
    init_method = f"file://{get_fixed_dirname('native_dist_model_init_method_is_not_none')}/shared"
    with pytest.raises(ValueError,
                       match=r"Both rank and world_size should be provided"):
        _NativeDistModel.create_from_backend(backend="gloo",
                                             world_size=world_size,
                                             init_method=init_method)

    with pytest.raises(ValueError,
                       match=r"Both rank and world_size should be provided"):
        _NativeDistModel.create_from_backend(backend="gloo",
                                             rank=local_rank,
                                             init_method=init_method)
Пример #4
0
def test__native_dist_model_create_from_backend_bad_slurm_config():
    import os
    from datetime import timedelta

    os.environ["SLURM_JOB_ID"] = "1"

    with pytest.raises(RuntimeError,
                       match=r"SLURM distributed configuration is missing"):
        _NativeDistModel.create_from_backend(backend="gloo",
                                             timeout=timedelta(seconds=10))

    with pytest.raises(
            ValueError,
            match=
            r"Arguments rank and world_size should not be specified with SLURM"
    ):
        _NativeDistModel.create_from_backend(backend="gloo",
                                             timeout=timedelta(seconds=10),
                                             rank=1,
                                             init_method="",
                                             world_size=1)

    os.environ["SLURM_PROCID"] = "0"
    os.environ["SLURM_LOCALID"] = "0"
    os.environ["SLURM_NTASKS"] = "1"
    os.environ["SLURM_JOB_NODELIST"] = "localhost"
    os.environ["SLURM_JOB_NUM_NODES"] = "1"

    os.environ["RANK"] = "1"

    with pytest.warns(UserWarning,
                      match=r"We detected the following env variables"):
        model = _NativeDistModel.create_from_backend(
            backend="gloo", timeout=timedelta(seconds=10))
        model.finalize()

    del os.environ["SLURM_JOB_ID"]
    del os.environ["SLURM_PROCID"]
    del os.environ["SLURM_LOCALID"]
    del os.environ["SLURM_NTASKS"]
    del os.environ["SLURM_JOB_NODELIST"]
    del os.environ["SLURM_JOB_NUM_NODES"]
    del os.environ["RANK"]
Пример #5
0
def test__native_dist_model():
    available_backends = _NativeDistModel.available_backends

    if dist.is_nccl_available():
        assert "nccl" in available_backends
    else:
        assert "nccl" not in available_backends

    if dist.is_gloo_available():
        assert "gloo" in available_backends
    else:
        assert "gloo" not in available_backends

    if dist.is_mpi_available():
        assert "mpi" in available_backends
    else:
        assert "mpi" not in available_backends

    with pytest.raises(ValueError, match=r"Backend should be one of"):
        _NativeDistModel.create_from_backend("abc")
Пример #6
0
def _test__native_dist_model_create_from_backend_dist(local_rank, rank,
                                                      world_size, backend,
                                                      true_device):
    import os
    from datetime import timedelta

    timeout = timedelta(seconds=20)
    os.environ["RANK"] = "{}".format(rank)

    model = _NativeDistModel.create_from_backend(backend=backend,
                                                 timeout=timeout)

    assert dist.is_available() and dist.is_initialized()
    assert dist.get_backend() == backend

    with pytest.raises(
            RuntimeError,
            match=
            r"Can not create new distributed process group if default one is"):
        _NativeDistModel.create_from_backend(backend=backend, timeout=timeout)

    _assert_model(
        model,
        {
            "device": true_device,
            "local_rank": local_rank,
            "rank": rank,
            "world_size": world_size,
            "node_index": 0,
            "nnodes": 1,
            "nproc_per_node": world_size,
        },
    )

    model.finalize()

    del os.environ["RANK"]
Пример #7
0
def test__native_dist_model_create_from_backend_bad_slurm_config():
    import os
    from datetime import timedelta

    os.environ["SLURM_JOB_ID"] = "1"

    with pytest.raises(RuntimeError,
                       match=r"SLURM distributed configuration is missing"):
        _NativeDistModel.create_from_backend(backend="gloo",
                                             timeout=timedelta(seconds=10))

    with pytest.raises(
            ValueError,
            match=
            r"Arguments rank and world_size should not be specified with SLURM"
    ):
        _NativeDistModel.create_from_backend(backend="gloo",
                                             timeout=timedelta(seconds=10),
                                             rank=1,
                                             init_method="",
                                             world_size=1)

    os.environ["SLURM_PROCID"] = "0"
    os.environ["SLURM_LOCALID"] = "0"
    os.environ["SLURM_NTASKS"] = "1"
    os.environ["SLURM_JOB_NODELIST"] = "localhost"

    with pytest.raises(FileNotFoundError,
                       match=r"No such file or directory: 'scontrol'"):
        _NativeDistModel.create_from_backend(backend="gloo",
                                             timeout=timedelta(seconds=10))

    os.environ["RANK"] = "1"

    with pytest.raises(RuntimeError, match=r"Defined env variables"):
        _NativeDistModel.create_from_backend(backend="gloo",
                                             timeout=timedelta(seconds=10))

    del os.environ["SLURM_JOB_ID"]
    del os.environ["SLURM_PROCID"]
    del os.environ["SLURM_LOCALID"]
    del os.environ["SLURM_NTASKS"]
    del os.environ["SLURM_JOB_NODELIST"]
    del os.environ["RANK"]
Пример #8
0
def _test__native_dist_model_create_from_backend_no_dist(backend, true_device):
    from datetime import timedelta

    model = _NativeDistModel.create_from_backend(backend=backend, timeout=timedelta(seconds=20))

    assert dist.is_available() and dist.is_initialized()
    assert dist.get_backend() == backend

    _assert_model(
        model,
        {
            "device": true_device,
            "local_rank": 0,
            "rank": 0,
            "world_size": 1,
            "node_index": 0,
            "nnodes": 1,
            "nproc_per_node": 1,
        },
    )

    model.finalize()
Пример #9
0
def test__native_dist_model_init_method_is_none(world_size):
    with pytest.raises(ValueError,
                       match=r"Arguments rank and world_size should be None"):
        _NativeDistModel.create_from_backend(backend="gloo",
                                             world_size=world_size)
Пример #10
0
def _test__native_dist_model_create_from_backend_slurm(local_rank, rank,
                                                       world_size, backend,
                                                       true_device):

    import os
    from datetime import timedelta

    timeout = timedelta(seconds=20)

    assert "MASTER_ADDR" not in os.environ
    assert "MASTER_PORT" not in os.environ

    del os.environ["WORLD_SIZE"]
    del os.environ["LOCAL_RANK"]

    os.environ["SLURM_JOB_ID"] = "15000"
    os.environ["SLURM_PROCID"] = str(rank)
    os.environ["SLURM_LOCALID"] = str(local_rank)
    os.environ["SLURM_NTASKS"] = str(world_size)
    os.environ["SLURM_JOB_NODELIST"] = "localhost"
    os.environ["SLURM_JOB_NUM_NODES"] = "1"

    model = _NativeDistModel.create_from_backend(backend=backend,
                                                 timeout=timeout)

    assert dist.is_available() and dist.is_initialized()
    assert dist.get_backend() == backend

    with pytest.raises(
            RuntimeError,
            match=
            r"Can not create new distributed process group if default one is"):
        _NativeDistModel.create_from_backend(backend=backend, timeout=timeout)

    _assert_model(
        model,
        {
            "device": true_device,
            "local_rank": local_rank,
            "rank": rank,
            "world_size": world_size,
            "node_index": 0,
            "nnodes": 1,
            "nproc_per_node": world_size,
        },
    )

    model.finalize()

    del os.environ["SLURM_JOB_ID"]
    del os.environ["SLURM_PROCID"]
    del os.environ["SLURM_LOCALID"]
    del os.environ["SLURM_NTASKS"]
    del os.environ["SLURM_JOB_NODELIST"]
    del os.environ["SLURM_JOB_NUM_NODES"]

    assert "MASTER_ADDR" not in os.environ
    assert "MASTER_PORT" not in os.environ
    assert "RANK" not in os.environ

    os.environ["WORLD_SIZE"] = str(world_size)
    os.environ["LOCAL_RANK"] = str(local_rank)