예제 #1
0
def test_init_two_actors(ray_start_distributed_multigpu_2_nodes_4_gpus,
                         group_name):
    world_size = 2
    actors, results = create_collective_multigpu_workers(
        world_size, group_name)
    for i in range(world_size):
        assert (results[i])
예제 #2
0
def test_sendrecv(ray_start_distributed_multigpu_2_nodes_4_gpus, array_size,
                  src_rank, dst_rank, src_gpu_index, dst_gpu_index):
    if src_rank == dst_rank:
        return
    world_size = 2
    actors, _ = create_collective_multigpu_workers(num_workers=world_size)

    ray.get(actors[0].set_buffer.remote(array_size, value0=2, value1=3))
    ray.get(actors[1].set_buffer.remote(array_size, value0=4, value1=5))

    refs = []
    for i in range(world_size):
        refs.append(actors[i].get_buffer.remote())
    refs[src_rank][src_gpu_index] = actors[src_rank].do_send_multigpu.remote(
        dst_rank=dst_rank,
        dst_gpu_index=dst_gpu_index,
        src_gpu_index=src_gpu_index)
    refs[dst_rank][dst_gpu_index] = actors[dst_rank].do_recv_multigpu.remote(
        src_rank=src_rank,
        src_gpu_index=src_gpu_index,
        dst_gpu_index=dst_gpu_index)
    results = []
    results_flattend = ray.get(refs[0] + refs[1])
    results.append([results_flattend[0], results_flattend[1]])
    results.append([results_flattend[2], results_flattend[3]])
    assert (results[src_rank][src_gpu_index] == cp.ones(
        array_size, dtype=cp.float32) * (
            (src_rank + 1) * 2 + src_gpu_index)).all()
    assert (results[dst_rank][dst_gpu_index] == cp.ones(
        array_size, dtype=cp.float32) * (
            (src_rank + 1) * 2 + src_gpu_index)).all()
    ray.get([a.destroy_group.remote() for a in actors])
예제 #3
0
def test_allreduce_multigpu_different_op(ray_start_distributed_multigpu_2_nodes_4_gpus):
    world_size = 2
    actors, _ = create_collective_multigpu_workers(world_size)
    # check product
    ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3))
    ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5))
    results = ray.get(
        [a.do_allreduce_multigpu.remote(op=ReduceOp.PRODUCT) for a in actors]
    )
    assert (results[0] == cp.ones((10,), dtype=cp.float32) * 120).all()
    assert (results[1] == cp.ones((10,), dtype=cp.float32) * 120).all()

    # check min
    ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3))
    ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5))
    results = ray.get([a.do_allreduce_multigpu.remote(op=ReduceOp.MIN) for a in actors])
    assert (results[0] == cp.ones((10,), dtype=cp.float32) * 2).all()
    assert (results[1] == cp.ones((10,), dtype=cp.float32) * 2).all()

    # check max
    ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3))
    ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5))
    results = ray.get([a.do_allreduce_multigpu.remote(op=ReduceOp.MAX) for a in actors])
    assert (results[0] == cp.ones((10,), dtype=cp.float32) * 5).all()
    assert (results[1] == cp.ones((10,), dtype=cp.float32) * 5).all()
예제 #4
0
def test_broadcast_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus,
                              src_rank, src_gpu_index):
    import torch

    world_size = 2
    num_gpu_per_worker = 2
    actors, _ = create_collective_multigpu_workers(world_size)
    ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3))
    ray.get(actors[1].set_buffer.remote([10],
                                        value0=4,
                                        value1=5,
                                        tensor_type0="torch",
                                        tensor_type1="torch"))
    results = ray.get([
        a.do_broadcast_multigpu.remote(src_rank=src_rank,
                                       src_gpu_index=src_gpu_index)
        for a in actors
    ])
    for i in range(world_size):
        for j in range(num_gpu_per_worker):
            val = (src_rank + 1) * 2 + src_gpu_index
            if i == 0:
                assert (results[i][j] == cp.ones([10], dtype=cp.float32) *
                        val).all()
            else:
                assert (results[i][j] == torch.ones([10]).cuda(j) * val).all()
예제 #5
0
def test_destroy_group(ray_start_distributed_multigpu_2_nodes_4_gpus):
    world_size = 2
    actors, _ = create_collective_multigpu_workers(world_size)
    # Now destroy the group at actor0
    ray.wait([actors[0].destroy_group.remote()])
    actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert not actor0_is_init

    # should go well as the group `random` does not exist at all
    ray.wait([actors[0].destroy_group.remote("random")])

    actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote())
    assert actor1_is_init
    ray.wait([actors[1].destroy_group.remote("random")])
    actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote())
    assert actor1_is_init
    ray.wait([actors[1].destroy_group.remote("default")])
    actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote())
    assert not actor1_is_init

    # Now reconstruct the group using the same name
    init_results = ray.get([
        actor.init_group.remote(world_size, i)
        for i, actor in enumerate(actors)
    ])
    for i in range(world_size):
        assert init_results[i]
    actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert actor0_is_init
    actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert actor1_is_init
예제 #6
0
def test_allreduce_multigpu_destroy(
    ray_start_distributed_multigpu_2_nodes_4_gpus, backend="nccl", group_name="default"
):
    world_size = 2
    num_gpu_per_worker = 2
    actual_world_size = world_size * num_gpu_per_worker
    actors, _ = create_collective_multigpu_workers(world_size)

    results = ray.get([a.do_allreduce_multigpu.remote() for a in actors])
    assert (results[0] == cp.ones((10,), dtype=cp.float32) * actual_world_size).all()
    assert (results[1] == cp.ones((10,), dtype=cp.float32) * actual_world_size).all()

    # destroy the group and try do work, should fail
    ray.get([a.destroy_group.remote() for a in actors])
    with pytest.raises(RuntimeError):
        results = ray.get([a.do_allreduce_multigpu.remote() for a in actors])

    # reinit the same group and all reduce
    ray.get(
        [
            actor.init_group.remote(world_size, i, backend, group_name)
            for i, actor in enumerate(actors)
        ]
    )
    results = ray.get([a.do_allreduce_multigpu.remote() for a in actors])
    assert (
        results[0]
        == cp.ones((10,), dtype=cp.float32) * actual_world_size * actual_world_size
    ).all()
    assert (
        results[1]
        == cp.ones((10,), dtype=cp.float32) * actual_world_size * actual_world_size
    ).all()
def test_reduce_different_op(ray_start_distributed_multigpu_2_nodes_4_gpus,
                             dst_rank, dst_gpu_index):
    world_size = 2
    num_gpu_per_worker = 2
    actors, _ = create_collective_multigpu_workers(world_size)

    # check product
    ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3))
    ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5))
    results = ray.get([
        a.do_reduce_multigpu.remote(dst_rank=dst_rank,
                                    dst_gpu_index=dst_gpu_index,
                                    op=ReduceOp.PRODUCT) for a in actors
    ])
    for i in range(world_size):
        for j in range(num_gpu_per_worker):
            if i == dst_rank and j == dst_gpu_index:
                assert (results[i][j] == cp.ones(
                    (10, ), dtype=cp.float32) * 120).all()
            else:
                val = (i + 1) * 2 + j
                assert (results[i][j] == cp.ones(
                    (10, ), dtype=cp.float32) * val).all()

    # check min
    ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3))
    ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5))
    results = ray.get([
        a.do_reduce_multigpu.remote(dst_rank=dst_rank,
                                    dst_gpu_index=dst_gpu_index,
                                    op=ReduceOp.MIN) for a in actors
    ])
    for i in range(world_size):
        for j in range(num_gpu_per_worker):
            if i == dst_rank and j == dst_gpu_index:
                assert (results[i][j] == cp.ones(
                    (10, ), dtype=cp.float32) * 2).all()
            else:
                val = (i + 1) * 2 + j
                assert (results[i][j] == cp.ones(
                    (10, ), dtype=cp.float32) * val).all()

    # check max
    ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3))
    ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5))
    results = ray.get([
        a.do_reduce_multigpu.remote(dst_rank=dst_rank,
                                    dst_gpu_index=dst_gpu_index,
                                    op=ReduceOp.MAX) for a in actors
    ])
    for i in range(world_size):
        for j in range(num_gpu_per_worker):
            if i == dst_rank and j == dst_gpu_index:
                assert (results[i][j] == cp.ones(
                    (10, ), dtype=cp.float32) * 5).all()
            else:
                val = (i + 1) * 2 + j
                assert (results[i][j] == cp.ones(
                    (10, ), dtype=cp.float32) * val).all()
def test_broadcast_invalid_rank(ray_start_distributed_multigpu_2_nodes_4_gpus,
                                src_rank, src_gpu_index):
    world_size = 2
    actors, _ = create_collective_multigpu_workers(world_size)
    with pytest.raises(ValueError):
        _ = ray.get([
            a.do_broadcast_multigpu.remote(
                src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors
        ])
def test_availability(ray_start_distributed_multigpu_2_nodes_4_gpus):
    world_size = 2
    actors, _ = create_collective_multigpu_workers(world_size)
    actor0_nccl_availability = ray.get(
        actors[0].report_nccl_availability.remote())
    assert actor0_nccl_availability
    actor0_gloo_availability = ray.get(
        actors[0].report_gloo_availability.remote())
    assert not actor0_gloo_availability
예제 #10
0
def test_allreduce_multigpu_different_dtype(
    ray_start_distributed_multigpu_2_nodes_4_gpus, dtype
):
    world_size = 2
    num_gpu_per_worker = 2
    actual_world_size = world_size * num_gpu_per_worker
    actors, _ = create_collective_multigpu_workers(world_size)
    ray.get([a.set_buffer.remote([10], dtype=dtype) for a in actors])
    results = ray.get([a.do_allreduce_multigpu.remote() for a in actors])
    assert (results[0] == cp.ones((10,), dtype=dtype) * actual_world_size).all()
    assert (results[1] == cp.ones((10,), dtype=dtype) * actual_world_size).all()
예제 #11
0
def test_allreduce_multigpu_different_name(
    ray_start_distributed_multigpu_2_nodes_4_gpus, group_name
):
    world_size = 2
    num_gpu_per_worker = 2
    actual_world_size = world_size * num_gpu_per_worker
    actors, _ = create_collective_multigpu_workers(
        num_workers=world_size, group_name=group_name
    )
    results = ray.get([a.do_allreduce_multigpu.remote(group_name) for a in actors])
    assert (results[0] == cp.ones((10,), dtype=cp.float32) * actual_world_size).all()
    assert (results[1] == cp.ones((10,), dtype=cp.float32) * actual_world_size).all()
def test_is_group_initialized(ray_start_distributed_multigpu_2_nodes_4_gpus):
    world_size = 2
    actors, _ = create_collective_multigpu_workers(world_size)
    # check group is_init
    actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert actor0_is_init
    actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote("random"))
    assert not actor0_is_init
    actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote("123"))
    assert not actor0_is_init
    actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert actor1_is_init
    actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote("456"))
    assert not actor1_is_init
def test_broadcast_different_array_size(
        ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, src_rank,
        src_gpu_index):
    world_size = 2
    num_gpu_per_worker = 2
    actors, _ = create_collective_multigpu_workers(world_size)
    ray.get(actors[0].set_buffer.remote([array_size], value0=2, value1=3))
    ray.get(actors[1].set_buffer.remote([array_size], value0=4, value1=5))
    results = ray.get([
        a.do_broadcast_multigpu.remote(
            src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors
    ])
    for i in range(world_size):
        for j in range(num_gpu_per_worker):
            val = (src_rank + 1) * 2 + src_gpu_index
            assert (results[i][j] == cp.ones(
                (array_size, ), dtype=cp.float32) * val).all()
예제 #14
0
def test_allgather_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus):
    world_size = 2
    num_gpu_per_worker = 2
    actual_world_size = world_size * num_gpu_per_worker
    shape = [10, 10]
    actors, _ = create_collective_multigpu_workers(world_size)

    # tensor is pytorch, list is cupy
    for i, a in enumerate(actors):
        ray.get([
            a.set_buffer.remote(shape,
                                tensor_type0="torch",
                                tensor_type1="torch")
        ])
        ray.get([
            a.set_list_buffer.remote(shape,
                                     tensor_type0="cupy",
                                     tensor_type1="cupy")
        ])
    results = ray.get([a.do_allgather_multigpu.remote() for a in actors])
    for i in range(world_size):
        for j in range(num_gpu_per_worker):
            for k in range(actual_world_size):
                assert (results[i][j][k] == cp.ones(shape,
                                                    dtype=cp.float32)).all()

    # tensor is cupy, list is pytorch
    for i, a in enumerate(actors):
        ray.get([
            a.set_buffer.remote(shape,
                                tensor_type0="cupy",
                                tensor_type1="cupy")
        ])
        ray.get([
            a.set_list_buffer.remote(shape,
                                     tensor_type0="torch",
                                     tensor_type1="torch")
        ])
    results = ray.get([a.do_allgather_multigpu.remote() for a in actors])
    for i in range(world_size):
        for j in range(num_gpu_per_worker):
            for k in range(actual_world_size):
                assert (results[i][j][k] == torch.ones(
                    shape, dtype=torch.float32).cuda(j)).all()
예제 #15
0
def test_allreduce_multigpu_multiple_group(
        ray_start_distributed_multigpu_2_nodes_4_gpus,
        backend="nccl",
        num_groups=5):
    world_size = 2
    num_gpu_per_worker = 2
    actual_world_size = world_size * num_gpu_per_worker
    actors, _ = create_collective_multigpu_workers(world_size)
    for group_name in range(1, num_groups):
        ray.get([
            actor.init_group.remote(world_size, i, backend, str(group_name))
            for i, actor in enumerate(actors)
        ])
    for i in range(num_groups):
        group_name = "default" if i == 0 else str(i)
        results = ray.get(
            [a.do_allreduce_multigpu.remote(group_name) for a in actors])
        assert (results[0] == cp.ones(
            (10, ), dtype=cp.float32) * (actual_world_size**(i + 1))).all()
예제 #16
0
def test_allreduce_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus):
    # import torch
    world_size = 2
    actual_world_size = 4
    actors, _ = create_collective_multigpu_workers(world_size)
    ray.get(actors[0].set_buffer.remote([10]))
    ray.get(
        actors[1].set_buffer.remote([10], tensor_type0="torch", tensor_type1="torch")
    )
    results = ray.get([a.do_allreduce_multigpu.remote() for a in actors])
    assert (results[0] == cp.ones((10,)) * actual_world_size).all()

    ray.get(
        actors[0].set_buffer.remote([10], tensor_type0="cupy", tensor_type1="torch")
    )
    ray.get(
        actors[1].set_buffer.remote([10], tensor_type0="torch", tensor_type1="cupy")
    )
    results = ray.get([a.do_allreduce_multigpu.remote() for a in actors])
    assert (results[0] == cp.ones((10,)) * actual_world_size).all()
예제 #17
0
def test_allgather_different_array_size(
        ray_start_distributed_multigpu_2_nodes_4_gpus, array_size,
        tensor_backend):
    world_size = 2
    num_gpu_per_worker = 2
    actual_world_size = world_size * num_gpu_per_worker
    actors, _ = create_collective_multigpu_workers(world_size)
    init_tensors_for_gather_scatter_multigpu(actors,
                                             array_size=array_size,
                                             tensor_backend=tensor_backend)
    results = ray.get([a.do_allgather_multigpu.remote() for a in actors])
    for i in range(world_size):
        for j in range(num_gpu_per_worker):
            for k in range(actual_world_size):
                if tensor_backend == "cupy":
                    assert (results[i][j][k] == cp.ones(
                        array_size, dtype=cp.float32)).all()
                else:
                    assert (results[i][j][k] == torch.ones(
                        array_size, dtype=torch.float32).cuda(j)).all()
def test_reduce_different_name(ray_start_distributed_multigpu_2_nodes_4_gpus,
                               group_name, dst_rank, dst_gpu_index):
    world_size = 2
    num_gpu_per_worker = 2
    actual_world_size = world_size * num_gpu_per_worker
    actors, _ = create_collective_multigpu_workers(num_workers=world_size,
                                                   group_name=group_name)
    results = ray.get([
        a.do_reduce_multigpu.remote(group_name,
                                    dst_rank=dst_rank,
                                    dst_gpu_index=dst_gpu_index)
        for a in actors
    ])
    for i in range(world_size):
        for j in range(num_gpu_per_worker):
            if i == dst_rank and j == dst_gpu_index:
                assert (results[i][j] == cp.ones(
                    (10, ), dtype=cp.float32) * actual_world_size).all()
            else:
                assert (results[i][j] == cp.ones((10, ),
                                                 dtype=cp.float32)).all()
예제 #19
0
def test_get_rank(ray_start_distributed_multigpu_2_nodes_4_gpus):
    world_size = 2
    actors, _ = create_collective_multigpu_workers(world_size)
    actor0_rank = ray.get(actors[0].report_rank.remote())
    assert actor0_rank == 0
    actor1_rank = ray.get(actors[1].report_rank.remote())
    assert actor1_rank == 1

    # create a second group with a different name, and different
    # orders of ranks.
    new_group_name = "default2"
    ranks = list(range(world_size))
    shuffle(ranks)
    ray.get([
        actor.init_group.remote(world_size,
                                ranks[i],
                                group_name=new_group_name)
        for i, actor in enumerate(actors)
    ])
    actor0_rank = ray.get(actors[0].report_rank.remote(new_group_name))
    assert actor0_rank == ranks[0]
    actor1_rank = ray.get(actors[1].report_rank.remote(new_group_name))
    assert actor1_rank == ranks[1]
예제 #20
0
def test_report_num_gpus(ray_start_distributed_multigpu_2_nodes_4_gpus):
    world_size = 2
    actors, results = create_collective_multigpu_workers(world_size)
    num_gpus = ray.get([actor.report_num_gpus.remote() for actor in actors])
    assert num_gpus == [2, 2]