Пример #1
0
def test_sendrecv(
    ray_start_distributed_2_nodes_4_gpus, group_name, array_size, src_rank, dst_rank
):
    if src_rank == dst_rank:
        return
    world_size = 4
    actors, _ = create_collective_workers(num_workers=world_size, group_name=group_name)
    ray.get(
        [
            a.set_buffer.remote(cp.ones(array_size, dtype=cp.float32) * (i + 1))
            for i, a in enumerate(actors)
        ]
    )
    refs = []
    for i in range(world_size):
        refs.append(actors[i].get_buffer.remote())
    refs[src_rank] = actors[src_rank].do_send.remote(group_name, dst_rank)
    refs[dst_rank] = actors[dst_rank].do_recv.remote(group_name, src_rank)
    results = ray.get(refs)
    assert (
        results[src_rank] == cp.ones(array_size, dtype=cp.float32) * (src_rank + 1)
    ).all()
    assert (
        results[dst_rank] == cp.ones(array_size, dtype=cp.float32) * (src_rank + 1)
    ).all()
    ray.get([a.destroy_group.remote(group_name) for a in actors])
Пример #2
0
def test_allreduce_different_name(ray_start_single_node_2_gpus, group_name):
    world_size = 2
    actors, _ = create_collective_workers(num_workers=world_size,
                                          group_name=group_name)
    results = ray.get([a.do_allreduce.remote(group_name) for a in actors])
    assert (results[0] == cp.ones((10, ), dtype=cp.float32) * world_size).all()
    assert (results[1] == cp.ones((10, ), dtype=cp.float32) * world_size).all()
Пример #3
0
def test_allreduce_different_op(ray_start_single_node_2_gpus):
    world_size = 2
    actors, _ = create_collective_workers(world_size)

    # check product
    ray.wait([
        a.set_buffer.remote(cp.ones(10, dtype=cp.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get(
        [a.do_allreduce.remote(op=ReduceOp.PRODUCT) for a in actors])
    assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 6).all()
    assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 6).all()

    # check min
    ray.wait([
        a.set_buffer.remote(cp.ones(10, dtype=cp.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get([a.do_allreduce.remote(op=ReduceOp.MIN) for a in actors])
    assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 2).all()
    assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 2).all()

    # check max
    ray.wait([
        a.set_buffer.remote(cp.ones(10, dtype=cp.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get([a.do_allreduce.remote(op=ReduceOp.MAX) for a in actors])
    assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 3).all()
    assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 3).all()
Пример #4
0
def test_allreduce_different_dtype(ray_start_single_node_2_gpus, dtype):
    world_size = 2
    actors, _ = create_collective_workers(world_size)
    ray.wait([a.set_buffer.remote(cp.ones(10, dtype=dtype)) for a in actors])
    results = ray.get([a.do_allreduce.remote() for a in actors])
    assert (results[0] == cp.ones((10, ), dtype=dtype) * world_size).all()
    assert (results[1] == cp.ones((10, ), dtype=dtype) * world_size).all()
Пример #5
0
def test_allreduce_destroy(ray_start_single_node_2_gpus,
                           backend="nccl",
                           group_name="default"):
    world_size = 2
    actors, _ = create_collective_workers(world_size)

    results = ray.get([a.do_allreduce.remote() for a in actors])
    assert (results[0] == cp.ones((10, ), dtype=cp.float32) * world_size).all()
    assert (results[1] == cp.ones((10, ), dtype=cp.float32) * world_size).all()

    # destroy the group and try do work, should fail
    ray.get([a.destroy_group.remote() for a in actors])
    with pytest.raises(RuntimeError):
        results = ray.get([a.do_allreduce.remote() for a in actors])

    # reinit the same group and all reduce
    ray.get([
        actor.init_group.remote(world_size, i, backend, group_name)
        for i, actor in enumerate(actors)
    ])
    results = ray.get([a.do_allreduce.remote() for a in actors])
    assert (results[0] == cp.ones(
        (10, ), dtype=cp.float32) * world_size * 2).all()
    assert (results[1] == cp.ones(
        (10, ), dtype=cp.float32) * world_size * 2).all()
Пример #6
0
def test_destroy_group(ray_start_distributed_2_nodes_4_gpus):
    world_size = 4
    actors, _ = create_collective_workers(world_size)
    # Now destroy the group at actor0
    ray.wait([actors[0].destroy_group.remote()])
    actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert not actor0_is_init

    # should go well as the group `random` does not exist at all
    ray.wait([actors[0].destroy_group.remote("random")])

    actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote())
    assert actor1_is_init
    ray.wait([actors[1].destroy_group.remote("random")])
    actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote())
    assert actor1_is_init
    ray.wait([actors[1].destroy_group.remote("default")])
    actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote())
    assert not actor1_is_init
    for i in [2, 3]:
        ray.wait([actors[i].destroy_group.remote("default")])

    # Now reconstruct the group using the same name
    init_results = ray.get([
        actor.init_group.remote(world_size, i)
        for i, actor in enumerate(actors)
    ])
    for i in range(world_size):
        assert init_results[i]
    actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert actor0_is_init
    actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert actor1_is_init
Пример #7
0
def test_reducescatter_different_dtype(ray_start_single_node_2_gpus, dtype):
    world_size = 2
    actors, _ = create_collective_workers(world_size)
    init_tensors_for_gather_scatter(actors, dtype=dtype)
    results = ray.get([a.do_reducescatter.remote() for a in actors])
    for i in range(world_size):
        for j in range(world_size):
            assert (results[i] == cp.ones(10, dtype=dtype) * world_size).all()
Пример #8
0
def test_availability(ray_start_single_node_2_gpus):
    world_size = 2
    actors, _ = create_collective_workers(world_size)
    actor0_nccl_availability = ray.get(
        actors[0].report_nccl_availability.remote())
    assert actor0_nccl_availability
    actor0_gloo_availability = ray.get(
        actors[0].report_gloo_availability.remote())
    assert not actor0_gloo_availability
Пример #9
0
def test_allgather_different_dtype(ray_start_distributed_2_nodes_4_gpus,
                                   dtype):
    world_size = 4
    actors, _ = create_collective_workers(world_size)
    init_tensors_for_gather_scatter(actors, dtype=dtype)
    results = ray.get([a.do_allgather.remote() for a in actors])
    for i in range(world_size):
        for j in range(world_size):
            assert (results[i][j] == cp.ones(10, dtype=dtype) * (j + 1)).all()
Пример #10
0
def test_availability(ray_start_distributed_2_nodes_4_gpus):
    world_size = 4
    actors, _ = create_collective_workers(world_size)
    actor0_nccl_availability = ray.get(
        actors[0].report_nccl_availability.remote())
    assert actor0_nccl_availability
    actor0_mpi_availability = ray.get(
        actors[0].report_mpi_availability.remote())
    assert not actor0_mpi_availability
Пример #11
0
def test_unmatched_tensor_list_length(ray_start_single_node_2_gpus, length):
    world_size = 2
    actors, _ = create_collective_workers(world_size)
    list_buffer = [cp.ones(10, dtype=cp.float32) for _ in range(length)]
    ray.wait([a.set_list_buffer.remote(list_buffer) for a in actors])
    if length != world_size:
        with pytest.raises(RuntimeError):
            ray.get([a.do_allgather.remote() for a in actors])
    else:
        ray.get([a.do_allgather.remote() for a in actors])
Пример #12
0
def test_allgather_torch_cupy(ray_start_distributed_2_nodes_4_gpus):
    world_size = 4
    shape = [10, 10]
    actors, _ = create_collective_workers(world_size)

    # tensor is pytorch, list is cupy
    for i, a in enumerate(actors):
        t = torch.ones(shape, dtype=torch.float32).cuda() * (i + 1)
        ray.wait([a.set_buffer.remote(t)])
        list_buffer = [
            cp.ones(shape, dtype=cp.float32) for _ in range(world_size)
        ]
        ray.wait([a.set_list_buffer.remote(list_buffer)])
    results = ray.get([a.do_allgather.remote() for a in actors])
    for i in range(world_size):
        for j in range(world_size):
            assert (results[i][j] == cp.ones(shape, dtype=cp.float32) *
                    (j + 1)).all()

    # tensor is cupy, list is pytorch
    for i, a in enumerate(actors):
        t = cp.ones(shape, dtype=cp.float32) * (i + 1)
        ray.wait([a.set_buffer.remote(t)])
        list_buffer = [
            torch.ones(shape, dtype=torch.float32).cuda()
            for _ in range(world_size)
        ]
        ray.wait([a.set_list_buffer.remote(list_buffer)])
    results = ray.get([a.do_allgather.remote() for a in actors])
    for i in range(world_size):
        for j in range(world_size):
            assert (results[i][j] == torch.ones(
                shape, dtype=torch.float32).cuda() * (j + 1)).all()

    # some tensors in the list are pytorch, some are cupy
    for i, a in enumerate(actors):
        t = cp.ones(shape, dtype=cp.float32) * (i + 1)
        ray.wait([a.set_buffer.remote(t)])
        list_buffer = []
        for j in range(world_size):
            if j % 2 == 0:
                list_buffer.append(
                    torch.ones(shape, dtype=torch.float32).cuda())
            else:
                list_buffer.append(cp.ones(shape, dtype=cp.float32))
        ray.wait([a.set_list_buffer.remote(list_buffer)])
    results = ray.get([a.do_allgather.remote() for a in actors])
    for i in range(world_size):
        for j in range(world_size):
            if j % 2 == 0:
                assert (results[i][j] == torch.ones(
                    shape, dtype=torch.float32).cuda() * (j + 1)).all()
            else:
                assert (results[i][j] == cp.ones(shape, dtype=cp.float32) *
                        (j + 1)).all()
Пример #13
0
def test_allreduce_different_array_size(
    ray_start_distributed_2_nodes_4_gpus, array_size
):
    world_size = 4
    actors, _ = create_collective_workers(world_size)
    ray.wait(
        [a.set_buffer.remote(cp.ones(array_size, dtype=cp.float32)) for a in actors]
    )
    results = ray.get([a.do_allreduce.remote() for a in actors])
    assert (results[0] == cp.ones((array_size,), dtype=cp.float32) * world_size).all()
    assert (results[1] == cp.ones((array_size,), dtype=cp.float32) * world_size).all()
Пример #14
0
def test_unmatched_tensor_shape(ray_start_distributed_2_nodes_4_gpus, shape):
    world_size = 4
    actors, _ = create_collective_workers(world_size)
    init_tensors_for_gather_scatter(actors, array_size=10)
    list_buffer = [cp.ones(shape, dtype=cp.float32) for _ in range(world_size)]
    ray.get([a.set_list_buffer.remote(list_buffer) for a in actors])
    if shape != 10:
        with pytest.raises(RuntimeError):
            ray.get([a.do_allgather.remote() for a in actors])
    else:
        ray.get([a.do_allgather.remote() for a in actors])
Пример #15
0
def test_allreduce_torch_cupy(ray_start_single_node_2_gpus):
    # import torch
    world_size = 2
    actors, _ = create_collective_workers(world_size)
    ray.wait([actors[1].set_buffer.remote(torch.ones(10, ).cuda())])
    results = ray.get([a.do_allreduce.remote() for a in actors])
    assert (results[0] == cp.ones((10, )) * world_size).all()

    ray.wait([actors[0].set_buffer.remote(torch.ones(10, ))])
    ray.wait([actors[1].set_buffer.remote(cp.ones(10, ))])
    with pytest.raises(RuntimeError):
        results = ray.get([a.do_allreduce.remote() for a in actors])
Пример #16
0
def test_reduce_torch_cupy(ray_start_distributed_2_nodes_4_gpus, dst_rank):
    import torch
    world_size = 4
    actors, _ = create_collective_workers(world_size)
    ray.wait([actors[1].set_buffer.remote(torch.ones(10, ).cuda())])
    results = ray.get([a.do_reduce.remote(dst_rank=dst_rank) for a in actors])
    if dst_rank == 0:
        assert (results[0] == cp.ones((10, )) * world_size).all()
        assert (results[1] == torch.ones((10, )).cuda()).all()
    else:
        assert (results[0] == cp.ones((10, ))).all()
        assert (results[1] == torch.ones((10, )).cuda() * world_size).all()
Пример #17
0
def test_reduce_different_name(ray_start_single_node_2_gpus, group_name,
                               dst_rank):
    world_size = 2
    actors, _ = create_collective_workers(num_workers=world_size,
                                          group_name=group_name)
    results = ray.get(
        [a.do_reduce.remote(group_name, dst_rank) for a in actors])
    for i in range(world_size):
        if i == dst_rank:
            assert (results[i] == cp.ones(
                (10, ), dtype=cp.float32) * world_size).all()
        else:
            assert (results[i] == cp.ones((10, ), dtype=cp.float32)).all()
Пример #18
0
def test_broadcast_different_array_size(ray_start_distributed_2_nodes_4_gpus,
                                        array_size, src_rank):
    world_size = 4
    actors, _ = create_collective_workers(world_size)
    ray.wait([
        a.set_buffer.remote(cp.ones(array_size, dtype=cp.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get(
        [a.do_broadcast.remote(src_rank=src_rank) for a in actors])
    for i in range(world_size):
        assert (results[i] == cp.ones(
            (array_size, ), dtype=cp.float32) * (src_rank + 2)).all()
Пример #19
0
def test_broadcast_torch_cupy(ray_start_single_node_2_gpus, src_rank):
    import torch
    world_size = 2
    actors, _ = create_collective_workers(world_size)
    ray.wait(
        [actors[1].set_buffer.remote(torch.ones(10, ).cuda() * world_size)])
    results = ray.get(
        [a.do_broadcast.remote(src_rank=src_rank) for a in actors])
    if src_rank == 0:
        assert (results[0] == cp.ones((10, ))).all()
        assert (results[1] == torch.ones((10, )).cuda()).all()
    else:
        assert (results[0] == cp.ones((10, )) * world_size).all()
        assert (results[1] == torch.ones((10, )).cuda() * world_size).all()
Пример #20
0
def test_reducescatter_different_array_size(
        ray_start_distributed_2_nodes_4_gpus, array_size, tensor_backend):
    world_size = 4
    actors, _ = create_collective_workers(world_size)
    init_tensors_for_gather_scatter(
        actors, array_size=array_size, tensor_backend=tensor_backend)
    results = ray.get([a.do_reducescatter.remote() for a in actors])
    for i in range(world_size):
        if tensor_backend == "cupy":
            assert (results[i] == cp.ones(array_size, dtype=cp.float32) *
                    world_size).all()
        else:
            assert (results[i] == torch.ones(
                array_size, dtype=torch.float32).cuda() * world_size).all()
Пример #21
0
def test_reduce_different_op(ray_start_distributed_2_nodes_4_gpus, dst_rank):
    world_size = 4
    actors, _ = create_collective_workers(world_size)

    # check product
    ray.wait([
        a.set_buffer.remote(cp.ones(10, dtype=cp.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get([
        a.do_reduce.remote(dst_rank=dst_rank, op=ReduceOp.PRODUCT)
        for a in actors
    ])
    for i in range(world_size):
        if i == dst_rank:
            assert (results[i] == cp.ones(
                (10, ), dtype=cp.float32) * 120).all()
        else:
            assert (results[i] == cp.ones(
                (10, ), dtype=cp.float32) * (i + 2)).all()

    # check min
    ray.wait([
        a.set_buffer.remote(cp.ones(10, dtype=cp.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get([
        a.do_reduce.remote(dst_rank=dst_rank, op=ReduceOp.MIN) for a in actors
    ])
    for i in range(world_size):
        if i == dst_rank:
            assert (results[i] == cp.ones((10, ), dtype=cp.float32) * 2).all()
        else:
            assert (results[i] == cp.ones(
                (10, ), dtype=cp.float32) * (i + 2)).all()

    # check max
    ray.wait([
        a.set_buffer.remote(cp.ones(10, dtype=cp.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get([
        a.do_reduce.remote(dst_rank=dst_rank, op=ReduceOp.MAX) for a in actors
    ])
    for i in range(world_size):
        if i == dst_rank:
            assert (results[i] == cp.ones((10, ), dtype=cp.float32) * 5).all()
        else:
            assert (results[i] == cp.ones(
                (10, ), dtype=cp.float32) * (i + 2)).all()
Пример #22
0
def test_allreduce_multiple_group(ray_start_single_node_2_gpus,
                                  backend="nccl",
                                  num_groups=5):
    world_size = 2
    actors, _ = create_collective_workers(world_size)
    for group_name in range(1, num_groups):
        ray.get([
            actor.init_group.remote(world_size, i, backend, str(group_name))
            for i, actor in enumerate(actors)
        ])
    for i in range(num_groups):
        group_name = "default" if i == 0 else str(i)
        results = ray.get([a.do_allreduce.remote(group_name) for a in actors])
        assert (results[0] == cp.ones(
            (10, ), dtype=cp.float32) * (world_size**(i + 1))).all()
Пример #23
0
def test_reduce_different_array_size(ray_start_single_node_2_gpus, array_size,
                                     dst_rank):
    world_size = 2
    actors, _ = create_collective_workers(world_size)
    ray.wait([
        a.set_buffer.remote(cp.ones(array_size, dtype=cp.float32))
        for a in actors
    ])
    results = ray.get([a.do_reduce.remote(dst_rank=dst_rank) for a in actors])
    for i in range(world_size):
        if i == dst_rank:
            assert (results[i] == cp.ones(
                (array_size, ), dtype=cp.float32) * world_size).all()
        else:
            assert (results[i] == cp.ones((array_size, ),
                                          dtype=cp.float32)).all()
Пример #24
0
def test_broadcast_different_name(ray_start_single_node_2_gpus, group_name,
                                  src_rank):
    world_size = 2
    actors, _ = create_collective_workers(num_workers=world_size,
                                          group_name=group_name)
    ray.wait([
        a.set_buffer.remote(cp.ones((10, ), dtype=cp.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get([
        a.do_broadcast.remote(group_name=group_name, src_rank=src_rank)
        for a in actors
    ])
    for i in range(world_size):
        assert (results[i] == cp.ones(
            (10, ), dtype=cp.float32) * (src_rank + 2)).all()
Пример #25
0
def test_allgather_different_array_size(ray_start_single_node_2_gpus,
                                        array_size, tensor_backend):
    world_size = 2
    actors, _ = create_collective_workers(world_size)
    init_tensors_for_gather_scatter(actors,
                                    array_size=array_size,
                                    tensor_backend=tensor_backend)
    results = ray.get([a.do_allgather.remote() for a in actors])
    for i in range(world_size):
        for j in range(world_size):
            if tensor_backend == "cupy":
                assert (
                    results[i][j] == cp.ones(array_size, dtype=cp.float32) *
                    (j + 1)).all()
            else:
                assert (results[i][j] == torch.ones(
                    array_size, dtype=torch.float32).cuda() * (j + 1)).all()
Пример #26
0
def test_is_group_initialized(ray_start_distributed_2_nodes_4_gpus):
    world_size = 4
    actors, _ = create_collective_workers(world_size)
    # check group is_init
    actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert actor0_is_init
    actor0_is_init = ray.get(
        actors[0].report_is_group_initialized.remote("random"))
    assert not actor0_is_init
    actor0_is_init = ray.get(
        actors[0].report_is_group_initialized.remote("123"))
    assert not actor0_is_init
    actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert actor1_is_init
    actor1_is_init = ray.get(
        actors[0].report_is_group_initialized.remote("456"))
    assert not actor1_is_init
Пример #27
0
def test_reduce_different_name(ray_start_single_node_2_gpus, group_name,
                               array_size, dst_rank):
    world_size = 2
    actors, _ = create_collective_workers(num_workers=world_size,
                                          group_name=group_name)
    ray.wait([
        a.set_buffer.remote(cp.ones(array_size, dtype=cp.float32) * (i + 1))
        for i, a in enumerate(actors)
    ])
    src_rank = 1 - dst_rank
    refs = []
    for i, actor in enumerate(actors):
        if i != dst_rank:
            ref = actor.do_send.remote(group_name, dst_rank)
        else:
            ref = actor.do_recv.remote(group_name, src_rank)
        refs.append(ref)
    results = ray.get(refs)
    for i in range(world_size):
        assert (results[i] == cp.ones(array_size, dtype=cp.float32) *
                (src_rank + 1)).all()
Пример #28
0
def test_get_rank(ray_start_distributed_2_nodes_4_gpus, world_size):
    actors, _ = create_collective_workers(world_size)
    actor0_rank = ray.get(actors[0].report_rank.remote())
    assert actor0_rank == 0
    actor1_rank = ray.get(actors[1].report_rank.remote())
    assert actor1_rank == 1

    # create a second group with a different name, and different
    # orders of ranks.
    new_group_name = "default2"
    ranks = list(range(world_size))
    shuffle(ranks)
    _ = ray.get([
        actor.init_group.remote(
            world_size, ranks[i], group_name=new_group_name)
        for i, actor in enumerate(actors)
    ])
    actor0_rank = ray.get(actors[0].report_rank.remote(new_group_name))
    assert actor0_rank == ranks[0]
    actor1_rank = ray.get(actors[1].report_rank.remote(new_group_name))
    assert actor1_rank == ranks[1]
Пример #29
0
def test_sendrecv_torch_cupy(ray_start_single_node_2_gpus, dst_rank):
    import torch
    world_size = 2
    actors, _ = create_collective_workers(world_size)
    ray.wait([actors[1].set_buffer.remote(torch.ones(10, ).cuda() * 2)])
    src_rank = 1 - dst_rank

    refs = []
    for i, actor in enumerate(actors):
        if i != dst_rank:
            ref = actor.do_send.remote(dst_rank=dst_rank)
        else:
            ref = actor.do_recv.remote(src_rank=src_rank)
        refs.append(ref)
    results = ray.get(refs)
    if dst_rank == 0:
        assert (results[0] == cp.ones((10, )) * 2).all()
        assert (results[1] == torch.ones((10, )).cuda() * 2).all()
    else:
        assert (results[0] == cp.ones((10, ))).all()
        assert (results[1] == torch.ones((10, )).cuda()).all()
Пример #30
0
def test_get_rank(ray_start_single_node_2_gpus):
    world_size = 2
    actors, _ = create_collective_workers(world_size)
    actor0_rank = ray.get(actors[0].report_rank.remote())
    assert actor0_rank == 0
    actor1_rank = ray.get(actors[1].report_rank.remote())
    assert actor1_rank == 1

    # create a second group with a different name,
    # and different order of ranks.
    new_group_name = "default2"
    ray.get([
        actor.init_group.remote(world_size,
                                world_size - 1 - i,
                                group_name=new_group_name)
        for i, actor in enumerate(actors)
    ])
    actor0_rank = ray.get(actors[0].report_rank.remote(new_group_name))
    assert actor0_rank == 1
    actor1_rank = ray.get(actors[1].report_rank.remote(new_group_name))
    assert actor1_rank == 0