示例#1
0
def test_allreduce_different_dtype(ray_start_single_node, dtype, backend):
    world_size = 2
    actors, _ = create_collective_workers(world_size, backend=backend)
    ray.wait([a.set_buffer.remote(np.ones(10, dtype=dtype)) for a in actors])
    results = ray.get([a.do_allreduce.remote() for a in actors])
    assert (results[0] == np.ones((10,), dtype=dtype) * world_size).all()
    assert (results[1] == np.ones((10,), dtype=dtype) * world_size).all()
示例#2
0
def test_reduce_different_name(
    ray_start_single_node, group_name, array_size, dst_rank, backend
):
    world_size = 2
    actors, _ = create_collective_workers(
        num_workers=world_size, group_name=group_name, backend=backend
    )
    ray.wait(
        [
            a.set_buffer.remote(np.ones(array_size, dtype=np.float32) * (i + 1))
            for i, a in enumerate(actors)
        ]
    )
    src_rank = 1 - dst_rank
    refs = []
    for i, actor in enumerate(actors):
        if i != dst_rank:
            ref = actor.do_send.remote(group_name, dst_rank)
        else:
            ref = actor.do_recv.remote(group_name, src_rank)
        refs.append(ref)
    results = ray.get(refs)
    for i in range(world_size):
        assert (
            results[i] == np.ones(array_size, dtype=np.float32) * (src_rank + 1)
        ).all()
示例#3
0
def test_sendrecv_torch_numpy(ray_start_single_node, dst_rank, backend):
    import torch

    world_size = 2
    actors, _ = create_collective_workers(world_size, backend=backend)
    ray.wait(
        [
            actors[1].set_buffer.remote(
                torch.ones(
                    10,
                )
                * 2
            )
        ]
    )
    src_rank = 1 - dst_rank

    refs = []
    for i, actor in enumerate(actors):
        if i != dst_rank:
            ref = actor.do_send.remote(dst_rank=dst_rank)
        else:
            ref = actor.do_recv.remote(src_rank=src_rank)
        refs.append(ref)
    results = ray.get(refs)
    if dst_rank == 0:
        assert (results[0] == np.ones((10,)) * 2).all()
        assert (results[1] == torch.ones((10,)) * 2).all()
    else:
        assert (results[0] == np.ones((10,))).all()
        assert (results[1] == torch.ones((10,))).all()
示例#4
0
def test_broadcast_invalid_rank(ray_start_distributed_2_nodes,
                                backend,
                                src_rank=9):
    world_size = 8
    actors, _ = create_collective_workers(world_size, backend=backend)
    with pytest.raises(ValueError):
        _ = ray.get([a.do_broadcast.remote(src_rank=src_rank) for a in actors])
示例#5
0
def test_destroy_group(ray_start_single_node, backend):
    world_size = 2
    actors, _ = create_collective_workers(world_size, backend=backend)
    # Now destroy the group at actor0
    ray.wait([actors[0].destroy_group.remote()])
    actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert not actor0_is_init

    # should go well as the group `random` does not exist at all
    ray.wait([actors[0].destroy_group.remote("random")])

    actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote())
    assert actor1_is_init
    ray.wait([actors[1].destroy_group.remote("random")])
    actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote())
    assert actor1_is_init
    ray.wait([actors[1].destroy_group.remote("default")])
    actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote())
    assert not actor1_is_init

    # Now reconstruct the group using the same name
    init_results = ray.get([
        actor.init_group.remote(world_size, i)
        for i, actor in enumerate(actors)
    ])
    for i in range(world_size):
        assert init_results[i]
    actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert actor0_is_init
    actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert actor1_is_init
示例#6
0
def test_init_two_actors(ray_start_distributed_2_nodes, world_size, group_name,
                         backend):
    actors, results = create_collective_workers(world_size,
                                                group_name,
                                                backend=backend)
    for i in range(world_size):
        assert (results[i])
示例#7
0
def test_init_two_actors(ray_start_single_node, group_name, backend):
    world_size = 2
    actors, results = create_collective_workers(world_size,
                                                group_name,
                                                backend=backend)
    for i in range(world_size):
        assert results[i]
示例#8
0
def test_allreduce_different_op(ray_start_distributed_2_nodes, backend):
    world_size = 8
    actors, _ = create_collective_workers(world_size, backend=backend)

    # check product
    ray.wait([
        a.set_buffer.remote(np.ones(10, dtype=np.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get(
        [a.do_allreduce.remote(op=ReduceOp.PRODUCT) for a in actors])
    product = 1
    for i in range(world_size):
        product = product * (i + 2)
    assert (results[0] == np.ones((10, ), dtype=np.float32) * product).all()
    assert (results[1] == np.ones((10, ), dtype=np.float32) * product).all()

    # check min
    ray.wait([
        a.set_buffer.remote(np.ones(10, dtype=np.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get([a.do_allreduce.remote(op=ReduceOp.MIN) for a in actors])
    assert (results[0] == np.ones((10, ), dtype=np.float32) * 2).all()
    assert (results[1] == np.ones((10, ), dtype=np.float32) * 2).all()

    # check max
    ray.wait([
        a.set_buffer.remote(np.ones(10, dtype=np.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get([a.do_allreduce.remote(op=ReduceOp.MAX) for a in actors])
    assert (results[0] == np.ones((10, ), dtype=np.float32) * 9).all()
    assert (results[1] == np.ones((10, ), dtype=np.float32) * 9).all()
示例#9
0
def test_allreduce_torch_numpy(ray_start_single_node, backend):
    # import torch
    world_size = 2
    actors, _ = create_collective_workers(world_size, backend=backend)
    ray.wait(
        [
            actors[1].set_buffer.remote(
                torch.ones(
                    10,
                )
            )
        ]
    )
    results = ray.get([a.do_allreduce.remote() for a in actors])
    assert (results[0] == np.ones((10,)) * world_size).all()

    ray.wait(
        [
            actors[0].set_buffer.remote(
                torch.ones(
                    10,
                )
            )
        ]
    )
    ray.wait([actors[1].set_buffer.remote(np.ones(10, dtype=np.float32))])
    ray.get([a.do_allreduce.remote() for a in actors])
示例#10
0
def test_allreduce_destroy(ray_start_distributed_2_nodes,
                           backend,
                           group_name="default"):
    world_size = 8
    actors, _ = create_collective_workers(world_size, backend=backend)

    results = ray.get([a.do_allreduce.remote() for a in actors])
    assert (results[0] == np.ones((10, ), dtype=np.float32) * world_size).all()
    assert (results[1] == np.ones((10, ), dtype=np.float32) * world_size).all()

    # destroy the group and try do work, should fail
    ray.get([a.destroy_group.remote() for a in actors])
    with pytest.raises(RuntimeError):
        results = ray.get([a.do_allreduce.remote() for a in actors])

    # reinit the same group and all reduce
    ray.get([
        actor.init_group.remote(world_size, i, backend, group_name)
        for i, actor in enumerate(actors)
    ])
    results = ray.get([a.do_allreduce.remote() for a in actors])
    assert (results[0] == np.ones(
        (10, ), dtype=np.float32) * world_size * world_size).all()
    assert (results[1] == np.ones(
        (10, ), dtype=np.float32) * world_size * world_size).all()
示例#11
0
def test_allreduce_different_name(ray_start_single_node, group_name, backend):
    world_size = 2
    actors, _ = create_collective_workers(
        num_workers=world_size, group_name=group_name, backend=backend
    )
    results = ray.get([a.do_allreduce.remote(group_name) for a in actors])
    assert (results[0] == np.ones((10,), dtype=np.float32) * world_size).all()
    assert (results[1] == np.ones((10,), dtype=np.float32) * world_size).all()
示例#12
0
def test_reducescatter_different_dtype(ray_start_single_node, dtype, backend):
    world_size = 2
    actors, _ = create_collective_workers(world_size, backend=backend)
    init_tensors_for_gather_scatter(actors, dtype=dtype)
    results = ray.get([a.do_reducescatter.remote() for a in actors])
    for i in range(world_size):
        for j in range(world_size):
            assert (results[i] == np.ones(10, dtype=dtype) * world_size).all()
def test_allgather_different_dtype(ray_start_distributed_2_nodes, dtype, backend):
    world_size = 8
    actors, _ = create_collective_workers(world_size, backend=backend)
    init_tensors_for_gather_scatter(actors, dtype=dtype)
    results = ray.get([a.do_allgather.remote() for a in actors])
    for i in range(world_size):
        for j in range(world_size):
            assert (results[i][j] == np.ones(10, dtype=dtype) * (j + 1)).all()
def test_unmatched_tensor_list_length(ray_start_distributed_2_nodes, length, backend):
    world_size = 8
    actors, _ = create_collective_workers(world_size, backend=backend)
    list_buffer = [np.ones(10, dtype=np.float32) for _ in range(length)]
    ray.wait([a.set_list_buffer.remote(list_buffer, copy=True) for a in actors])
    if length != world_size:
        with pytest.raises(RuntimeError):
            ray.get([a.do_allgather.remote() for a in actors])
    else:
        ray.get([a.do_allgather.remote() for a in actors])
示例#15
0
def test_allgather_torch_numpy(ray_start_single_node, backend):
    world_size = 2
    shape = [10, 10]
    actors, _ = create_collective_workers(world_size, backend=backend)

    # tensor is pytorch, list is numpy
    for i, a in enumerate(actors):
        t = torch.ones(shape, dtype=torch.float32) * (i + 1)
        ray.wait([a.set_buffer.remote(t)])
        list_buffer = [
            np.ones(shape, dtype=np.float32) for _ in range(world_size)
        ]
        ray.wait([a.set_list_buffer.remote(list_buffer, copy=True)])
    results = ray.get([a.do_allgather.remote() for a in actors])
    for i in range(world_size):
        for j in range(world_size):
            assert (results[i][j] == np.ones(shape, dtype=np.float32) *
                    (j + 1)).all()

    # tensor is numpy, list is pytorch
    for i, a in enumerate(actors):
        t = np.ones(shape, dtype=np.float32) * (i + 1)
        ray.wait([a.set_buffer.remote(t)])
        list_buffer = [
            torch.ones(shape, dtype=torch.float32) for _ in range(world_size)
        ]
        ray.wait([a.set_list_buffer.remote(list_buffer, copy=True)])
    results = ray.get([a.do_allgather.remote() for a in actors])
    for i in range(world_size):
        for j in range(world_size):
            assert (results[i][j] == torch.ones(shape, dtype=torch.float32) *
                    (j + 1)).all()

    # some tensors in the list are pytorch, some are numpy
    for i, a in enumerate(actors):
        t = np.ones(shape, dtype=np.float32) * (i + 1)
        ray.wait([a.set_buffer.remote(t)])
        list_buffer = []
        for j in range(world_size):
            if j % 2 == 0:
                list_buffer.append(torch.ones(shape, dtype=torch.float32))
            else:
                list_buffer.append(np.ones(shape, dtype=np.float32))
        ray.wait([a.set_list_buffer.remote(list_buffer, copy=True)])
    results = ray.get([a.do_allgather.remote() for a in actors])
    for i in range(world_size):
        for j in range(world_size):
            if j % 2 == 0:
                assert (
                    results[i][j] == torch.ones(shape, dtype=torch.float32) *
                    (j + 1)).all()
            else:
                assert (results[i][j] == np.ones(shape, dtype=np.float32) *
                        (j + 1)).all()
示例#16
0
def test_unmatched_tensor_shape(ray_start_single_node, shape, backend):
    world_size = 2
    actors, _ = create_collective_workers(world_size, backend=backend)
    init_tensors_for_gather_scatter(actors, array_size=10)
    list_buffer = [np.ones(shape, dtype=np.float32) for _ in range(world_size)]
    ray.get([a.set_list_buffer.remote(list_buffer, copy=True) for a in actors])
    if shape != 10:
        with pytest.raises(RuntimeError):
            ray.get([a.do_allgather.remote() for a in actors])
    else:
        ray.get([a.do_allgather.remote() for a in actors])
def test_allreduce_different_array_size(
    ray_start_distributed_2_nodes, array_size, backend
):
    world_size = 8
    actors, _ = create_collective_workers(world_size, backend=backend)
    ray.wait(
        [a.set_buffer.remote(np.ones(array_size, dtype=np.float32)) for a in actors]
    )
    results = ray.get([a.do_allreduce.remote() for a in actors])
    assert (results[0] == np.ones((array_size,), dtype=np.float32) * world_size).all()
    assert (results[1] == np.ones((array_size,), dtype=np.float32) * world_size).all()
示例#18
0
def test_reduce_torch_numpy(ray_start_single_node, dst_rank, backend):
    import torch
    world_size = 2
    actors, _ = create_collective_workers(world_size, backend=backend)
    ray.wait([actors[1].set_buffer.remote(torch.ones(10, ))])
    results = ray.get([a.do_reduce.remote(dst_rank=dst_rank) for a in actors])
    if dst_rank == 0:
        assert (results[0] == np.ones((10, )) * world_size).all()
        assert (results[1] == torch.ones((10, ))).all()
    else:
        assert (results[0] == np.ones((10, ))).all()
        assert (results[1] == torch.ones((10, )) * world_size).all()
示例#19
0
def test_broadcast_different_array_size(ray_start_distributed_2_nodes,
                                        array_size, src_rank, backend):
    world_size = 8
    actors, _ = create_collective_workers(world_size, backend=backend)
    ray.wait([
        a.set_buffer.remote(np.ones(array_size, dtype=np.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get(
        [a.do_broadcast.remote(src_rank=src_rank) for a in actors])
    for i in range(world_size):
        assert (results[i] == np.ones(
            (array_size, ), dtype=np.float32) * (src_rank + 2)).all()
示例#20
0
def test_broadcast_torch_numpy(ray_start_distributed_2_nodes, src_rank,
                               backend):
    import torch
    world_size = 8
    actors, _ = create_collective_workers(world_size, backend=backend)
    ray.wait([actors[1].set_buffer.remote(torch.ones(10, ) * world_size)])
    results = ray.get(
        [a.do_broadcast.remote(src_rank=src_rank) for a in actors])
    if src_rank == 0:
        assert (results[0] == np.ones((10, ))).all()
        assert (results[1] == torch.ones((10, ))).all()
    else:
        assert (results[0] == np.ones((10, )) * world_size).all()
        assert (results[1] == torch.ones((10, )) * world_size).all()
示例#21
0
def test_is_group_initialized(ray_start_distributed_2_nodes, backend):
    world_size = 8
    actors, _ = create_collective_workers(world_size, backend=backend)
    # check group is_init
    actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert actor0_is_init
    actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote("random"))
    assert not actor0_is_init
    actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote("123"))
    assert not actor0_is_init
    actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote())
    assert actor1_is_init
    actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote("456"))
    assert not actor1_is_init
示例#22
0
def test_reducescatter_different_array_size(ray_start_single_node, array_size,
                                            tensor_backend, backend):
    world_size = 2
    actors, _ = create_collective_workers(world_size, backend=backend)
    init_tensors_for_gather_scatter(
        actors, array_size=array_size, tensor_backend=tensor_backend)
    results = ray.get([a.do_reducescatter.remote() for a in actors])
    for i in range(world_size):
        if tensor_backend == "numpy":
            assert (results[i] == np.ones(array_size, dtype=np.float32) *
                    world_size).all()
        else:
            assert (results[i] == torch.ones(array_size, dtype=torch.float32) *
                    world_size).all()
示例#23
0
def test_reduce_different_name(ray_start_single_node, group_name, dst_rank,
                               backend):
    world_size = 2
    actors, _ = create_collective_workers(num_workers=world_size,
                                          group_name=group_name,
                                          backend=backend)
    results = ray.get(
        [a.do_reduce.remote(group_name, dst_rank) for a in actors])
    for i in range(world_size):
        if i == dst_rank:
            assert (results[i] == np.ones(
                (10, ), dtype=np.float32) * world_size).all()
        else:
            assert (results[i] == np.ones((10, ), dtype=np.float32)).all()
示例#24
0
def test_reduce_different_op(ray_start_single_node, dst_rank, backend):
    world_size = 2
    actors, _ = create_collective_workers(world_size, backend=backend)

    # check product
    ray.wait([
        a.set_buffer.remote(np.ones(10, dtype=np.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get([
        a.do_reduce.remote(dst_rank=dst_rank, op=ReduceOp.PRODUCT)
        for a in actors
    ])
    for i in range(world_size):
        if i == dst_rank:
            assert (results[i] == np.ones((10, ), dtype=np.float32) * 6).all()
        else:
            assert (results[i] == np.ones(
                (10, ), dtype=np.float32) * (i + 2)).all()

    # check min
    ray.wait([
        a.set_buffer.remote(np.ones(10, dtype=np.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get([
        a.do_reduce.remote(dst_rank=dst_rank, op=ReduceOp.MIN) for a in actors
    ])
    for i in range(world_size):
        if i == dst_rank:
            assert (results[i] == np.ones((10, ), dtype=np.float32) * 2).all()
        else:
            assert (results[i] == np.ones(
                (10, ), dtype=np.float32) * (i + 2)).all()

    # check max
    ray.wait([
        a.set_buffer.remote(np.ones(10, dtype=np.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get([
        a.do_reduce.remote(dst_rank=dst_rank, op=ReduceOp.MAX) for a in actors
    ])
    for i in range(world_size):
        if i == dst_rank:
            assert (results[i] == np.ones((10, ), dtype=np.float32) * 3).all()
        else:
            assert (results[i] == np.ones(
                (10, ), dtype=np.float32) * (i + 2)).all()
示例#25
0
def test_allreduce_multiple_group(ray_start_distributed_2_nodes,
                                  backend,
                                  num_groups=5):
    world_size = 8
    actors, _ = create_collective_workers(world_size, backend=backend)
    for group_name in range(1, num_groups):
        ray.get([
            actor.init_group.remote(world_size, i, backend, str(group_name))
            for i, actor in enumerate(actors)
        ])
    for i in range(num_groups):
        group_name = "default" if i == 0 else str(i)
        results = ray.get([a.do_allreduce.remote(group_name) for a in actors])
        assert (results[0] == np.ones(
            (10, ), dtype=np.float32) * (world_size**(i + 1))).all()
示例#26
0
def test_broadcast_different_name(ray_start_single_node, group_name, src_rank,
                                  backend):
    world_size = 2
    actors, _ = create_collective_workers(
        num_workers=world_size, group_name=group_name, backend=backend)
    ray.get([
        a.set_buffer.remote(np.ones((10, ), dtype=np.float32) * (i + 2))
        for i, a in enumerate(actors)
    ])
    results = ray.get([
        a.do_broadcast.remote(group_name=group_name, src_rank=src_rank)
        for a in actors
    ])
    for i in range(world_size):
        assert (results[i] == np.ones(
            (10, ), dtype=np.float32) * (src_rank + 2)).all()
示例#27
0
def test_reduce_different_array_size(ray_start_single_node, array_size,
                                     dst_rank, backend):
    world_size = 2
    actors, _ = create_collective_workers(world_size, backend=backend)
    ray.wait([
        a.set_buffer.remote(np.ones(array_size, dtype=np.float32))
        for a in actors
    ])
    results = ray.get([a.do_reduce.remote(dst_rank=dst_rank) for a in actors])
    for i in range(world_size):
        if i == dst_rank:
            assert (results[i] == np.ones(
                (array_size, ), dtype=np.float32) * world_size).all()
        else:
            assert (results[i] == np.ones((array_size, ),
                                          dtype=np.float32)).all()
示例#28
0
def test_get_rank(ray_start_single_node, backend):
    world_size = 2
    actors, _ = create_collective_workers(world_size, backend=backend)
    actor0_rank = ray.get(actors[0].report_rank.remote())
    assert actor0_rank == 0
    actor1_rank = ray.get(actors[1].report_rank.remote())
    assert actor1_rank == 1

    # create a second group with a different name,
    # and different order of ranks.
    new_group_name = "default2"
    ray.get([
        actor.init_group.remote(world_size,
                                world_size - 1 - i,
                                group_name=new_group_name,
                                backend=backend)
        for i, actor in enumerate(actors)
    ])
    actor0_rank = ray.get(actors[0].report_rank.remote(new_group_name))
    assert actor0_rank == 1
    actor1_rank = ray.get(actors[1].report_rank.remote(new_group_name))
    assert actor1_rank == 0
示例#29
0
def test_sendrecv(ray_start_distributed_2_nodes, group_name, array_size,
                  src_rank, dst_rank, backend):
    if src_rank == dst_rank:
        return
    world_size = 8
    actors, _ = create_collective_workers(num_workers=world_size,
                                          group_name=group_name,
                                          backend=backend)
    ray.get([
        a.set_buffer.remote(np.ones(array_size, dtype=np.float32) * (i + 1))
        for i, a in enumerate(actors)
    ])
    refs = []
    for i in range(world_size):
        refs.append(actors[i].get_buffer.remote())
    refs[src_rank] = actors[src_rank].do_send.remote(group_name, dst_rank)
    refs[dst_rank] = actors[dst_rank].do_recv.remote(group_name, src_rank)
    results = ray.get(refs)
    assert (results[src_rank] == np.ones(array_size, dtype=np.float32) *
            (src_rank + 1)).all()
    assert (results[dst_rank] == np.ones(array_size, dtype=np.float32) *
            (src_rank + 1)).all()
    ray.get([a.destroy_group.remote(group_name) for a in actors])
示例#30
0
def test_reduce_multiple_group(ray_start_single_node,
                               dst_rank,
                               backend,
                               num_groups=5):
    world_size = 2
    actors, _ = create_collective_workers(world_size, backend=backend)
    for group_name in range(1, num_groups):
        ray.get([
            actor.init_group.remote(world_size, i, backend, str(group_name))
            for i, actor in enumerate(actors)
        ])
    for i in range(num_groups):
        group_name = "default" if i == 0 else str(i)
        results = ray.get([
            a.do_reduce.remote(dst_rank=dst_rank, group_name=group_name)
            for a in actors
        ])
        for j in range(world_size):
            if j == dst_rank:
                assert (results[j] == np.ones(
                    (10, ), dtype=np.float32) * (i + 2)).all()
            else:
                assert (results[j] == np.ones((10, ), dtype=np.float32)).all()