def test_sendrecv( ray_start_distributed_2_nodes_4_gpus, group_name, array_size, src_rank, dst_rank ): if src_rank == dst_rank: return world_size = 4 actors, _ = create_collective_workers(num_workers=world_size, group_name=group_name) ray.get( [ a.set_buffer.remote(cp.ones(array_size, dtype=cp.float32) * (i + 1)) for i, a in enumerate(actors) ] ) refs = [] for i in range(world_size): refs.append(actors[i].get_buffer.remote()) refs[src_rank] = actors[src_rank].do_send.remote(group_name, dst_rank) refs[dst_rank] = actors[dst_rank].do_recv.remote(group_name, src_rank) results = ray.get(refs) assert ( results[src_rank] == cp.ones(array_size, dtype=cp.float32) * (src_rank + 1) ).all() assert ( results[dst_rank] == cp.ones(array_size, dtype=cp.float32) * (src_rank + 1) ).all() ray.get([a.destroy_group.remote(group_name) for a in actors])
def test_allreduce_different_name(ray_start_single_node_2_gpus, group_name): world_size = 2 actors, _ = create_collective_workers(num_workers=world_size, group_name=group_name) results = ray.get([a.do_allreduce.remote(group_name) for a in actors]) assert (results[0] == cp.ones((10, ), dtype=cp.float32) * world_size).all() assert (results[1] == cp.ones((10, ), dtype=cp.float32) * world_size).all()
def test_allreduce_different_op(ray_start_single_node_2_gpus): world_size = 2 actors, _ = create_collective_workers(world_size) # check product ray.wait([ a.set_buffer.remote(cp.ones(10, dtype=cp.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get( [a.do_allreduce.remote(op=ReduceOp.PRODUCT) for a in actors]) assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 6).all() assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 6).all() # check min ray.wait([ a.set_buffer.remote(cp.ones(10, dtype=cp.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get([a.do_allreduce.remote(op=ReduceOp.MIN) for a in actors]) assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 2).all() assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 2).all() # check max ray.wait([ a.set_buffer.remote(cp.ones(10, dtype=cp.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get([a.do_allreduce.remote(op=ReduceOp.MAX) for a in actors]) assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 3).all() assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 3).all()
def test_allreduce_different_dtype(ray_start_single_node_2_gpus, dtype): world_size = 2 actors, _ = create_collective_workers(world_size) ray.wait([a.set_buffer.remote(cp.ones(10, dtype=dtype)) for a in actors]) results = ray.get([a.do_allreduce.remote() for a in actors]) assert (results[0] == cp.ones((10, ), dtype=dtype) * world_size).all() assert (results[1] == cp.ones((10, ), dtype=dtype) * world_size).all()
def test_allreduce_destroy(ray_start_single_node_2_gpus, backend="nccl", group_name="default"): world_size = 2 actors, _ = create_collective_workers(world_size) results = ray.get([a.do_allreduce.remote() for a in actors]) assert (results[0] == cp.ones((10, ), dtype=cp.float32) * world_size).all() assert (results[1] == cp.ones((10, ), dtype=cp.float32) * world_size).all() # destroy the group and try do work, should fail ray.get([a.destroy_group.remote() for a in actors]) with pytest.raises(RuntimeError): results = ray.get([a.do_allreduce.remote() for a in actors]) # reinit the same group and all reduce ray.get([ actor.init_group.remote(world_size, i, backend, group_name) for i, actor in enumerate(actors) ]) results = ray.get([a.do_allreduce.remote() for a in actors]) assert (results[0] == cp.ones( (10, ), dtype=cp.float32) * world_size * 2).all() assert (results[1] == cp.ones( (10, ), dtype=cp.float32) * world_size * 2).all()
def test_destroy_group(ray_start_distributed_2_nodes_4_gpus): world_size = 4 actors, _ = create_collective_workers(world_size) # Now destroy the group at actor0 ray.wait([actors[0].destroy_group.remote()]) actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert not actor0_is_init # should go well as the group `random` does not exist at all ray.wait([actors[0].destroy_group.remote("random")]) actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) assert actor1_is_init ray.wait([actors[1].destroy_group.remote("random")]) actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) assert actor1_is_init ray.wait([actors[1].destroy_group.remote("default")]) actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) assert not actor1_is_init for i in [2, 3]: ray.wait([actors[i].destroy_group.remote("default")]) # Now reconstruct the group using the same name init_results = ray.get([ actor.init_group.remote(world_size, i) for i, actor in enumerate(actors) ]) for i in range(world_size): assert init_results[i] actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert actor0_is_init actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert actor1_is_init
def test_reducescatter_different_dtype(ray_start_single_node_2_gpus, dtype): world_size = 2 actors, _ = create_collective_workers(world_size) init_tensors_for_gather_scatter(actors, dtype=dtype) results = ray.get([a.do_reducescatter.remote() for a in actors]) for i in range(world_size): for j in range(world_size): assert (results[i] == cp.ones(10, dtype=dtype) * world_size).all()
def test_availability(ray_start_single_node_2_gpus): world_size = 2 actors, _ = create_collective_workers(world_size) actor0_nccl_availability = ray.get( actors[0].report_nccl_availability.remote()) assert actor0_nccl_availability actor0_gloo_availability = ray.get( actors[0].report_gloo_availability.remote()) assert not actor0_gloo_availability
def test_allgather_different_dtype(ray_start_distributed_2_nodes_4_gpus, dtype): world_size = 4 actors, _ = create_collective_workers(world_size) init_tensors_for_gather_scatter(actors, dtype=dtype) results = ray.get([a.do_allgather.remote() for a in actors]) for i in range(world_size): for j in range(world_size): assert (results[i][j] == cp.ones(10, dtype=dtype) * (j + 1)).all()
def test_availability(ray_start_distributed_2_nodes_4_gpus): world_size = 4 actors, _ = create_collective_workers(world_size) actor0_nccl_availability = ray.get( actors[0].report_nccl_availability.remote()) assert actor0_nccl_availability actor0_mpi_availability = ray.get( actors[0].report_mpi_availability.remote()) assert not actor0_mpi_availability
def test_unmatched_tensor_list_length(ray_start_single_node_2_gpus, length): world_size = 2 actors, _ = create_collective_workers(world_size) list_buffer = [cp.ones(10, dtype=cp.float32) for _ in range(length)] ray.wait([a.set_list_buffer.remote(list_buffer) for a in actors]) if length != world_size: with pytest.raises(RuntimeError): ray.get([a.do_allgather.remote() for a in actors]) else: ray.get([a.do_allgather.remote() for a in actors])
def test_allgather_torch_cupy(ray_start_distributed_2_nodes_4_gpus): world_size = 4 shape = [10, 10] actors, _ = create_collective_workers(world_size) # tensor is pytorch, list is cupy for i, a in enumerate(actors): t = torch.ones(shape, dtype=torch.float32).cuda() * (i + 1) ray.wait([a.set_buffer.remote(t)]) list_buffer = [ cp.ones(shape, dtype=cp.float32) for _ in range(world_size) ] ray.wait([a.set_list_buffer.remote(list_buffer)]) results = ray.get([a.do_allgather.remote() for a in actors]) for i in range(world_size): for j in range(world_size): assert (results[i][j] == cp.ones(shape, dtype=cp.float32) * (j + 1)).all() # tensor is cupy, list is pytorch for i, a in enumerate(actors): t = cp.ones(shape, dtype=cp.float32) * (i + 1) ray.wait([a.set_buffer.remote(t)]) list_buffer = [ torch.ones(shape, dtype=torch.float32).cuda() for _ in range(world_size) ] ray.wait([a.set_list_buffer.remote(list_buffer)]) results = ray.get([a.do_allgather.remote() for a in actors]) for i in range(world_size): for j in range(world_size): assert (results[i][j] == torch.ones( shape, dtype=torch.float32).cuda() * (j + 1)).all() # some tensors in the list are pytorch, some are cupy for i, a in enumerate(actors): t = cp.ones(shape, dtype=cp.float32) * (i + 1) ray.wait([a.set_buffer.remote(t)]) list_buffer = [] for j in range(world_size): if j % 2 == 0: list_buffer.append( torch.ones(shape, dtype=torch.float32).cuda()) else: list_buffer.append(cp.ones(shape, dtype=cp.float32)) ray.wait([a.set_list_buffer.remote(list_buffer)]) results = ray.get([a.do_allgather.remote() for a in actors]) for i in range(world_size): for j in range(world_size): if j % 2 == 0: assert (results[i][j] == torch.ones( shape, dtype=torch.float32).cuda() * (j + 1)).all() else: assert (results[i][j] == cp.ones(shape, dtype=cp.float32) * (j + 1)).all()
def test_allreduce_different_array_size( ray_start_distributed_2_nodes_4_gpus, array_size ): world_size = 4 actors, _ = create_collective_workers(world_size) ray.wait( [a.set_buffer.remote(cp.ones(array_size, dtype=cp.float32)) for a in actors] ) results = ray.get([a.do_allreduce.remote() for a in actors]) assert (results[0] == cp.ones((array_size,), dtype=cp.float32) * world_size).all() assert (results[1] == cp.ones((array_size,), dtype=cp.float32) * world_size).all()
def test_unmatched_tensor_shape(ray_start_distributed_2_nodes_4_gpus, shape): world_size = 4 actors, _ = create_collective_workers(world_size) init_tensors_for_gather_scatter(actors, array_size=10) list_buffer = [cp.ones(shape, dtype=cp.float32) for _ in range(world_size)] ray.get([a.set_list_buffer.remote(list_buffer) for a in actors]) if shape != 10: with pytest.raises(RuntimeError): ray.get([a.do_allgather.remote() for a in actors]) else: ray.get([a.do_allgather.remote() for a in actors])
def test_allreduce_torch_cupy(ray_start_single_node_2_gpus): # import torch world_size = 2 actors, _ = create_collective_workers(world_size) ray.wait([actors[1].set_buffer.remote(torch.ones(10, ).cuda())]) results = ray.get([a.do_allreduce.remote() for a in actors]) assert (results[0] == cp.ones((10, )) * world_size).all() ray.wait([actors[0].set_buffer.remote(torch.ones(10, ))]) ray.wait([actors[1].set_buffer.remote(cp.ones(10, ))]) with pytest.raises(RuntimeError): results = ray.get([a.do_allreduce.remote() for a in actors])
def test_reduce_torch_cupy(ray_start_distributed_2_nodes_4_gpus, dst_rank): import torch world_size = 4 actors, _ = create_collective_workers(world_size) ray.wait([actors[1].set_buffer.remote(torch.ones(10, ).cuda())]) results = ray.get([a.do_reduce.remote(dst_rank=dst_rank) for a in actors]) if dst_rank == 0: assert (results[0] == cp.ones((10, )) * world_size).all() assert (results[1] == torch.ones((10, )).cuda()).all() else: assert (results[0] == cp.ones((10, ))).all() assert (results[1] == torch.ones((10, )).cuda() * world_size).all()
def test_reduce_different_name(ray_start_single_node_2_gpus, group_name, dst_rank): world_size = 2 actors, _ = create_collective_workers(num_workers=world_size, group_name=group_name) results = ray.get( [a.do_reduce.remote(group_name, dst_rank) for a in actors]) for i in range(world_size): if i == dst_rank: assert (results[i] == cp.ones( (10, ), dtype=cp.float32) * world_size).all() else: assert (results[i] == cp.ones((10, ), dtype=cp.float32)).all()
def test_broadcast_different_array_size(ray_start_distributed_2_nodes_4_gpus, array_size, src_rank): world_size = 4 actors, _ = create_collective_workers(world_size) ray.wait([ a.set_buffer.remote(cp.ones(array_size, dtype=cp.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get( [a.do_broadcast.remote(src_rank=src_rank) for a in actors]) for i in range(world_size): assert (results[i] == cp.ones( (array_size, ), dtype=cp.float32) * (src_rank + 2)).all()
def test_broadcast_torch_cupy(ray_start_single_node_2_gpus, src_rank): import torch world_size = 2 actors, _ = create_collective_workers(world_size) ray.wait( [actors[1].set_buffer.remote(torch.ones(10, ).cuda() * world_size)]) results = ray.get( [a.do_broadcast.remote(src_rank=src_rank) for a in actors]) if src_rank == 0: assert (results[0] == cp.ones((10, ))).all() assert (results[1] == torch.ones((10, )).cuda()).all() else: assert (results[0] == cp.ones((10, )) * world_size).all() assert (results[1] == torch.ones((10, )).cuda() * world_size).all()
def test_reducescatter_different_array_size( ray_start_distributed_2_nodes_4_gpus, array_size, tensor_backend): world_size = 4 actors, _ = create_collective_workers(world_size) init_tensors_for_gather_scatter( actors, array_size=array_size, tensor_backend=tensor_backend) results = ray.get([a.do_reducescatter.remote() for a in actors]) for i in range(world_size): if tensor_backend == "cupy": assert (results[i] == cp.ones(array_size, dtype=cp.float32) * world_size).all() else: assert (results[i] == torch.ones( array_size, dtype=torch.float32).cuda() * world_size).all()
def test_reduce_different_op(ray_start_distributed_2_nodes_4_gpus, dst_rank): world_size = 4 actors, _ = create_collective_workers(world_size) # check product ray.wait([ a.set_buffer.remote(cp.ones(10, dtype=cp.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get([ a.do_reduce.remote(dst_rank=dst_rank, op=ReduceOp.PRODUCT) for a in actors ]) for i in range(world_size): if i == dst_rank: assert (results[i] == cp.ones( (10, ), dtype=cp.float32) * 120).all() else: assert (results[i] == cp.ones( (10, ), dtype=cp.float32) * (i + 2)).all() # check min ray.wait([ a.set_buffer.remote(cp.ones(10, dtype=cp.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get([ a.do_reduce.remote(dst_rank=dst_rank, op=ReduceOp.MIN) for a in actors ]) for i in range(world_size): if i == dst_rank: assert (results[i] == cp.ones((10, ), dtype=cp.float32) * 2).all() else: assert (results[i] == cp.ones( (10, ), dtype=cp.float32) * (i + 2)).all() # check max ray.wait([ a.set_buffer.remote(cp.ones(10, dtype=cp.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get([ a.do_reduce.remote(dst_rank=dst_rank, op=ReduceOp.MAX) for a in actors ]) for i in range(world_size): if i == dst_rank: assert (results[i] == cp.ones((10, ), dtype=cp.float32) * 5).all() else: assert (results[i] == cp.ones( (10, ), dtype=cp.float32) * (i + 2)).all()
def test_allreduce_multiple_group(ray_start_single_node_2_gpus, backend="nccl", num_groups=5): world_size = 2 actors, _ = create_collective_workers(world_size) for group_name in range(1, num_groups): ray.get([ actor.init_group.remote(world_size, i, backend, str(group_name)) for i, actor in enumerate(actors) ]) for i in range(num_groups): group_name = "default" if i == 0 else str(i) results = ray.get([a.do_allreduce.remote(group_name) for a in actors]) assert (results[0] == cp.ones( (10, ), dtype=cp.float32) * (world_size**(i + 1))).all()
def test_reduce_different_array_size(ray_start_single_node_2_gpus, array_size, dst_rank): world_size = 2 actors, _ = create_collective_workers(world_size) ray.wait([ a.set_buffer.remote(cp.ones(array_size, dtype=cp.float32)) for a in actors ]) results = ray.get([a.do_reduce.remote(dst_rank=dst_rank) for a in actors]) for i in range(world_size): if i == dst_rank: assert (results[i] == cp.ones( (array_size, ), dtype=cp.float32) * world_size).all() else: assert (results[i] == cp.ones((array_size, ), dtype=cp.float32)).all()
def test_broadcast_different_name(ray_start_single_node_2_gpus, group_name, src_rank): world_size = 2 actors, _ = create_collective_workers(num_workers=world_size, group_name=group_name) ray.wait([ a.set_buffer.remote(cp.ones((10, ), dtype=cp.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get([ a.do_broadcast.remote(group_name=group_name, src_rank=src_rank) for a in actors ]) for i in range(world_size): assert (results[i] == cp.ones( (10, ), dtype=cp.float32) * (src_rank + 2)).all()
def test_allgather_different_array_size(ray_start_single_node_2_gpus, array_size, tensor_backend): world_size = 2 actors, _ = create_collective_workers(world_size) init_tensors_for_gather_scatter(actors, array_size=array_size, tensor_backend=tensor_backend) results = ray.get([a.do_allgather.remote() for a in actors]) for i in range(world_size): for j in range(world_size): if tensor_backend == "cupy": assert ( results[i][j] == cp.ones(array_size, dtype=cp.float32) * (j + 1)).all() else: assert (results[i][j] == torch.ones( array_size, dtype=torch.float32).cuda() * (j + 1)).all()
def test_is_group_initialized(ray_start_distributed_2_nodes_4_gpus): world_size = 4 actors, _ = create_collective_workers(world_size) # check group is_init actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert actor0_is_init actor0_is_init = ray.get( actors[0].report_is_group_initialized.remote("random")) assert not actor0_is_init actor0_is_init = ray.get( actors[0].report_is_group_initialized.remote("123")) assert not actor0_is_init actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert actor1_is_init actor1_is_init = ray.get( actors[0].report_is_group_initialized.remote("456")) assert not actor1_is_init
def test_reduce_different_name(ray_start_single_node_2_gpus, group_name, array_size, dst_rank): world_size = 2 actors, _ = create_collective_workers(num_workers=world_size, group_name=group_name) ray.wait([ a.set_buffer.remote(cp.ones(array_size, dtype=cp.float32) * (i + 1)) for i, a in enumerate(actors) ]) src_rank = 1 - dst_rank refs = [] for i, actor in enumerate(actors): if i != dst_rank: ref = actor.do_send.remote(group_name, dst_rank) else: ref = actor.do_recv.remote(group_name, src_rank) refs.append(ref) results = ray.get(refs) for i in range(world_size): assert (results[i] == cp.ones(array_size, dtype=cp.float32) * (src_rank + 1)).all()
def test_get_rank(ray_start_distributed_2_nodes_4_gpus, world_size): actors, _ = create_collective_workers(world_size) actor0_rank = ray.get(actors[0].report_rank.remote()) assert actor0_rank == 0 actor1_rank = ray.get(actors[1].report_rank.remote()) assert actor1_rank == 1 # create a second group with a different name, and different # orders of ranks. new_group_name = "default2" ranks = list(range(world_size)) shuffle(ranks) _ = ray.get([ actor.init_group.remote( world_size, ranks[i], group_name=new_group_name) for i, actor in enumerate(actors) ]) actor0_rank = ray.get(actors[0].report_rank.remote(new_group_name)) assert actor0_rank == ranks[0] actor1_rank = ray.get(actors[1].report_rank.remote(new_group_name)) assert actor1_rank == ranks[1]
def test_sendrecv_torch_cupy(ray_start_single_node_2_gpus, dst_rank): import torch world_size = 2 actors, _ = create_collective_workers(world_size) ray.wait([actors[1].set_buffer.remote(torch.ones(10, ).cuda() * 2)]) src_rank = 1 - dst_rank refs = [] for i, actor in enumerate(actors): if i != dst_rank: ref = actor.do_send.remote(dst_rank=dst_rank) else: ref = actor.do_recv.remote(src_rank=src_rank) refs.append(ref) results = ray.get(refs) if dst_rank == 0: assert (results[0] == cp.ones((10, )) * 2).all() assert (results[1] == torch.ones((10, )).cuda() * 2).all() else: assert (results[0] == cp.ones((10, ))).all() assert (results[1] == torch.ones((10, )).cuda()).all()
def test_get_rank(ray_start_single_node_2_gpus): world_size = 2 actors, _ = create_collective_workers(world_size) actor0_rank = ray.get(actors[0].report_rank.remote()) assert actor0_rank == 0 actor1_rank = ray.get(actors[1].report_rank.remote()) assert actor1_rank == 1 # create a second group with a different name, # and different order of ranks. new_group_name = "default2" ray.get([ actor.init_group.remote(world_size, world_size - 1 - i, group_name=new_group_name) for i, actor in enumerate(actors) ]) actor0_rank = ray.get(actors[0].report_rank.remote(new_group_name)) assert actor0_rank == 1 actor1_rank = ray.get(actors[1].report_rank.remote(new_group_name)) assert actor1_rank == 0