def test_init_two_actors(ray_start_distributed_multigpu_2_nodes_4_gpus, group_name): world_size = 2 actors, results = create_collective_multigpu_workers( world_size, group_name) for i in range(world_size): assert (results[i])
def test_sendrecv(ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, src_rank, dst_rank, src_gpu_index, dst_gpu_index): if src_rank == dst_rank: return world_size = 2 actors, _ = create_collective_multigpu_workers(num_workers=world_size) ray.get(actors[0].set_buffer.remote(array_size, value0=2, value1=3)) ray.get(actors[1].set_buffer.remote(array_size, value0=4, value1=5)) refs = [] for i in range(world_size): refs.append(actors[i].get_buffer.remote()) refs[src_rank][src_gpu_index] = actors[src_rank].do_send_multigpu.remote( dst_rank=dst_rank, dst_gpu_index=dst_gpu_index, src_gpu_index=src_gpu_index) refs[dst_rank][dst_gpu_index] = actors[dst_rank].do_recv_multigpu.remote( src_rank=src_rank, src_gpu_index=src_gpu_index, dst_gpu_index=dst_gpu_index) results = [] results_flattend = ray.get(refs[0] + refs[1]) results.append([results_flattend[0], results_flattend[1]]) results.append([results_flattend[2], results_flattend[3]]) assert (results[src_rank][src_gpu_index] == cp.ones( array_size, dtype=cp.float32) * ( (src_rank + 1) * 2 + src_gpu_index)).all() assert (results[dst_rank][dst_gpu_index] == cp.ones( array_size, dtype=cp.float32) * ( (src_rank + 1) * 2 + src_gpu_index)).all() ray.get([a.destroy_group.remote() for a in actors])
def test_allreduce_multigpu_different_op(ray_start_distributed_multigpu_2_nodes_4_gpus): world_size = 2 actors, _ = create_collective_multigpu_workers(world_size) # check product ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) results = ray.get( [a.do_allreduce_multigpu.remote(op=ReduceOp.PRODUCT) for a in actors] ) assert (results[0] == cp.ones((10,), dtype=cp.float32) * 120).all() assert (results[1] == cp.ones((10,), dtype=cp.float32) * 120).all() # check min ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) results = ray.get([a.do_allreduce_multigpu.remote(op=ReduceOp.MIN) for a in actors]) assert (results[0] == cp.ones((10,), dtype=cp.float32) * 2).all() assert (results[1] == cp.ones((10,), dtype=cp.float32) * 2).all() # check max ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) results = ray.get([a.do_allreduce_multigpu.remote(op=ReduceOp.MAX) for a in actors]) assert (results[0] == cp.ones((10,), dtype=cp.float32) * 5).all() assert (results[1] == cp.ones((10,), dtype=cp.float32) * 5).all()
def test_broadcast_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus, src_rank, src_gpu_index): import torch world_size = 2 num_gpu_per_worker = 2 actors, _ = create_collective_multigpu_workers(world_size) ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5, tensor_type0="torch", tensor_type1="torch")) results = ray.get([ a.do_broadcast_multigpu.remote(src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors ]) for i in range(world_size): for j in range(num_gpu_per_worker): val = (src_rank + 1) * 2 + src_gpu_index if i == 0: assert (results[i][j] == cp.ones([10], dtype=cp.float32) * val).all() else: assert (results[i][j] == torch.ones([10]).cuda(j) * val).all()
def test_destroy_group(ray_start_distributed_multigpu_2_nodes_4_gpus): world_size = 2 actors, _ = create_collective_multigpu_workers(world_size) # Now destroy the group at actor0 ray.wait([actors[0].destroy_group.remote()]) actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert not actor0_is_init # should go well as the group `random` does not exist at all ray.wait([actors[0].destroy_group.remote("random")]) actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) assert actor1_is_init ray.wait([actors[1].destroy_group.remote("random")]) actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) assert actor1_is_init ray.wait([actors[1].destroy_group.remote("default")]) actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) assert not actor1_is_init # Now reconstruct the group using the same name init_results = ray.get([ actor.init_group.remote(world_size, i) for i, actor in enumerate(actors) ]) for i in range(world_size): assert init_results[i] actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert actor0_is_init actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert actor1_is_init
def test_allreduce_multigpu_destroy( ray_start_distributed_multigpu_2_nodes_4_gpus, backend="nccl", group_name="default" ): world_size = 2 num_gpu_per_worker = 2 actual_world_size = world_size * num_gpu_per_worker actors, _ = create_collective_multigpu_workers(world_size) results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) assert (results[0] == cp.ones((10,), dtype=cp.float32) * actual_world_size).all() assert (results[1] == cp.ones((10,), dtype=cp.float32) * actual_world_size).all() # destroy the group and try do work, should fail ray.get([a.destroy_group.remote() for a in actors]) with pytest.raises(RuntimeError): results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) # reinit the same group and all reduce ray.get( [ actor.init_group.remote(world_size, i, backend, group_name) for i, actor in enumerate(actors) ] ) results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) assert ( results[0] == cp.ones((10,), dtype=cp.float32) * actual_world_size * actual_world_size ).all() assert ( results[1] == cp.ones((10,), dtype=cp.float32) * actual_world_size * actual_world_size ).all()
def test_reduce_different_op(ray_start_distributed_multigpu_2_nodes_4_gpus, dst_rank, dst_gpu_index): world_size = 2 num_gpu_per_worker = 2 actors, _ = create_collective_multigpu_workers(world_size) # check product ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) results = ray.get([ a.do_reduce_multigpu.remote(dst_rank=dst_rank, dst_gpu_index=dst_gpu_index, op=ReduceOp.PRODUCT) for a in actors ]) for i in range(world_size): for j in range(num_gpu_per_worker): if i == dst_rank and j == dst_gpu_index: assert (results[i][j] == cp.ones( (10, ), dtype=cp.float32) * 120).all() else: val = (i + 1) * 2 + j assert (results[i][j] == cp.ones( (10, ), dtype=cp.float32) * val).all() # check min ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) results = ray.get([ a.do_reduce_multigpu.remote(dst_rank=dst_rank, dst_gpu_index=dst_gpu_index, op=ReduceOp.MIN) for a in actors ]) for i in range(world_size): for j in range(num_gpu_per_worker): if i == dst_rank and j == dst_gpu_index: assert (results[i][j] == cp.ones( (10, ), dtype=cp.float32) * 2).all() else: val = (i + 1) * 2 + j assert (results[i][j] == cp.ones( (10, ), dtype=cp.float32) * val).all() # check max ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) results = ray.get([ a.do_reduce_multigpu.remote(dst_rank=dst_rank, dst_gpu_index=dst_gpu_index, op=ReduceOp.MAX) for a in actors ]) for i in range(world_size): for j in range(num_gpu_per_worker): if i == dst_rank and j == dst_gpu_index: assert (results[i][j] == cp.ones( (10, ), dtype=cp.float32) * 5).all() else: val = (i + 1) * 2 + j assert (results[i][j] == cp.ones( (10, ), dtype=cp.float32) * val).all()
def test_broadcast_invalid_rank(ray_start_distributed_multigpu_2_nodes_4_gpus, src_rank, src_gpu_index): world_size = 2 actors, _ = create_collective_multigpu_workers(world_size) with pytest.raises(ValueError): _ = ray.get([ a.do_broadcast_multigpu.remote( src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors ])
def test_availability(ray_start_distributed_multigpu_2_nodes_4_gpus): world_size = 2 actors, _ = create_collective_multigpu_workers(world_size) actor0_nccl_availability = ray.get( actors[0].report_nccl_availability.remote()) assert actor0_nccl_availability actor0_gloo_availability = ray.get( actors[0].report_gloo_availability.remote()) assert not actor0_gloo_availability
def test_allreduce_multigpu_different_dtype( ray_start_distributed_multigpu_2_nodes_4_gpus, dtype ): world_size = 2 num_gpu_per_worker = 2 actual_world_size = world_size * num_gpu_per_worker actors, _ = create_collective_multigpu_workers(world_size) ray.get([a.set_buffer.remote([10], dtype=dtype) for a in actors]) results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) assert (results[0] == cp.ones((10,), dtype=dtype) * actual_world_size).all() assert (results[1] == cp.ones((10,), dtype=dtype) * actual_world_size).all()
def test_allreduce_multigpu_different_name( ray_start_distributed_multigpu_2_nodes_4_gpus, group_name ): world_size = 2 num_gpu_per_worker = 2 actual_world_size = world_size * num_gpu_per_worker actors, _ = create_collective_multigpu_workers( num_workers=world_size, group_name=group_name ) results = ray.get([a.do_allreduce_multigpu.remote(group_name) for a in actors]) assert (results[0] == cp.ones((10,), dtype=cp.float32) * actual_world_size).all() assert (results[1] == cp.ones((10,), dtype=cp.float32) * actual_world_size).all()
def test_is_group_initialized(ray_start_distributed_multigpu_2_nodes_4_gpus): world_size = 2 actors, _ = create_collective_multigpu_workers(world_size) # check group is_init actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert actor0_is_init actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote("random")) assert not actor0_is_init actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote("123")) assert not actor0_is_init actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert actor1_is_init actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote("456")) assert not actor1_is_init
def test_broadcast_different_array_size( ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, src_rank, src_gpu_index): world_size = 2 num_gpu_per_worker = 2 actors, _ = create_collective_multigpu_workers(world_size) ray.get(actors[0].set_buffer.remote([array_size], value0=2, value1=3)) ray.get(actors[1].set_buffer.remote([array_size], value0=4, value1=5)) results = ray.get([ a.do_broadcast_multigpu.remote( src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors ]) for i in range(world_size): for j in range(num_gpu_per_worker): val = (src_rank + 1) * 2 + src_gpu_index assert (results[i][j] == cp.ones( (array_size, ), dtype=cp.float32) * val).all()
def test_allgather_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus): world_size = 2 num_gpu_per_worker = 2 actual_world_size = world_size * num_gpu_per_worker shape = [10, 10] actors, _ = create_collective_multigpu_workers(world_size) # tensor is pytorch, list is cupy for i, a in enumerate(actors): ray.get([ a.set_buffer.remote(shape, tensor_type0="torch", tensor_type1="torch") ]) ray.get([ a.set_list_buffer.remote(shape, tensor_type0="cupy", tensor_type1="cupy") ]) results = ray.get([a.do_allgather_multigpu.remote() for a in actors]) for i in range(world_size): for j in range(num_gpu_per_worker): for k in range(actual_world_size): assert (results[i][j][k] == cp.ones(shape, dtype=cp.float32)).all() # tensor is cupy, list is pytorch for i, a in enumerate(actors): ray.get([ a.set_buffer.remote(shape, tensor_type0="cupy", tensor_type1="cupy") ]) ray.get([ a.set_list_buffer.remote(shape, tensor_type0="torch", tensor_type1="torch") ]) results = ray.get([a.do_allgather_multigpu.remote() for a in actors]) for i in range(world_size): for j in range(num_gpu_per_worker): for k in range(actual_world_size): assert (results[i][j][k] == torch.ones( shape, dtype=torch.float32).cuda(j)).all()
def test_allreduce_multigpu_multiple_group( ray_start_distributed_multigpu_2_nodes_4_gpus, backend="nccl", num_groups=5): world_size = 2 num_gpu_per_worker = 2 actual_world_size = world_size * num_gpu_per_worker actors, _ = create_collective_multigpu_workers(world_size) for group_name in range(1, num_groups): ray.get([ actor.init_group.remote(world_size, i, backend, str(group_name)) for i, actor in enumerate(actors) ]) for i in range(num_groups): group_name = "default" if i == 0 else str(i) results = ray.get( [a.do_allreduce_multigpu.remote(group_name) for a in actors]) assert (results[0] == cp.ones( (10, ), dtype=cp.float32) * (actual_world_size**(i + 1))).all()
def test_allreduce_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus): # import torch world_size = 2 actual_world_size = 4 actors, _ = create_collective_multigpu_workers(world_size) ray.get(actors[0].set_buffer.remote([10])) ray.get( actors[1].set_buffer.remote([10], tensor_type0="torch", tensor_type1="torch") ) results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) assert (results[0] == cp.ones((10,)) * actual_world_size).all() ray.get( actors[0].set_buffer.remote([10], tensor_type0="cupy", tensor_type1="torch") ) ray.get( actors[1].set_buffer.remote([10], tensor_type0="torch", tensor_type1="cupy") ) results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) assert (results[0] == cp.ones((10,)) * actual_world_size).all()
def test_allgather_different_array_size( ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, tensor_backend): world_size = 2 num_gpu_per_worker = 2 actual_world_size = world_size * num_gpu_per_worker actors, _ = create_collective_multigpu_workers(world_size) init_tensors_for_gather_scatter_multigpu(actors, array_size=array_size, tensor_backend=tensor_backend) results = ray.get([a.do_allgather_multigpu.remote() for a in actors]) for i in range(world_size): for j in range(num_gpu_per_worker): for k in range(actual_world_size): if tensor_backend == "cupy": assert (results[i][j][k] == cp.ones( array_size, dtype=cp.float32)).all() else: assert (results[i][j][k] == torch.ones( array_size, dtype=torch.float32).cuda(j)).all()
def test_reduce_different_name(ray_start_distributed_multigpu_2_nodes_4_gpus, group_name, dst_rank, dst_gpu_index): world_size = 2 num_gpu_per_worker = 2 actual_world_size = world_size * num_gpu_per_worker actors, _ = create_collective_multigpu_workers(num_workers=world_size, group_name=group_name) results = ray.get([ a.do_reduce_multigpu.remote(group_name, dst_rank=dst_rank, dst_gpu_index=dst_gpu_index) for a in actors ]) for i in range(world_size): for j in range(num_gpu_per_worker): if i == dst_rank and j == dst_gpu_index: assert (results[i][j] == cp.ones( (10, ), dtype=cp.float32) * actual_world_size).all() else: assert (results[i][j] == cp.ones((10, ), dtype=cp.float32)).all()
def test_get_rank(ray_start_distributed_multigpu_2_nodes_4_gpus): world_size = 2 actors, _ = create_collective_multigpu_workers(world_size) actor0_rank = ray.get(actors[0].report_rank.remote()) assert actor0_rank == 0 actor1_rank = ray.get(actors[1].report_rank.remote()) assert actor1_rank == 1 # create a second group with a different name, and different # orders of ranks. new_group_name = "default2" ranks = list(range(world_size)) shuffle(ranks) ray.get([ actor.init_group.remote(world_size, ranks[i], group_name=new_group_name) for i, actor in enumerate(actors) ]) actor0_rank = ray.get(actors[0].report_rank.remote(new_group_name)) assert actor0_rank == ranks[0] actor1_rank = ray.get(actors[1].report_rank.remote(new_group_name)) assert actor1_rank == ranks[1]
def test_report_num_gpus(ray_start_distributed_multigpu_2_nodes_4_gpus): world_size = 2 actors, results = create_collective_multigpu_workers(world_size) num_gpus = ray.get([actor.report_num_gpus.remote() for actor in actors]) assert num_gpus == [2, 2]