def test_placement_group_task_resource_ids(ray_start_cluster, connect_to_client): @ray.remote(num_cpus=1) def f(): return ray.worker.get_resource_ids() cluster = ray_start_cluster num_nodes = 1 for _ in range(num_nodes): cluster.add_node(num_cpus=4) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): g1 = ray.util.placement_group([{"CPU": 2}]) o1 = f.options(placement_group=g1).remote() resources = ray.get(o1) assert len(resources) == 1, resources assert "CPU_group_" in list(resources.keys())[0], resources assert "CPU_group_0_" not in list(resources.keys())[0], resources # Now retry with a bundle index constraint. o1 = f.options(placement_group=g1, placement_group_bundle_index=0).remote() resources = ray.get(o1) assert len(resources) == 2, resources keys = list(resources.keys()) assert "CPU_group_" in keys[0], resources assert "CPU_group_" in keys[1], resources assert ("CPU_group_0_" in keys[0] or "CPU_group_0_" in keys[1]), resources placement_group_assert_no_leak([g1])
def test_placement_group_gpu_set(ray_start_cluster, connect_to_client): cluster = ray_start_cluster # One node which only has one CPU. cluster.add_node(num_cpus=1, num_gpus=1) cluster.add_node(num_cpus=1, num_gpus=1) cluster.wait_for_nodes() ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): placement_group = ray.util.placement_group(name="name", strategy="PACK", bundles=[{ "CPU": 1, "GPU": 1 }, { "CPU": 1, "GPU": 1 }]) @ray.remote(num_gpus=1) def get_gpus(): return ray.get_gpu_ids() result = get_gpus.options(placement_group=placement_group, placement_group_bundle_index=0).remote() result = ray.get(result) assert result == [0] result = get_gpus.options(placement_group=placement_group, placement_group_bundle_index=1).remote() result = ray.get(result) assert result == [0]
def test_placement_group_hang(ray_start_cluster, connect_to_client): @ray.remote(num_cpus=1) def f(): return ray.worker.get_resource_ids() cluster = ray_start_cluster num_nodes = 1 for _ in range(num_nodes): cluster.add_node(num_cpus=4) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): # Warm workers up, so that this triggers the hang rice. ray.get(f.remote()) g1 = ray.util.placement_group([{"CPU": 2}]) # This will start out infeasible. The placement group will then be # created and it transitions to feasible. o1 = f.options(placement_group=g1).remote() resources = ray.get(o1) assert len(resources) == 1, resources assert "CPU_group_" in list(resources.keys())[0], resources placement_group_assert_no_leak([g1])
def test_placement_group_wait(ray_start_cluster, connect_to_client): cluster = ray_start_cluster [cluster.add_node(num_cpus=2) for _ in range(2)] ray.init(address=cluster.address) cluster.wait_for_nodes() with connect_to_client_or_not(connect_to_client): # Wait on placement group that cannot be created. placement_group = ray.util.placement_group( name="name", strategy="SPREAD", bundles=[ { "CPU": 2 }, { "CPU": 2 }, ], ) ready, unready = ray.wait([placement_group.ready()]) assert len(unready) == 0 assert len(ready) == 1 table = ray.util.placement_group_table(placement_group) assert table["state"] == "CREATED" pg = ray.get(placement_group.ready()) assert pg.bundle_specs == placement_group.bundle_specs assert pg.id.binary() == placement_group.id.binary()
def test_check_bundle_index(ray_start_cluster, connect_to_client): @ray.remote(num_cpus=2) class Actor(object): def __init__(self): self.n = 0 def value(self): return self.n cluster = ray_start_cluster cluster.add_node(num_cpus=4) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): placement_group = ray.util.placement_group(name="name", strategy="SPREAD", bundles=[{ "CPU": 2 }, { "CPU": 2 }]) with pytest.raises(ValueError, match="bundle index 3 is invalid"): Actor.options(placement_group=placement_group, placement_group_bundle_index=3).remote() with pytest.raises(ValueError, match="bundle index -2 is invalid"): Actor.options(placement_group=placement_group, placement_group_bundle_index=-2).remote() with pytest.raises(ValueError, match="bundle index must be -1"): Actor.options(placement_group_bundle_index=0).remote() placement_group_assert_no_leak([placement_group])
def test_placement_group_synchronous_registration(ray_start_cluster, connect_to_client): cluster = ray_start_cluster # One node which only has one CPU. cluster.add_node(num_cpus=1) cluster.wait_for_nodes() ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): # Create a placement group that has two bundles and `STRICT_PACK` # strategy so its registration will successful but scheduling failed. placement_group = ray.util.placement_group( name="name", strategy="STRICT_PACK", bundles=[ { "CPU": 1, }, {"CPU": 1}, ], ) # Make sure we can properly remove it immediately # as its registration is synchronous. ray.util.remove_placement_group(placement_group) wait_for_condition(lambda: is_placement_group_removed(placement_group))
def test_pending_placement_group_wait(ray_start_cluster, connect_to_client): cluster = ray_start_cluster [cluster.add_node(num_cpus=2) for _ in range(1)] ray.init(address=cluster.address) cluster.wait_for_nodes() with connect_to_client_or_not(connect_to_client): # Wait on placement group that cannot be created. placement_group = ray.util.placement_group( name="name", strategy="SPREAD", bundles=[ { "CPU": 2 }, { "CPU": 2 }, { "GPU": 2 }, ], ) ready, unready = ray.wait([placement_group.ready()], timeout=0.1) assert len(unready) == 1 assert len(ready) == 0 table = ray.util.placement_group_table(placement_group) assert table["state"] == "PENDING" with pytest.raises(ray.exceptions.GetTimeoutError): ray.get(placement_group.ready(), timeout=0.1)
def test_placement_group_strict_spread(ray_start_cluster, connect_to_client): @ray.remote(num_cpus=2) class Actor(object): def __init__(self): self.n = 0 def value(self): return self.n cluster = ray_start_cluster num_nodes = 3 for _ in range(num_nodes): cluster.add_node(num_cpus=4) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): placement_group = ray.util.placement_group( name="name", strategy="STRICT_SPREAD", bundles=[{ "CPU": 2 }, { "CPU": 2 }, { "CPU": 2 }]) ray.get(placement_group.ready()) actor_1 = Actor.options( placement_group=placement_group, placement_group_bundle_index=0).remote() actor_2 = Actor.options( placement_group=placement_group, placement_group_bundle_index=1).remote() actor_3 = Actor.options( placement_group=placement_group, placement_group_bundle_index=2).remote() ray.get(actor_1.value.remote()) ray.get(actor_2.value.remote()) ray.get(actor_3.value.remote()) # Get all actors. actor_infos = ray.state.actors() # Make sure all actors in counter_list are located in separate nodes. actor_info_1 = actor_infos.get(actor_1._actor_id.hex()) actor_info_2 = actor_infos.get(actor_2._actor_id.hex()) actor_info_3 = actor_infos.get(actor_3._actor_id.hex()) assert actor_info_1 and actor_info_2 and actor_info_3 node_of_actor_1 = actor_info_1["Address"]["NodeID"] node_of_actor_2 = actor_info_2["Address"]["NodeID"] node_of_actor_3 = actor_info_3["Address"]["NodeID"] assert node_of_actor_1 != node_of_actor_2 assert node_of_actor_1 != node_of_actor_3 assert node_of_actor_2 != node_of_actor_3 placement_group_assert_no_leak([placement_group])
def test_placement_group_pack(ray_start_cluster, connect_to_client, gcs_actor_scheduling_enabled): @ray.remote(num_cpus=2) class Actor(object): def __init__(self): self.n = 0 def value(self): return self.n cluster = ray_start_cluster num_nodes = 2 for i in range(num_nodes): cluster.add_node( num_cpus=4, _system_config={ "gcs_actor_scheduling_enabled": gcs_actor_scheduling_enabled } if i == 0 else {}, ) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): placement_group = ray.util.placement_group( name="name", strategy="PACK", bundles=[ { "CPU": 2, "GPU": 0 }, # Test 0 resource spec doesn't break tests. { "CPU": 2 }, ], ) ray.get(placement_group.ready()) actor_1 = Actor.options(placement_group=placement_group, placement_group_bundle_index=0).remote() actor_2 = Actor.options(placement_group=placement_group, placement_group_bundle_index=1).remote() ray.get(actor_1.value.remote()) ray.get(actor_2.value.remote()) # Get all actors. actor_infos = ray._private.state.actors() # Make sure all actors in counter_list are collocated in one node. actor_info_1 = actor_infos.get(actor_1._actor_id.hex()) actor_info_2 = actor_infos.get(actor_2._actor_id.hex()) assert actor_info_1 and actor_info_2 node_of_actor_1 = actor_info_1["Address"]["NodeID"] node_of_actor_2 = actor_info_2["Address"]["NodeID"] assert node_of_actor_1 == node_of_actor_2 placement_group_assert_no_leak([placement_group])
def test_spread_scheduling_strategy(ray_start_cluster, connect_to_client): cluster = ray_start_cluster # Create a head node cluster.add_node( num_cpus=0, _system_config={ "scheduler_spread_threshold": 1, }, ) ray.init(address=cluster.address) for i in range(2): cluster.add_node(num_cpus=8, resources={f"foo:{i}": 1}) cluster.wait_for_nodes() with connect_to_client_or_not(connect_to_client): @ray.remote def get_node_id(): return ray.worker.global_worker.current_node_id worker_node_ids = { ray.get(get_node_id.options(resources={ f"foo:{i}": 1 }).remote()) for i in range(2) } # Wait for updating driver raylet's resource view. time.sleep(5) @ray.remote(scheduling_strategy=SPREAD_SCHEDULING_STRATEGY) def task1(): internal_kv._internal_kv_put("test_task1", "task1") while internal_kv._internal_kv_exists("test_task1"): time.sleep(0.1) return ray.worker.global_worker.current_node_id @ray.remote def task2(): internal_kv._internal_kv_put("test_task2", "task2") return ray.worker.global_worker.current_node_id locations = [] locations.append(task1.remote()) while not internal_kv._internal_kv_exists("test_task1"): time.sleep(0.1) # Wait for updating driver raylet's resource view. time.sleep(5) locations.append( task2.options( scheduling_strategy=SPREAD_SCHEDULING_STRATEGY).remote()) while not internal_kv._internal_kv_exists("test_task2"): time.sleep(0.1) internal_kv._internal_kv_del("test_task1") internal_kv._internal_kv_del("test_task2") assert set(ray.get(locations)) == worker_node_ids
def test_placement_group_strict_pack(ray_start_cluster, connect_to_client): @ray.remote(num_cpus=2) class Actor(object): def __init__(self): self.n = 0 def value(self): return self.n cluster = ray_start_cluster num_nodes = 2 for _ in range(num_nodes): cluster.add_node(num_cpus=4) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): placement_group = ray.util.placement_group( name="name", strategy="STRICT_PACK", bundles=[ { "memory": 50 * 1024 * 1024, # Test memory resource spec doesn't break tests. "CPU": 2 }, { "CPU": 2 } ]) ray.get(placement_group.ready()) actor_1 = Actor.options( placement_group=placement_group, placement_group_bundle_index=0).remote() actor_2 = Actor.options( placement_group=placement_group, placement_group_bundle_index=1).remote() ray.get(actor_1.value.remote()) ray.get(actor_2.value.remote()) # Get all actors. actor_infos = ray.state.actors() # Make sure all actors in counter_list are collocated in one node. actor_info_1 = actor_infos.get(actor_1._actor_id.hex()) actor_info_2 = actor_infos.get(actor_2._actor_id.hex()) assert actor_info_1 and actor_info_2 node_of_actor_1 = actor_info_1["Address"]["NodeID"] node_of_actor_2 = actor_info_2["Address"]["NodeID"] assert node_of_actor_1 == node_of_actor_2 placement_group_assert_no_leak([placement_group])
def test_client_context_manager(ray_start_regular_shared, connect_to_client): import ray with connect_to_client_or_not(connect_to_client): if connect_to_client: # Client mode is on. assert client_mode_should_convert(auto_init=True) # We're connected to Ray client. assert ray.util.client.ray.is_connected() else: assert not client_mode_should_convert(auto_init=True) assert not ray.util.client.ray.is_connected()
def test_placement_group_spread(ray_start_cluster, connect_to_client, gcs_actor_scheduling_enabled): @ray.remote class Actor(object): def __init__(self): self.n = 0 def value(self): return self.n cluster = ray_start_cluster num_nodes = 2 for i in range(num_nodes): cluster.add_node( num_cpus=4, _system_config={ "gcs_actor_scheduling_enabled": gcs_actor_scheduling_enabled } if i == 0 else {}, ) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): placement_group = ray.util.placement_group( name="name", strategy="STRICT_SPREAD", bundles=[{ "CPU": 2 }, { "CPU": 2 }], ) ray.get(placement_group.ready()) actors = [ Actor.options( placement_group=placement_group, placement_group_bundle_index=i, num_cpus=2, ).remote() for i in range(num_nodes) ] [ray.get(actor.value.remote()) for actor in actors] # Get all actors. actor_infos = ray._private.state.actors() # Make sure all actors in counter_list are located in separate nodes. actor_info_objs = [ actor_infos.get(actor._actor_id.hex()) for actor in actors ] assert are_pairwise_unique( [info_obj["Address"]["NodeID"] for info_obj in actor_info_objs]) placement_group_assert_no_leak([placement_group])
def test_cuda_visible_devices(ray_start_cluster, connect_to_client): @ray.remote(num_gpus=1) def f(): return os.environ["CUDA_VISIBLE_DEVICES"] cluster = ray_start_cluster num_nodes = 1 for _ in range(num_nodes): cluster.add_node(num_gpus=1) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): g1 = ray.util.placement_group([{"CPU": 1, "GPU": 1}]) o1 = f.options(placement_group=g1).remote() devices = ray.get(o1) assert devices == "0", devices
def test_remove_pending_placement_group(ray_start_cluster, connect_to_client): cluster = ray_start_cluster cluster.add_node(num_cpus=4) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): # Create a placement group that cannot be scheduled now. placement_group = ray.util.placement_group([{"GPU": 2}, {"CPU": 2}]) ray.util.remove_placement_group(placement_group) # TODO(sang): Add state check here. @ray.remote(num_cpus=4) def f(): return 3 # Make sure this task is still schedulable. assert ray.get(f.remote()) == 3
def test_placement_group_actor_resource_ids(ray_start_cluster, connect_to_client): @ray.remote(num_cpus=1) class F: def f(self): return ray.worker.get_resource_ids() cluster = ray_start_cluster num_nodes = 1 for _ in range(num_nodes): cluster.add_node(num_cpus=4) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): g1 = ray.util.placement_group([{"CPU": 2}]) a1 = F.options(placement_group=g1).remote() resources = ray.get(a1.f.remote()) assert len(resources) == 1, resources assert "CPU_group_" in list(resources.keys())[0], resources
def test_schedule_placement_group_when_node_add(ray_start_cluster, connect_to_client): cluster = ray_start_cluster cluster.add_node(num_cpus=4) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): # Creating a placement group that cannot be satisfied yet. placement_group = ray.util.placement_group([{"GPU": 2}, {"CPU": 2}]) def is_placement_group_created(): table = ray.util.placement_group_table(placement_group) if "state" not in table: return False return table["state"] == "CREATED" # Add a node that has GPU. cluster.add_node(num_cpus=4, num_gpus=4) # Make sure the placement group is created. wait_for_condition(is_placement_group_created)
def test_placement_ready(ray_start_regular, connect_to_client): @ray.remote class Actor: def __init__(self): pass def v(self): return 10 # bundle is placement group reserved resources and can't be used in bundles with pytest.raises(Exception): ray.util.placement_group(bundles=[{"bundle": 1}]) # This test is to test the case that even there all resource in the # bundle got allocated, we are still able to return from ready[I # since ready use 0 CPU with connect_to_client_or_not(connect_to_client): pg = ray.util.placement_group(bundles=[{"CPU": 1}]) ray.get(pg.ready()) a = Actor.options(num_cpus=1, placement_group=pg).remote() ray.get(a.v.remote()) ray.get(pg.ready())
def test_schedule_placement_groups_at_the_same_time(connect_to_client): ray.init(num_cpus=4) with connect_to_client_or_not(connect_to_client): pgs = [placement_group([{"CPU": 2}]) for _ in range(6)] wait_pgs = {pg.ready(): pg for pg in pgs} def is_all_placement_group_removed(): ready, _ = ray.wait(list(wait_pgs.keys()), timeout=0.5) if ready: ready_pg = wait_pgs[ready[0]] remove_placement_group(ready_pg) del wait_pgs[ready[0]] if len(wait_pgs) == 0: return True return False wait_for_condition(is_all_placement_group_removed) ray.shutdown()
def test_placement_group_gpu_assigned(ray_start_cluster, connect_to_client): cluster = ray_start_cluster cluster.add_node(num_gpus=2) ray.init(address=cluster.address) gpu_ids_res = set() @ray.remote(num_gpus=1, num_cpus=0) def f(): import os return os.environ["CUDA_VISIBLE_DEVICES"] with connect_to_client_or_not(connect_to_client): pg1 = ray.util.placement_group([{"GPU": 1}]) pg2 = ray.util.placement_group([{"GPU": 1}]) assert pg1.wait(10) assert pg2.wait(10) gpu_ids_res.add(ray.get(f.options(placement_group=pg1).remote())) gpu_ids_res.add(ray.get(f.options(placement_group=pg2).remote())) assert len(gpu_ids_res) == 2
def test_capture_child_tasks(ray_start_cluster, connect_to_client): cluster = ray_start_cluster total_num_tasks = 4 for _ in range(2): cluster.add_node(num_cpus=total_num_tasks, num_gpus=total_num_tasks) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): pg = ray.util.placement_group( [ { "CPU": 2, "GPU": 2, }, { "CPU": 2, "GPU": 2, }, ], strategy="STRICT_PACK", ) ray.get(pg.ready()) # If get_current_placement_group is used when the current worker/driver # doesn't belong to any of placement group, it should return None. assert get_current_placement_group() is None # Test if tasks capture child tasks. @ray.remote def task(): return get_current_placement_group() @ray.remote def create_nested_task(child_cpu, child_gpu, set_none=False): assert get_current_placement_group() is not None kwargs = { "num_cpus": child_cpu, "num_gpus": child_gpu, } if set_none: kwargs["placement_group"] = None return ray.get([task.options(**kwargs).remote() for _ in range(3)]) t = create_nested_task.options( num_cpus=1, num_gpus=0, placement_group=pg, placement_group_capture_child_tasks=True, ).remote(1, 0) pgs = ray.get(t) # Every task should have current placement group because they # should be implicitly captured by default. assert None not in pgs t1 = create_nested_task.options( num_cpus=1, num_gpus=0, placement_group=pg, placement_group_capture_child_tasks=True, ).remote(1, 0, True) pgs = ray.get(t1) # Every task should have no placement group since it's set to None. # should be implicitly captured by default. assert set(pgs) == {None} # Test if tasks don't capture child tasks when the option is off. t2 = create_nested_task.options(num_cpus=0, num_gpus=1, placement_group=pg).remote(0, 1) pgs = ray.get(t2) # All placement groups should be None since we don't capture child # tasks. assert not all(pgs)
def test_capture_child_actors(ray_start_cluster, connect_to_client): cluster = ray_start_cluster total_num_actors = 4 for _ in range(2): cluster.add_node(num_cpus=total_num_actors) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): pg = ray.util.placement_group([{ "CPU": 2 }, { "CPU": 2 }], strategy="STRICT_PACK") ray.get(pg.ready()) # If get_current_placement_group is used when the current worker/driver # doesn't belong to any of placement group, it should return None. assert get_current_placement_group() is None # Test actors first. @ray.remote(num_cpus=1) class NestedActor: def ready(self): return True @ray.remote(num_cpus=1) class Actor: def __init__(self): self.actors = [] def ready(self): return True def schedule_nested_actor(self): # Make sure we can capture the current placement group. assert get_current_placement_group() is not None # Actors should be implicitly captured. actor = NestedActor.remote() ray.get(actor.ready.remote()) self.actors.append(actor) def schedule_nested_actor_outside_pg(self): # Don't use placement group. actor = NestedActor.options(placement_group=None).remote() ray.get(actor.ready.remote()) self.actors.append(actor) a = Actor.options(placement_group=pg, placement_group_capture_child_tasks=True).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor.remote()) # Make sure all the actors are scheduled on the same node. # (why? The placement group has STRICT_PACK strategy). node_id_set = set() for actor_info in ray.state.actors().values(): if actor_info["State"] == convert_actor_state( gcs_utils.ActorTableData.ALIVE): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) # Since all node id should be identical, set should be equal to 1. assert len(node_id_set) == 1 # Kill an actor and wait until it is killed. kill_actor_and_wait_for_failure(a) with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) # Now create an actor, but do not capture the current tasks a = Actor.options(placement_group=pg).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor.remote()) # Make sure all the actors are not scheduled on the same node. # It is because the child tasks are not scheduled on the same # placement group. node_id_set = set() for actor_info in ray.state.actors().values(): if actor_info["State"] == convert_actor_state( gcs_utils.ActorTableData.ALIVE): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) assert len(node_id_set) == 2 # Kill an actor and wait until it is killed. kill_actor_and_wait_for_failure(a) with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) # Lastly, make sure when None is specified, actors are not scheduled # on the same placement group. a = Actor.options(placement_group=pg).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor_outside_pg.remote()) # Make sure all the actors are not scheduled on the same node. # It is because the child tasks are not scheduled on the same # placement group. node_id_set = set() for actor_info in ray.state.actors().values(): if actor_info["State"] == convert_actor_state( gcs_utils.ActorTableData.ALIVE): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) assert len(node_id_set) == 2
def test_mini_integration(ray_start_cluster, connect_to_client): # Create bundles as many as number of gpus in the cluster. # Do some random work and make sure all resources are properly recovered. cluster = ray_start_cluster num_nodes = 5 per_bundle_gpus = 2 gpu_per_node = 4 total_gpus = num_nodes * per_bundle_gpus * gpu_per_node per_node_gpus = per_bundle_gpus * gpu_per_node bundles_per_pg = 2 total_num_pg = total_gpus // (bundles_per_pg * per_bundle_gpus) [ cluster.add_node(num_cpus=2, num_gpus=per_bundle_gpus * gpu_per_node) for _ in range(num_nodes) ] cluster.wait_for_nodes() ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): @ray.remote(num_cpus=0, num_gpus=1) def random_tasks(): import time import random sleep_time = random.uniform(0.1, 0.2) time.sleep(sleep_time) return True pgs = [] pg_tasks = [] # total bundle gpu usage = bundles_per_pg*total_num_pg*per_bundle_gpus # Note this is half of total for index in range(total_num_pg): pgs.append( ray.util.placement_group( name=f"name{index}", strategy="PACK", bundles=[{ "GPU": per_bundle_gpus } for _ in range(bundles_per_pg)], )) # Schedule tasks. for i in range(total_num_pg): pg = pgs[i] pg_tasks.append([ random_tasks.options( placement_group=pg, placement_group_bundle_index=bundle_index).remote() for bundle_index in range(bundles_per_pg) ]) # Make sure tasks are done and we remove placement groups. num_removed_pg = 0 pg_indexes = [2, 3, 1, 7, 8, 9, 0, 6, 4, 5] while num_removed_pg < total_num_pg: index = pg_indexes[num_removed_pg] pg = pgs[index] assert all(ray.get(pg_tasks[index])) ray.util.remove_placement_group(pg) num_removed_pg += 1 @ray.remote(num_cpus=2, num_gpus=per_node_gpus) class A: def ping(self): return True # Make sure all resources are properly returned by scheduling # actors that take up all existing resources. actors = [A.remote() for _ in range(num_nodes)] assert all(ray.get([a.ping.remote() for a in actors]))
def test_atomic_creation(ray_start_cluster, connect_to_client): # Setup cluster. cluster = ray_start_cluster bundle_cpu_size = 2 bundle_per_node = 2 num_nodes = 2 [ cluster.add_node(num_cpus=bundle_cpu_size * bundle_per_node) for _ in range(num_nodes) ] ray.init(address=cluster.address) @ray.remote(num_cpus=1) class NormalActor: def ping(self): pass @ray.remote(num_cpus=3) def bothering_task(): time.sleep(6) return True with connect_to_client_or_not(connect_to_client): # Schedule tasks to fail initial placement group creation. tasks = [bothering_task.remote() for _ in range(2)] # Make sure the two common task has scheduled. def tasks_scheduled(): return ray.available_resources()["CPU"] == 2.0 wait_for_condition(tasks_scheduled) # Create an actor that will fail bundle scheduling. # It is important to use pack strategy to make test less flaky. pg = ray.util.placement_group( name="name", strategy="SPREAD", bundles=[{ "CPU": bundle_cpu_size } for _ in range(num_nodes * bundle_per_node)], ) # Create a placement group actor. # This shouldn't be scheduled because atomic # placement group creation should've failed. pg_actor = NormalActor.options( placement_group=pg, placement_group_bundle_index=num_nodes * bundle_per_node - 1, ).remote() # Wait on the placement group now. It should be unready # because normal actor takes resources that are required # for one of bundle creation. ready, unready = ray.wait([pg.ready()], timeout=0.5) assert len(ready) == 0 assert len(unready) == 1 # Wait until all tasks are done. assert all(ray.get(tasks)) # Wait on the placement group creation. Since resources are now # available, it should be ready soon. ready, unready = ray.wait([pg.ready()]) assert len(ready) == 1 assert len(unready) == 0 # Confirm that the placement group actor is created. It will # raise an exception if actor was scheduled before placement # group was created thus it checks atomicity. ray.get(pg_actor.ping.remote(), timeout=3.0) ray.kill(pg_actor) # Make sure atomic creation failure didn't impact resources. @ray.remote(num_cpus=bundle_cpu_size) def resource_check(): return True # This should hang because every resources # are claimed by placement group. check_without_pg = [ resource_check.remote() for _ in range(bundle_per_node * num_nodes) ] # This all should scheduled on each bundle. check_with_pg = [ resource_check.options(placement_group=pg, placement_group_bundle_index=i).remote() for i in range(bundle_per_node * num_nodes) ] # Make sure these are hanging. ready, unready = ray.wait(check_without_pg, timeout=0) assert len(ready) == 0 assert len(unready) == bundle_per_node * num_nodes # Make sure these are all scheduled. assert all(ray.get(check_with_pg)) ray.util.remove_placement_group(pg) def pg_removed(): return ray.util.placement_group_table(pg)["state"] == "REMOVED" wait_for_condition(pg_removed) # Make sure check without pgs are all # scheduled properly because resources are cleaned up. assert all(ray.get(check_without_pg))
def test_placement_group_bin_packing_priority(ray_start_cluster, connect_to_client, scheduling_strategy): @ray.remote class Actor(object): def __init__(self): self.n = 0 def value(self): return self.n def index_to_actor(pg, index): if index < 2: return Actor.options(placement_group=pg, placement_group_bundle_index=index, num_cpus=1).remote() elif index < 3: return Actor.options(placement_group=pg, placement_group_bundle_index=index, num_gpus=1).remote() else: return Actor.options( placement_group=pg, placement_group_bundle_index=index, object_store_memory=1024 * 1024 * 200, ).remote() def add_nodes_to_cluster(cluster): cluster.add_node(num_cpus=1) cluster.add_node(num_cpus=2) cluster.add_node(num_gpus=1) cluster.add_node(object_store_memory=1024 * 1024 * 250) default_bundles = [ { "CPU": 1 }, { "CPU": 2 }, { "CPU": 1, "GPU": 1 }, { "CPU": 1, "object_store_memory": 1024 * 1024 * 200 }, ] default_num_nodes = len(default_bundles) cluster = ray_start_cluster add_nodes_to_cluster(cluster) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): placement_group = ray.util.placement_group( name="name", strategy=scheduling_strategy, bundles=default_bundles, ) ray.get(placement_group.ready()) actors = [ index_to_actor(placement_group, i) for i in range(default_num_nodes) ] [ray.get(actor.value.remote()) for actor in actors] # Get all actors. actor_infos = ray._private.state.actors() # Make sure all actors in counter_list are located in separate nodes. actor_info_objs = [ actor_infos.get(actor._actor_id.hex()) for actor in actors ] assert are_pairwise_unique( [info_obj["Address"]["NodeID"] for info_obj in actor_info_objs])
def test_placement_group_reschedule_when_node_dead(ray_start_cluster, connect_to_client): @ray.remote(num_cpus=1) class Actor(object): def __init__(self): self.n = 0 def value(self): return self.n cluster = ray_start_cluster cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) cluster.wait_for_nodes() ray.init(address=cluster.address, namespace="default_test_namespace") # Make sure both head and worker node are alive. nodes = ray.nodes() assert len(nodes) == 3 assert nodes[0]["alive"] and nodes[1]["alive"] and nodes[2]["alive"] with connect_to_client_or_not(connect_to_client): placement_group = ray.util.placement_group(name="name", strategy="SPREAD", bundles=[{ "CPU": 2 }, { "CPU": 2 }, { "CPU": 2 }]) actor_1 = Actor.options( placement_group=placement_group, placement_group_bundle_index=0, lifetime="detached", ).remote() actor_2 = Actor.options( placement_group=placement_group, placement_group_bundle_index=1, lifetime="detached", ).remote() actor_3 = Actor.options( placement_group=placement_group, placement_group_bundle_index=2, lifetime="detached", ).remote() ray.get(actor_1.value.remote()) ray.get(actor_2.value.remote()) ray.get(actor_3.value.remote()) cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1]) cluster.wait_for_nodes() actor_4 = Actor.options( placement_group=placement_group, placement_group_bundle_index=0, lifetime="detached", ).remote() actor_5 = Actor.options( placement_group=placement_group, placement_group_bundle_index=1, lifetime="detached", ).remote() actor_6 = Actor.options( placement_group=placement_group, placement_group_bundle_index=2, lifetime="detached", ).remote() ray.get(actor_4.value.remote()) ray.get(actor_5.value.remote()) ray.get(actor_6.value.remote()) placement_group_assert_no_leak([placement_group]) ray.shutdown()
def test_placement_group_table(ray_start_cluster, connect_to_client): @ray.remote(num_cpus=2) class Actor(object): def __init__(self): self.n = 0 def value(self): return self.n cluster = ray_start_cluster num_nodes = 2 for _ in range(num_nodes): cluster.add_node(num_cpus=4) ray.init(address=cluster.address) pgs_created = [] with connect_to_client_or_not(connect_to_client): # Originally placement group creation should be pending because # there are no resources. name = "name" strategy = "PACK" bundles = [{"CPU": 2, "GPU": 1}, {"CPU": 2}] placement_group = ray.util.placement_group(name=name, strategy=strategy, bundles=bundles) pgs_created.append(placement_group) result = ray.util.placement_group_table(placement_group) assert result["name"] == name assert result["strategy"] == strategy for i in range(len(bundles)): assert bundles[i] == result["bundles"][i] assert result["state"] == "PENDING" # Now the placement group should be scheduled. cluster.add_node(num_cpus=5, num_gpus=1) cluster.wait_for_nodes() actor_1 = Actor.options(placement_group=placement_group, placement_group_bundle_index=0).remote() ray.get(actor_1.value.remote()) result = ray.util.placement_group_table(placement_group) assert result["state"] == "CREATED" # Add tow more placement group for placement group table test. second_strategy = "SPREAD" pgs_created.append( ray.util.placement_group(name="second_placement_group", strategy=second_strategy, bundles=bundles)) pgs_created.append( ray.util.placement_group(name="third_placement_group", strategy=second_strategy, bundles=bundles)) placement_group_table = ray.util.placement_group_table() assert len(placement_group_table) == 3 true_name_set = { "name", "second_placement_group", "third_placement_group" } get_name_set = set() for _, placement_group_data in placement_group_table.items(): get_name_set.add(placement_group_data["name"]) assert true_name_set == get_name_set placement_group_assert_no_leak(pgs_created)
def test_placement_group_scheduling_strategy(ray_start_cluster, connect_to_client): cluster = ray_start_cluster cluster.add_node(num_cpus=8, resources={"head": 1}) cluster.add_node(num_cpus=8, num_gpus=8, resources={"worker": 1}) cluster.wait_for_nodes() ray.init(address=cluster.address) pg = ray.util.placement_group(bundles=[{ "CPU": 1, "GPU": 1 }, { "CPU": 1, "GPU": 1 }]) ray.get(pg.ready()) with connect_to_client_or_not(connect_to_client): @ray.remote(scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY) def get_node_id_1(): return ray.worker.global_worker.current_node_id worker_node_id = ray.get( get_node_id_1.options(resources={ "worker": 1 }).remote()) assert ray.get( get_node_id_1.options( num_cpus=1, scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)).remote()) == worker_node_id @ray.remote(num_cpus=1, scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)) def get_node_id_2(): return ray.worker.global_worker.current_node_id assert ray.get(get_node_id_2.remote()) == worker_node_id @ray.remote(num_cpus=1, scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)) class Actor1(): def get_node_id(self): return ray.worker.global_worker.current_node_id actor1 = Actor1.remote() assert ray.get(actor1.get_node_id.remote()) == worker_node_id @ray.remote class Actor2(): def get_node_id(self): return ray.worker.global_worker.current_node_id actor2 = Actor2.options( scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)).remote() assert ray.get(actor2.get_node_id.remote()) == worker_node_id with pytest.raises(ValueError): @ray.remote(scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)) def func(): return 0 func.options(placement_group=pg).remote() with pytest.raises(ValueError): @ray.remote def func(): return 0 func.options(scheduling_strategy="XXX").remote() with pytest.raises(ValueError): @ray.remote def func(): return 0 func.options(scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=None)).remote()
def test_remove_placement_group(ray_start_cluster, connect_to_client): cluster = ray_start_cluster cluster.add_node(num_cpus=4) ray.init(address=cluster.address) @ray.remote def warmup(): pass # warm up the cluster. ray.get([warmup.remote() for _ in range(4)]) with connect_to_client_or_not(connect_to_client): # First try to remove a placement group that doesn't # exist. This should not do anything. random_group_id = PlacementGroupID.from_random() random_placement_group = PlacementGroup(random_group_id) for _ in range(3): ray.util.remove_placement_group(random_placement_group) # Creating a placement group as soon as it is # created should work. placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}]) assert placement_group.wait(10) ray.util.remove_placement_group(placement_group) wait_for_condition(lambda: is_placement_group_removed(placement_group)) # # Now let's create a placement group. placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}]) assert placement_group.wait(10) # Create an actor that occupies resources. @ray.remote(num_cpus=2) class A: def f(self): return 3 # Currently, there's no way to prevent # tasks to be retried for removed placement group. # Set max_retrie=0 for testing. # TODO(sang): Handle this edge case. @ray.remote(num_cpus=2, max_retries=0) def long_running_task(): print(os.getpid()) import time time.sleep(50) # Schedule a long running task and actor. task_ref = long_running_task.options( placement_group=placement_group).remote() a = A.options(placement_group=placement_group).remote() assert ray.get(a.f.remote()) == 3 ray.util.remove_placement_group(placement_group) # Subsequent remove request shouldn't do anything. for _ in range(3): ray.util.remove_placement_group(placement_group) # Make sure placement group resources are # released and we can schedule this task. @ray.remote(num_cpus=4) def f(): return 3 assert ray.get(f.remote()) == 3 # Since the placement group is removed, # the actor should've been killed. # That means this request should fail. with pytest.raises(ray.exceptions.RayActorError, match="actor died"): ray.get(a.f.remote(), timeout=3.0) with pytest.raises(ray.exceptions.WorkerCrashedError): ray.get(task_ref)
def test_default_scheduling_strategy(ray_start_cluster, connect_to_client): cluster = ray_start_cluster cluster.add_node(num_cpus=16, resources={"head": 1}, _system_config={"scheduler_spread_threshold": 1}) cluster.add_node(num_cpus=8, num_gpus=8, resources={"worker": 1}) cluster.wait_for_nodes() ray.init(address=cluster.address) pg = ray.util.placement_group(bundles=[{ "CPU": 1, "GPU": 1 }, { "CPU": 1, "GPU": 1 }]) ray.get(pg.ready()) ray.get(pg.ready()) with connect_to_client_or_not(connect_to_client): @ray.remote(scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY) def get_node_id_1(): return ray.worker.global_worker.current_node_id head_node_id = ray.get( get_node_id_1.options(resources={ "head": 1 }).remote()) worker_node_id = ray.get( get_node_id_1.options(resources={ "worker": 1 }).remote()) assert ray.get(get_node_id_1.remote()) == head_node_id @ray.remote(num_cpus=1, scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)) def get_node_id_2(): return ray.worker.global_worker.current_node_id assert ray.get( get_node_id_2.options( scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY).remote() ) == head_node_id @ray.remote def get_node_id_3(): return ray.worker.global_worker.current_node_id @ray.remote(num_cpus=1, scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg, placement_group_capture_child_tasks=True)) class Actor1(): def get_node_ids(self): return [ ray.worker.global_worker.current_node_id, # Use parent's placement group ray.get(get_node_id_3.remote()), ray.get( get_node_id_3.options( scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY). remote()) ] actor1 = Actor1.remote() assert ray.get(actor1.get_node_ids.remote()) == \ [worker_node_id, worker_node_id, head_node_id]