def test_actor_holding_serialized_reference(one_worker_100MiB, use_ray_put, failure): @ray.remote class GreedyActor(object): def __init__(self): pass def set_ref1(self, ref): self.ref1 = ref def add_ref2(self, new_ref): self.ref2 = new_ref def delete_ref1(self): self.ref1 = None def delete_ref2(self): self.ref2 = None # Test that the reference held by the actor isn't evicted. array_oid = put_object(np.zeros(20 * 1024 * 1024, dtype=np.uint8), use_ray_put) actor = GreedyActor.remote() actor.set_ref1.remote([array_oid]) # Test that giving the same actor a duplicate reference works. ray.get(actor.add_ref2.remote([array_oid])) # Remove the local reference. array_oid_bytes = array_oid.binary() del array_oid # Test that the remote references still pin the object. _fill_object_store_and_get(array_oid_bytes) # Test that removing only the first reference doesn't unpin the object. ray.get(actor.delete_ref1.remote()) _fill_object_store_and_get(array_oid_bytes) if failure: # Test that the actor exiting stops the reference from being pinned. # Kill the actor and wait for the actor to exit. kill_actor_and_wait_for_failure(actor) with pytest.raises(ray.exceptions.RayActorError): ray.get(actor.delete_ref1.remote()) else: # Test that deleting the second reference stops it from being pinned. ray.get(actor.delete_ref2.remote()) _fill_object_store_and_get(array_oid_bytes, succeed=False)
def test_kill(ray_start_regular_shared): @ray.remote class Actor: def hang(self): while True: time.sleep(1) actor = Actor.remote() result = actor.hang.remote() ready, _ = ray.wait([result], timeout=0.5) assert len(ready) == 0 kill_actor_and_wait_for_failure(actor) with pytest.raises(ray.exceptions.RayActorError): ray.get(result) with pytest.raises(ValueError): ray.kill("not_an_actor_handle")
def test_capture_child_actors(ray_start_cluster, connect_to_client): cluster = ray_start_cluster total_num_actors = 4 for _ in range(2): cluster.add_node(num_cpus=total_num_actors) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): pg = ray.util.placement_group([{ "CPU": 2 }, { "CPU": 2 }], strategy="STRICT_PACK") ray.get(pg.ready()) # If get_current_placement_group is used when the current worker/driver # doesn't belong to any of placement group, it should return None. assert get_current_placement_group() is None # Test actors first. @ray.remote(num_cpus=1) class NestedActor: def ready(self): return True @ray.remote(num_cpus=1) class Actor: def __init__(self): self.actors = [] def ready(self): return True def schedule_nested_actor(self): # Make sure we can capture the current placement group. assert get_current_placement_group() is not None # Actors should be implicitly captured. actor = NestedActor.remote() ray.get(actor.ready.remote()) self.actors.append(actor) def schedule_nested_actor_outside_pg(self): # Don't use placement group. actor = NestedActor.options(placement_group=None).remote() ray.get(actor.ready.remote()) self.actors.append(actor) a = Actor.options(placement_group=pg, placement_group_capture_child_tasks=True).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor.remote()) # Make sure all the actors are scheduled on the same node. # (why? The placement group has STRICT_PACK strategy). node_id_set = set() for actor_info in ray.state.actors().values(): if actor_info["State"] == convert_actor_state( gcs_utils.ActorTableData.ALIVE): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) # Since all node id should be identical, set should be equal to 1. assert len(node_id_set) == 1 # Kill an actor and wait until it is killed. kill_actor_and_wait_for_failure(a) with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) # Now create an actor, but do not capture the current tasks a = Actor.options(placement_group=pg).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor.remote()) # Make sure all the actors are not scheduled on the same node. # It is because the child tasks are not scheduled on the same # placement group. node_id_set = set() for actor_info in ray.state.actors().values(): if actor_info["State"] == convert_actor_state( gcs_utils.ActorTableData.ALIVE): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) assert len(node_id_set) == 2 # Kill an actor and wait until it is killed. kill_actor_and_wait_for_failure(a) with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) # Lastly, make sure when None is specified, actors are not scheduled # on the same placement group. a = Actor.options(placement_group=pg).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor_outside_pg.remote()) # Make sure all the actors are not scheduled on the same node. # It is because the child tasks are not scheduled on the same # placement group. node_id_set = set() for actor_info in ray.state.actors().values(): if actor_info["State"] == convert_actor_state( gcs_utils.ActorTableData.ALIVE): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) assert len(node_id_set) == 2
def test_detached_placement_group(ray_start_cluster): cluster = ray_start_cluster for _ in range(2): cluster.add_node(num_cpus=3) cluster.wait_for_nodes() info = ray.init(address=cluster.address) # Make sure detached placement group will alive when job dead. driver_code = f""" import ray ray.init(address="{info["redis_address"]}") pg = ray.util.placement_group( [{{"CPU": 1}} for _ in range(2)], strategy="STRICT_SPREAD", lifetime="detached") ray.get(pg.ready()) @ray.remote(num_cpus=1) class Actor: def ready(self): return True for bundle_index in range(2): actor = Actor.options(lifetime="detached", placement_group=pg, placement_group_bundle_index=bundle_index).remote() ray.get(actor.ready.remote()) ray.shutdown() """ run_string_as_driver(driver_code) # Wait until the driver is reported as dead by GCS. def is_job_done(): jobs = ray.state.jobs() for job in jobs: if job["IsDead"]: return True return False def assert_alive_num_pg(expected_num_pg): alive_num_pg = 0 for _, placement_group_info in ray.util.placement_group_table().items( ): if placement_group_info["state"] == "CREATED": alive_num_pg += 1 return alive_num_pg == expected_num_pg def assert_alive_num_actor(expected_num_actor): alive_num_actor = 0 for actor_info in ray.state.actors().values(): if actor_info["State"] == gcs_utils.ActorTableData.ALIVE: alive_num_actor += 1 return alive_num_actor == expected_num_actor wait_for_condition(is_job_done) assert assert_alive_num_pg(1) assert assert_alive_num_actor(2) # Make sure detached placement group will alive when its creator which # is detached actor dead. # Test actors first. @ray.remote(num_cpus=1) class NestedActor: def ready(self): return True @ray.remote(num_cpus=1) class Actor: def __init__(self): self.actors = [] def ready(self): return True def schedule_nested_actor_with_detached_pg(self): # Create placement group which is detached. pg = ray.util.placement_group([{ "CPU": 1 } for _ in range(2)], strategy="STRICT_SPREAD", lifetime="detached", name="detached_pg") ray.get(pg.ready()) # Schedule nested actor with the placement group. for bundle_index in range(2): actor = NestedActor.options( placement_group=pg, placement_group_bundle_index=bundle_index, lifetime="detached").remote() ray.get(actor.ready.remote()) self.actors.append(actor) a = Actor.options(lifetime="detached").remote() ray.get(a.ready.remote()) # 1 parent actor and 2 children actor. ray.get(a.schedule_nested_actor_with_detached_pg.remote()) # Kill an actor and wait until it is killed. kill_actor_and_wait_for_failure(a) with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) # We should have 2 alive pgs and 4 alive actors. assert assert_alive_num_pg(2) assert assert_alive_num_actor(4)