def check_actor_restart(): actors = list(ray.state.actors().values()) assert len(actors) == 1 print(actors) return (actors[0]["State"] == convert_actor_state( gcs_utils.ActorTableData.ALIVE) and actors[0]["NumRestarts"] == 1)
def _all_actors_dead(): return all( actor["State"] == convert_actor_state(gcs_utils.ActorTableData.DEAD) for actor in list(ray.state.actors().values()))
def test_capture_child_actors(ray_start_cluster, connect_to_client): cluster = ray_start_cluster total_num_actors = 4 for _ in range(2): cluster.add_node(num_cpus=total_num_actors) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): pg = ray.util.placement_group([{ "CPU": 2 }, { "CPU": 2 }], strategy="STRICT_PACK") ray.get(pg.ready()) # If get_current_placement_group is used when the current worker/driver # doesn't belong to any of placement group, it should return None. assert get_current_placement_group() is None # Test actors first. @ray.remote(num_cpus=1) class NestedActor: def ready(self): return True @ray.remote(num_cpus=1) class Actor: def __init__(self): self.actors = [] def ready(self): return True def schedule_nested_actor(self): # Make sure we can capture the current placement group. assert get_current_placement_group() is not None # Actors should be implicitly captured. actor = NestedActor.remote() ray.get(actor.ready.remote()) self.actors.append(actor) def schedule_nested_actor_outside_pg(self): # Don't use placement group. actor = NestedActor.options(placement_group=None).remote() ray.get(actor.ready.remote()) self.actors.append(actor) a = Actor.options(placement_group=pg, placement_group_capture_child_tasks=True).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor.remote()) # Make sure all the actors are scheduled on the same node. # (why? The placement group has STRICT_PACK strategy). node_id_set = set() for actor_info in ray.state.actors().values(): if actor_info["State"] == convert_actor_state( gcs_utils.ActorTableData.ALIVE): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) # Since all node id should be identical, set should be equal to 1. assert len(node_id_set) == 1 # Kill an actor and wait until it is killed. kill_actor_and_wait_for_failure(a) with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) # Now create an actor, but do not capture the current tasks a = Actor.options(placement_group=pg).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor.remote()) # Make sure all the actors are not scheduled on the same node. # It is because the child tasks are not scheduled on the same # placement group. node_id_set = set() for actor_info in ray.state.actors().values(): if actor_info["State"] == convert_actor_state( gcs_utils.ActorTableData.ALIVE): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) assert len(node_id_set) == 2 # Kill an actor and wait until it is killed. kill_actor_and_wait_for_failure(a) with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) # Lastly, make sure when None is specified, actors are not scheduled # on the same placement group. a = Actor.options(placement_group=pg).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor_outside_pg.remote()) # Make sure all the actors are not scheduled on the same node. # It is because the child tasks are not scheduled on the same # placement group. node_id_set = set() for actor_info in ray.state.actors().values(): if actor_info["State"] == convert_actor_state( gcs_utils.ActorTableData.ALIVE): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) assert len(node_id_set) == 2