def test_background_tasks_with_max_calls(shutdown_only): ray.init(num_cpus=2) @ray.remote def g(): time.sleep(.1) return 0 @ray.remote(max_calls=1, max_retries=0) def f(): return [g.remote()] nested = ray.get([f.remote() for _ in range(10)]) # Should still be able to retrieve these objects, since f's workers will # wait for g to finish before exiting. ray.get([x[0] for x in nested]) @ray.remote(max_calls=1, max_retries=0) def f(): return os.getpid(), g.remote() nested = ray.get([f.remote() for _ in range(10)]) while nested: pid, g_id = nested.pop(0) ray.get(g_id) del g_id wait_for_pid_to_exit(pid)
def test_distributed_actor_handle_deletion(ray_start_regular_shared): @ray.remote class Actor: def method(self): return 1 def getpid(self): return os.getpid() @ray.remote def f(actor, signal): ray.get(signal.wait.remote()) return ray.get(actor.method.remote()) SignalActor = create_remote_signal_actor(ray) signal = SignalActor.remote() a = Actor.remote() pid = ray.get(a.getpid.remote()) # Pass the handle to another task that cannot run yet. x_id = f.remote(a, signal) # Delete the original handle. The actor should not get killed yet. del a # Once the task finishes, the actor process should get killed. ray.get(signal.send.remote()) assert ray.get(x_id) == 1 wait_for_pid_to_exit(pid)
def test_actor_owner_worker_dies_before_dependency_ready(ray_start_cluster): """Test actor owner worker dies before local dependencies are resolved. This test verifies the scenario where owner worker has failed before actor dependencies are resolved. Reference: https://github.com/ray-project/ray/pull/8045 """ @ray.remote class Actor: def __init__(self, dependency): print("actor: {}".format(os.getpid())) self.dependency = dependency def f(self): return self.dependency @ray.remote class Owner: def get_pid(self): return os.getpid() def create_actor(self, caller_handle): s = SignalActor.remote() # Create an actor which depends on an object that can never be # resolved. actor_handle = Actor.remote(s.wait.remote()) pid = os.getpid() signal_handle = SignalActor.remote() caller_handle.call.remote(pid, signal_handle, actor_handle) # Wait until the `Caller` start executing the remote `call` method. ray.get(signal_handle.wait.remote()) # exit os._exit(0) @ray.remote class Caller: def call(self, owner_pid, signal_handle, actor_handle): # Notify the `Owner` that the `Caller` is executing the remote # `call` method. ray.get(signal_handle.send.remote()) # Wait for the `Owner` to exit. wait_for_pid_to_exit(owner_pid) oid = actor_handle.f.remote() # It will hang without location resolution protocol. ray.get(oid) def hang(self): return True owner = Owner.remote() owner_pid = ray.get(owner.get_pid.remote()) caller = Caller.remote() owner.create_actor.remote(caller) # Wait for the `Owner` to exit. wait_for_pid_to_exit(owner_pid) # It will hang here if location is not properly resolved. wait_for_condition(lambda: ray.get(caller.hang.remote()))
def call(self, owner_pid, signal_handle, actor_handle): # Notify the `Owner` that the `Caller` is executing the remote # `call` method. ray.get(signal_handle.send.remote()) # Wait for the `Owner` to exit. wait_for_pid_to_exit(owner_pid) oid = actor_handle.f.remote() # It will hang without location resolution protocol. ray.get(oid)
def test_actor_restart_with_retry(ray_init_with_task_retry_delay): """Test actor restart when actor process is killed.""" @ray.remote(max_restarts=1, max_task_retries=-1) class RestartableActor: """An actor that will be restarted at most once.""" def __init__(self): self.value = 0 def increase(self, delay=0): time.sleep(delay) self.value += 1 return self.value def get_pid(self): return os.getpid() actor = RestartableActor.remote() pid = ray.get(actor.get_pid.remote()) results = [actor.increase.remote() for _ in range(100)] # Kill actor process, while the above task is still being executed. os.kill(pid, SIGKILL) wait_for_pid_to_exit(pid) # Check that none of the tasks failed and the actor is restarted. seq = list(range(1, 101)) results = ray.get(results) failed_task_index = None # Make sure that all tasks were executed in order before and after the # actor's death. for i, res in enumerate(results): if res != seq[0]: if failed_task_index is None: failed_task_index = i assert res + failed_task_index == seq[0] seq.pop(0) # Check that we can still call the actor. result = actor.increase.remote() assert ray.get(result) == results[-1] + 1 # kill actor process one more time. results = [actor.increase.remote() for _ in range(100)] pid = ray.get(actor.get_pid.remote()) os.kill(pid, SIGKILL) wait_for_pid_to_exit(pid) # The actor has exceeded max restarts, and this task should fail. with pytest.raises(ray.exceptions.RayActorError): ray.get(actor.increase.remote()) # Create another actor. actor = RestartableActor.remote() # Intentionlly exit the actor actor.__ray_terminate__.remote() # Check that the actor won't be restarted. with pytest.raises(ray.exceptions.RayActorError): ray.get(actor.increase.remote())
def test_node_failure_detector_when_gcs_server_restart(ray_start_cluster_head): """Checks that the node failure detector is correct when gcs server restart. We set the cluster to timeout nodes after 2 seconds of heartbeats. We then kill gcs server and remove the worker node and restart gcs server again to check that the removed node will die finally. """ cluster = ray_start_cluster_head worker = cluster.add_node() cluster.wait_for_nodes() # Make sure both head and worker node are alive. nodes = ray.nodes() assert len(nodes) == 2 assert nodes[0]["alive"] and nodes[1]["alive"] to_be_removed_node = None for node in nodes: if node["RayletSocketName"] == worker.raylet_socket_name: to_be_removed_node = node assert to_be_removed_node is not None head_node = cluster.head_node gcs_server_process = head_node.all_processes["gcs_server"][0].process gcs_server_pid = gcs_server_process.pid # Kill gcs server. cluster.head_node.kill_gcs_server() # Wait to prevent the gcs server process becoming zombie. gcs_server_process.wait() wait_for_pid_to_exit(gcs_server_pid, 1000) raylet_process = worker.all_processes["raylet"][0].process raylet_pid = raylet_process.pid # Remove worker node. cluster.remove_node(worker, allow_graceful=False) # Wait to prevent the raylet process becoming zombie. raylet_process.wait() wait_for_pid_to_exit(raylet_pid) # Restart gcs server process. cluster.head_node.start_gcs_server() def condition(): nodes = ray.nodes() assert len(nodes) == 2 for node in nodes: if node["NodeID"] == to_be_removed_node["NodeID"]: return not node["alive"] return False # Wait for the removed node dead. wait_for_condition(condition, timeout=10)
def test_actor_deletion(ray_start_regular_shared): # Make sure that when an actor handles goes out of scope, the actor # destructor is called. @ray.remote class Actor: def getpid(self): return os.getpid() a = Actor.remote() pid = ray.get(a.getpid.remote()) a = None wait_for_pid_to_exit(pid) actors = [Actor.remote() for _ in range(10)] pids = ray.get([a.getpid.remote() for a in actors]) a = None actors = None [wait_for_pid_to_exit(pid) for pid in pids]
def test_worker_capping_fifo(shutdown_only): # Start 2 initial workers by setting num_cpus to 2. info = ray.init(num_cpus=2) wait_for_condition(lambda: len(get_workers()) == 2) time.sleep(1) @ray.remote def getpid(): return os.getpid() worker1, worker2 = get_workers() if worker1.pid == ray.get(getpid.remote()): worker1, worker2 = [worker2, worker1] # Worker 1 is before worker 2 in the FIFO queue. driver_code = """ import ray import time ray.init(address="{}") @ray.remote def foo(): pass ray.get(foo.remote()) # Sleep a while to make sure an idle worker exits before this driver exits. time.sleep(2) ray.shutdown() """.format(info["redis_address"]) run_string_as_driver(driver_code) # Worker 1 should have been killed. wait_for_pid_to_exit(worker1.pid) wait_for_condition(lambda: len(get_workers()) == 1) assert worker2.pid == get_workers()[0].pid
def kill_actor(actor): """A helper function that kills an actor process.""" pid = ray.get(actor.get_pid.remote()) os.kill(pid, SIGKILL) wait_for_pid_to_exit(pid)
def test_basic_reconstruction_actor_constructor(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. if not reconstruction_enabled: config["lineage_pinning_enabled"] = 0 cluster = ray_start_cluster # Head node with no resources. cluster.add_node(num_cpus=0, _system_config=config, enable_object_reconstruction=reconstruction_enabled) ray.init(address=cluster.address) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) cluster.add_node(num_cpus=1, resources={"node2": 1}, object_store_memory=10**8) cluster.wait_for_nodes() @ray.remote(max_retries=1 if reconstruction_enabled else 0) def large_object(): return np.zeros(10**7, dtype=np.uint8) # Both the constructor and a method depend on the large object. @ray.remote(max_restarts=-1) class Actor: def __init__(self, x): pass def dependent_task(self, x): return def pid(self): return os.getpid() obj = large_object.options(resources={"node1": 1}).remote() a = Actor.options(resources={"node1": 1}).remote(obj) ray.get(a.dependent_task.remote(obj)) pid = ray.get(a.pid.remote()) # Workaround to kill the actor process too since there is a bug where the # actor's plasma client hangs after the plasma store has exited. os.kill(pid, SIGKILL) cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) wait_for_pid_to_exit(pid) # Wait for the actor to restart. def probe(): try: ray.get(a.dependent_task.remote(obj)) return True except ray.exceptions.RayActorError: return False except (ray.exceptions.RayTaskError, ray.exceptions.ObjectLostError): return True wait_for_condition(probe) if reconstruction_enabled: ray.get(a.dependent_task.remote(obj)) else: with pytest.raises(ray.exceptions.RayTaskError) as e: x = a.dependent_task.remote(obj) print(x) ray.get(x) with pytest.raises(ray.exceptions.ObjectLostError): raise e.as_instanceof_cause()
def test_basic_reconstruction_actor_task(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. if not reconstruction_enabled: config["lineage_pinning_enabled"] = 0 cluster = ray_start_cluster # Head node with no resources. cluster.add_node(num_cpus=0, _system_config=config, enable_object_reconstruction=reconstruction_enabled) ray.init(address=cluster.address) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 2}, object_store_memory=10**8) cluster.add_node(num_cpus=1, resources={"node2": 1}, object_store_memory=10**8) cluster.wait_for_nodes() @ray.remote(max_restarts=-1, max_task_retries=-1 if reconstruction_enabled else 0, resources={"node1": 1}, num_cpus=0) class Actor: def __init__(self): pass def large_object(self): return np.zeros(10**7, dtype=np.uint8) def pid(self): return os.getpid() @ray.remote def dependent_task(x): return a = Actor.remote() pid = ray.get(a.pid.remote()) obj = a.large_object.remote() ray.get(dependent_task.options(resources={"node1": 1}).remote(obj)) # Workaround to kill the actor process too since there is a bug where the # actor's plasma client hangs after the plasma store has exited. os.kill(pid, SIGKILL) cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, resources={"node1": 2}, object_store_memory=10**8) wait_for_pid_to_exit(pid) if reconstruction_enabled: ray.get(dependent_task.remote(obj)) else: with pytest.raises(ray.exceptions.RayTaskError) as e: ray.get(dependent_task.remote(obj)) with pytest.raises(ray.exceptions.UnreconstructableError): raise e.as_instanceof_cause() # Make sure the actor handle is still usable. pid = ray.get(a.pid.remote())
def test_actor_restart(ray_init_with_task_retry_delay): """Test actor restart when actor process is killed.""" @ray.remote(max_restarts=1, max_task_retries=-1) class RestartableActor: """An actor that will be restarted at most once.""" def __init__(self): self.value = 0 def increase(self, delay=0): time.sleep(delay) self.value += 1 return self.value def get_pid(self): return os.getpid() actor = RestartableActor.remote() pid = ray.get(actor.get_pid.remote()) results = [actor.increase.remote() for _ in range(100)] # Kill actor process, while the above task is still being executed. os.kill(pid, SIGKILL) wait_for_pid_to_exit(pid) # Make sure that all tasks were executed in order before the actor's death. res = results.pop(0) i = 1 while True: try: r = ray.get(res) if r != i: # Actor restarted without any failed tasks. break res = results.pop(0) i += 1 except ray.exceptions.RayActorError: # Actor restarted. break # Find the first task to execute after the actor was restarted. while True: try: r = ray.get(res) break except ray.exceptions.RayActorError: res = results.pop(0) pass # Make sure that all tasks were executed in order after the actor's death. i = 1 while True: r = ray.get(res) assert r == i if results: res = results.pop(0) i += 1 else: break # Check that we can still call the actor. result = actor.increase.remote() assert ray.get(result) == r + 1 # kill actor process one more time. results = [actor.increase.remote() for _ in range(100)] pid = ray.get(actor.get_pid.remote()) os.kill(pid, SIGKILL) wait_for_pid_to_exit(pid) # The actor has exceeded max restarts, and this task should fail. with pytest.raises(ray.exceptions.RayActorError): ray.get(actor.increase.remote()) # Create another actor. actor = RestartableActor.remote() # Intentionlly exit the actor actor.__ray_terminate__.remote() # Check that the actor won't be restarted. with pytest.raises(ray.exceptions.RayActorError): ray.get(actor.increase.remote())