def test_dedupe_cluster_failure(tmp_path): ray.shutdown() """ ======== driver 1 =========== 1. Checkpoing the input args * Uploads 2. Begin to run step * Crash ====== driver 2 ============ 1. Recover inputs * Creates a new object ref 2. Finish running step 3. Checkpoint step output * Should not trigger upload """ lock_file = tmp_path / "lock" workflow_dir = tmp_path / "workflow" driver_script = f""" import time import ray from ray import workflow from filelock import FileLock @ray.remote def foo(objrefs): with FileLock("{str(lock_file)}"): return objrefs if __name__ == "__main__": ray.init(storage="{str(workflow_dir)}") workflow.init() arg = ray.put("hello world") workflow.create(foo.bind([arg, arg])).run() assert False """ lock = FileLock(lock_file) lock.acquire() run_string_as_driver_nonblocking(driver_script) time.sleep(10) subprocess.check_call(["ray", "stop", "--force"]) lock.release() ray.init(storage=str(workflow_dir)) workflow.init() resumed = workflow.resume_all() assert len(resumed) == 1 objref = resumed.pop()[1] ray.get(objref) # The object ref will be different before and after recovery, so it will # get uploaded twice. assert get_num_uploads() == 1 ray.shutdown()
def test_spill_logs(): script = """ import ray import numpy as np ray.init(object_store_memory=200e6) x = [] for _ in range(10): x.append(ray.put(np.ones(100 * 1024 * 1024, dtype=np.uint8))) """ proc = run_string_as_driver_nonblocking( script, env={"RAY_verbose_spill_logs": "1"}) out_str = proc.stdout.read().decode("ascii") + proc.stderr.read().decode( "ascii") print(out_str) assert "Spilled " in out_str proc = run_string_as_driver_nonblocking( script, env={"RAY_verbose_spill_logs": "0"}) out_str = proc.stdout.read().decode("ascii") + proc.stderr.read().decode( "ascii") print(out_str) assert "Spilled " not in out_str
def test_cluster_rllib_restore(start_connected_cluster, tmpdir): cluster = start_connected_cluster dirpath = str(tmpdir) script = """ import time import ray from ray import tune ray.init(address="{address}") tune.run( "PG", name="experiment", config=dict(env="CartPole-v1", framework="tf"), stop=dict(training_iteration=10), local_dir="{checkpoint_dir}", checkpoint_freq=1, max_failures=1, dict(experiment=kwargs), raise_on_failed_trial=False) """.format( address=cluster.address, checkpoint_dir=dirpath) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. local_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(100): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner( resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration"): break time.sleep(0.3) if not TrialRunner.checkpoint_exists(local_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() cluster.wait_for_nodes() # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": "PG", "checkpoint_freq": 1, "local_dir": dirpath, } }, resume=True) assert all(t.status == Trial.TERMINATED for t in trials2) ray.shutdown() cluster.shutdown()
def test_actor_stdout(): script = """ import ray ray.init(num_cpus=2) @ray.remote class Actor1: def f(self): print("hi") @ray.remote class Actor2: def f(self): print("bye") def __repr__(self): return "ActorX" a = Actor1.remote() ray.get(a.f.remote()) b = Actor2.remote() ray.get(b.f.remote()) """ proc = run_string_as_driver_nonblocking(script) out_str = proc.stdout.read().decode("ascii") assert "hi" in out_str, out_str assert "(Actor1 pid=" in out_str, out_str assert "bye" in out_str, out_str assert "(Actor2 pid=" not in out_str, out_str assert "(ActorX pid=" in out_str, out_str
def test_recovery_cluster_failure(tmp_path): subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) proc = run_string_as_driver_nonblocking(f""" import time import ray from ray import workflow @ray.remote def foo(x): print("Executing", x) time.sleep(1) if x < 20: return workflow.continuation(foo.bind(x + 1)) else: return 20 if __name__ == "__main__": ray.init(storage="{tmp_path}") workflow.init() assert workflow.create(foo.bind(0)).run(workflow_id="cluster_failure") == 20 """) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) ray.init(storage=str(tmp_path)) workflow.init() assert ray.get(workflow.resume("cluster_failure")) == 20 ray.shutdown()
def test_runtime_env_hook(skip_hook): ray_init_snippet = "ray.init(_skip_env_hook=True)" if skip_hook else "" script = f""" import ray import os {ray_init_snippet} @ray.remote def f(): return os.environ.get("HOOK_KEY") print(ray.get(f.remote())) """ proc = run_string_as_driver_nonblocking( script, env={"RAY_RUNTIME_ENV_HOOK": "ray.tests.test_output._hook"}) out_str = proc.stdout.read().decode("ascii") + proc.stderr.read().decode( "ascii") print(out_str) if skip_hook: assert "HOOK_VALUE" not in out_str else: assert "HOOK_VALUE" in out_str
def test_multi_stdout(): script = """ import ray import sys ray.init(num_cpus=1) @ray.remote def foo(): print() @ray.remote def bar(): print() @ray.remote def baz(): print() ray.get(foo.remote()) ray.get(bar.remote()) ray.get(baz.remote()) """ proc = run_string_as_driver_nonblocking(script) out_str = proc.stdout.read().decode("ascii") assert "(foo pid=" in out_str, out_str assert "(bar pid=" in out_str, out_str assert "(baz pid=" in out_str, out_str
def test_chained_workflow_logs(workflow_start_regular): script = """ import ray from ray import workflow ray.init(address='auto') @workflow.step(name="f1") def f1(): return 10 @workflow.step(name="f2") def f2(x): return x+1 f2.step(f1.step()).run("wid1") """ proc = run_string_as_driver_nonblocking(script) logs = proc.stdout.read().decode("ascii") + proc.stderr.read().decode( "ascii") # on driver assert 'Workflow job created. [id="wid1"' in logs # # in WorkflowManagementActor's run_or_resume.remote() # assert "run_or_resume: wid1" in logs # assert "Workflow job [id=wid1] started." in logs # in _workflow_step_executor_remote assert "Step status [RUNNING]\t[wid1@f1" in logs assert "Step status [SUCCESSFUL]\t[wid1@f1" in logs assert "Step status [RUNNING]\t[wid1@f2" in logs assert "Step status [SUCCESSFUL]\t[wid1@f2" in logs
def test_driver_dead(shutdown_only): """Make sure all ray workers are shutdown when driver is killed.""" driver = """ import ray ray.init(_system_config={"ping_gcs_rpc_server_max_retries": 1}) @ray.remote def f(): import time time.sleep(10) num_cpus = int(ray.available_resources()["CPU"]) tasks = [f.remote() for _ in range(num_cpus)] """ p = run_string_as_driver_nonblocking(driver) # Make sure the driver is running. time.sleep(1) assert p.poll() is None wait_for_condition(lambda: len(get_all_ray_worker_processes()) > 0) # Kill the driver process. p.kill() p.wait() time.sleep(0.1) wait_for_condition(lambda: len(get_all_ray_worker_processes()) == 0)
def test_virtual_actor_logs(workflow_start_regular): script = """ import ray from ray import workflow ray.init(address='auto') @workflow.virtual_actor class Counter: def __init__(self, x: int): self.x = x def add(self, y): self.x += y return self.x couter = Counter.get_or_create("vid", 10) couter.add.options(name="add").run(1) """ proc = run_string_as_driver_nonblocking(script) logs = proc.stdout.read().decode("ascii") + proc.stderr.read().decode( "ascii") print(logs) # on driver assert 'Workflow job created. [id="vid"' in logs # # in WorkflowManagementActor's run_or_resume.remote() # assert "run_or_resume: vid" in logs # assert "Workflow job [id=vid] started." in logs # in _workflow_step_executor_remote assert "Step status [RUNNING]\t[vid@add" in logs assert "Step status [SUCCESSFUL]\t[vid@add" in logs
def test_dynamic_workflow_logs(workflow_start_regular): script = """ import ray from ray import workflow ray.init(address='auto') @workflow.step(name="f3") def f3(x): return x+1 @workflow.step(name="f4") def f4(x): return f3.step(x*2) f4.step(10).run("wid2") """ proc = run_string_as_driver_nonblocking(script) logs = proc.stdout.read().decode("ascii") + proc.stderr.read().decode( "ascii") # on driver assert 'Workflow job created. [id="wid2"' in logs # # in WorkflowManagementActor's run_or_resume.remote() # assert "run_or_resume: wid2" in logs # assert "Workflow job [id=wid2] started." in logs # in _workflow_step_executor_remote assert "Step status [RUNNING]\t[wid2@f3" in logs assert "Step status [SUCCESSFUL]\t[wid2@f3" in logs assert "Step status [RUNNING]\t[wid2@f4" in logs assert "Step status [SUCCESSFUL]\t[wid2@f4" in logs
def test_gcs_server_crash_cluster(ray_start_cluster): # Test the GCS server failures will crash the driver. cluster = ray_start_cluster GCS_RECONNECTION_TIMEOUT = 5 node = cluster.add_node( num_cpus=0, _system_config={ "gcs_rpc_server_reconnect_timeout_s": GCS_RECONNECTION_TIMEOUT }, ) script = """ import ray import time ray.init(address="auto") time.sleep(60) """ # Get gcs server pid to send a signal. all_processes = node.all_processes gcs_server_process = all_processes["gcs_server"][0].process gcs_server_pid = gcs_server_process.pid proc = run_string_as_driver_nonblocking(script) # Wait long enough to start the driver. time.sleep(5) start = time.time() print(gcs_server_pid) os.kill(gcs_server_pid, signal.SIGKILL) wait_for_condition(lambda: proc.poll() is None, timeout=10) # Make sure the driver was exited within the timeout instead of hanging. # * 2 for avoiding flakiness. assert time.time() - start < GCS_RECONNECTION_TIMEOUT * 2
def test_job_timestamps(ray_start_regular): driver_template = """ import ray from time import sleep ray.init(address="{}") print("My job id: ", str(ray.get_runtime_context().job_id)) {} ray.shutdown() """ non_hanging = driver_template.format(ray_start_regular["address"], "sleep(1)") hanging_driver = driver_template.format(ray_start_regular["address"], "sleep(60)") out = run_string_as_driver(non_hanging) p = run_string_as_driver_nonblocking(hanging_driver) # The nonblocking process needs time to connect. time.sleep(1) jobs = list(ray.state.jobs()) jobs.sort(key=lambda x: x["JobID"]) driver = jobs[0] finished = jobs[1] running = jobs[2] # The initial driver timestamp/start time go down a different code path. assert driver["Timestamp"] == driver["StartTime"] assert finished["Timestamp"] == finished["EndTime"] assert running["Timestamp"] == running["StartTime"] assert finished["EndTime"] > finished["StartTime"] > 0, out lapsed = finished["EndTime"] - finished["StartTime"] assert 0 < lapsed < 5000, f"Job should've taken ~1s, {finished}" assert running["StartTime"] > 0 assert running["EndTime"] == 0 p.kill() # Give the second job time to clean itself up. time.sleep(1) jobs = list(ray.state.jobs()) jobs.sort(key=lambda x: x["JobID"]) # jobs[0] is the test case driver. finished = jobs[1] prev_running = jobs[2] assert finished["EndTime"] > finished["StartTime"] > 0, f"{finished}" assert finished["EndTime"] == finished["Timestamp"] lapsed = finished["EndTime"] - finished["StartTime"] assert 0 < lapsed < 5000, f"Job should've taken ~1s {finished}" assert prev_running["EndTime"] > prev_running["StartTime"] > 0
def test_workflow_lifetime_2(call_ray_start, reset_workflow): # Case 2: driver terminated proc = run_string_as_driver_nonblocking(driver_script.format(100)) time.sleep(10) proc.kill() time.sleep(1) workflow.init() output = workflow.get_output("driver_terminated") assert ray.get(output) == 20
def test_workflow_lifetime_2(workflow_start_cluster): # Case 2: driver terminated address, storage_uri = workflow_start_cluster with patch.dict(os.environ, {"RAY_ADDRESS": address}): ray.init() proc = run_string_as_driver_nonblocking(driver_script.format(100)) time.sleep(10) proc.kill() time.sleep(1) assert workflow.get_output("driver_terminated") == 20
def test_workflow_lifetime_2(call_ray_start, reset_workflow): # Case 2: driver terminated with patch.dict(os.environ, {"RAY_ADDRESS": call_ray_start}): proc = run_string_as_driver_nonblocking(driver_script.format(100)) time.sleep(10) proc.kill() time.sleep(1) workflow.init() output = workflow.get_output("driver_terminated") assert ray.get(output) == 20
def test_recovery_cluster_failure(reset_workflow, tmp_path): subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) proc = run_string_as_driver_nonblocking( driver_script.format(tmp_path=str(tmp_path))) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) workflow.init(str(tmp_path)) assert ray.get(workflow.resume("cluster_failure")) == 20 workflow.storage.set_global_storage(None) ray.shutdown()
def test_drivers_named_actors(call_ray_start): # This test will create some drivers that submit some tasks to the same # named actor. address = call_ray_start ray.init(address=address, namespace="test") # Define a driver that creates a named actor then sleeps for a while. driver_script1 = """ import ray import time ray.init(address="{}", namespace="test") @ray.remote class Counter: def __init__(self): self.count = 0 def increment(self): self.count += 1 return self.count counter = Counter.options(name="Counter").remote() time.sleep(100) """.format( address ) # Define a driver that submits to the named actor and exits. driver_script2 = """ import ray import time ray.init(address="{}", namespace="test") while True: try: counter = ray.get_actor("Counter") break except ValueError: time.sleep(1) assert ray.get(counter.increment.remote()) == {} print("success") """.format( address, "{}" ) process_handle = run_string_as_driver_nonblocking(driver_script1) for i in range(3): driver_script = driver_script2.format(i + 1) out = run_string_as_driver(driver_script) assert "success" in out process_handle.kill()
def test_core_worker_error_message(): script = """ import ray import sys ray.init(local_mode=True) # In local mode this generates an ERROR level log. ray._private.utils.push_error_to_driver( ray.worker.global_worker, "type", "Hello there") """ proc = run_string_as_driver_nonblocking(script) err_str = proc.stderr.read().decode("ascii") assert "Hello there" in err_str, err_str
def test_actor_stdout(file): if file == "stdout": file_handle = "sys.stdout" else: # sys.stderr file_handle = "sys.stderr" script = f""" import ray import sys ray.init(num_cpus=2) @ray.remote class Actor1: def f(self): print("hi", file={file_handle}) @ray.remote class Actor2: def __init__(self): print("init", file={file_handle}) self.name = "ActorX" def f(self): print("bye", file={file_handle}) def __repr__(self): return self.name a = Actor1.remote() ray.get(a.f.remote()) b = Actor2.remote() ray.get(b.f.remote()) """ proc = run_string_as_driver_nonblocking(script) if file == "stdout": out_str = proc.stdout.read().decode("ascii") else: out_str = proc.stderr.read().decode("ascii") print(out_str) assert "hi" in out_str, out_str assert "(Actor1 pid=" in out_str, out_str assert "bye" in out_str, out_str assert re.search("Actor2 pid=.*init", out_str), out_str assert not re.search("ActorX pid=.*init", out_str), out_str assert re.search("ActorX pid=.*bye", out_str), out_str assert not re.search("Actor2 pid=.*bye", out_str), out_str
def test_runtime_env_hook(): script = """ import ray import os @ray.remote def f(): return os.environ.get("HOOK_KEY") print(ray.get(f.remote())) """ proc = run_string_as_driver_nonblocking( script, env={"RAY_RUNTIME_ENV_HOOK": "ray.tests.test_output._hook"}) out_str = proc.stdout.read().decode("ascii") + proc.stderr.read().decode( "ascii") print(out_str) assert "HOOK_VALUE" in out_str
def test_no_verbose_output(): script = """ import ray @ray.remote class Actor: def ping(self): return "ok" @ray.remote def getter(name): actor = Actor.options( name="foo", lifetime="detached", namespace="n", get_if_exists=True).remote() ray.get(actor.ping.remote()) def do_run(name): name = "actor_" + str(name) tasks = [getter.remote(name) for i in range(4)] ray.get(tasks) try: ray.kill(ray.get_actor(name, namespace="n")) # Cleanup except: pass for i in range(100): do_run(i) print("DONE") """ proc = run_string_as_driver_nonblocking(script) out_str = proc.stdout.read().decode("ascii") + proc.stderr.read().decode( "ascii") # Check there's no excessively verbose raylet error messages due to # actor creation races. out = [] for line in out_str.split("\n"): if "Ray dashboard" not in line and "The object store" not in line: out.append(line) valid = "".join(out) assert valid.strip() == "DONE", out_str
def test_fail_importing_actor(ray_start_regular, error_pubsub): script = """ import os import sys import tempfile import ray ray.init() temporary_python_file = ''' def temporary_helper_function(): return 1 ''' f = tempfile.NamedTemporaryFile("w+", suffix=".py", prefix="_", delete=True) f_name = f.name f.close() f = open(f_name, "w+") f.write(temporary_python_file) f.flush() directory = os.path.dirname(f_name) # Get the module name and strip ".py" from the end. module_name = os.path.basename(f_name)[:-3] sys.path.append(directory) module = __import__(module_name) # Define an actor that closes over this temporary module. This should # fail when it is unpickled. @ray.remote class Foo: def __init__(self): self.x = module.temporary_python_file() a = Foo.remote() import time time.sleep(3) # Wait for actor start. """ proc = run_string_as_driver_nonblocking(script) out_str = proc.stdout.read().decode("ascii") err_str = proc.stderr.read().decode("ascii") print(out_str) print(err_str) assert "ModuleNotFoundError: No module named" in err_str assert "RuntimeError: The actor with name Foo failed to import" in err_str
def test_recovery_cluster_failure_resume_all(tmp_path, shutdown_only): ray.shutdown() tmp_path = tmp_path subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) workflow_dir = tmp_path / "workflow" lock_file = tmp_path / "lock_file" lock = FileLock(lock_file) lock.acquire() proc = run_string_as_driver_nonblocking( f""" import time import ray from ray import workflow from filelock import FileLock @ray.remote def foo(x): with FileLock("{str(lock_file)}"): return 20 if __name__ == "__main__": ray.init(storage="{str(workflow_dir)}") workflow.init() assert workflow.create(foo.bind(0)).run(workflow_id="cluster_failure") == 20 """ ) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) lock.release() ray.init(storage=str(workflow_dir)) workflow.init() resumed = workflow.resume_all() assert len(resumed) == 1 (wid, obj_ref) = resumed[0] assert wid == "cluster_failure" assert ray.get(obj_ref) == 20
def test_autoscaler_no_spam(): script = """ import ray import time ray.init(num_cpus=1) @ray.remote(num_cpus=1) def f(): time.sleep(1) ray.get([f.remote() for _ in range(5)]) """ proc = run_string_as_driver_nonblocking(script) out_str = proc.stdout.read().decode("ascii") err_str = proc.stderr.read().decode("ascii") print(out_str, err_str) assert "Tip:" not in out_str assert "Tip:" not in err_str
def test_worker_stdout(): script = """ import ray import sys ray.init(num_cpus=2) @ray.remote def foo(out_str, err_str): print(out_str) print(err_str, file=sys.stderr) ray.get(foo.remote("abc", "def")) """ proc = run_string_as_driver_nonblocking(script) out_str = proc.stdout.read().decode("ascii") err_str = proc.stderr.read().decode("ascii") assert out_str.endswith("abc\n") assert err_str.split("\n")[-2].endswith("def")
def test_head_node_down(short_gcs_publish_timeout, ray_start_cluster): """Make sure all ray workers when head node is dead.""" cluster = ray_start_cluster # head node. head = cluster.add_node( num_cpus=2, _system_config={"gcs_rpc_server_reconnect_timeout_s": 1} ) # worker nodes. num_worker_nodes = 2 for _ in range(num_worker_nodes): cluster.add_node(num_cpus=2) cluster.wait_for_nodes() # Start a driver. driver = """ import ray ray.init(address="{}") @ray.remote def f(): import time time.sleep(10) num_cpus = int(ray.available_resources()["CPU"]) tasks = [f.remote() for _ in range(num_cpus)] import time time.sleep(100) """.format( cluster.address ) p = run_string_as_driver_nonblocking(driver) # Make sure the driver is running. time.sleep(1) wait_for_condition(lambda: p.poll() is None) wait_for_condition(lambda: len(get_all_ray_worker_processes()) > 0) cluster.remove_node(head) wait_for_condition(lambda: len(get_all_ray_worker_processes()) == 0)
def test_autoscaler_infeasible(): script = """ import ray import time ray.init(num_cpus=1) @ray.remote(num_gpus=1) def foo(): pass x = foo.remote() time.sleep(15) """ proc = run_string_as_driver_nonblocking(script) out_str = proc.stdout.read().decode("ascii") err_str = proc.stderr.read().decode("ascii") print(out_str, err_str) assert "Tip:" in out_str assert "Error: No available node types can fulfill" in out_str
def test_env_installation_nonblocking(shutdown_only): """Test fix for https://github.com/ray-project/ray/issues/16226.""" env1 = {"pip": ["pip-install-test==0.5"]} ray.init(runtime_env=env1) @ray.remote def f(): return "hello" # Warm up a worker because it takes time to start. ray.get(f.remote()) def assert_tasks_finish_quickly(total_sleep_s=0.1): """Call f every 0.01 seconds for total time total_sleep_s.""" gap_s = 0.01 for i in range(int(total_sleep_s / gap_s)): start = time.time() ray.get(f.remote()) # Env installation takes around 10 to 60 seconds. If we fail the # below assert, we can be pretty sure an env installation blocked # the task. assert time.time() - start < 1.0 time.sleep(gap_s) assert_tasks_finish_quickly() env2 = {"pip": ["pip-install-test==0.5", "requests"]} f.options(runtime_env=env2).remote() # Check that installing env2 above does not block tasks using env1. assert_tasks_finish_quickly() proc = run_string_as_driver_nonblocking( install_env_script.format(env=env1)) # Check that installing env1 in a new worker in the script above does not # block other tasks that use env1. assert_tasks_finish_quickly(total_sleep_s=5) proc.kill() proc.wait()
def test_multi_stdout_err(file): if file == "stdout": file_handle = "sys.stdout" else: # sys.stderr file_handle = "sys.stderr" script = f""" import ray import sys ray.init(num_cpus=1) @ray.remote def foo(): print(file={file_handle}) @ray.remote def bar(): print(file={file_handle}) @ray.remote def baz(): print(file={file_handle}) ray.get(foo.remote()) ray.get(bar.remote()) ray.get(baz.remote()) """ proc = run_string_as_driver_nonblocking(script) if file == "stdout": out_str = proc.stdout.read().decode("ascii") else: out_str = proc.stderr.read().decode("ascii") out_str = "".join(out_str.splitlines()) assert "(foo pid=" in out_str, out_str assert "(bar pid=" in out_str, out_str assert "(baz pid=" in out_str, out_str