def test_get_output_3(workflow_start_regular, tmp_path): cnt_file = tmp_path / "counter" cnt_file.write_text("0") error_flag = tmp_path / "error" error_flag.touch() @ray.remote def incr(): v = int(cnt_file.read_text()) cnt_file.write_text(str(v + 1)) if error_flag.exists(): raise ValueError() return 10 with pytest.raises(workflow.WorkflowExecutionError): workflow.run(incr.options(max_retries=0).bind(), workflow_id="incr") assert cnt_file.read_text() == "1" from ray.exceptions import RaySystemError # TODO(suquark): We should prevent Ray from raising "RaySystemError", # in workflow, because "RaySystemError" does not inherit the underlying # error, so users and developers cannot catch the expected error. # I feel this issue is a very annoying. with pytest.raises((RaySystemError, ValueError)): workflow.get_output("incr") assert cnt_file.read_text() == "1" error_flag.unlink() with pytest.raises((RaySystemError, ValueError)): workflow.get_output("incr") assert workflow.resume("incr") == 10
def test_get_output_3(workflow_start_regular, tmp_path): cnt_file = tmp_path / "counter" cnt_file.write_text("0") error_flag = tmp_path / "error" error_flag.touch() @workflow.step def incr(): v = int(cnt_file.read_text()) cnt_file.write_text(str(v + 1)) if error_flag.exists(): raise ValueError() return 10 with pytest.raises(ray.exceptions.RaySystemError): incr.options(max_retries=1).step().run("incr") assert cnt_file.read_text() == "1" with pytest.raises(ray.exceptions.RaySystemError): ray.get(workflow.get_output("incr")) assert cnt_file.read_text() == "1" error_flag.unlink() with pytest.raises(ray.exceptions.RaySystemError): ray.get(workflow.get_output("incr")) assert ray.get(workflow.resume("incr")) == 10
def test_output_with_name(workflow_start_regular): @ray.remote def double(v): return 2 * v inner_task = double.options(**workflow.options(name="inner")).bind(1) outer_task = double.options(**workflow.options(name="outer")).bind(inner_task) result = workflow.create(outer_task).run_async("double") inner = workflow.get_output("double", name="inner") outer = workflow.get_output("double", name="outer") assert ray.get(inner) == 2 assert ray.get(outer) == 4 assert ray.get(result) == 4 @workflow.options(name="double") @ray.remote def double_2(s): return s * 2 inner_task = double_2.bind(1) outer_task = double_2.bind(inner_task) workflow_id = "double_2" result = workflow.create(outer_task).run_async(workflow_id) inner = workflow.get_output(workflow_id, name="double") outer = workflow.get_output(workflow_id, name="double_1") assert ray.get(inner) == 2 assert ray.get(outer) == 4 assert ray.get(result) == 4
def test_get_named_step_output_finished(workflow_start_regular, tmp_path): @workflow.step def double(v): return 2 * v # Get the result from named step after workflow finished assert 4 == double.options(name="outer").step( double.options(name="inner").step(1)).run("double") assert ray.get(workflow.get_output("double", name="inner")) == 2 assert ray.get(workflow.get_output("double", name="outer")) == 4
def test_get_named_step_output_running(workflow_start_regular, tmp_path): @ray.remote def double(v, lock=None): if lock is not None: with FileLock(lock_path): return 2 * v else: return 2 * v # Get the result from named step after workflow before it's finished lock_path = str(tmp_path / "lock") lock = FileLock(lock_path) lock.acquire() output = workflow.create( double.options(**workflow.options(name="outer")).bind( double.options(**workflow.options(name="inner")).bind( 1, lock_path), lock_path, )).run_async("double-2") inner = workflow.get_output("double-2", name="inner") outer = workflow.get_output("double-2", name="outer") @ray.remote def wait(obj_ref): return ray.get(obj_ref[0]) # Make sure nothing is finished. ready, waiting = ray.wait( [wait.remote([output]), wait.remote([inner]), wait.remote([outer])], timeout=1) assert 0 == len(ready) assert 3 == len(waiting) # Once job finished, we'll be able to get the result. lock.release() assert 4 == ray.get(output) # Here sometimes inner will not be generated when we call # run_async. So there is a race condition here. try: v = ray.get(inner) except Exception: v = None if v is not None: assert 2 == v assert 4 == ray.get(outer) inner = workflow.get_output("double-2", name="inner") outer = workflow.get_output("double-2", name="outer") assert 2 == ray.get(inner) assert 4 == ray.get(outer)
def test_get_named_step_output_finished(workflow_start_regular, tmp_path): @ray.remote def double(v): return 2 * v # Get the result from named step after workflow finished assert 4 == workflow.create( update_workflow_options(double, name="outer").bind( update_workflow_options(double, name="inner").bind(1))).run("double") assert ray.get(workflow.get_output("double", name="inner")) == 2 assert ray.get(workflow.get_output("double", name="outer")) == 4
def test_get_named_step_duplicate(workflow_start_regular): @workflow.step(name="f") def f(n, dep): return n inner = f.step(10, None) outer = f.step(20, inner) assert 20 == outer.run("duplicate") # The outer will be checkpointed first. So there is no suffix for the name assert ray.get(workflow.get_output("duplicate", name="f")) == 20 # The inner will be checkpointed after the outer. And there is a duplicate # for the name. suffix _1 is added automatically assert ray.get(workflow.get_output("duplicate", name="f_1")) == 10
def test_get_named_step_output_finished(workflow_start_regular, tmp_path): @ray.remote def double(v): return 2 * v # Get the result from named step after workflow finished assert 4 == workflow.run( double.options(**workflow.options(name="outer")).bind( double.options(**workflow.options(name="inner")).bind(1)), workflow_id="double", ) assert workflow.get_output("double", name="inner") == 2 assert workflow.get_output("double", name="outer") == 4
def test_get_named_step_duplicate(workflow_start_regular): @workflow.options(name="f") @ray.remote def f(n, dep): return n inner = f.bind(10, None) outer = f.bind(20, inner) assert 20 == workflow.run(outer, workflow_id="duplicate") # The outer will be checkpointed first. So there is no suffix for the name assert workflow.get_output("duplicate", name="f") == 10 # The inner will be checkpointed after the outer. And there is a duplicate # for the name. suffix _1 is added automatically assert workflow.get_output("duplicate", name="f_1") == 20
def test_crash_during_event_checkpointing(workflow_start_regular_shared): """Ensure that if the cluster dies while the event is being checkpointed, we properly re-poll for the event.""" from ray._private import storage storage_uri = storage._storage_uri """Ensure that we don't re-call poll_for_event after `event_checkpointed` returns, even after a crash.""" class MyEventListener(workflow.EventListener): async def poll_for_event(self): assert not utils.check_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") utils.set_global_mark("first") utils.set_global_mark("time_to_die") while not utils.check_global_mark("resume"): time.sleep(0.1) async def event_checkpointed(self, event): utils.set_global_mark("committed") @ray.remote def wait_then_finish(arg): pass event_promise = workflow.wait_for_event(MyEventListener) workflow.run_async(wait_then_finish.bind(event_promise), workflow_id="workflow") while not utils.check_global_mark("time_to_die"): time.sleep(0.1) assert utils.check_global_mark("first") ray.shutdown() subprocess.check_output(["ray", "stop", "--force"]) # Give the workflow some time to kill the cluster. # time.sleep(3) ray.init(num_cpus=4, storage=storage_uri) workflow.init() workflow.resume_async("workflow") utils.set_global_mark("resume") workflow.get_output("workflow") assert utils.check_global_mark("second")
def test_get_output_1(workflow_start_regular, tmp_path): @workflow.step def simple(v): return v assert 0 == simple.step(0).run("simple") assert 0 == ray.get(workflow.get_output("simple"))
def test_workflow_lifetime_1(workflow_start_cluster): # Case 1: driver exits normally address, storage_uri = workflow_start_cluster with patch.dict(os.environ, {"RAY_ADDRESS": address}): ray.init() run_string_as_driver(driver_script.format(5)) assert workflow.get_output("driver_terminated") == 20
def test_workflow_lifetime_1(call_ray_start, reset_workflow): # Case 1: driver exits normally with patch.dict(os.environ, {"RAY_ADDRESS": call_ray_start}): run_string_as_driver(driver_script.format(5)) workflow.init() output = workflow.get_output("driver_terminated") assert ray.get(output) == 20
def test_get_output_4(workflow_start_regular, tmp_path): """Test getting output of a workflow tasks that are dynamically generated.""" lock_path = str(tmp_path / "lock") lock = FileLock(lock_path) @ray.remote def recursive(n): if n <= 0: with FileLock(lock_path): return 42 return workflow.continuation( recursive.options(**workflow.options(name=str(n - 1))).bind(n - 1) ) workflow_id = "test_get_output_4" lock.acquire() obj = workflow.create( recursive.options(**workflow.options(name="10")).bind(10) ).run_async(workflow_id) outputs = [workflow.get_output(workflow_id, name=str(i)) for i in range(11)] outputs.append(obj) import time # wait so that 'get_output' is scheduled before executing the workflow time.sleep(3) lock.release() assert ray.get(outputs) == [42] * len(outputs)
def test_crash_after_commit(workflow_start_regular_shared): _storage = storage.get_global_storage() """Ensure that we don't re-call poll_for_event after `event_checkpointed` returns, even after a crash. Here we must call `event_checkpointed` twice, because there's no way to know if we called it after checkpointing. """ class MyEventListener(workflow.EventListener): async def poll_for_event(self): assert not utils.check_global_mark("committed") async def event_checkpointed(self, event): utils.set_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") else: utils.set_global_mark("first") await asyncio.sleep(1000000) event_promise = workflow.wait_for_event(MyEventListener) event_promise.run_async("workflow") while not utils.check_global_mark("first"): time.sleep(0.1) ray.shutdown() subprocess.check_output(["ray", "stop", "--force"]) ray.init(num_cpus=4) workflow.init(storage=_storage) workflow.resume("workflow") ray.get(workflow.get_output("workflow")) assert utils.check_global_mark("second")
def test_get_output_1(workflow_start_regular, tmp_path): @ray.remote def simple(v): return v assert 0 == workflow.create(simple.bind(0)).run("simple") assert 0 == ray.get(workflow.get_output("simple"))
def test_task_id_generation(workflow_start_regular_shared, request): @ray.remote def simple(x): return x + 1 x = simple.options(**workflow.options(name="simple")).bind(-1) n = 20 for i in range(1, n): x = simple.options(**workflow.options(name="simple")).bind(x) workflow_id = "test_task_id_generation" ret = workflow.create(x).run_async(workflow_id=workflow_id) outputs = [workflow.get_output(workflow_id, name="simple")] for i in range(1, n): outputs.append(workflow.get_output(workflow_id, name=f"simple_{i}")) assert ray.get(ret) == n - 1 assert ray.get(outputs) == list(range(n))
def test_get_named_step_output_error(workflow_start_regular, tmp_path): @workflow.step def double(v, error): if error: raise Exception() return v + v # Force it to fail for the outer step with pytest.raises(Exception): double.options(name="outer").step( double.options(name="inner").step(1, False), True).run("double") # For the inner step, it should have already been executed. assert 2 == ray.get(workflow.get_output("double", name="inner")) outer = workflow.get_output("double", name="outer") with pytest.raises(Exception): ray.get(outer)
def test_workflow_lifetime_2(call_ray_start, reset_workflow): # Case 2: driver terminated proc = run_string_as_driver_nonblocking(driver_script.format(100)) time.sleep(10) proc.kill() time.sleep(1) workflow.init() output = workflow.get_output("driver_terminated") assert ray.get(output) == 20
def test_workflow_lifetime_2(call_ray_start, reset_workflow): # Case 2: driver terminated with patch.dict(os.environ, {"RAY_ADDRESS": call_ray_start}): proc = run_string_as_driver_nonblocking(driver_script.format(100)) time.sleep(10) proc.kill() time.sleep(1) workflow.init() output = workflow.get_output("driver_terminated") assert ray.get(output) == 20
def test_workflow_lifetime_2(workflow_start_cluster): # Case 2: driver terminated address, storage_uri = workflow_start_cluster with patch.dict(os.environ, {"RAY_ADDRESS": address}): ray.init() proc = run_string_as_driver_nonblocking(driver_script.format(100)) time.sleep(10) proc.kill() time.sleep(1) assert workflow.get_output("driver_terminated") == 20
def test_get_named_step_output_error(workflow_start_regular, tmp_path): @ray.remote def double(v, error): if error: raise Exception() return v + v # Force it to fail for the outer step with pytest.raises(Exception): workflow.run( double.options(**workflow.options(name="outer")).bind( double.options(**workflow.options(name="inner")).bind( 1, False), True), workflow_id="double", ) # For the inner step, it should have already been executed. assert 2 == workflow.get_output("double", name="inner") with pytest.raises(Exception): workflow.get_output("double", name="outer")
def test_get_non_exist_output(workflow_start_regular, tmp_path): lock_path = str(tmp_path / "lock") @ray.remote def simple(): with FileLock(lock_path): return "hello" workflow_id = "test_get_non_exist_output" with FileLock(lock_path): dag = simple.options(**workflow.options(name="simple")).bind() ret = workflow.create(dag).run_async(workflow_id=workflow_id) exist = workflow.get_output(workflow_id, name="simple") non_exist = workflow.get_output(workflow_id, name="non_exist") assert ray.get(ret) == "hello" assert ray.get(exist) == "hello" with pytest.raises(ValueError, match="non_exist"): ray.get(non_exist)
def test_get_named_step_output_running(workflow_start_regular, tmp_path): @ray.remote def double(v, lock=None): if lock is not None: with FileLock(lock_path): return 2 * v else: return 2 * v # Get the result from named step after workflow before it's finished lock_path = str(tmp_path / "lock") lock = FileLock(lock_path) lock.acquire() output = workflow.create( double.options(**workflow.options(name="outer")).bind( double.options(**workflow.options(name="inner")).bind(1, lock_path), lock_path, ) ).run_async("double-2") inner = workflow.get_output("double-2", name="inner") outer = workflow.get_output("double-2", name="outer") @ray.remote def wait(obj_ref): return ray.get(obj_ref[0]) # Make sure nothing is finished. ready, waiting = ray.wait( [wait.remote([output]), wait.remote([inner]), wait.remote([outer])], timeout=1 ) assert 0 == len(ready) assert 3 == len(waiting) # Once job finished, we'll be able to get the result. lock.release() assert [4, 2, 4] == ray.get([output, inner, outer]) inner = workflow.get_output("double-2", name="inner") outer = workflow.get_output("double-2", name="outer") assert [2, 4] == ray.get([inner, outer])
def test_get_output_2(workflow_start_regular, tmp_path): lock_path = str(tmp_path / "lock") lock = FileLock(lock_path) @workflow.step def simple(v): with FileLock(lock_path): return v lock.acquire() obj = simple.step(0).run_async("simple") obj2 = workflow.get_output("simple") lock.release() assert ray.get([obj, obj2]) == [0, 0]
def test_get_output_5(workflow_start_regular, tmp_path): """Test getting output of a workflow task immediately after executing it asynchronously.""" @ray.remote def simple(): return 314 workflow_id = "test_get_output_5_{}" outputs = [] for i in range(20): workflow.create(simple.bind()).run_async(workflow_id.format(i)) outputs.append(workflow.get_output(workflow_id.format(i))) assert ray.get(outputs) == [314] * len(outputs)
def test_get_named_step_default(workflow_start_regular, tmp_path): @workflow.step def factorial(n, r=1): if n == 1: return r return factorial.step(n - 1, r * n) import math assert math.factorial(5) == factorial.step(5).run("factorial") for i in range(5): step_name = ("test_basic_workflows_2." "test_get_named_step_default.locals.factorial") if i != 0: step_name += "_" + str(i) # All outputs will be 120 assert math.factorial(5) == ray.get( workflow.get_output("factorial", name=step_name))
def test_workflow_queuing_1(shutdown_only, tmp_path): ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) import queue import filelock lock_path = str(tmp_path / ".lock") @ray.remote def long_running(x): with filelock.FileLock(lock_path): return x wfs = [long_running.bind(i) for i in range(5)] with filelock.FileLock(lock_path): refs = [ workflow.run_async(wfs[i], workflow_id=f"workflow_{i}") for i in range(4) ] assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [ "workflow_0", "workflow_1", ] assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [ "workflow_2", "workflow_3", ] with pytest.raises(queue.Full, match="Workflow queue has been full"): workflow.run(wfs[4], workflow_id="workflow_4") assert ray.get(refs) == [0, 1, 2, 3] assert workflow.run(wfs[4], workflow_id="workflow_4") == 4 assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [ "workflow_0", "workflow_1", "workflow_2", "workflow_3", "workflow_4", ] for i in range(5): assert workflow.get_output(f"workflow_{i}") == i
def test_get_named_step_default(workflow_start_regular, tmp_path): @ray.remote def factorial(n, r=1): if n == 1: return r return workflow.continuation(factorial.bind(n - 1, r * n)) import math assert math.factorial(5) == workflow.run(factorial.bind(5), workflow_id="factorial") for i in range(5): step_name = ("python.ray.workflow.tests.test_basic_workflows_2." "test_get_named_step_default.locals.factorial") if i != 0: step_name += "_" + str(i) # All outputs will be 120 assert math.factorial(5) == workflow.get_output("factorial", name=step_name)
def test_workflow_queuing_2(shutdown_only, tmp_path): ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) @ray.remote def short_running(x): return x wfs = [short_running.bind(i) for i in range(5)] refs = [ workflow.run_async(wfs[i], workflow_id=f"workflow_{i}") for i in range(4) ] for i in range(4): assert workflow.get_output(f"workflow_{i}") == i assert ray.get(refs) == [0, 1, 2, 3] assert workflow.run(wfs[4], workflow_id="workflow_4") == 4 assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [ "workflow_0", "workflow_1", "workflow_2", "workflow_3", "workflow_4", ]