def test_wait_failure_recovery_2(workflow_start_regular_shared): # Test failing "workflow.wait" and its input steps. @workflow.step def sleep_identity(x: int): # block the step by a global mark while not utils.check_global_mark(): time.sleep(0.1) time.sleep(x) return x @workflow.step def identity(x): return x ws = [ sleep_identity.step(2), sleep_identity.step(5), sleep_identity.step(1), ] w = workflow.wait(ws, num_returns=2, timeout=None) utils.unset_global_mark() _ = identity.step(w).run_async(workflow_id="wait_failure_recovery_2") # wait util "workflow.wait" has been running time.sleep(10) workflow.cancel("wait_failure_recovery_2") time.sleep(2) utils.set_global_mark() ready, unready = ray.get(workflow.resume("wait_failure_recovery_2")) assert ready == [2, 1]
def test_wait_recovery_step_id(workflow_start_regular_shared): # This test ensures workflow reuse the original directory and # step id for "workflow.wait" during recovery. @workflow.step def identity(x: int): # block the step by a global mark assert utils.check_global_mark() return x w = workflow.wait([identity.step(42)], num_returns=1, timeout=None) utils.unset_global_mark() with pytest.raises(RaySystemError): _ = w.run(workflow_id="test_wait_recovery_step_id") utils.set_global_mark() ready, unready = ray.get(workflow.resume("test_wait_recovery_step_id")) assert ready == [42] from ray.workflow import storage, workflow_storage global_storage = storage.get_global_storage() wf_storage = workflow_storage.WorkflowStorage("test_wait_recovery_step_id", global_storage) index = wf_storage.gen_step_id("workflow.wait") # no new step id assert index <= 1
def test_wait_for_multiple_events(workflow_start_regular_shared): """If a workflow has multiple event arguments, it should wait for them at the same time. """ class EventListener1(workflow.EventListener): async def poll_for_event(self): utils.set_global_mark("listener1") while not utils.check_global_mark("trigger_event"): await asyncio.sleep(0.1) return "event1" class EventListener2(workflow.EventListener): async def poll_for_event(self): utils.set_global_mark("listener2") while not utils.check_global_mark("trigger_event"): await asyncio.sleep(0.1) return "event2" @ray.remote def trivial_step(arg1, arg2): return f"{arg1} {arg2}" event1_promise = workflow.wait_for_event(EventListener1) event2_promise = workflow.wait_for_event(EventListener2) promise = workflow.create(trivial_step.bind(event1_promise, event2_promise)).run_async() while not (utils.check_global_mark("listener1") and utils.check_global_mark("listener2")): time.sleep(0.1) utils.set_global_mark("trigger_event") assert ray.get(promise) == "event1 event2"
def test_wait_failure_recovery_1(workflow_start_regular_shared): # This tests that if a step using the output of "workflow.wait" as its # input, it can be recovered after failure. @workflow.step def get_all(ready, unready): return ready, unready @workflow.step def filter_all_2(wait_results): assert wait_results[0] == [1, 3, 2] # failure point assert utils.check_global_mark() ready, unready = wait_results return get_all.step(ready, unready) @workflow.step def composite_2(): w = wait_multiple_steps.step() return filter_all_2.step(w) utils.unset_global_mark() with pytest.raises(RaySystemError): composite_2.step().run(workflow_id="wait_failure_recovery") utils.set_global_mark() ready, unready = ray.get(workflow.resume("wait_failure_recovery")) assert ready == [1, 3, 2] assert unready == [10, 12]
def test_recovery_simple(workflow_start_regular): @ray.remote def append1(x): return x + "[append1]" @ray.remote def append2(x): return x + "[append2]" @ray.remote def simple(x): x = append1.bind(x) y = the_failed_step.bind(x) z = append2.bind(y) return workflow.continuation(z) utils.unset_global_mark() workflow_id = "test_recovery_simple" with pytest.raises(RaySystemError): # internally we get WorkerCrashedError workflow.create(simple.bind("x")).run(workflow_id=workflow_id) assert workflow.get_status( workflow_id) == workflow.WorkflowStatus.RESUMABLE utils.set_global_mark() output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]" utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]"
def test_event_as_workflow(workflow_start_regular_shared): class MyEventListener(workflow.EventListener): async def poll_for_event(self): while not utils.check_global_mark(): await asyncio.sleep(1) utils.unset_global_mark() promise = workflow.wait_for_event(MyEventListener).run_async("wf") assert workflow.get_status("wf") == workflow.WorkflowStatus.RUNNING utils.set_global_mark() assert ray.get(promise) is None
def test_recovery_simple_1(workflow_start_regular): utils.unset_global_mark() workflow_id = "test_recovery_simple_1" with pytest.raises(workflow.WorkflowExecutionError): # internally we get WorkerCrashedError workflow.run(the_failed_step.bind("x"), workflow_id=workflow_id) assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED utils.set_global_mark() assert workflow.resume(workflow_id) == "foo(x)" utils.unset_global_mark() # resume from workflow output checkpoint assert workflow.resume(workflow_id) == "foo(x)"
def test_crash_during_event_checkpointing(workflow_start_regular_shared): """Ensure that if the cluster dies while the event is being checkpointed, we properly re-poll for the event.""" from ray._private import storage storage_uri = storage._storage_uri """Ensure that we don't re-call poll_for_event after `event_checkpointed` returns, even after a crash.""" class MyEventListener(workflow.EventListener): async def poll_for_event(self): assert not utils.check_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") utils.set_global_mark("first") utils.set_global_mark("time_to_die") while not utils.check_global_mark("resume"): time.sleep(0.1) async def event_checkpointed(self, event): utils.set_global_mark("committed") @ray.remote def wait_then_finish(arg): pass event_promise = workflow.wait_for_event(MyEventListener) workflow.run_async(wait_then_finish.bind(event_promise), workflow_id="workflow") while not utils.check_global_mark("time_to_die"): time.sleep(0.1) assert utils.check_global_mark("first") ray.shutdown() subprocess.check_output(["ray", "stop", "--force"]) # Give the workflow some time to kill the cluster. # time.sleep(3) ray.init(num_cpus=4, storage=storage_uri) workflow.init() workflow.resume_async("workflow") utils.set_global_mark("resume") workflow.get_output("workflow") assert utils.check_global_mark("second")
def test_recovery_complex(workflow_start_regular): utils.unset_global_mark() workflow_id = "test_recovery_complex" with pytest.raises(RaySystemError): # internally we get WorkerCrashedError complex.step("x").run(workflow_id=workflow_id) utils.set_global_mark() output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r
def test_recovery_simple(workflow_start_regular): utils.unset_global_mark() workflow_id = "test_recovery_simple" with pytest.raises(RaySystemError): # internally we get WorkerCrashedError simple.step("x").run(workflow_id=workflow_id) assert workflow.get_status( workflow_id) == workflow.WorkflowStatus.RESUMABLE utils.set_global_mark() output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]" utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]"
def test_checkpoint_dag_recovery_partial(workflow_start_regular_shared): utils.unset_global_mark() start = time.time() with pytest.raises(RaySystemError): workflow.create(checkpoint_dag.bind(False)).run( workflow_id="checkpoint_partial_recovery") run_duration_partial = time.time() - start utils.set_global_mark() start = time.time() recovered = ray.get(workflow.resume("checkpoint_partial_recovery")) recover_duration_partial = time.time() - start assert np.isclose(recovered, np.arange(SIZE).mean()) print(f"[partial] run_duration = {run_duration_partial}, " f"recover_duration = {recover_duration_partial}")
def test_checkpoint_dag_recovery_whole(workflow_start_regular_shared): utils.unset_global_mark() start = time.time() with pytest.raises(workflow.WorkflowExecutionError): workflow.run(checkpoint_dag.bind(True), workflow_id="checkpoint_whole_recovery") run_duration_whole = time.time() - start utils.set_global_mark() start = time.time() recovered = workflow.resume("checkpoint_whole_recovery") recover_duration_whole = time.time() - start assert np.isclose(recovered, np.arange(SIZE).mean()) print(f"[whole] run_duration = {run_duration_whole}, " f"recover_duration = {recover_duration_whole}")
def test_recovery_complex(workflow_start_regular): @ray.remote def source1(): return "[source1]" @ray.remote def append1(x): return x + "[append1]" @ray.remote def append2(x): return x + "[append2]" @ray.remote def join(x, y): return f"join({x}, {y})" @ray.remote def complex(x1): x2 = source1.bind() v = join.bind(x1, x2) y = append1.bind(x1) y = the_failed_step.bind(y) z = append2.bind(x2) u = join.bind(y, z) return workflow.continuation(join.bind(u, v)) utils.unset_global_mark() workflow_id = "test_recovery_complex" with pytest.raises(workflow.WorkflowExecutionError): # internally we get WorkerCrashedError workflow.create(complex.bind("x")).run(workflow_id=workflow_id) assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED utils.set_global_mark() output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r
def test_checkpoint_dag_recovery_skip(workflow_start_regular_shared): utils.unset_global_mark() start = time.time() with pytest.raises(workflow.WorkflowExecutionError): workflow.create( checkpoint_dag.options(**workflow.options( checkpoint=False)).bind(False)).run( workflow_id="checkpoint_skip_recovery") run_duration_skipped = time.time() - start utils.set_global_mark() start = time.time() recovered = ray.get(workflow.resume("checkpoint_skip_recovery")) recover_duration_skipped = time.time() - start assert np.isclose(recovered, np.arange(SIZE).mean()) print(f"[skipped] run_duration = {run_duration_skipped}, " f"recover_duration = {recover_duration_skipped}")
def test_recovery_simple_2(workflow_start_regular): @ray.remote def simple(x): return workflow.continuation(the_failed_step.bind(x)) utils.unset_global_mark() workflow_id = "test_recovery_simple_2" with pytest.raises(workflow.WorkflowExecutionError): # internally we get WorkerCrashedError workflow.create(simple.bind("x")).run(workflow_id=workflow_id) assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED utils.set_global_mark() output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x)" utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x)"
async def event_checkpointed(self, event): utils.set_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") else: utils.set_global_mark("first") await asyncio.sleep(1000000)
async def poll_for_event(self): assert not utils.check_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") utils.set_global_mark("first") utils.set_global_mark("time_to_die") while not utils.check_global_mark("resume"): time.sleep(0.1)
async def poll_for_event(self): utils.set_global_mark("listener2") while not utils.check_global_mark("trigger_event"): await asyncio.sleep(0.1) return "event2"
async def event_checkpointed(self, event): utils.set_global_mark("committed")
def triggers_event(): utils.set_global_mark() while not utils.check_global_mark("event_returning"): time.sleep(0.1)
def never_ends(x): utils.set_global_mark() time.sleep(1000000) return x
def triggers_event(): utils.set_global_mark()
def test_checkpoint_dag_recovery(workflow_start_regular): utils.set_global_mark() # warm up to ensure precise timing for _ in range(3): outputs = checkpoint_dag2.step(True).run() assert np.isclose(outputs, 8388607.5) utils.unset_global_mark() start = time.time() with pytest.raises(RaySystemError): checkpoint_dag2.options(checkpoint=False).step(False).run( workflow_id="checkpoint_skip2" ) run_duration_skipped = time.time() - start utils.set_global_mark() start = time.time() recovered = ray.get(workflow.resume("checkpoint_skip2")) recover_duration_skipped = time.time() - start assert np.isclose(recovered, 8388607.5) utils.unset_global_mark() start = time.time() with pytest.raises(RaySystemError): checkpoint_dag2.step(False).run(workflow_id="checkpoint_partial2") run_duration_partial = time.time() - start utils.set_global_mark() start = time.time() recovered = ray.get(workflow.resume("checkpoint_partial2")) recover_duration_partial = time.time() - start assert np.isclose(recovered, 8388607.5) utils.unset_global_mark() start = time.time() with pytest.raises(RaySystemError): checkpoint_dag2.step(True).run(workflow_id="checkpoint_whole2") run_duration_whole = time.time() - start utils.set_global_mark() start = time.time() recovered = ray.get(workflow.resume("checkpoint_whole2")) recover_duration_whole = time.time() - start assert np.isclose(recovered, 8388607.5) print( f"[skipped] run_duration = {run_duration_skipped}, " f"recover_duration = {recover_duration_skipped}" ) print( f"[partial] run_duration = {run_duration_partial}, " f"recover_duration = {recover_duration_partial}" ) print( f"[whole] run_duration = {run_duration_whole}, " f"recover_duration = {recover_duration_whole}" )
async def poll_for_event(self): while not utils.check_global_mark(): await asyncio.sleep(0.1) utils.set_global_mark("event_returning")