def test_run_or_resume_during_running(workflow_start_regular_shared): output = simple_sequential.step().run_async(workflow_id="running_workflow") with pytest.raises(RuntimeError): simple_sequential.step().run_async(workflow_id="running_workflow") with pytest.raises(RuntimeError): workflow.resume(workflow_id="running_workflow") assert ray.get(output) == "[source1][append1][append2]"
def test_run_or_resume_during_running(): ray.init(namespace="workflow") output = workflow.run( simple_sequential.step(), workflow_id="running_workflow") with pytest.raises(ValueError): workflow.run(simple_sequential.step(), workflow_id="running_workflow") with pytest.raises(ValueError): workflow.resume(workflow_id="running_workflow") assert ray.get(output) == "[source1][append1][append2]" ray.shutdown()
def test_recovery_simple(workflow_start_regular): utils.unset_global_mark() workflow_id = "test_recovery_simple" with pytest.raises(RaySystemError): # internally we get WorkerCrashedError simple.step("x").run(workflow_id=workflow_id) utils.set_global_mark() output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]" utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]"
def test_recovery_complex(workflow_start_regular): utils.unset_global_mark() workflow_id = "test_recovery_complex" with pytest.raises(RaySystemError): # internally we get WorkerCrashedError complex.step("x").run(workflow_id=workflow_id) utils.set_global_mark() output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r
def test_resume_different_storage(ray_start_regular): constant_1.step().run(workflow_id="const") tmp_dir = tempfile.mkdtemp() constant_2.step().run(workflow_id="const", storage=tmp_dir) assert ray.get(workflow.resume(workflow_id="const", storage=tmp_dir)) == 31416 shutil.rmtree(tmp_dir)
def test_get_output_3(workflow_start_regular, tmp_path): cnt_file = tmp_path / "counter" cnt_file.write_text("0") error_flag = tmp_path / "error" error_flag.touch() @workflow.step def incr(): v = int(cnt_file.read_text()) cnt_file.write_text(str(v + 1)) if error_flag.exists(): raise ValueError() return 10 with pytest.raises(ray.exceptions.RaySystemError): incr.step().run("incr") assert cnt_file.read_text() == "1" with pytest.raises(ray.exceptions.RaySystemError): ray.get(workflow.get_output("incr")) assert cnt_file.read_text() == "1" error_flag.unlink() with pytest.raises(ray.exceptions.RaySystemError): ray.get(workflow.get_output("incr")) assert ray.get(workflow.resume("incr")) == 10
def test_recovery_simple(): ray.init() utils.unset_global_mark() workflow_id = "test_recovery_simple" with pytest.raises(ObjectLostError): # internally we get WorkerCrashedError output = workflow.run(simple.step("x"), workflow_id=workflow_id) ray.get(output) utils.set_global_mark() output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]" utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]" ray.shutdown()
def resume(num_records_replayed): key = debug_store.wrapped_storage.make_key("complex_workflow") asyncio_run(debug_store.wrapped_storage.delete_prefix(key)) replays = [ debug_store.replay(i) for i in range(num_records_replayed) ] asyncio_run(asyncio.gather(*replays)) return ray.get(workflow.resume(workflow_id="complex_workflow"))
def test_recovery_complex(): ray.init() utils.unset_global_mark() workflow_id = "test_recovery_complex" with pytest.raises(RayTaskError): # internally we get WorkerCrashedError output = workflow.run(complex.step("x"), workflow_id=workflow_id) ray.get(output) utils.set_global_mark() output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r ray.shutdown()
def test_recovery_cluster_failure(): subprocess.run(["ray start --head"], shell=True) time.sleep(1) proc = run_string_as_driver_nonblocking(driver_script) time.sleep(10) subprocess.run(["ray stop"], shell=True) proc.kill() time.sleep(1) ray.init() assert ray.get(workflow.resume("cluster_failure")) == 20 ray.shutdown()
def test_recovery_cluster_failure(): subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) proc = run_string_as_driver_nonblocking(driver_script) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) workflow.init() assert ray.get(workflow.resume("cluster_failure")) == 20 ray.shutdown()
def test_recovery_cluster_failure(): subprocess.run(["ray start --head"], shell=True) time.sleep(1) script = os.path.join(os.path.abspath(os.path.dirname(__file__)), "workflows_to_fail.py") proc = subprocess.Popen([sys.executable, script]) time.sleep(10) subprocess.run(["ray stop"], shell=True) proc.kill() time.sleep(1) ray.init() assert ray.get(workflow.resume("cluster_failure")) == 20 ray.shutdown()
def test_recovery_cluster_failure(reset_workflow, tmp_path): subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) proc = run_string_as_driver_nonblocking( driver_script.format(tmp_path=str(tmp_path))) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) workflow.init(str(tmp_path)) assert ray.get(workflow.resume("cluster_failure")) == 20 workflow.storage.set_global_storage(None) ray.shutdown()
def test_recovery_non_exists_workflow(): ray.init() with pytest.raises(WorkflowNotResumableError): workflow.resume("this_workflow_id_does_not_exist") ray.shutdown()
def test_recovery_non_exists_workflow(workflow_start_regular): with pytest.raises(ValueError): ray.get(workflow.resume("this_workflow_id_does_not_exist"))
def test_recovery_non_exists_workflow(): ray.init(namespace="workflow") with pytest.raises(RayTaskError): ray.get(workflow.resume("this_workflow_id_does_not_exist")) ray.shutdown()
def test_resume_different_storage(ray_start_regular, tmp_path, reset_workflow): workflow.init(storage=str(tmp_path)) constant.step().run(workflow_id="const") assert ray.get(workflow.resume(workflow_id="const")) == 31416 workflow.storage.set_global_storage(None)
def test_actor_writer_2(workflow_start_regular, tmp_path): g_lock = str(Path(tmp_path / "g.lock")) incr_lock = str(Path(tmp_path / "incr.lock")) val_lock = str(Path(tmp_path / "val.lock")) val_err = str(Path(tmp_path / "val.err")) incr_err = str(Path(tmp_path / "incr.err")) @workflow.virtual_actor class SyncCounter: def __init__(self, val_lock: str, incr_lock: str, g_lock: str, val_err: str, incr_err: str): self.val_lock = val_lock self.incr_lock = incr_lock self.g_lock = g_lock self.val_err = val_err self.incr_err = incr_err self.v = 0 if Path(self.val_err).exists(): raise ValueError() @workflow.virtual_actor.readonly def val(self): with FileLock(self.val_lock), FileLock(self.g_lock): if Path(self.val_err).exists(): raise ValueError() return self.v def incr(self, create_incr_err=False): with FileLock(self.incr_lock), FileLock(self.g_lock): if Path(self.incr_err).exists(): raise ValueError() if create_incr_err: Path(incr_err).touch() self.v += 1 return self.v def __getstate__(self): return (self.v, self.val_lock, self.incr_lock, self.g_lock, self.val_err, self.incr_err) def __setstate__(self, state): (self.v, self.val_lock, self.incr_lock, self.g_lock, self.val_err, self.incr_err) = state # trigger error in init Path(val_err).touch() actor = SyncCounter.get_or_create("sync_counter", val_lock, incr_lock, g_lock, val_err, incr_err) with pytest.raises(Exception): actor.incr.run() Path(val_err).unlink() assert ray.get([actor.incr.run_async() for _ in range(9)]) == list(range(2, 11)) incr_lock = FileLock(incr_lock) incr_lock.acquire() objs = [actor.incr.run_async() for _ in range(10)] assert 10 == actor.val.run() Path(val_err).touch() with pytest.raises(Exception): actor.val.run() Path(val_err).unlink() incr_lock.release() assert ray.get(objs) == list(range(11, 21)) # test error cases actor.incr.run_async() # 21 actor.incr.run_async() # 22 actor.incr.run_async(create_incr_err=True) # 23 actor.incr.run_async() # 24 s5 = actor.incr.run_async() # 25 with pytest.raises(Exception): ray.get(s5) assert 23 == actor.val.run() Path(incr_err).unlink() obj = workflow.resume("sync_counter") assert 25 == ray.get(obj)[0] assert 25 == actor.val.run()
def test_wf_in_actor_chain(workflow_start_regular, tmp_path): file_lock = [str(tmp_path / str(i)) for i in range(5)] fail_flag = tmp_path / "fail" @workflow.virtual_actor class Counter: def __init__(self): self._counter = 0 def incr(self, n): with FileLock(file_lock[n]): self._counter += 1 if fail_flag.exists(): raise Exception() if n == 0: return self._counter else: return self.incr.step(n - 1) @workflow.virtual_actor.readonly def val(self): return self._counter def __getstate__(self): return self._counter def __setstate__(self, v): self._counter = v locks = [FileLock(f) for f in file_lock] for lock in locks: lock.acquire() c = Counter.get_or_create("counter") ray.get(c.ready()) final_ret = c.incr.run_async(len(file_lock) - 1) for i in range(0, len(file_lock) - 2): locks[-i - 1].release() val = c.val.run() for _ in range(0, 60): if val == i + 1: break val = c.val.run() time.sleep(1) assert val == i + 1 fail_flag.touch() locks[1 - len(file_lock)].release() # Fail the pipeline with pytest.raises(Exception): ray.get(final_ret) fail_flag.unlink() workflow.resume("counter") # After resume, it'll start form the place where it failed for i in range(len(file_lock) - 1, len(file_lock)): locks[-i - 1].release() val = c.val.run() for _ in range(0, 60): if val == i + 1: break val = c.val.run() time.sleep(1) assert val == i + 1 assert c.val.run() == 5
def test_wf_in_actor(workflow_start_regular, tmp_path): fail_flag = tmp_path / "fail" cnt = tmp_path / "count" cnt.write_text(str(0)) lock_file = tmp_path / "lock" @workflow.step def start_session(): if fail_flag.exists(): raise Exception() v = int(cnt.read_text()) + 1 cnt.write_text(str(v)) with FileLock(str(lock_file)): return "UP" @workflow.virtual_actor class Session: def __init__(self): self._session_status = "DOWN" @workflow.virtual_actor.readonly def get_status(self): return self._session_status def update_session(self, up): (ret, err) = up if err is None: self._session_status = ret else: self._session_status = err return self._session_status def session_start(self): step = start_session.step() return step def session_start_with_status(self): self._session_status = "STARTING" return self.update_session.step( start_session.options(catch_exceptions=True).step()) def __getstate__(self): return self._session_status def __setstate__(self, state): self._session_status = state actor = Session.get_or_create("session_id") fail_flag.touch() with pytest.raises(Exception): actor.session_start.run() fail_flag.unlink() ray.get(workflow.resume("session_id")) # After resume, it'll rerun start_session which will # generate 1 assert cnt.read_text() == "1" assert actor.session_start.run() == "UP" assert cnt.read_text() == "2" assert actor.session_start_with_status.run() == "UP" assert cnt.read_text() == "3" # Now test a new session. actor = Session.get_or_create("session_id") fail_flag.touch() assert isinstance(actor.session_start_with_status.run(), Exception) assert cnt.read_text() == "3" lock = FileLock(str(lock_file)) lock.acquire() fail_flag.unlink() ret = actor.session_start_with_status.run_async() for i in range(0, 60): if cnt.read_text() == "4": break time.sleep(1) assert cnt.read_text() == "4" # This means when return from session_start_with_status, # the session got updated assert actor.get_status.run() == "STARTING" lock.release() assert ray.get(ret) == "UP"