Пример #1
0
def test_crash_after_commit(workflow_start_regular_shared):
    _storage = storage.get_global_storage()
    """Ensure that we don't re-call poll_for_event after `event_checkpointed`
       returns, even after a crash. Here we must call `event_checkpointed`
       twice, because there's no way to know if we called it after
       checkpointing.

    """
    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            assert not utils.check_global_mark("committed")

        async def event_checkpointed(self, event):
            utils.set_global_mark("committed")
            if utils.check_global_mark("first"):
                utils.set_global_mark("second")
            else:
                utils.set_global_mark("first")
                await asyncio.sleep(1000000)

    event_promise = workflow.wait_for_event(MyEventListener)
    event_promise.run_async("workflow")

    while not utils.check_global_mark("first"):
        time.sleep(0.1)

    ray.shutdown()
    subprocess.check_output(["ray", "stop", "--force"])

    ray.init(num_cpus=4)
    workflow.init(storage=_storage)
    workflow.resume("workflow")

    ray.get(workflow.get_output("workflow"))
    assert utils.check_global_mark("second")
Пример #2
0
def test_init_twice_2(call_ray_start, reset_workflow, tmp_path):
    with patch.dict(os.environ, {"RAY_ADDRESS": call_ray_start}):
        run_string_as_driver(driver_script)
        with pytest.raises(
            RuntimeError, match=".*different from the workflow manager.*"
        ):
            workflow.init(str(tmp_path))
Пример #3
0
def test_embedded_objectrefs(workflow_start_regular):
    workflow_id = test_embedded_objectrefs.__name__

    class ObjectRefsWrapper:
        def __init__(self, refs):
            self.refs = refs

    from ray.internal.storage import _storage_uri

    wrapped = ObjectRefsWrapper([ray.put(1), ray.put(2)])

    store = workflow_storage.get_workflow_storage(workflow_id)
    serialization.dump_to_storage("key", wrapped, workflow_id, store)

    # Be extremely explicit about shutting down. We want to make sure the
    # `_get` call deserializes the full object and puts it in the object store.
    # Shutting down the cluster should guarantee we don't accidently get the
    # old object and pass the test.
    ray.shutdown()
    subprocess.check_output("ray stop --force", shell=True)

    ray.init(storage=_storage_uri)
    workflow.init()
    storage2 = workflow_storage.get_workflow_storage(workflow_id)

    result = storage2._get("key")
    assert ray.get(result.refs) == [1, 2]
Пример #4
0
def test_dedupe_cluster_failure(tmp_path):
    ray.shutdown()
    """
    ======== driver 1 ===========
    1. Checkpoing the input args
        * Uploads
    2. Begin to run step
        * Crash

    ====== driver 2 ============
    1. Recover inputs
        * Creates a new object ref
    2. Finish running step
    3. Checkpoint step output
        * Should not trigger upload
    """
    lock_file = tmp_path / "lock"
    workflow_dir = tmp_path / "workflow"

    driver_script = f"""
import time
import ray
from ray import workflow
from filelock import FileLock

@ray.remote
def foo(objrefs):
    with FileLock("{str(lock_file)}"):
        return objrefs

if __name__ == "__main__":
    ray.init(storage="{str(workflow_dir)}")
    workflow.init()
    arg = ray.put("hello world")

    workflow.create(foo.bind([arg, arg])).run()
    assert False
    """

    lock = FileLock(lock_file)
    lock.acquire()

    run_string_as_driver_nonblocking(driver_script)

    time.sleep(10)

    subprocess.check_call(["ray", "stop", "--force"])

    lock.release()
    ray.init(storage=str(workflow_dir))
    workflow.init()
    resumed = workflow.resume_all()
    assert len(resumed) == 1
    objref = resumed.pop()[1]
    ray.get(objref)

    # The object ref will be different before and after recovery, so it will
    # get uploaded twice.
    assert get_num_uploads() == 1
    ray.shutdown()
Пример #5
0
def test_embedded_objectrefs(workflow_start_regular):
    workflow_id = test_embedded_objectrefs.__name__
    base_storage = storage.get_global_storage()

    class ObjectRefsWrapper:
        def __init__(self, refs):
            self.refs = refs

    url = base_storage.storage_url

    wrapped = ObjectRefsWrapper([ray.put(1), ray.put(2)])

    promise = serialization.dump_to_storage(["key"], wrapped, workflow_id,
                                            base_storage)
    workflow_storage.asyncio_run(promise)

    # Be extremely explicit about shutting down. We want to make sure the
    # `_get` call deserializes the full object and puts it in the object store.
    # Shutting down the cluster should guarantee we don't accidently get the
    # old object and pass the test.
    ray.shutdown()
    subprocess.check_output("ray stop --force", shell=True)

    workflow.init(url)
    storage2 = workflow_storage.get_workflow_storage(workflow_id)

    result = workflow_storage.asyncio_run(storage2._get(["key"]))
    assert ray.get(result.refs) == [1, 2]
Пример #6
0
def test_recovery_cluster_failure(tmp_path):
    subprocess.check_call(["ray", "start", "--head"])
    time.sleep(1)
    proc = run_string_as_driver_nonblocking(f"""
import time
import ray
from ray import workflow

@ray.remote
def foo(x):
    print("Executing", x)
    time.sleep(1)
    if x < 20:
        return workflow.continuation(foo.bind(x + 1))
    else:
        return 20

if __name__ == "__main__":
    ray.init(storage="{tmp_path}")
    workflow.init()
    assert workflow.create(foo.bind(0)).run(workflow_id="cluster_failure") == 20
""")
    time.sleep(10)
    subprocess.check_call(["ray", "stop"])
    proc.kill()
    time.sleep(1)
    ray.init(storage=str(tmp_path))
    workflow.init()
    assert ray.get(workflow.resume("cluster_failure")) == 20
    ray.shutdown()
Пример #7
0
def test_workflow_lifetime_1(call_ray_start, reset_workflow):
    # Case 1: driver exits normally
    with patch.dict(os.environ, {"RAY_ADDRESS": call_ray_start}):
        run_string_as_driver(driver_script.format(5))
        workflow.init()
        output = workflow.get_output("driver_terminated")
        assert ray.get(output) == 20
Пример #8
0
def test_workflow_concurrency_limit_reinit(shutdown_only):
    workflow.init(max_running_workflows=5, max_pending_workflows=6)
    workflow.init(max_running_workflows=5, max_pending_workflows=6)
    with pytest.raises(ValueError):
        workflow.init(max_running_workflows=7, max_pending_workflows=8)
    workflow.init()
    workflow.init(max_running_workflows=None, max_pending_workflows=None)
Пример #9
0
def test_workflow_queuing_3(shutdown_only, tmp_path):
    """This test ensures the queuing workflow is indeed pending."""
    ray.init(storage=str(tmp_path))
    workflow.init(max_running_workflows=1, max_pending_workflows=1)

    import time
    import filelock
    from ray.exceptions import GetTimeoutError

    lock_path = str(tmp_path / ".lock")

    @ray.remote
    def long_running(x):
        (tmp_path / str(x)).write_text(str(x))
        with filelock.FileLock(lock_path):
            return x

    workflow_id = "test_workflow_queuing_3"

    with filelock.FileLock(lock_path):
        wf_1 = workflow.run_async(long_running.bind(1),
                                  workflow_id=f"{workflow_id}_1")
        wf_2 = workflow.run_async(long_running.bind(2),
                                  workflow_id=f"{workflow_id}_2")
        time.sleep(5)
        assert (tmp_path / str(1)).exists()
        assert not (tmp_path / str(2)).exists()
        assert workflow.get_status(
            workflow_id=f"{workflow_id}_1") == workflow.RUNNING
        assert workflow.get_status(
            workflow_id=f"{workflow_id}_2") == workflow.PENDING
        with pytest.raises(GetTimeoutError):
            ray.get(wf_2, timeout=5)

    assert ray.get([wf_1, wf_2]) == [1, 2]
Пример #10
0
def test_resume_different_storage(ray_start_regular, tmp_path, reset_workflow):
    @ray.remote
    def constant():
        return 31416

    workflow.init(storage=str(tmp_path))
    workflow.create(constant.bind()).run(workflow_id="const")
    assert ray.get(workflow.resume(workflow_id="const")) == 31416
    workflow.storage.set_global_storage(None)
Пример #11
0
def test_resume_different_storage(shutdown_only, tmp_path):
    @ray.remote
    def constant():
        return 31416

    ray.init(storage=str(tmp_path))
    workflow.init()
    workflow.create(constant.bind()).run(workflow_id="const")
    assert ray.get(workflow.resume(workflow_id="const")) == 31416
Пример #12
0
def test_workflow_lifetime_2(call_ray_start, reset_workflow):
    # Case 2: driver terminated
    proc = run_string_as_driver_nonblocking(driver_script.format(100))
    time.sleep(10)
    proc.kill()
    time.sleep(1)
    workflow.init()
    output = workflow.get_output("driver_terminated")
    assert ray.get(output) == 20
Пример #13
0
def test_workflow_lifetime_2(call_ray_start, reset_workflow):
    # Case 2: driver terminated
    with patch.dict(os.environ, {"RAY_ADDRESS": call_ray_start}):
        proc = run_string_as_driver_nonblocking(driver_script.format(100))
        time.sleep(10)
        proc.kill()
        time.sleep(1)
        workflow.init()
        output = workflow.get_output("driver_terminated")
        assert ray.get(output) == 20
Пример #14
0
def test_workflow_error_message():
    storage_url = r"c:\ray"
    expected_error_msg = "Invalid url: {}.".format(storage_url)
    if os.name == "nt":

        expected_error_msg += (
            " Try using file://{} or file:///{} for Windows file paths.".
            format(storage_url, storage_url))
    with pytest.raises(ValueError) as e:
        workflow.init(storage_url)
    assert str(e.value) == expected_error_msg
Пример #15
0
def test_recovery_cluster_failure(reset_workflow, tmp_path):
    subprocess.check_call(["ray", "start", "--head"])
    time.sleep(1)
    proc = run_string_as_driver_nonblocking(
        driver_script.format(tmp_path=str(tmp_path)))
    time.sleep(10)
    subprocess.check_call(["ray", "stop"])
    proc.kill()
    time.sleep(1)
    workflow.init(str(tmp_path))
    assert ray.get(workflow.resume("cluster_failure")) == 20
    workflow.storage.set_global_storage(None)
    ray.shutdown()
Пример #16
0
def test_crash_during_event_checkpointing(workflow_start_regular_shared):
    """Ensure that if the cluster dies while the event is being checkpointed, we
    properly re-poll for the event."""

    from ray._private import storage

    storage_uri = storage._storage_uri

    """Ensure that we don't re-call poll_for_event after `event_checkpointed`
       returns, even after a crash."""

    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            assert not utils.check_global_mark("committed")
            if utils.check_global_mark("first"):
                utils.set_global_mark("second")
            utils.set_global_mark("first")

            utils.set_global_mark("time_to_die")
            while not utils.check_global_mark("resume"):
                time.sleep(0.1)

        async def event_checkpointed(self, event):
            utils.set_global_mark("committed")

    @ray.remote
    def wait_then_finish(arg):
        pass

    event_promise = workflow.wait_for_event(MyEventListener)
    workflow.run_async(wait_then_finish.bind(event_promise), workflow_id="workflow")

    while not utils.check_global_mark("time_to_die"):
        time.sleep(0.1)

    assert utils.check_global_mark("first")
    ray.shutdown()
    subprocess.check_output(["ray", "stop", "--force"])

    # Give the workflow some time to kill the cluster.
    # time.sleep(3)

    ray.init(num_cpus=4, storage=storage_uri)
    workflow.init()
    workflow.resume_async("workflow")
    utils.set_global_mark("resume")

    workflow.get_output("workflow")
    assert utils.check_global_mark("second")
Пример #17
0
def test_workflow_queuing_1(shutdown_only, tmp_path):
    ray.init(storage=str(tmp_path))
    workflow.init(max_running_workflows=2, max_pending_workflows=2)

    import queue
    import filelock

    lock_path = str(tmp_path / ".lock")

    @ray.remote
    def long_running(x):
        with filelock.FileLock(lock_path):
            return x

    wfs = [long_running.bind(i) for i in range(5)]

    with filelock.FileLock(lock_path):
        refs = [
            workflow.run_async(wfs[i], workflow_id=f"workflow_{i}")
            for i in range(4)
        ]

        assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [
            "workflow_0",
            "workflow_1",
        ]
        assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [
            "workflow_2",
            "workflow_3",
        ]

        with pytest.raises(queue.Full, match="Workflow queue has been full"):
            workflow.run(wfs[4], workflow_id="workflow_4")

    assert ray.get(refs) == [0, 1, 2, 3]
    assert workflow.run(wfs[4], workflow_id="workflow_4") == 4
    assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [
        "workflow_0",
        "workflow_1",
        "workflow_2",
        "workflow_3",
        "workflow_4",
    ]
    for i in range(5):
        assert workflow.get_output(f"workflow_{i}") == i
Пример #18
0
def test_recovery_cluster_failure_resume_all(tmp_path, shutdown_only):
    ray.shutdown()

    tmp_path = tmp_path
    subprocess.check_call(["ray", "start", "--head"])
    time.sleep(1)
    workflow_dir = tmp_path / "workflow"
    lock_file = tmp_path / "lock_file"
    lock = FileLock(lock_file)
    lock.acquire()

    proc = run_string_as_driver_nonblocking(
        f"""
import time
import ray
from ray import workflow
from filelock import FileLock

@ray.remote
def foo(x):
    with FileLock("{str(lock_file)}"):
        return 20

if __name__ == "__main__":
    ray.init(storage="{str(workflow_dir)}")
    workflow.init()
    assert workflow.create(foo.bind(0)).run(workflow_id="cluster_failure") == 20
"""
    )
    time.sleep(10)
    subprocess.check_call(["ray", "stop"])
    proc.kill()
    time.sleep(1)
    lock.release()
    ray.init(storage=str(workflow_dir))
    workflow.init()
    resumed = workflow.resume_all()
    assert len(resumed) == 1
    (wid, obj_ref) = resumed[0]
    assert wid == "cluster_failure"
    assert ray.get(obj_ref) == 20
Пример #19
0
def test_workflow_queuing_2(shutdown_only, tmp_path):
    ray.init(storage=str(tmp_path))
    workflow.init(max_running_workflows=2, max_pending_workflows=2)

    @ray.remote
    def short_running(x):
        return x

    wfs = [short_running.bind(i) for i in range(5)]
    refs = [
        workflow.run_async(wfs[i], workflow_id=f"workflow_{i}")
        for i in range(4)
    ]
    for i in range(4):
        assert workflow.get_output(f"workflow_{i}") == i
    assert ray.get(refs) == [0, 1, 2, 3]
    assert workflow.run(wfs[4], workflow_id="workflow_4") == 4
    assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [
        "workflow_0",
        "workflow_1",
        "workflow_2",
        "workflow_3",
        "workflow_4",
    ]
Пример #20
0
def test_recovery_cluster_failure_resume_all(reset_workflow, tmp_path):
    tmp_path = tmp_path
    subprocess.check_call(["ray", "start", "--head"])
    time.sleep(1)
    workflow_dir = tmp_path / "workflow"
    lock_file = tmp_path / "lock_file"
    driver_script = f"""
import time
from ray import workflow
from filelock import FileLock
@workflow.step
def foo(x):
    with FileLock("{str(lock_file)}"):
        return 20

if __name__ == "__main__":
    workflow.init("{str(workflow_dir)}")
    assert foo.step(0).run(workflow_id="cluster_failure") == 20
"""
    lock = FileLock(lock_file)
    lock.acquire()

    proc = run_string_as_driver_nonblocking(driver_script)
    time.sleep(10)
    subprocess.check_call(["ray", "stop"])
    proc.kill()
    time.sleep(1)
    lock.release()
    workflow.init(str(workflow_dir))
    resumed = workflow.resume_all()
    assert len(resumed) == 1
    (wid, obj_ref) = resumed[0]
    assert wid == "cluster_failure"
    assert ray.get(obj_ref) == 20
    workflow.storage.set_global_storage(None)
    ray.shutdown()
Пример #21
0
from ray import workflow


@workflow.step
def hello(msg: str) -> None:
    print(msg)


@workflow.step
def wait_all(*args) -> None:
    pass


if __name__ == "__main__":
    workflow.init()
    children = []
    for msg in ["hello world", "goodbye world"]:
        children.append(hello.step(msg))
    wait_all.step(*children).run()
Пример #22
0
def test_workflow_concurrency_limit_argument(shutdown_only):
    with pytest.raises(TypeError):
        workflow.init(1, 2)

    with pytest.raises(TypeError):
        workflow.init(max_running_workflows=1.7)

    with pytest.raises(TypeError):
        workflow.init(max_pending_workflows=1.7)

    with pytest.raises(ValueError):
        workflow.init(max_running_workflows=-2)

    with pytest.raises(ValueError):
        workflow.init(max_pending_workflows=-2)

    with pytest.raises(ValueError):
        workflow.init(max_running_workflows=0)
Пример #23
0
def test_delete(workflow_start_regular):
    from ray._private.storage import _storage_uri

    # Try deleting a random workflow that never existed.
    with pytest.raises(WorkflowNotFoundError):
        workflow.delete(workflow_id="never_existed")

    # Delete a workflow that has not finished and is not running.
    @ray.remote
    def never_ends(x):
        utils.set_global_mark()
        time.sleep(1000000)
        return x

    workflow.create(never_ends.bind("hello world")).run_async("never_finishes")

    # Make sure the step is actualy executing before killing the cluster
    while not utils.check_global_mark():
        time.sleep(0.1)

    # Restart
    ray.shutdown()
    subprocess.check_output("ray stop --force", shell=True)
    ray.init(storage=_storage_uri)
    workflow.init()

    with pytest.raises(ray.exceptions.RaySystemError):
        result = workflow.get_output("never_finishes")
        ray.get(result)

    workflow.delete("never_finishes")

    with pytest.raises(ValueError):
        ouput = workflow.get_output("never_finishes")

    # TODO(Alex): Uncomment after
    # https://github.com/ray-project/ray/issues/19481.
    # with pytest.raises(WorkflowNotFoundError):
    #     workflow.resume("never_finishes")

    with pytest.raises(WorkflowNotFoundError):
        workflow.delete(workflow_id="never_finishes")

    # Delete a workflow which has finished.
    @ray.remote
    def basic_step(arg):
        return arg

    result = workflow.create(
        basic_step.bind("hello world")).run(workflow_id="finishes")
    assert result == "hello world"
    ouput = workflow.get_output("finishes")
    assert ray.get(ouput) == "hello world"

    workflow.delete(workflow_id="finishes")

    with pytest.raises(ValueError):
        ouput = workflow.get_output("finishes")

    # TODO(Alex): Uncomment after
    # https://github.com/ray-project/ray/issues/19481.
    # with pytest.raises(ValueError):
    #     workflow.resume("finishes")

    with pytest.raises(WorkflowNotFoundError):
        workflow.delete(workflow_id="finishes")

    assert workflow.list_all() == []

    # The workflow can be re-run as if it was never run before.
    assert workflow.create(
        basic_step.bind("123")).run(workflow_id="finishes") == "123"
Пример #24
0
def test_init_twice(call_ray_start, reset_workflow, tmp_path):
    workflow.init()
    with pytest.raises(RuntimeError):
        workflow.init(str(tmp_path))
Пример #25
0
def test_resume_different_storage(ray_start_regular, tmp_path, reset_workflow):
    workflow.init(storage=str(tmp_path))
    constant.step().run(workflow_id="const")
    assert ray.get(workflow.resume(workflow_id="const")) == 31416
    workflow.storage.set_global_storage(None)
Пример #26
0
def test_init_twice_2(call_ray_start, reset_workflow, tmp_path):
    run_string_as_driver(driver_script)
    with pytest.raises(RuntimeError):
        workflow.init(str(tmp_path))
Пример #27
0
def _alter_storage(new_storage):
    set_global_storage(new_storage)
    # alter the storage
    ray.shutdown()
    os.system("ray stop --force")
    workflow.init(new_storage)
Пример #28
0
def test_workflow_queuing_resume_all(shutdown_only, tmp_path):
    ray.init(storage=str(tmp_path))
    workflow.init(max_running_workflows=2, max_pending_workflows=2)

    import queue
    import filelock

    lock_path = str(tmp_path / ".lock")

    @ray.remote
    def long_running(x):
        with filelock.FileLock(lock_path):
            return x

    wfs = [long_running.bind(i) for i in range(5)]

    with filelock.FileLock(lock_path):
        _refs = [  # noqa: F841
            workflow.run_async(wfs[i], workflow_id=f"workflow_{i}")
            for i in range(4)
        ]

        assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [
            "workflow_0",
            "workflow_1",
        ]
        assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [
            "workflow_2",
            "workflow_3",
        ]

        with pytest.raises(queue.Full, match="Workflow queue has been full"):
            workflow.run(wfs[4], workflow_id="workflow_4")

        # kill all workflows
        ray.shutdown()

    ray.init(storage=str(tmp_path))
    workflow.init(max_running_workflows=2, max_pending_workflows=2)

    with filelock.FileLock(lock_path):
        workflow_ids, outputs = zip(*sorted(workflow.resume_all()))
        # We should have the same running and pending workflows, because when
        # resume_all(), running workflows have higher priority.
        assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [
            "workflow_0",
            "workflow_1",
        ]
        assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [
            "workflow_2",
            "workflow_3",
        ]

    assert workflow_ids == (
        "workflow_0",
        "workflow_1",
        "workflow_2",
        "workflow_3",
    )

    assert ray.get(list(outputs)) == [0, 1, 2, 3]
    assert workflow.run(wfs[4], workflow_id="workflow_4") == 4
    assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [
        "workflow_0",
        "workflow_1",
        "workflow_2",
        "workflow_3",
        "workflow_4",
    ]
Пример #29
0
def test_workflow_lifetime_1(call_ray_start, reset_workflow):
    # Case 1: driver exits normally
    run_string_as_driver(driver_script.format(5))
    workflow.init()
    output = workflow.get_output("driver_terminated")
    assert ray.get(output) == 20