예제 #1
0
def test_run_or_resume_during_running(workflow_start_regular_shared):
    @ray.remote
    def source1():
        return "[source1]"

    @ray.remote
    def append1(x):
        return x + "[append1]"

    @ray.remote
    def append2(x):
        return x + "[append2]"

    @ray.remote
    def simple_sequential():
        x = source1.bind()
        y = append1.bind(x)
        return workflow.continuation(append2.bind(y))

    output = workflow.create(
        simple_sequential.bind()).run_async(workflow_id="running_workflow")
    with pytest.raises(RuntimeError):
        workflow.create(
            simple_sequential.bind()).run_async(workflow_id="running_workflow")
    with pytest.raises(RuntimeError):
        workflow.resume(workflow_id="running_workflow")
    assert ray.get(output) == "[source1][append1][append2]"
예제 #2
0
def test_crash_after_commit(workflow_start_regular_shared):
    _storage = storage.get_global_storage()
    """Ensure that we don't re-call poll_for_event after `event_checkpointed`
       returns, even after a crash. Here we must call `event_checkpointed`
       twice, because there's no way to know if we called it after
       checkpointing.

    """
    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            assert not utils.check_global_mark("committed")

        async def event_checkpointed(self, event):
            utils.set_global_mark("committed")
            if utils.check_global_mark("first"):
                utils.set_global_mark("second")
            else:
                utils.set_global_mark("first")
                await asyncio.sleep(1000000)

    event_promise = workflow.wait_for_event(MyEventListener)
    event_promise.run_async("workflow")

    while not utils.check_global_mark("first"):
        time.sleep(0.1)

    ray.shutdown()
    subprocess.check_output(["ray", "stop", "--force"])

    ray.init(num_cpus=4)
    workflow.init(storage=_storage)
    workflow.resume("workflow")

    ray.get(workflow.get_output("workflow"))
    assert utils.check_global_mark("second")
예제 #3
0
def test_recovery_simple(workflow_start_regular):
    @ray.remote
    def append1(x):
        return x + "[append1]"

    @ray.remote
    def append2(x):
        return x + "[append2]"

    @ray.remote
    def simple(x):
        x = append1.bind(x)
        y = the_failed_step.bind(x)
        z = append2.bind(y)
        return workflow.continuation(z)

    utils.unset_global_mark()
    workflow_id = "test_recovery_simple"
    with pytest.raises(RaySystemError):
        # internally we get WorkerCrashedError
        workflow.create(simple.bind("x")).run(workflow_id=workflow_id)

    assert workflow.get_status(
        workflow_id) == workflow.WorkflowStatus.RESUMABLE

    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
예제 #4
0
def test_run_or_resume_during_running(workflow_start_regular_shared):
    output = simple_sequential.step().run_async(workflow_id="running_workflow")
    with pytest.raises(RuntimeError):
        simple_sequential.step().run_async(workflow_id="running_workflow")
    with pytest.raises(RuntimeError):
        workflow.resume(workflow_id="running_workflow")
    assert ray.get(output) == "[source1][append1][append2]"
예제 #5
0
파일: test_recovery.py 프로젝트: parasj/ray
def test_recovery_simple_1(workflow_start_regular):
    utils.unset_global_mark()
    workflow_id = "test_recovery_simple_1"
    with pytest.raises(workflow.WorkflowExecutionError):
        # internally we get WorkerCrashedError
        workflow.run(the_failed_step.bind("x"), workflow_id=workflow_id)

    assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED

    utils.set_global_mark()
    assert workflow.resume(workflow_id) == "foo(x)"
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    assert workflow.resume(workflow_id) == "foo(x)"
예제 #6
0
def test_recovery_complex(workflow_start_regular):
    utils.unset_global_mark()
    workflow_id = "test_recovery_complex"
    with pytest.raises(RaySystemError):
        # internally we get WorkerCrashedError
        complex.step("x").run(workflow_id=workflow_id)
    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
예제 #7
0
def test_wait_recovery_step_id(workflow_start_regular_shared):
    # This test ensures workflow reuse the original directory and
    # step id for "workflow.wait" during recovery.

    @workflow.step
    def identity(x: int):
        # block the step by a global mark
        assert utils.check_global_mark()
        return x

    w = workflow.wait([identity.step(42)], num_returns=1, timeout=None)
    utils.unset_global_mark()
    with pytest.raises(RaySystemError):
        _ = w.run(workflow_id="test_wait_recovery_step_id")
    utils.set_global_mark()
    ready, unready = ray.get(workflow.resume("test_wait_recovery_step_id"))
    assert ready == [42]

    from ray.workflow import storage, workflow_storage

    global_storage = storage.get_global_storage()
    wf_storage = workflow_storage.WorkflowStorage("test_wait_recovery_step_id",
                                                  global_storage)
    index = wf_storage.gen_step_id("workflow.wait")
    # no new step id
    assert index <= 1
예제 #8
0
def test_failed_and_resumed_workflow(workflow_start_regular, tmp_path):

    workflow_id = "simple"
    error_flag = tmp_path / "error"
    error_flag.touch()

    @ray.remote
    def simple():
        if error_flag.exists():
            raise ValueError()
        return 0

    with pytest.raises(workflow.WorkflowExecutionError):
        workflow.create(simple.bind()).run(workflow_id)

    workflow_metadata_failed = workflow.get_metadata(workflow_id)
    assert workflow_metadata_failed["status"] == "FAILED"

    error_flag.unlink()
    ref = workflow.resume(workflow_id)
    assert ray.get(ref) == 0

    workflow_metadata_resumed = workflow.get_metadata(workflow_id)
    assert workflow_metadata_resumed["status"] == "SUCCESSFUL"

    # make sure resume updated running metrics
    assert (
        workflow_metadata_resumed["stats"]["start_time"]
        > workflow_metadata_failed["stats"]["start_time"]
    )
    assert (
        workflow_metadata_resumed["stats"]["end_time"]
        > workflow_metadata_failed["stats"]["end_time"]
    )
예제 #9
0
def test_wait_failure_recovery_1(workflow_start_regular_shared):
    # This tests that if a step using the output of "workflow.wait" as its
    # input, it can be recovered after failure.
    @workflow.step
    def get_all(ready, unready):
        return ready, unready

    @workflow.step
    def filter_all_2(wait_results):
        assert wait_results[0] == [1, 3, 2]
        # failure point
        assert utils.check_global_mark()
        ready, unready = wait_results
        return get_all.step(ready, unready)

    @workflow.step
    def composite_2():
        w = wait_multiple_steps.step()
        return filter_all_2.step(w)

    utils.unset_global_mark()

    with pytest.raises(RaySystemError):
        composite_2.step().run(workflow_id="wait_failure_recovery")

    utils.set_global_mark()

    ready, unready = ray.get(workflow.resume("wait_failure_recovery"))
    assert ready == [1, 3, 2]
    assert unready == [10, 12]
예제 #10
0
def test_wait_failure_recovery_2(workflow_start_regular_shared):
    # Test failing "workflow.wait" and its input steps.

    @workflow.step
    def sleep_identity(x: int):
        # block the step by a global mark
        while not utils.check_global_mark():
            time.sleep(0.1)
        time.sleep(x)
        return x

    @workflow.step
    def identity(x):
        return x

    ws = [
        sleep_identity.step(2),
        sleep_identity.step(5),
        sleep_identity.step(1),
    ]
    w = workflow.wait(ws, num_returns=2, timeout=None)
    utils.unset_global_mark()
    _ = identity.step(w).run_async(workflow_id="wait_failure_recovery_2")
    # wait util "workflow.wait" has been running
    time.sleep(10)
    workflow.cancel("wait_failure_recovery_2")
    time.sleep(2)

    utils.set_global_mark()
    ready, unready = ray.get(workflow.resume("wait_failure_recovery_2"))
    assert ready == [2, 1]
예제 #11
0
def test_failed_and_resumed_workflow(workflow_start_regular, tmp_path):

    workflow_id = "simple"
    error_flag = tmp_path / "error"
    error_flag.touch()

    @workflow.step
    def simple():
        if error_flag.exists():
            raise ValueError()
        return 0

    with pytest.raises(ray.exceptions.RaySystemError):
        simple.step().run(workflow_id)

    workflow_metadata_failed = workflow.get_metadata(workflow_id)
    assert workflow_metadata_failed["status"] == "FAILED"

    error_flag.unlink()
    ref = workflow.resume(workflow_id)
    assert ray.get(ref) == 0

    workflow_metadata_resumed = workflow.get_metadata(workflow_id)
    assert workflow_metadata_resumed["status"] == "SUCCESSFUL"

    # make sure resume updated running metrics
    assert workflow_metadata_resumed["stats"]["start_time"] \
           > workflow_metadata_failed["stats"]["start_time"]
    assert workflow_metadata_resumed["stats"]["end_time"] \
           > workflow_metadata_failed["stats"]["end_time"]
예제 #12
0
def test_recovery_cluster_failure(tmp_path):
    subprocess.check_call(["ray", "start", "--head"])
    time.sleep(1)
    proc = run_string_as_driver_nonblocking(f"""
import time
import ray
from ray import workflow

@ray.remote
def foo(x):
    print("Executing", x)
    time.sleep(1)
    if x < 20:
        return workflow.continuation(foo.bind(x + 1))
    else:
        return 20

if __name__ == "__main__":
    ray.init(storage="{tmp_path}")
    workflow.init()
    assert workflow.create(foo.bind(0)).run(workflow_id="cluster_failure") == 20
""")
    time.sleep(10)
    subprocess.check_call(["ray", "stop"])
    proc.kill()
    time.sleep(1)
    ray.init(storage=str(tmp_path))
    workflow.init()
    assert ray.get(workflow.resume("cluster_failure")) == 20
    ray.shutdown()
예제 #13
0
def test_get_output_3(workflow_start_regular, tmp_path):
    cnt_file = tmp_path / "counter"
    cnt_file.write_text("0")
    error_flag = tmp_path / "error"
    error_flag.touch()

    @ray.remote
    def incr():
        v = int(cnt_file.read_text())
        cnt_file.write_text(str(v + 1))
        if error_flag.exists():
            raise ValueError()
        return 10

    with pytest.raises(workflow.WorkflowExecutionError):
        workflow.run(incr.options(max_retries=0).bind(), workflow_id="incr")

    assert cnt_file.read_text() == "1"

    from ray.exceptions import RaySystemError

    # TODO(suquark): We should prevent Ray from raising "RaySystemError",
    #   in workflow, because "RaySystemError" does not inherit the underlying
    #   error, so users and developers cannot catch the expected error.
    #   I feel this issue is a very annoying.
    with pytest.raises((RaySystemError, ValueError)):
        workflow.get_output("incr")

    assert cnt_file.read_text() == "1"
    error_flag.unlink()
    with pytest.raises((RaySystemError, ValueError)):
        workflow.get_output("incr")
    assert workflow.resume("incr") == 10
예제 #14
0
def test_get_output_3(workflow_start_regular, tmp_path):
    cnt_file = tmp_path / "counter"
    cnt_file.write_text("0")
    error_flag = tmp_path / "error"
    error_flag.touch()

    @workflow.step
    def incr():
        v = int(cnt_file.read_text())
        cnt_file.write_text(str(v + 1))
        if error_flag.exists():
            raise ValueError()
        return 10

    with pytest.raises(ray.exceptions.RaySystemError):
        incr.options(max_retries=1).step().run("incr")

    assert cnt_file.read_text() == "1"

    with pytest.raises(ray.exceptions.RaySystemError):
        ray.get(workflow.get_output("incr"))

    assert cnt_file.read_text() == "1"
    error_flag.unlink()
    with pytest.raises(ray.exceptions.RaySystemError):
        ray.get(workflow.get_output("incr"))
    assert ray.get(workflow.resume("incr")) == 10
예제 #15
0
파일: test_events.py 프로젝트: tchordia/ray
def test_crash_during_event_checkpointing(workflow_start_regular_shared):
    """Ensure that if the cluster dies while the event is being checkpointed, we
    properly re-poll for the event."""

    from ray.internal import storage

    storage_uri = storage._storage_uri
    """Ensure that we don't re-call poll_for_event after `event_checkpointed`
       returns, even after a crash."""
    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            assert not utils.check_global_mark("committed")
            if utils.check_global_mark("first"):
                utils.set_global_mark("second")
            utils.set_global_mark("first")

            utils.set_global_mark("time_to_die")
            while not utils.check_global_mark("resume"):
                time.sleep(0.1)

        async def event_checkpointed(self, event):
            utils.set_global_mark("committed")

    @ray.remote
    def wait_then_finish(arg):
        pass

    event_promise = workflow.wait_for_event(MyEventListener)
    workflow.create(wait_then_finish.bind(event_promise)).run_async("workflow")

    while not utils.check_global_mark("time_to_die"):
        time.sleep(0.1)

    assert utils.check_global_mark("first")
    ray.shutdown()
    subprocess.check_output(["ray", "stop", "--force"])

    # Give the workflow some time to kill the cluster.
    # time.sleep(3)

    ray.init(num_cpus=4, storage=storage_uri)
    workflow.init()
    workflow.resume("workflow")
    utils.set_global_mark("resume")

    ray.get(workflow.get_output("workflow"))
    assert utils.check_global_mark("second")
예제 #16
0
def test_recovery_simple(workflow_start_regular):
    utils.unset_global_mark()
    workflow_id = "test_recovery_simple"
    with pytest.raises(RaySystemError):
        # internally we get WorkerCrashedError
        simple.step("x").run(workflow_id=workflow_id)

    assert workflow.get_status(
        workflow_id) == workflow.WorkflowStatus.RESUMABLE

    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
예제 #17
0
def test_resume_different_storage(ray_start_regular, tmp_path, reset_workflow):
    @ray.remote
    def constant():
        return 31416

    workflow.init(storage=str(tmp_path))
    workflow.create(constant.bind()).run(workflow_id="const")
    assert ray.get(workflow.resume(workflow_id="const")) == 31416
    workflow.storage.set_global_storage(None)
예제 #18
0
def test_resume_different_storage(shutdown_only, tmp_path):
    @ray.remote
    def constant():
        return 31416

    ray.init(storage=str(tmp_path))
    workflow.init()
    workflow.create(constant.bind()).run(workflow_id="const")
    assert ray.get(workflow.resume(workflow_id="const")) == 31416
예제 #19
0
def test_recovery_complex(workflow_start_regular):
    @ray.remote
    def source1():
        return "[source1]"

    @ray.remote
    def append1(x):
        return x + "[append1]"

    @ray.remote
    def append2(x):
        return x + "[append2]"

    @ray.remote
    def join(x, y):
        return f"join({x}, {y})"

    @ray.remote
    def complex(x1):
        x2 = source1.bind()
        v = join.bind(x1, x2)
        y = append1.bind(x1)
        y = the_failed_step.bind(y)
        z = append2.bind(x2)
        u = join.bind(y, z)
        return workflow.continuation(join.bind(u, v))

    utils.unset_global_mark()
    workflow_id = "test_recovery_complex"
    with pytest.raises(workflow.WorkflowExecutionError):
        # internally we get WorkerCrashedError
        workflow.create(complex.bind("x")).run(workflow_id=workflow_id)

    assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED

    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
예제 #20
0
        def resume(num_records_replayed):
            key = debug_store.wrapped_storage.make_key("complex_workflow")
            asyncio_run(debug_store.wrapped_storage.delete_prefix(key))

            async def replay():
                # We need to replay one by one to avoid conflict
                for i in range(num_records_replayed):
                    await debug_store.replay(i)

            asyncio_run(replay())
            return ray.get(workflow.resume(workflow_id="complex_workflow"))
예제 #21
0
def test_recovery_simple_2(workflow_start_regular):
    @ray.remote
    def simple(x):
        return workflow.continuation(the_failed_step.bind(x))

    utils.unset_global_mark()
    workflow_id = "test_recovery_simple_2"
    with pytest.raises(workflow.WorkflowExecutionError):
        # internally we get WorkerCrashedError
        workflow.create(simple.bind("x")).run(workflow_id=workflow_id)

    assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED

    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x)"
    utils.unset_global_mark()
    # resume from workflow output checkpoint

    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x)"
예제 #22
0
def test_checkpoint_dag_full(workflow_start_regular_shared):
    outputs = workflow.create(
        checkpoint_dag.options(**workflow.options(name="checkpoint_dag")).bind(True)
    ).run(workflow_id="checkpoint_whole")
    assert np.isclose(outputs, 8388607.5)
    recovered = ray.get(workflow.resume("checkpoint_whole"))
    assert np.isclose(recovered, 8388607.5)

    wf_storage = workflow_storage.WorkflowStorage("checkpoint_whole")
    _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "large_input", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "identity", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
예제 #23
0
def test_recovery_cluster_failure(reset_workflow, tmp_path):
    subprocess.check_call(["ray", "start", "--head"])
    time.sleep(1)
    proc = run_string_as_driver_nonblocking(
        driver_script.format(tmp_path=str(tmp_path)))
    time.sleep(10)
    subprocess.check_call(["ray", "stop"])
    proc.kill()
    time.sleep(1)
    workflow.init(str(tmp_path))
    assert ray.get(workflow.resume("cluster_failure")) == 20
    workflow.storage.set_global_storage(None)
    ray.shutdown()
예제 #24
0
def test_checkpoint_dag_skip_partial(workflow_start_regular_shared):
    outputs = workflow.run(
        checkpoint_dag.options(**workflow.options(
            name="checkpoint_dag")).bind(False),
        workflow_id="checkpoint_partial",
    )
    assert np.isclose(outputs, 8388607.5)
    recovered = workflow.resume("checkpoint_partial")
    assert np.isclose(recovered, 8388607.5)

    wf_storage = workflow_storage.WorkflowStorage("checkpoint_partial")
    _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "large_input", mode="output_skipped")
    _assert_step_checkpoints(wf_storage, "identity", mode="output_skipped")
    _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
예제 #25
0
def test_checkpoint_dag_full(workflow_start_regular):
    global_storage = storage.get_global_storage()

    outputs = utils.run_workflow_dag_with_options(
        checkpoint_dag, (True,), workflow_id="checkpoint_whole", name="checkpoint_dag"
    )
    assert np.isclose(outputs, 8388607.5)
    recovered = ray.get(workflow.resume("checkpoint_whole"))
    assert np.isclose(recovered, 8388607.5)

    wf_storage = workflow_storage.WorkflowStorage("checkpoint_whole", global_storage)
    _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "large_input", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "identity", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
예제 #26
0
def test_checkpoint_dag_skip_partial(workflow_start_regular):
    global_storage = storage.get_global_storage()

    outputs = (
        checkpoint_dag.options(name="checkpoint_dag")
        .step(False)
        .run(workflow_id="checkpoint_partial")
    )
    assert np.isclose(outputs, 8388607.5)
    recovered = ray.get(workflow.resume("checkpoint_partial"))
    assert np.isclose(recovered, 8388607.5)

    wf_storage = workflow_storage.WorkflowStorage("checkpoint_partial", global_storage)
    _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "large_input", mode="output_skipped")
    _assert_step_checkpoints(wf_storage, "identity", mode="output_skipped")
    _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
예제 #27
0
def test_checkpoint_dag_recovery_partial(workflow_start_regular_shared):
    utils.unset_global_mark()

    start = time.time()
    with pytest.raises(RaySystemError):
        workflow.create(checkpoint_dag.bind(False)).run(
            workflow_id="checkpoint_partial_recovery")
    run_duration_partial = time.time() - start

    utils.set_global_mark()

    start = time.time()
    recovered = ray.get(workflow.resume("checkpoint_partial_recovery"))
    recover_duration_partial = time.time() - start
    assert np.isclose(recovered, np.arange(SIZE).mean())
    print(f"[partial] run_duration = {run_duration_partial}, "
          f"recover_duration = {recover_duration_partial}")
예제 #28
0
def test_checkpoint_dag_recovery_whole(workflow_start_regular_shared):
    utils.unset_global_mark()

    start = time.time()
    with pytest.raises(workflow.WorkflowExecutionError):
        workflow.run(checkpoint_dag.bind(True),
                     workflow_id="checkpoint_whole_recovery")
    run_duration_whole = time.time() - start

    utils.set_global_mark()

    start = time.time()
    recovered = workflow.resume("checkpoint_whole_recovery")
    recover_duration_whole = time.time() - start
    assert np.isclose(recovered, np.arange(SIZE).mean())

    print(f"[whole] run_duration = {run_duration_whole}, "
          f"recover_duration = {recover_duration_whole}")
예제 #29
0
def test_checkpoint_dag_recovery_skip(workflow_start_regular_shared):
    utils.unset_global_mark()

    start = time.time()
    with pytest.raises(workflow.WorkflowExecutionError):
        workflow.create(
            checkpoint_dag.options(**workflow.options(
                checkpoint=False)).bind(False)).run(
                    workflow_id="checkpoint_skip_recovery")
    run_duration_skipped = time.time() - start

    utils.set_global_mark()

    start = time.time()
    recovered = ray.get(workflow.resume("checkpoint_skip_recovery"))
    recover_duration_skipped = time.time() - start
    assert np.isclose(recovered, np.arange(SIZE).mean())

    print(f"[skipped] run_duration = {run_duration_skipped}, "
          f"recover_duration = {recover_duration_skipped}")
예제 #30
0
def test_workflow_manager(workflow_start_regular, tmp_path):
    # For sync between jobs
    tmp_file = str(tmp_path / "lock")
    lock = FileLock(tmp_file)
    lock.acquire()

    # For sync between jobs
    flag_file = tmp_path / "flag"
    flag_file.touch()

    @ray.remote
    def long_running(i):
        lock = FileLock(tmp_file)
        with lock.acquire():
            pass

        if i % 2 == 0:
            if flag_file.exists():
                raise ValueError()
        return 100

    outputs = [
        workflow.create(long_running.bind(i)).run_async(workflow_id=str(i))
        for i in range(100)
    ]
    # Test list all, it should list all jobs running
    all_tasks = workflow.list_all()
    assert len(all_tasks) == 100
    all_tasks_running = workflow.list_all(workflow.RUNNING)
    assert dict(all_tasks) == dict(all_tasks_running)
    assert workflow.get_status("0") == "RUNNING"

    # Release lock and make sure all tasks finished
    lock.release()
    for o in outputs:
        try:
            r = ray.get(o)
        except Exception:
            continue
        assert 100 == r
    all_tasks_running = workflow.list_all(workflow.WorkflowStatus.RUNNING)
    assert len(all_tasks_running) == 0
    # Half of them failed and half succeed
    failed_jobs = workflow.list_all("FAILED")
    assert len(failed_jobs) == 50
    finished_jobs = workflow.list_all("SUCCESSFUL")
    assert len(finished_jobs) == 50

    all_tasks_status = workflow.list_all(
        {
            workflow.WorkflowStatus.SUCCESSFUL,
            workflow.WorkflowStatus.FAILED,
            workflow.WorkflowStatus.RUNNING,
        }
    )
    assert len(all_tasks_status) == 100
    assert failed_jobs == [
        (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.FAILED
    ]
    assert finished_jobs == [
        (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.SUCCESSFUL
    ]

    # Test get_status
    assert workflow.get_status("0") == "FAILED"
    assert workflow.get_status("1") == "SUCCESSFUL"
    lock.acquire()
    r = workflow.resume("0")
    assert workflow.get_status("0") == workflow.RUNNING
    flag_file.unlink()
    lock.release()
    assert 100 == ray.get(r)
    assert workflow.get_status("0") == workflow.SUCCESSFUL

    # Test cancel
    lock.acquire()
    workflow.resume("2")
    assert workflow.get_status("2") == workflow.RUNNING
    workflow.cancel("2")
    assert workflow.get_status("2") == workflow.CANCELED

    # Now resume_all
    resumed = workflow.resume_all(include_failed=True)
    assert len(resumed) == 48
    lock.release()
    assert [ray.get(o) for (_, o) in resumed] == [100] * 48