示例#1
0
def test_workflow_queuing_3(shutdown_only, tmp_path):
    """This test ensures the queuing workflow is indeed pending."""
    ray.init(storage=str(tmp_path))
    workflow.init(max_running_workflows=1, max_pending_workflows=1)

    import time
    import filelock
    from ray.exceptions import GetTimeoutError

    lock_path = str(tmp_path / ".lock")

    @ray.remote
    def long_running(x):
        (tmp_path / str(x)).write_text(str(x))
        with filelock.FileLock(lock_path):
            return x

    workflow_id = "test_workflow_queuing_3"

    with filelock.FileLock(lock_path):
        wf_1 = workflow.run_async(long_running.bind(1),
                                  workflow_id=f"{workflow_id}_1")
        wf_2 = workflow.run_async(long_running.bind(2),
                                  workflow_id=f"{workflow_id}_2")
        time.sleep(5)
        assert (tmp_path / str(1)).exists()
        assert not (tmp_path / str(2)).exists()
        assert workflow.get_status(
            workflow_id=f"{workflow_id}_1") == workflow.RUNNING
        assert workflow.get_status(
            workflow_id=f"{workflow_id}_2") == workflow.PENDING
        with pytest.raises(GetTimeoutError):
            ray.get(wf_2, timeout=5)

    assert ray.get([wf_1, wf_2]) == [1, 2]
示例#2
0
def test_running_and_canceled_workflow(workflow_start_regular, tmp_path):

    workflow_id = "simple"
    flag = tmp_path / "flag"

    @ray.remote
    def simple():
        flag.touch()
        time.sleep(1000)
        return 0

    workflow.run_async(simple.bind(), workflow_id=workflow_id)

    # Wait until step runs to make sure pre-run metadata is written
    while not flag.exists():
        time.sleep(1)

    workflow_metadata = workflow.get_metadata(workflow_id)
    assert workflow_metadata["status"] == "RUNNING"
    assert "start_time" in workflow_metadata["stats"]
    assert "end_time" not in workflow_metadata["stats"]

    workflow.cancel(workflow_id)

    workflow_metadata = workflow.get_metadata(workflow_id)
    assert workflow_metadata["status"] == "CANCELED"
    assert "start_time" in workflow_metadata["stats"]
    assert "end_time" not in workflow_metadata["stats"]
示例#3
0
def test_run_or_resume_during_running(workflow_start_regular_shared):
    @ray.remote
    def source1():
        return "[source1]"

    @ray.remote
    def append1(x):
        return x + "[append1]"

    @ray.remote
    def append2(x):
        return x + "[append2]"

    @ray.remote
    def simple_sequential():
        x = source1.bind()
        y = append1.bind(x)
        return workflow.continuation(append2.bind(y))

    output = workflow.run_async(simple_sequential.bind(),
                                workflow_id="running_workflow")
    with pytest.raises(RuntimeError):
        workflow.run_async(simple_sequential.bind(),
                           workflow_id="running_workflow")
    with pytest.raises(RuntimeError):
        workflow.resume_async(workflow_id="running_workflow")
    assert ray.get(output) == "[source1][append1][append2]"
示例#4
0
def test_output_with_name(workflow_start_regular):
    @ray.remote
    def double(v):
        return 2 * v

    inner_task = double.options(**workflow.options(name="inner")).bind(1)
    outer_task = double.options(**workflow.options(
        name="outer")).bind(inner_task)
    result = workflow.run_async(outer_task, workflow_id="double")
    inner = workflow.get_output_async("double", name="inner")
    outer = workflow.get_output_async("double", name="outer")

    assert ray.get(inner) == 2
    assert ray.get(outer) == 4
    assert ray.get(result) == 4

    @workflow.options(name="double")
    @ray.remote
    def double_2(s):
        return s * 2

    inner_task = double_2.bind(1)
    outer_task = double_2.bind(inner_task)
    workflow_id = "double_2"
    result = workflow.run_async(outer_task, workflow_id=workflow_id)

    inner = workflow.get_output_async(workflow_id, name="double")
    outer = workflow.get_output_async(workflow_id, name="double_1")

    assert ray.get(inner) == 2
    assert ray.get(outer) == 4
    assert ray.get(result) == 4
示例#5
0
def test_user_metadata_not_dict(workflow_start_regular):
    @ray.remote
    def simple():
        return 0

    with pytest.raises(ValueError):
        workflow.run_async(
            simple.options(**workflow.options(metadata="x")).bind())

    with pytest.raises(ValueError):
        workflow.run(simple.bind(), metadata="x")
示例#6
0
def test_user_metadata_not_json_serializable(workflow_start_regular):
    @ray.remote
    def simple():
        return 0

    class X:
        pass

    with pytest.raises(ValueError):
        workflow.run_async(
            simple.options(**workflow.options(metadata={"x": X()})).bind())

    with pytest.raises(ValueError):
        workflow.run(simple.bind(), metadata={"x": X()})
示例#7
0
def test_crash_during_event_checkpointing(workflow_start_regular_shared):
    """Ensure that if the cluster dies while the event is being checkpointed, we
    properly re-poll for the event."""

    from ray._private import storage

    storage_uri = storage._storage_uri

    """Ensure that we don't re-call poll_for_event after `event_checkpointed`
       returns, even after a crash."""

    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            assert not utils.check_global_mark("committed")
            if utils.check_global_mark("first"):
                utils.set_global_mark("second")
            utils.set_global_mark("first")

            utils.set_global_mark("time_to_die")
            while not utils.check_global_mark("resume"):
                time.sleep(0.1)

        async def event_checkpointed(self, event):
            utils.set_global_mark("committed")

    @ray.remote
    def wait_then_finish(arg):
        pass

    event_promise = workflow.wait_for_event(MyEventListener)
    workflow.run_async(wait_then_finish.bind(event_promise), workflow_id="workflow")

    while not utils.check_global_mark("time_to_die"):
        time.sleep(0.1)

    assert utils.check_global_mark("first")
    ray.shutdown()
    subprocess.check_output(["ray", "stop", "--force"])

    # Give the workflow some time to kill the cluster.
    # time.sleep(3)

    ray.init(num_cpus=4, storage=storage_uri)
    workflow.init()
    workflow.resume_async("workflow")
    utils.set_global_mark("resume")

    workflow.get_output("workflow")
    assert utils.check_global_mark("second")
示例#8
0
def test_get_output_5(workflow_start_regular, tmp_path):
    """Test getting output of a workflow task immediately after executing it
    asynchronously."""
    @ray.remote
    def simple():
        return 314

    workflow_id = "test_get_output_5_{}"

    outputs = []
    for i in range(20):
        workflow.run_async(simple.bind(), workflow_id=workflow_id.format(i))
        outputs.append(workflow.get_output_async(workflow_id.format(i)))

    assert ray.get(outputs) == [314] * len(outputs)
示例#9
0
def test_step_resources(workflow_start_regular, tmp_path):
    lock_path = str(tmp_path / "lock")
    # We use signal actor here because we can't guarantee the order of tasks
    # sent from worker to raylet.
    signal_actor = SignalActor.remote()

    @ray.remote
    def step_run():
        ray.wait([signal_actor.send.remote()])
        with FileLock(lock_path):
            return None

    @ray.remote(num_cpus=1)
    def remote_run():
        return None

    lock = FileLock(lock_path)
    lock.acquire()
    ret = workflow.run_async(step_run.options(num_cpus=2).bind())
    ray.wait([signal_actor.wait.remote()])
    obj = remote_run.remote()
    with pytest.raises(ray.exceptions.GetTimeoutError):
        ray.get(obj, timeout=2)
    lock.release()
    assert ray.get(ret) is None
    assert ray.get(obj) is None
示例#10
0
def test_get_output_4(workflow_start_regular, tmp_path):
    """Test getting output of a workflow tasks that are dynamically generated."""
    lock_path = str(tmp_path / "lock")
    lock = FileLock(lock_path)

    @ray.remote
    def recursive(n):
        if n <= 0:
            with FileLock(lock_path):
                return 42
        return workflow.continuation(
            recursive.options(**workflow.options(name=str(n - 1))).bind(n - 1))

    workflow_id = "test_get_output_4"
    lock.acquire()
    obj = workflow.run_async(
        recursive.options(**workflow.options(name="10")).bind(10),
        workflow_id=workflow_id,
    )

    outputs = [
        workflow.get_output_async(workflow_id, name=str(i)) for i in range(11)
    ]
    outputs.append(obj)

    import time

    # wait so that 'get_output' is scheduled before executing the workflow
    time.sleep(3)
    lock.release()
    assert ray.get(outputs) == [42] * len(outputs)
示例#11
0
def test_cancellation(tmp_path, workflow_start_regular):
    lock_a = tmp_path / "lock_a"
    lock_b = tmp_path / "lock_b"

    @ray.remote
    def simple():
        with filelock.FileLock(lock_a):
            with filelock.FileLock(lock_b):
                pass

    workflow_id = "test_cancellation"

    with filelock.FileLock(lock_b):
        r = workflow.run_async(simple.bind(), workflow_id=workflow_id)
        try:
            ray.get(r, timeout=5)
        except GetTimeoutError:
            pass
        else:
            assert False

        assert workflow.get_status(workflow_id) == WorkflowStatus.RUNNING

        workflow.cancel(workflow_id)
        with pytest.raises(workflow.WorkflowCancellationError):
            ray.get(r)
        lock = filelock.FileLock(lock_a)
        lock.acquire(timeout=5)

        assert workflow.get_status(workflow_id) == WorkflowStatus.CANCELED
示例#12
0
def test_wait_for_multiple_events(workflow_start_regular_shared):
    """If a workflow has multiple event arguments, it should wait for them at the
    same time.
    """

    class EventListener1(workflow.EventListener):
        async def poll_for_event(self):
            utils.set_global_mark("listener1")
            while not utils.check_global_mark("trigger_event"):
                await asyncio.sleep(0.1)
            return "event1"

    class EventListener2(workflow.EventListener):
        async def poll_for_event(self):
            utils.set_global_mark("listener2")
            while not utils.check_global_mark("trigger_event"):
                await asyncio.sleep(0.1)
            return "event2"

    @ray.remote
    def trivial_step(arg1, arg2):
        return f"{arg1} {arg2}"

    event1_promise = workflow.wait_for_event(EventListener1)
    event2_promise = workflow.wait_for_event(EventListener2)

    promise = workflow.run_async(trivial_step.bind(event1_promise, event2_promise))

    while not (
        utils.check_global_mark("listener1") and utils.check_global_mark("listener2")
    ):
        time.sleep(0.1)

    utils.set_global_mark("trigger_event")
    assert ray.get(promise) == "event1 event2"
示例#13
0
def test_async_execution(workflow_start_regular_shared):
    @ray.remote
    def blocking():
        time.sleep(10)
        return 314

    start = time.time()
    output = workflow.run_async(blocking.bind())
    duration = time.time() - start
    assert duration < 5  # workflow.run is not blocked
    assert ray.get(output) == 314
示例#14
0
def test_crash_after_commit(workflow_start_regular_shared):
    """Ensure that we don't re-call poll_for_event after `event_checkpointed`
    returns, even after a crash. Here we must call `event_checkpointed`
    twice, because there's no way to know if we called it after
    checkpointing.
    """

    from ray._private import storage

    storage_uri = storage._storage_uri

    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            assert not utils.check_global_mark("committed")

        async def event_checkpointed(self, event):
            utils.set_global_mark("committed")
            if utils.check_global_mark("first"):
                utils.set_global_mark("second")
            else:
                utils.set_global_mark("first")
                await asyncio.sleep(1000000)

    event_promise = workflow.wait_for_event(MyEventListener)
    workflow.run_async(event_promise, workflow_id="workflow")

    while not utils.check_global_mark("first"):
        time.sleep(0.1)

    ray.shutdown()
    subprocess.check_output(["ray", "stop", "--force"])

    ray.init(num_cpus=4, storage=storage_uri)
    workflow.init()
    workflow.resume_async("workflow")

    workflow.get_output("workflow")
    assert utils.check_global_mark("second")
示例#15
0
def test_get_output_2(workflow_start_regular, tmp_path):
    lock_path = str(tmp_path / "lock")
    lock = FileLock(lock_path)

    @ray.remote
    def simple(v):
        with FileLock(lock_path):
            return v

    lock.acquire()
    obj = workflow.run_async(simple.bind(0), workflow_id="simple")
    obj2 = workflow.get_output_async("simple")
    lock.release()
    assert ray.get([obj, obj2]) == [0, 0]
示例#16
0
def test_event_as_workflow(workflow_start_regular_shared):
    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            while not utils.check_global_mark():
                await asyncio.sleep(1)

    utils.unset_global_mark()
    promise = workflow.run_async(
        workflow.wait_for_event(MyEventListener), workflow_id="wf"
    )

    assert workflow.get_status("wf") == workflow.WorkflowStatus.RUNNING

    utils.set_global_mark()
    assert ray.get(promise) is None
示例#17
0
def test_workflow_with_pressure(workflow_start_regular_shared):
    pressure_level = 10

    dags = [
        generate_chain(),
        generate_continuation(),
        generate_random_dag(gather_and_hash),
        generate_layered_dag(gather_and_hash),
    ]

    ans = ray.get([d.execute() for d in dags])
    outputs = []
    for _ in range(pressure_level):
        for w in dags:
            outputs.append(workflow.run_async(w))

    assert ray.get(outputs) == ans * pressure_level
示例#18
0
def test_get_named_step_output_running(workflow_start_regular, tmp_path):
    @ray.remote
    def double(v, lock=None):
        if lock is not None:
            with FileLock(lock_path):
                return 2 * v
        else:
            return 2 * v

    # Get the result from named step after workflow before it's finished
    lock_path = str(tmp_path / "lock")
    lock = FileLock(lock_path)
    lock.acquire()
    output = workflow.run_async(
        double.options(**workflow.options(name="outer")).bind(
            double.options(**workflow.options(name="inner")).bind(
                1, lock_path),
            lock_path,
        ),
        workflow_id="double-2",
    )

    inner = workflow.get_output_async("double-2", name="inner")
    outer = workflow.get_output_async("double-2", name="outer")

    @ray.remote
    def wait(obj_ref):
        return ray.get(obj_ref[0])

    # Make sure nothing is finished.
    ready, waiting = ray.wait(
        [wait.remote([output]),
         wait.remote([inner]),
         wait.remote([outer])],
        timeout=1)
    assert 0 == len(ready)
    assert 3 == len(waiting)

    # Once job finished, we'll be able to get the result.
    lock.release()
    assert [4, 2, 4] == ray.get([output, inner, outer])

    inner = workflow.get_output_async("double-2", name="inner")
    outer = workflow.get_output_async("double-2", name="outer")
    assert [2, 4] == ray.get([inner, outer])
示例#19
0
def test_workflow_queuing_1(shutdown_only, tmp_path):
    ray.init(storage=str(tmp_path))
    workflow.init(max_running_workflows=2, max_pending_workflows=2)

    import queue
    import filelock

    lock_path = str(tmp_path / ".lock")

    @ray.remote
    def long_running(x):
        with filelock.FileLock(lock_path):
            return x

    wfs = [long_running.bind(i) for i in range(5)]

    with filelock.FileLock(lock_path):
        refs = [
            workflow.run_async(wfs[i], workflow_id=f"workflow_{i}")
            for i in range(4)
        ]

        assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [
            "workflow_0",
            "workflow_1",
        ]
        assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [
            "workflow_2",
            "workflow_3",
        ]

        with pytest.raises(queue.Full, match="Workflow queue has been full"):
            workflow.run(wfs[4], workflow_id="workflow_4")

    assert ray.get(refs) == [0, 1, 2, 3]
    assert workflow.run(wfs[4], workflow_id="workflow_4") == 4
    assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [
        "workflow_0",
        "workflow_1",
        "workflow_2",
        "workflow_3",
        "workflow_4",
    ]
    for i in range(5):
        assert workflow.get_output(f"workflow_{i}") == i
示例#20
0
def test_task_id_generation(workflow_start_regular_shared, request):
    @ray.remote
    def simple(x):
        return x + 1

    x = simple.options(**workflow.options(name="simple")).bind(-1)
    n = 20
    for i in range(1, n):
        x = simple.options(**workflow.options(name="simple")).bind(x)

    workflow_id = "test_task_id_generation"
    ret = workflow.run_async(x, workflow_id=workflow_id)
    outputs = [workflow.get_output_async(workflow_id, name="simple")]
    for i in range(1, n):
        outputs.append(
            workflow.get_output_async(workflow_id, name=f"simple_{i}"))
    assert ray.get(ret) == n - 1
    assert ray.get(outputs) == list(range(n))
示例#21
0
def test_get_non_exist_output(workflow_start_regular, tmp_path):
    lock_path = str(tmp_path / "lock")

    @ray.remote
    def simple():
        with FileLock(lock_path):
            return "hello"

    workflow_id = "test_get_non_exist_output"

    with FileLock(lock_path):
        dag = simple.options(**workflow.options(name="simple")).bind()
        ret = workflow.run_async(dag, workflow_id=workflow_id)
        exist = workflow.get_output_async(workflow_id, name="simple")
        non_exist = workflow.get_output_async(workflow_id, name="non_exist")

    assert ray.get(ret) == "hello"
    assert ray.get(exist) == "hello"
    with pytest.raises(ValueError, match="non_exist"):
        ray.get(non_exist)
示例#22
0
    def _trigger_lineage_reconstruction(with_workflow):
        (tmp_path / "f2").unlink(missing_ok=True)
        (tmp_path / "num_executed").write_text("0")

        worker_node_1 = cluster.add_node(
            num_cpus=2, resources={"worker_1": 1}, storage=str(tmp_path)
        )
        worker_node_2 = cluster.add_node(
            num_cpus=2, resources={"worker_2": 1}, storage=str(tmp_path)
        )
        worker_node_id_1 = ray.get(
            get_node_id.options(num_cpus=0, resources={"worker_1": 1}).remote()
        )
        worker_node_id_2 = ray.get(
            get_node_id.options(num_cpus=0, resources={"worker_2": 1}).remote()
        )
        dag = f2.options(
            scheduling_strategy=NodeAffinitySchedulingStrategy(
                worker_node_id_2, soft=True
            )
        ).bind(
            f1.options(
                scheduling_strategy=NodeAffinitySchedulingStrategy(
                    worker_node_id_1, soft=True
                )
            ).bind()
        )

        with FileLock(lock_path):
            if with_workflow:
                ref = workflow.run_async(dag)
            else:
                ref = dag.execute()
            while not (tmp_path / "f2").exists():
                time.sleep(0.1)
            cluster.remove_node(worker_node_1, allow_graceful=False)
            cluster.remove_node(worker_node_2, allow_graceful=False)
        return ray.get(ref).sum()
示例#23
0
def test_workflow_queuing_2(shutdown_only, tmp_path):
    ray.init(storage=str(tmp_path))
    workflow.init(max_running_workflows=2, max_pending_workflows=2)

    @ray.remote
    def short_running(x):
        return x

    wfs = [short_running.bind(i) for i in range(5)]
    refs = [
        workflow.run_async(wfs[i], workflow_id=f"workflow_{i}")
        for i in range(4)
    ]
    for i in range(4):
        assert workflow.get_output(f"workflow_{i}") == i
    assert ray.get(refs) == [0, 1, 2, 3]
    assert workflow.run(wfs[4], workflow_id="workflow_4") == 4
    assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [
        "workflow_0",
        "workflow_1",
        "workflow_2",
        "workflow_3",
        "workflow_4",
    ]
示例#24
0
def test_delete(workflow_start_regular):
    from ray._private.storage import _storage_uri

    # Try deleting a random workflow that never existed.
    with pytest.raises(WorkflowNotFoundError):
        workflow.delete(workflow_id="never_existed")

    # Delete a workflow that has not finished and is not running.
    @ray.remote
    def never_ends(x):
        utils.set_global_mark()
        time.sleep(1000000)
        return x

    workflow.run_async(never_ends.bind("hello world"), workflow_id="never_finishes")

    # Make sure the step is actualy executing before killing the cluster
    while not utils.check_global_mark():
        time.sleep(0.1)

    # Restart
    ray.shutdown()
    subprocess.check_output("ray stop --force", shell=True)
    ray.init(storage=_storage_uri)
    workflow.init()

    with pytest.raises(ray.exceptions.RaySystemError):
        workflow.get_output("never_finishes")

    workflow.delete("never_finishes")

    with pytest.raises(ray.exceptions.RaySystemError):
        # TODO(suquark): we should raise "ValueError" without
        #  been blocking over the result.
        workflow.get_output("never_finishes")

    # TODO(Alex): Uncomment after
    # https://github.com/ray-project/ray/issues/19481.
    # with pytest.raises(WorkflowNotFoundError):
    #     workflow.resume("never_finishes")

    with pytest.raises(WorkflowNotFoundError):
        workflow.delete(workflow_id="never_finishes")

    # Delete a workflow which has finished.
    @ray.remote
    def basic_step(arg):
        return arg

    result = workflow.run(basic_step.bind("hello world"), workflow_id="finishes")
    assert result == "hello world"
    assert workflow.get_output("finishes") == "hello world"

    workflow.delete(workflow_id="finishes")

    with pytest.raises(ray.exceptions.RaySystemError):
        # TODO(suquark): we should raise "ValueError" without
        #  blocking over the result.
        workflow.get_output("finishes")

    # TODO(Alex): Uncomment after
    # https://github.com/ray-project/ray/issues/19481.
    # with pytest.raises(ValueError):
    #     workflow.resume("finishes")

    with pytest.raises(WorkflowNotFoundError):
        workflow.delete(workflow_id="finishes")

    assert workflow.list_all() == []

    # The workflow can be re-run as if it was never run before.
    assert workflow.run(basic_step.bind("123"), workflow_id="finishes") == "123"
示例#25
0
def test_workflow_queuing_resume_all(shutdown_only, tmp_path):
    ray.init(storage=str(tmp_path))
    workflow.init(max_running_workflows=2, max_pending_workflows=2)

    import queue
    import filelock

    lock_path = str(tmp_path / ".lock")

    @ray.remote
    def long_running(x):
        with filelock.FileLock(lock_path):
            return x

    wfs = [long_running.bind(i) for i in range(5)]

    with filelock.FileLock(lock_path):
        _refs = [  # noqa: F841
            workflow.run_async(wfs[i], workflow_id=f"workflow_{i}")
            for i in range(4)
        ]

        assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [
            "workflow_0",
            "workflow_1",
        ]
        assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [
            "workflow_2",
            "workflow_3",
        ]

        with pytest.raises(queue.Full, match="Workflow queue has been full"):
            workflow.run(wfs[4], workflow_id="workflow_4")

        # kill all workflows
        ray.shutdown()

    ray.init(storage=str(tmp_path))
    workflow.init(max_running_workflows=2, max_pending_workflows=2)

    with filelock.FileLock(lock_path):
        workflow_ids, outputs = zip(*sorted(workflow.resume_all()))
        # We should have the same running and pending workflows, because when
        # resume_all(), running workflows have higher priority.
        assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [
            "workflow_0",
            "workflow_1",
        ]
        assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [
            "workflow_2",
            "workflow_3",
        ]

    assert workflow_ids == (
        "workflow_0",
        "workflow_1",
        "workflow_2",
        "workflow_3",
    )

    assert ray.get(list(outputs)) == [0, 1, 2, 3]
    assert workflow.run(wfs[4], workflow_id="workflow_4") == 4
    assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [
        "workflow_0",
        "workflow_1",
        "workflow_2",
        "workflow_3",
        "workflow_4",
    ]
示例#26
0
def test_workflow_manager(workflow_start_regular, tmp_path):
    # For sync between jobs
    tmp_file = str(tmp_path / "lock")
    lock = FileLock(tmp_file)
    lock.acquire()

    # For sync between jobs
    flag_file = tmp_path / "flag"
    flag_file.touch()

    @ray.remote
    def long_running(i):
        lock = FileLock(tmp_file)
        with lock.acquire():
            pass

        if i % 2 == 0:
            if flag_file.exists():
                raise ValueError()
        return 100

    outputs = [
        workflow.run_async(long_running.bind(i), workflow_id=str(i))
        for i in range(100)
    ]
    # Test list all, it should list all jobs running
    all_tasks = workflow.list_all()
    assert len(all_tasks) == 100
    all_tasks_running = workflow.list_all(workflow.RUNNING)
    assert dict(all_tasks) == dict(all_tasks_running)
    assert workflow.get_status("0") == "RUNNING"

    # Release lock and make sure all tasks finished
    lock.release()
    for o in outputs:
        try:
            r = ray.get(o)
        except Exception:
            continue
        assert 100 == r
    all_tasks_running = workflow.list_all(workflow.WorkflowStatus.RUNNING)
    assert len(all_tasks_running) == 0
    # Half of them failed and half succeed
    failed_jobs = workflow.list_all("FAILED")
    assert len(failed_jobs) == 50
    finished_jobs = workflow.list_all("SUCCESSFUL")
    assert len(finished_jobs) == 50

    all_tasks_status = workflow.list_all({
        workflow.WorkflowStatus.SUCCESSFUL,
        workflow.WorkflowStatus.FAILED,
        workflow.WorkflowStatus.RUNNING,
    })
    assert len(all_tasks_status) == 100
    assert failed_jobs == [(k, v) for (k, v) in all_tasks_status
                           if v == workflow.WorkflowStatus.FAILED]
    assert finished_jobs == [(k, v) for (k, v) in all_tasks_status
                             if v == workflow.WorkflowStatus.SUCCESSFUL]

    # Test get_status
    assert workflow.get_status("0") == "FAILED"
    assert workflow.get_status("1") == "SUCCESSFUL"
    lock.acquire()
    r = workflow.resume_async("0")
    assert workflow.get_status("0") == workflow.RUNNING
    flag_file.unlink()
    lock.release()
    assert 100 == ray.get(r)
    assert workflow.get_status("0") == workflow.SUCCESSFUL

    # Test cancel
    lock.acquire()
    workflow.resume_async("2")
    assert workflow.get_status("2") == workflow.RUNNING
    workflow.cancel("2")
    assert workflow.get_status("2") == workflow.CANCELED

    # Now resume_all
    resumed = workflow.resume_all(include_failed=True)
    assert len(resumed) == 48
    lock.release()
    assert [ray.get(o) for (_, o) in resumed] == [100] * 48