Exemplo n.º 1
0
def test_running_and_canceled_workflow(workflow_start_regular, tmp_path):

    workflow_id = "simple"
    flag = tmp_path / "flag"

    @ray.remote
    def simple():
        flag.touch()
        time.sleep(1000)
        return 0

    workflow.create(simple.bind()).run_async(workflow_id)

    # Wait until step runs to make sure pre-run metadata is written
    while not flag.exists():
        time.sleep(1)

    workflow_metadata = workflow.get_metadata(workflow_id)
    assert workflow_metadata["status"] == "RUNNING"
    assert "start_time" in workflow_metadata["stats"]
    assert "end_time" not in workflow_metadata["stats"]

    workflow.cancel(workflow_id)

    workflow_metadata = workflow.get_metadata(workflow_id)
    assert workflow_metadata["status"] == "CANCELED"
    assert "start_time" in workflow_metadata["stats"]
    assert "end_time" not in workflow_metadata["stats"]
Exemplo n.º 2
0
def test_wait_failure_recovery_2(workflow_start_regular_shared):
    # Test failing "workflow.wait" and its input steps.

    @workflow.step
    def sleep_identity(x: int):
        # block the step by a global mark
        while not utils.check_global_mark():
            time.sleep(0.1)
        time.sleep(x)
        return x

    @workflow.step
    def identity(x):
        return x

    ws = [
        sleep_identity.step(2),
        sleep_identity.step(5),
        sleep_identity.step(1),
    ]
    w = workflow.wait(ws, num_returns=2, timeout=None)
    utils.unset_global_mark()
    _ = identity.step(w).run_async(workflow_id="wait_failure_recovery_2")
    # wait util "workflow.wait" has been running
    time.sleep(10)
    workflow.cancel("wait_failure_recovery_2")
    time.sleep(2)

    utils.set_global_mark()
    ready, unready = ray.get(workflow.resume("wait_failure_recovery_2"))
    assert ready == [2, 1]
Exemplo n.º 3
0
def test_cancellation(tmp_path, workflow_start_regular):
    lock_a = tmp_path / "lock_a"
    lock_b = tmp_path / "lock_b"

    @ray.remote
    def simple():
        with filelock.FileLock(lock_a):
            with filelock.FileLock(lock_b):
                pass

    workflow_id = "test_cancellation"

    with filelock.FileLock(lock_b):
        r = workflow.run_async(simple.bind(), workflow_id=workflow_id)
        try:
            ray.get(r, timeout=5)
        except GetTimeoutError:
            pass
        else:
            assert False

        assert workflow.get_status(workflow_id) == WorkflowStatus.RUNNING

        workflow.cancel(workflow_id)
        with pytest.raises(workflow.WorkflowCancellationError):
            ray.get(r)
        lock = filelock.FileLock(lock_a)
        lock.acquire(timeout=5)

        assert workflow.get_status(workflow_id) == WorkflowStatus.CANCELED
Exemplo n.º 4
0
def test_no_init(shutdown_only):
    @ray.remote
    def f():
        pass

    fail_wf_init_error_msg = re.escape(
        "`workflow.init()` must be called prior to using the workflows API.")

    with pytest.raises(RuntimeError, match=fail_wf_init_error_msg):
        workflow.create(f.bind()).run()
    with pytest.raises(RuntimeError, match=fail_wf_init_error_msg):
        workflow.list_all()
    with pytest.raises(RuntimeError, match=fail_wf_init_error_msg):
        workflow.resume_all()
    with pytest.raises(RuntimeError, match=fail_wf_init_error_msg):
        workflow.cancel("wf")
    with pytest.raises(RuntimeError, match=fail_wf_init_error_msg):
        workflow.get_actor("wf")
Exemplo n.º 5
0
def test_workflow_manager(workflow_start_regular, tmp_path):
    # For sync between jobs
    tmp_file = str(tmp_path / "lock")
    lock = FileLock(tmp_file)
    lock.acquire()

    # For sync between jobs
    flag_file = tmp_path / "flag"
    flag_file.touch()

    @ray.remote
    def long_running(i):
        lock = FileLock(tmp_file)
        with lock.acquire():
            pass

        if i % 2 == 0:
            if flag_file.exists():
                raise ValueError()
        return 100

    outputs = [
        workflow.create(long_running.bind(i)).run_async(workflow_id=str(i))
        for i in range(100)
    ]
    # Test list all, it should list all jobs running
    all_tasks = workflow.list_all()
    assert len(all_tasks) == 100
    all_tasks_running = workflow.list_all(workflow.RUNNING)
    assert dict(all_tasks) == dict(all_tasks_running)
    assert workflow.get_status("0") == "RUNNING"

    # Release lock and make sure all tasks finished
    lock.release()
    for o in outputs:
        try:
            r = ray.get(o)
        except Exception:
            continue
        assert 100 == r
    all_tasks_running = workflow.list_all(workflow.WorkflowStatus.RUNNING)
    assert len(all_tasks_running) == 0
    # Half of them failed and half succeed
    failed_jobs = workflow.list_all("FAILED")
    assert len(failed_jobs) == 50
    finished_jobs = workflow.list_all("SUCCESSFUL")
    assert len(finished_jobs) == 50

    all_tasks_status = workflow.list_all(
        {
            workflow.WorkflowStatus.SUCCESSFUL,
            workflow.WorkflowStatus.FAILED,
            workflow.WorkflowStatus.RUNNING,
        }
    )
    assert len(all_tasks_status) == 100
    assert failed_jobs == [
        (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.FAILED
    ]
    assert finished_jobs == [
        (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.SUCCESSFUL
    ]

    # Test get_status
    assert workflow.get_status("0") == "FAILED"
    assert workflow.get_status("1") == "SUCCESSFUL"
    lock.acquire()
    r = workflow.resume("0")
    assert workflow.get_status("0") == workflow.RUNNING
    flag_file.unlink()
    lock.release()
    assert 100 == ray.get(r)
    assert workflow.get_status("0") == workflow.SUCCESSFUL

    # Test cancel
    lock.acquire()
    workflow.resume("2")
    assert workflow.get_status("2") == workflow.RUNNING
    workflow.cancel("2")
    assert workflow.get_status("2") == workflow.CANCELED

    # Now resume_all
    resumed = workflow.resume_all(include_failed=True)
    assert len(resumed) == 48
    lock.release()
    assert [ray.get(o) for (_, o) in resumed] == [100] * 48