示例#1
0
def test_output_with_name(workflow_start_regular):
    @ray.remote
    def double(v):
        return 2 * v

    inner_task = double.options(**workflow.options(name="inner")).bind(1)
    outer_task = double.options(**workflow.options(
        name="outer")).bind(inner_task)
    result = workflow.run_async(outer_task, workflow_id="double")
    inner = workflow.get_output_async("double", name="inner")
    outer = workflow.get_output_async("double", name="outer")

    assert ray.get(inner) == 2
    assert ray.get(outer) == 4
    assert ray.get(result) == 4

    @workflow.options(name="double")
    @ray.remote
    def double_2(s):
        return s * 2

    inner_task = double_2.bind(1)
    outer_task = double_2.bind(inner_task)
    workflow_id = "double_2"
    result = workflow.run_async(outer_task, workflow_id=workflow_id)

    inner = workflow.get_output_async(workflow_id, name="double")
    outer = workflow.get_output_async(workflow_id, name="double_1")

    assert ray.get(inner) == 2
    assert ray.get(outer) == 4
    assert ray.get(result) == 4
示例#2
0
def test_nested_catch_exception_3(workflow_start_regular_shared, tmp_path):
    """Test the case where the exception is not raised by the output task of
    a nested DAG."""

    @ray.remote
    def f3():
        return 10

    @ray.remote
    def f3_exc():
        raise ValueError()

    @ray.remote
    def f2(x):
        return x

    @ray.remote
    def f1(exc):
        if exc:
            return workflow.continuation(f2.bind(f3_exc.bind()))
        else:
            return workflow.continuation(f2.bind(f3.bind()))

    ret, err = workflow.run(
        f1.options(**workflow.options(catch_exceptions=True)).bind(True)
    )
    assert ret is None
    assert isinstance(err, ValueError)

    assert (10, None) == workflow.run(
        f1.options(**workflow.options(catch_exceptions=True)).bind(False)
    )
示例#3
0
def test_step_failure(workflow_start_regular_shared, tmp_path):
    (tmp_path / "test").write_text("0")

    @ray.remote
    def unstable_step():
        v = int((tmp_path / "test").read_text())
        (tmp_path / "test").write_text(f"{v + 1}")
        if v < 10:
            raise ValueError("Invalid")
        return v

    with pytest.raises(Exception):
        workflow.create(
            unstable_step.options(**workflow.options(max_retries=-2).bind()))

    with pytest.raises(Exception):
        workflow.create(
            unstable_step.options(**workflow.options(
                max_retries=2)).bind()).run()
    assert (10 == workflow.create(
        unstable_step.options(**workflow.options(max_retries=7)).bind()).run())
    (tmp_path / "test").write_text("0")
    (ret, err) = workflow.create(
        unstable_step.options(**workflow.options(
            max_retries=2, catch_exceptions=True)).bind()).run()
    assert ret is None
    assert isinstance(err, ValueError)
    (ret, err) = workflow.create(
        unstable_step.options(**workflow.options(
            max_retries=7, catch_exceptions=True)).bind()).run()
    assert ret == 10
    assert err is None
示例#4
0
def inplace_test():
    from ray.worker import global_worker

    worker_id = global_worker.worker_id
    x = check_and_update.options(**workflow.options(allow_inplace=True)).bind(
        "@", worker_id)
    y = check_and_update.bind(x, worker_id)
    z = check_and_update.options(**workflow.options(allow_inplace=True)).bind(
        y, worker_id)
    return workflow.continuation(z)
def test_get_named_step_output_running(workflow_start_regular, tmp_path):
    @ray.remote
    def double(v, lock=None):
        if lock is not None:
            with FileLock(lock_path):
                return 2 * v
        else:
            return 2 * v

    # Get the result from named step after workflow before it's finished
    lock_path = str(tmp_path / "lock")
    lock = FileLock(lock_path)
    lock.acquire()
    output = workflow.create(
        double.options(**workflow.options(name="outer")).bind(
            double.options(**workflow.options(name="inner")).bind(
                1, lock_path),
            lock_path,
        )).run_async("double-2")

    inner = workflow.get_output("double-2", name="inner")
    outer = workflow.get_output("double-2", name="outer")

    @ray.remote
    def wait(obj_ref):
        return ray.get(obj_ref[0])

    # Make sure nothing is finished.
    ready, waiting = ray.wait(
        [wait.remote([output]),
         wait.remote([inner]),
         wait.remote([outer])],
        timeout=1)
    assert 0 == len(ready)
    assert 3 == len(waiting)

    # Once job finished, we'll be able to get the result.
    lock.release()
    assert 4 == ray.get(output)

    # Here sometimes inner will not be generated when we call
    # run_async. So there is a race condition here.
    try:
        v = ray.get(inner)
    except Exception:
        v = None
    if v is not None:
        assert 2 == v
    assert 4 == ray.get(outer)

    inner = workflow.get_output("double-2", name="inner")
    outer = workflow.get_output("double-2", name="outer")
    assert 2 == ray.get(inner)
    assert 4 == ray.get(outer)
def test_get_named_step_output_finished(workflow_start_regular, tmp_path):
    @ray.remote
    def double(v):
        return 2 * v

    # Get the result from named step after workflow finished
    assert 4 == workflow.create(
        double.options(**workflow.options(name="outer")).bind(
            double.options(**workflow.options(
                name="inner")).bind(1))).run("double")
    assert ray.get(workflow.get_output("double", name="inner")) == 2
    assert ray.get(workflow.get_output("double", name="outer")) == 4
示例#7
0
def test_get_output_3(workflow_start_regular, tmp_path):
    cnt_file = tmp_path / "counter"
    cnt_file.write_text("0")
    error_flag = tmp_path / "error"
    error_flag.touch()

    @ray.remote
    def incr():
        v = int(cnt_file.read_text())
        cnt_file.write_text(str(v + 1))
        if error_flag.exists():
            raise ValueError()
        return 10

    with pytest.raises(workflow.WorkflowExecutionError):
        workflow.create(incr.options(**workflow.options(max_retries=0)).bind()).run(
            "incr"
        )

    assert cnt_file.read_text() == "1"

    from ray.exceptions import RaySystemError

    # TODO(suquark): We should prevent Ray from raising "RaySystemError",
    #   in workflow, because "RaySystemError" does not inherit the underlying
    #   error, so users and developers cannot catch the expected error.
    #   I feel this issue is a very annoying.
    with pytest.raises((RaySystemError, ValueError)):
        ray.get(workflow.get_output("incr"))

    assert cnt_file.read_text() == "1"
    error_flag.unlink()
    with pytest.raises((RaySystemError, ValueError)):
        ray.get(workflow.get_output("incr"))
    assert ray.get(workflow.resume("incr")) == 10
示例#8
0
 def recursive(n):
     if n <= 0:
         with FileLock(lock_path):
             return 42
     return workflow.continuation(
         recursive.options(**workflow.options(name=str(n - 1))).bind(n - 1)
     )
def test_get_output_3(workflow_start_regular, tmp_path):
    cnt_file = tmp_path / "counter"
    cnt_file.write_text("0")
    error_flag = tmp_path / "error"
    error_flag.touch()

    @ray.remote
    def incr():
        v = int(cnt_file.read_text())
        cnt_file.write_text(str(v + 1))
        if error_flag.exists():
            raise ValueError()
        return 10

    with pytest.raises(ray.exceptions.RaySystemError):
        workflow.create(
            incr.options(**workflow.options(max_retries=0)).bind()).run("incr")

    assert cnt_file.read_text() == "1"

    with pytest.raises(ray.exceptions.RaySystemError):
        ray.get(workflow.get_output("incr"))

    assert cnt_file.read_text() == "1"
    error_flag.unlink()
    with pytest.raises(ray.exceptions.RaySystemError):
        ray.get(workflow.get_output("incr"))
    assert ray.get(workflow.resume("incr")) == 10
示例#10
0
def test_get_output_4(workflow_start_regular, tmp_path):
    """Test getting output of a workflow tasks that are dynamically generated."""
    lock_path = str(tmp_path / "lock")
    lock = FileLock(lock_path)

    @ray.remote
    def recursive(n):
        if n <= 0:
            with FileLock(lock_path):
                return 42
        return workflow.continuation(
            recursive.options(**workflow.options(name=str(n - 1))).bind(n - 1))

    workflow_id = "test_get_output_4"
    lock.acquire()
    obj = workflow.run_async(
        recursive.options(**workflow.options(name="10")).bind(10),
        workflow_id=workflow_id,
    )

    outputs = [
        workflow.get_output_async(workflow_id, name=str(i)) for i in range(11)
    ]
    outputs.append(obj)

    import time

    # wait so that 'get_output' is scheduled before executing the workflow
    time.sleep(3)
    lock.release()
    assert ray.get(outputs) == [42] * len(outputs)
示例#11
0
def test_options_update():
    from ray.workflow.common import WORKFLOW_OPTIONS

    # Options are given in decorator first, then in the first .options()
    # and finally in the second .options()
    @workflow.options(name="old_name", metadata={"k": "v"})
    @ray.remote(num_cpus=2, max_retries=1)
    def f():
        return

    # name is updated from the old name in the decorator to the new name in the first
    # .options(), then preserved in the second options.
    # metadata and ray_options are "updated"
    # max_retries only defined in the decorator and it got preserved all the way
    new_f = f.options(
        num_returns=2,
        **workflow.options(name="new_name", metadata={"extra_k2": "extra_v2"}),
    )
    options = new_f.bind().get_options()
    assert options == {
        "num_cpus": 2,
        "num_returns": 2,
        "max_retries": 1,
        "_metadata": {
            WORKFLOW_OPTIONS: {
                "name": "new_name",
                "metadata": {
                    "extra_k2": "extra_v2"
                },
            }
        },
    }
示例#12
0
def test_dynamic_output(workflow_start_regular_shared):
    @ray.remote
    def exponential_fail(k, n):
        if n > 0:
            if n < 3:
                raise Exception("Failed intentionally")
            return workflow.continuation(
                exponential_fail.options(**workflow.options(
                    name=f"step_{n}")).bind(k * 2, n - 1))
        return k

    # When workflow fails, the dynamic output should points to the
    # latest successful step.
    try:
        workflow.run(
            exponential_fail.options(**workflow.options(name="step_0")).bind(
                3, 10),
            workflow_id="dynamic_output",
        )
    except Exception:
        pass
    from ray.workflow.workflow_storage import get_workflow_storage

    wf_storage = get_workflow_storage(workflow_id="dynamic_output")
    result = wf_storage.inspect_step("step_0")
    assert result.output_step_id == "step_3"
示例#13
0
 def exponential_fail(k, n):
     if n > 0:
         if n < 3:
             raise Exception("Failed intentionally")
         return workflow.continuation(
             exponential_fail.options(**workflow.options(
                 name=f"step_{n}")).bind(k * 2, n - 1))
     return k
示例#14
0
def test_task_id_generation(workflow_start_regular_shared, request):
    @ray.remote
    def simple(x):
        return x + 1

    x = simple.options(**workflow.options(name="simple")).bind(-1)
    n = 20
    for i in range(1, n):
        x = simple.options(**workflow.options(name="simple")).bind(x)

    workflow_id = "test_task_id_generation"
    ret = workflow.create(x).run_async(workflow_id=workflow_id)
    outputs = [workflow.get_output(workflow_id, name="simple")]
    for i in range(1, n):
        outputs.append(workflow.get_output(workflow_id, name=f"simple_{i}"))
    assert ray.get(ret) == n - 1
    assert ray.get(outputs) == list(range(n))
示例#15
0
def test_get_named_step_output_running(workflow_start_regular, tmp_path):
    @ray.remote
    def double(v, lock=None):
        if lock is not None:
            with FileLock(lock_path):
                return 2 * v
        else:
            return 2 * v

    # Get the result from named step after workflow before it's finished
    lock_path = str(tmp_path / "lock")
    lock = FileLock(lock_path)
    lock.acquire()
    output = workflow.run_async(
        double.options(**workflow.options(name="outer")).bind(
            double.options(**workflow.options(name="inner")).bind(
                1, lock_path),
            lock_path,
        ),
        workflow_id="double-2",
    )

    inner = workflow.get_output_async("double-2", name="inner")
    outer = workflow.get_output_async("double-2", name="outer")

    @ray.remote
    def wait(obj_ref):
        return ray.get(obj_ref[0])

    # Make sure nothing is finished.
    ready, waiting = ray.wait(
        [wait.remote([output]),
         wait.remote([inner]),
         wait.remote([outer])],
        timeout=1)
    assert 0 == len(ready)
    assert 3 == len(waiting)

    # Once job finished, we'll be able to get the result.
    lock.release()
    assert [4, 2, 4] == ray.get([output, inner, outer])

    inner = workflow.get_output_async("double-2", name="inner")
    outer = workflow.get_output_async("double-2", name="outer")
    assert [2, 4] == ray.get([inner, outer])
示例#16
0
def checkpoint_dag(checkpoint):
    @ray.remote
    def large_input():
        return np.arange(2**24)

    @ray.remote
    def identity(x):
        return x

    @ray.remote
    def average(x):
        return np.mean(x)

    x = large_input.options(
        **workflow.options(name="large_input", checkpoint=checkpoint)).bind()
    y = identity.options(
        **workflow.options(name="identity", checkpoint=checkpoint)).bind(x)
    return workflow.continuation(
        average.options(**workflow.options(name="average")).bind(y))
示例#17
0
def test_user_metadata_not_dict(workflow_start_regular):
    @ray.remote
    def simple():
        return 0

    with pytest.raises(ValueError):
        workflow.create(simple.options(**workflow.options(metadata="x")).bind())

    with pytest.raises(ValueError):
        workflow.create(simple.bind()).run(metadata="x")
示例#18
0
    def tail_recursion(n):
        import inspect

        # check if the stack is growing
        assert len(inspect.stack(0)) < 20
        if n <= 0:
            return "ok"
        return workflow.continuation(
            tail_recursion.options(**workflow.options(
                allow_inplace=True)).bind(n - 1))
示例#19
0
def test_get_named_step_output_error(workflow_start_regular, tmp_path):
    @ray.remote
    def double(v, error):
        if error:
            raise Exception()
        return v + v

    # Force it to fail for the outer step
    with pytest.raises(Exception):
        workflow.create(
            double.options(**workflow.options(name="outer")).bind(
                double.options(**workflow.options(name="inner")).bind(
                    1, False), True)).run("double")

    # For the inner step, it should have already been executed.
    assert 2 == ray.get(workflow.get_output("double", name="inner"))
    outer = workflow.get_output("double", name="outer")
    with pytest.raises(Exception):
        ray.get(outer)
示例#20
0
def checkpoint_dag(checkpoint):
    @ray.remote
    def large_input():
        return np.arange(SIZE)

    @ray.remote
    def identity(x):
        if not utils.check_global_mark():
            import os

            os.kill(os.getpid(), 9)
        return x

    @ray.remote
    def average(x):
        return np.mean(x)

    x = large_input.options(**workflow.options(checkpoint=checkpoint)).bind()
    y = identity.options(**workflow.options(checkpoint=checkpoint)).bind(x)
    return workflow.continuation(average.bind(y))
示例#21
0
def test_nested_catch_exception(workflow_start_regular_shared, tmp_path):
    @ray.remote
    def f2():
        return 10

    @ray.remote
    def f1():
        return workflow.continuation(f2.bind())

    assert (10, None) == workflow.create(
        f1.options(**workflow.options(catch_exceptions=True)).bind()).run()
示例#22
0
def test_nested_catch_exception_2(workflow_start_regular_shared, tmp_path):
    @ray.remote
    def f1(n):
        if n == 0:
            raise ValueError()
        else:
            return workflow.continuation(f1.bind(n - 1))

    ret, err = workflow.create(
        f1.options(**workflow.options(catch_exceptions=True)).bind(5)).run()
    assert ret is None
    assert isinstance(err, ValueError)
示例#23
0
def test_user_metadata_not_json_serializable(workflow_start_regular):
    @ray.remote
    def simple():
        return 0

    class X:
        pass

    with pytest.raises(ValueError):
        workflow.create(simple.options(**workflow.options(metadata={"x": X()})).bind())

    with pytest.raises(ValueError):
        workflow.create(simple.bind()).run(metadata={"x": X()})
示例#24
0
def test_checkpoint_dag_full(workflow_start_regular_shared):
    outputs = workflow.create(
        checkpoint_dag.options(**workflow.options(name="checkpoint_dag")).bind(True)
    ).run(workflow_id="checkpoint_whole")
    assert np.isclose(outputs, 8388607.5)
    recovered = ray.get(workflow.resume("checkpoint_whole"))
    assert np.isclose(recovered, 8388607.5)

    wf_storage = workflow_storage.WorkflowStorage("checkpoint_whole")
    _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "large_input", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "identity", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
示例#25
0
def exp_inplace(k, n, worker_id=None):
    from ray.worker import global_worker

    _worker_id = global_worker.worker_id
    if worker_id is not None:
        # sub-workflows running inplace
        assert _worker_id == worker_id
    worker_id = _worker_id

    if n == 0:
        return k
    return workflow.continuation(
        exp_inplace.options(**workflow.options(allow_inplace=True)).bind(
            2 * k, n - 1, worker_id))
示例#26
0
def test_checkpoint_dag_skip_partial(workflow_start_regular_shared):
    outputs = workflow.run(
        checkpoint_dag.options(**workflow.options(
            name="checkpoint_dag")).bind(False),
        workflow_id="checkpoint_partial",
    )
    assert np.isclose(outputs, 8388607.5)
    recovered = workflow.resume("checkpoint_partial")
    assert np.isclose(recovered, 8388607.5)

    wf_storage = workflow_storage.WorkflowStorage("checkpoint_partial")
    _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "large_input", mode="output_skipped")
    _assert_step_checkpoints(wf_storage, "identity", mode="output_skipped")
    _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
示例#27
0
def custom_retry_strategy(func: Any, num_retries: int, delay_s: int) -> str:
    import time

    @ray.remote
    def handle_result(res: Tuple[Optional[str], Optional[Exception]]) -> str:
        result, error = res
        if result:
            return res
        elif num_retries <= 0:
            raise error
        else:
            print("Retrying exception after delay", error)
            time.sleep(delay_s)
            return workflow.continuation(
                custom_retry_strategy.bind(func, num_retries - 1, delay_s))

    res = func.options(**workflow.options(catch_exceptions=True)).bind()
    return workflow.continuation(handle_result.bind(res))
示例#28
0
def test_checkpoint_dag_recovery_skip(workflow_start_regular_shared):
    utils.unset_global_mark()

    start = time.time()
    with pytest.raises(RaySystemError):
        workflow.create(
            checkpoint_dag.options(**workflow.options(
                checkpoint=False)).bind(False)).run(
                    workflow_id="checkpoint_skip_recovery")
    run_duration_skipped = time.time() - start

    utils.set_global_mark()

    start = time.time()
    recovered = ray.get(workflow.resume("checkpoint_skip_recovery"))
    recover_duration_skipped = time.time() - start
    assert np.isclose(recovered, np.arange(SIZE).mean())

    print(f"[skipped] run_duration = {run_duration_skipped}, "
          f"recover_duration = {recover_duration_skipped}")
示例#29
0
def test_get_non_exist_output(workflow_start_regular, tmp_path):
    lock_path = str(tmp_path / "lock")

    @ray.remote
    def simple():
        with FileLock(lock_path):
            return "hello"

    workflow_id = "test_get_non_exist_output"

    with FileLock(lock_path):
        dag = simple.options(**workflow.options(name="simple")).bind()
        ret = workflow.run_async(dag, workflow_id=workflow_id)
        exist = workflow.get_output_async(workflow_id, name="simple")
        non_exist = workflow.get_output_async(workflow_id, name="non_exist")

    assert ray.get(ret) == "hello"
    assert ray.get(exist) == "hello"
    with pytest.raises(ValueError, match="non_exist"):
        ray.get(non_exist)
示例#30
0
@ray.remote
def celebrate(result: str) -> None:
    print("Success!", result)


@ray.remote
def send_email(result: str) -> None:
    print("Sending email", result)


@ray.remote
def exit_handler(res: Tuple[Optional[str], Optional[Exception]]) -> None:
    result, error = res
    email = send_email.bind(f"Raw result: {result}, {error}")
    if error:
        handler = cry.bind(error)
    else:
        handler = celebrate.bind(result)
    return workflow.continuation(wait_all.bind(handler, email))


@ray.remote
def wait_all(*deps):
    return "done"


if __name__ == "__main__":
    res = intentional_fail.options(**workflow.options(catch_exceptions=True)).bind()
    print(workflow.run(exit_handler.bind(res)))