Пример #1
0
def test_connect_with_disconnected_node(shutdown_only):
    config = {
        "num_heartbeats_timeout": 50,
        "raylet_heartbeat_period_milliseconds": 10,
    }
    cluster = Cluster()
    cluster.add_node(num_cpus=0, _system_config=config)
    ray.init(address=cluster.address)
    p = init_error_pubsub()
    errors = get_error_message(p, 1, timeout=5)
    assert len(errors) == 0
    # This node will be killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0)
    cluster.remove_node(dead_node, allow_graceful=False)
    errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR)
    assert len(errors) == 1
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0)
    cluster.remove_node(dead_node, allow_graceful=False)
    errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR)
    assert len(errors) == 1
    # This node is killed by SIGTERM, ray_monitor will not mark it again.
    removing_node = cluster.add_node(num_cpus=0)
    cluster.remove_node(removing_node, allow_graceful=True)
    errors = get_error_message(p, 1, timeout=2)
    assert len(errors) == 0
    # There is no connection error to a dead node.
    errors = get_error_message(p, 1, timeout=2)
    assert len(errors) == 0
    p.close()
Пример #2
0
def test_warning_many_actor_tasks_queued(shutdown_only, sync: bool):
    ray.init(num_cpus=1)
    p = init_error_pubsub()

    @ray.remote(num_cpus=1)
    class SyncFoo:
        def f(self):
            import time

            time.sleep(1)

    @ray.remote(num_cpus=1)
    class AsyncFoo:
        async def f(self):
            import asyncio

            await asyncio.sleep(1)

    Foo = SyncFoo if sync else AsyncFoo
    a = Foo.remote()
    [a.f.remote() for _ in range(50000)]
    errors = get_error_message(p, 4, ray_constants.EXCESS_QUEUEING_WARNING)
    msgs = [e.error_message for e in errors]
    assert "Warning: More than 5000 tasks are pending submission to actor" in msgs[0]
    assert "Warning: More than 10000 tasks are pending submission to actor" in msgs[1]
    assert "Warning: More than 20000 tasks are pending submission to actor" in msgs[2]
    assert "Warning: More than 40000 tasks are pending submission to actor" in msgs[3]
Пример #3
0
def test_warning_for_too_many_actors(shutdown_only):
    # Check that if we run a workload which requires too many workers to be
    # started that we will receive a warning.
    num_cpus = 2
    ray.init(num_cpus=num_cpus)

    p = init_error_pubsub()

    @ray.remote
    class Foo:
        def __init__(self):
            time.sleep(1000)

    # NOTE: We should save actor, otherwise it will be out of scope.
    actor_group1 = [Foo.remote() for _ in range(num_cpus * 10)]
    assert len(actor_group1) == num_cpus * 10
    errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR

    actor_group2 = [Foo.remote() for _ in range(num_cpus * 3)]
    assert len(actor_group2) == num_cpus * 3
    errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR
    p.close()
Пример #4
0
def test_agent_report_unexpected_raylet_death_large_file(shutdown_only):
    """Test agent reports Raylet death if it is not SIGTERM."""

    ray.init(include_dashboard=True)
    p = init_error_pubsub()

    node = ray._private.worker._global_node
    all_processes = node.all_processes
    raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0]
    raylet_proc = psutil.Process(raylet_proc_info.process.pid)

    wait_for_condition(lambda: search_agent(raylet_proc.children()))
    agent_proc = search_agent(raylet_proc.children())
    agent_pid = agent_proc.pid

    check_agent_register(raylet_proc, agent_pid)

    # Append to the Raylet log file with data >> 1 MB.
    with open(os.path.join(node.get_session_dir_path(), "logs", "raylet.out"),
              "a") as f:
        f.write("test data\n" * 1024**2)

    # The agent should be dead if raylet exits.
    raylet_proc.kill()
    raylet_proc.wait()
    agent_proc.wait(5)

    # Reading and publishing logs should still work.
    errors = get_error_message(p, 1, ray_constants.RAYLET_DIED_ERROR)
    assert len(errors) == 1, errors
    err = errors[0]
    assert err.type == ray_constants.RAYLET_DIED_ERROR
    assert "Termination is unexpected." in err.error_message, err.error_message
    assert "Raylet logs:" in err.error_message, err.error_message
Пример #5
0
def test_detached_warning(shutdown_only):
    ray.init()

    @ray.remote
    class DetachedActor:
        def ping(self):
            return "pong"

    error_pubsub = init_error_pubsub()
    actor = DetachedActor.options(  # noqa: F841
        name="Pinger", lifetime="detached").remote()
    errors = get_error_message(error_pubsub, 1, None)
    error = errors.pop()
    assert error.type == ray_constants.DETACHED_ACTOR_ANONYMOUS_NAMESPACE_ERROR
Пример #6
0
def test_warning_for_too_many_nested_tasks(shutdown_only):
    # Check that if we run a workload which requires too many workers to be
    # started that we will receive a warning.
    num_cpus = 2
    ray.init(num_cpus=num_cpus)
    p = init_error_pubsub()

    remote_wait = Semaphore.remote(value=0)
    nested_wait = Semaphore.remote(value=0)

    ray.get([
        remote_wait.locked.remote(),
        nested_wait.locked.remote(),
    ])

    @ray.remote(num_cpus=0.25)
    def f():
        time.sleep(1000)
        return 1

    @ray.remote(num_cpus=0.25)
    def h(nested_waits):
        nested_wait.release.remote()
        ray.get(nested_waits)
        ray.get(f.remote())

    @ray.remote(num_cpus=0.25)
    def g(remote_waits, nested_waits):
        # Sleep so that the f tasks all get submitted to the scheduler after
        # the g tasks.
        remote_wait.release.remote()
        # wait until every lock is released.
        ray.get(remote_waits)
        ray.get(h.remote(nested_waits))

    num_root_tasks = num_cpus * 4
    # Lock remote task until everything is scheduled.
    remote_waits = []
    nested_waits = []
    for _ in range(num_root_tasks):
        remote_waits.append(remote_wait.acquire.remote())
        nested_waits.append(nested_wait.acquire.remote())

    [g.remote(remote_waits, nested_waits) for _ in range(num_root_tasks)]

    errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR
    p.close()
Пример #7
0
def test_warning_actor_waiting_on_actor(shutdown_only):
    ray.init(num_cpus=1,
             _system_config={"debug_dump_period_milliseconds": 500})
    p = init_error_pubsub()

    @ray.remote(num_cpus=1)
    class Actor:
        pass

    a = Actor.remote()  # noqa
    b = Actor.remote()  # noqa

    errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
Пример #8
0
def test_warning_for_infeasible_zero_cpu_actor(shutdown_only):
    # Check that we cannot place an actor on a 0 CPU machine and that we get an
    # infeasibility warning (even though the actor creation task itself
    # requires no CPUs).

    ray.init(num_cpus=0)
    p = init_error_pubsub()

    @ray.remote
    class Foo:
        pass

    # The actor creation should be infeasible.
    Foo.remote()
    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR
    p.close()
Пример #9
0
def test_raylet_and_agent_share_fate(shutdown_only):
    """Test raylet and agent share fate."""

    ray.init(include_dashboard=True)
    p = init_error_pubsub()

    node = ray._private.worker._global_node
    all_processes = node.all_processes
    raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0]
    raylet_proc = psutil.Process(raylet_proc_info.process.pid)

    wait_for_condition(lambda: search_agent(raylet_proc.children()))
    agent_proc = search_agent(raylet_proc.children())
    agent_pid = agent_proc.pid

    check_agent_register(raylet_proc, agent_pid)

    # The agent should be dead if raylet exits.
    raylet_proc.terminate()
    raylet_proc.wait()
    agent_proc.wait(5)

    # No error should be reported for graceful termination.
    errors = get_error_message(p, 1, ray_constants.RAYLET_DIED_ERROR)
    assert len(errors) == 0, errors

    ray.shutdown()

    ray.init(include_dashboard=True)
    all_processes = ray._private.worker._global_node.all_processes
    raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0]
    raylet_proc = psutil.Process(raylet_proc_info.process.pid)
    wait_for_condition(lambda: search_agent(raylet_proc.children()))
    agent_proc = search_agent(raylet_proc.children())
    agent_pid = agent_proc.pid

    check_agent_register(raylet_proc, agent_pid)

    # The raylet should be dead if agent exits.
    agent_proc.kill()
    agent_proc.wait()
    raylet_proc.wait(5)
Пример #10
0
def test_no_warning_many_actor_tasks_queued_when_sequential(shutdown_only, sync: bool):
    ray.init(num_cpus=1)
    p = init_error_pubsub()

    @ray.remote(num_cpus=1)
    class SyncFoo:
        def f(self):
            return 1

    @ray.remote(num_cpus=1)
    class AsyncFoo:
        async def f(self):
            return 1

    Foo = SyncFoo if sync else AsyncFoo
    a = Foo.remote()
    for _ in range(10000):
        assert ray.get(a.f.remote()) == 1
    errors = get_error_message(p, 1, ray_constants.EXCESS_QUEUEING_WARNING, timeout=1)
    assert len(errors) == 0
Пример #11
0
def test_warning_all_tasks_blocked(shutdown_only):
    ray.init(num_cpus=1,
             _system_config={"debug_dump_period_milliseconds": 500})
    p = init_error_pubsub()

    @ray.remote(num_cpus=1)
    class Foo:
        def f(self):
            return 0

    @ray.remote
    def f():
        # Creating both actors is not possible.
        actors = [Foo.remote() for _ in range(3)]
        for a in actors:
            ray.get(a.f.remote())

    # Run in a task to check we handle the blocked task case correctly
    f.remote()
    errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
Пример #12
0
def test_warning_task_waiting_on_actor(shutdown_only):
    ray.init(num_cpus=1,
             _system_config={"debug_dump_period_milliseconds": 500})
    p = init_error_pubsub()

    @ray.remote(num_cpus=1)
    class Actor:
        def hello(self):
            pass

    a = Actor.remote()  # noqa
    ray.get(a.hello.remote())

    @ray.remote(num_cpus=1)
    def f():
        print("f running")
        time.sleep(999)

    ids = [f.remote()]  # noqa

    errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
Пример #13
0
def test_error_isolation(call_ray_start):
    address = call_ray_start
    # Connect a driver to the Ray cluster.
    ray.init(address=address)

    # If a GRPC call exceeds timeout, the calls is cancelled at client side but
    # server may still reply to it, leading to missed message. Using a sequence
    # number to ensure no message is dropped can be the long term solution,
    # but its complexity and the fact the Ray subscribers do not use deadline
    # in production makes it less preferred.
    # Therefore, a simpler workaround is used instead: a different subscriber
    # is used for each get_error_message() call.
    subscribers = [init_error_pubsub() for _ in range(3)]

    # There shouldn't be any errors yet.
    errors = get_error_message(subscribers[0], 1, timeout=2)
    assert len(errors) == 0

    error_string1 = "error_string1"
    error_string2 = "error_string2"

    @ray.remote
    def f():
        raise Exception(error_string1)

    # Run a remote function that throws an error.
    with pytest.raises(Exception):
        ray.get(f.remote())

    # Wait for the error to appear in Redis.
    errors = get_error_message(subscribers[1], 1)

    # Make sure we got the error.
    assert len(errors) == 1
    assert error_string1 in errors[0].error_message

    # Start another driver and make sure that it does not receive this
    # error. Make the other driver throw an error, and make sure it
    # receives that error.
    driver_script = """
import ray
import time
from ray._private.test_utils import init_error_pubsub, get_error_message

ray.init(address="{}")
subscribers = [init_error_pubsub() for _ in range(2)]
time.sleep(1)
errors = get_error_message(subscribers[0], 1, timeout=2)
assert len(errors) == 0

@ray.remote
def f():
    raise Exception("{}")

try:
    ray.get(f.remote())
except Exception as e:
    pass

errors = get_error_message(subscribers[1], 1)
assert len(errors) == 1

assert "{}" in errors[0].error_message

print("success")
""".format(
        address, error_string2, error_string2
    )

    out = run_string_as_driver(driver_script)
    # Make sure the other driver succeeded.
    assert "success" in out

    # Make sure that the other error message doesn't show up for this
    # driver.
    errors = get_error_message(subscribers[2], 1)
    assert len(errors) == 1
Пример #14
0
def error_pubsub():
    p = init_error_pubsub()
    yield p
    p.close()
Пример #15
0
def test_drain_api(shutdown_only):
    """E2E test of the autoscaler's use of the DrainNode API.

    Adapted from test_autoscaler_fake_multinode.py.

    The strategy is to mock out Ray node process termination in
    FakeMultiNodeProvider, leaving node termination to the DrainNode API.

    Scale-down is verified by `ray.cluster_resources`. It is verified that
    no removed_node errors are issued adter scale-down.

    Validity of this test depends on the current implementation of DrainNode.
    DrainNode currently works by asking the GCS to de-register and shut down
    Ray nodes.
    """
    # Autoscaling cluster with Ray process termination mocked out in the node
    # provider.
    cluster = MockAutoscalingCluster(
        head_resources={"CPU": 1},
        worker_node_types={
            "gpu_node": {
                "resources": {
                    "CPU": 1,
                    "GPU": 1,
                    "object_store_memory": 1024 * 1024 * 1024,
                },
                "node_config": {},
                "min_workers": 0,
                "max_workers": 2,
            },
        },
    )

    try:
        cluster.start()
        ray.init("auto")

        # Triggers the addition of a GPU node.
        @ray.remote(num_gpus=1)
        def f():
            print("gpu ok")

        ray.get(f.remote())

        # Verify scale-up
        assert ray.cluster_resources().get("GPU", 0) == 1
        # Sleep for double the idle timeout of 6 seconds.
        time.sleep(12)

        # Verify scale-down
        assert ray.cluster_resources().get("GPU", 0) == 0

        # Check that no errors were raised while draining nodes.
        # (Logic copied from test_failure4::test_gcs_drain.)
        try:
            p = init_error_pubsub()
            errors = get_error_message(p,
                                       1,
                                       ray_constants.REMOVED_NODE_ERROR,
                                       timeout=5)
            assert len(errors) == 0
        finally:
            p.close()
    finally:
        cluster.shutdown()
Пример #16
0
def test_error_isolation(call_ray_start):
    address = call_ray_start
    # Connect a driver to the Ray cluster.
    ray.init(address=address)
    p = init_error_pubsub()

    # There shouldn't be any errors yet.
    errors = get_error_message(p, 1, 2)
    assert len(errors) == 0

    error_string1 = "error_string1"
    error_string2 = "error_string2"

    @ray.remote
    def f():
        raise Exception(error_string1)

    # Run a remote function that throws an error.
    with pytest.raises(Exception):
        ray.get(f.remote())

    # Wait for the error to appear in Redis.
    errors = get_error_message(p, 1)

    # Make sure we got the error.
    assert len(errors) == 1
    assert error_string1 in errors[0].error_message

    # Start another driver and make sure that it does not receive this
    # error. Make the other driver throw an error, and make sure it
    # receives that error.
    driver_script = """
import ray
import time
from ray._private.test_utils import (init_error_pubsub, get_error_message)

ray.init(address="{}")
p = init_error_pubsub()
time.sleep(1)
errors = get_error_message(p, 1, 2)
assert len(errors) == 0

@ray.remote
def f():
    raise Exception("{}")

try:
    ray.get(f.remote())
except Exception as e:
    pass

errors = get_error_message(p, 1)
assert len(errors) == 1

assert "{}" in errors[0].error_message

print("success")
""".format(address, error_string2, error_string2)

    out = run_string_as_driver(driver_script)
    # Make sure the other driver succeeded.
    assert "success" in out

    # Make sure that the other error message doesn't show up for this
    # driver.
    errors = get_error_message(p, 1)
    assert len(errors) == 1
    p.close()