예제 #1
0
def test_pending_task_dependency_pinning(one_worker_100MiB):
    @ray.remote
    def pending(input1, input2):
        return

    # The object that is ray.put here will go out of scope immediately, so if
    # pending task dependencies aren't considered, it will be evicted before
    # the ray.get below due to the subsequent ray.puts that fill up the object
    # store.
    np_array = np.zeros(40 * 1024 * 1024, dtype=np.uint8)
    signal = SignalActor.remote()
    obj_ref = pending.remote(np_array, signal.wait.remote())

    for _ in range(2):
        ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))

    ray.get(signal.send.remote())
    ray.get(obj_ref)
예제 #2
0
def test_async_callback(ray_start_regular_shared):
    global_set = set()

    ref = ray.put(None)
    ref._on_completed(lambda _: global_set.add("completed-1"))
    wait_for_condition(lambda: "completed-1" in global_set)

    signal = SignalActor.remote()

    @ray.remote
    def wait():
        ray.get(signal.wait.remote())

    ref = wait.remote()
    ref._on_completed(lambda _: global_set.add("completed-2"))
    assert "completed-2" not in global_set
    signal.send.remote()
    wait_for_condition(lambda: "completed-2" in global_set)
예제 #3
0
def test_recursively_nest_ids(one_worker_100MiB, use_ray_put, failure):
    @ray.remote(max_retries=1)
    def recursive(ref, signal, max_depth, depth=0):
        unwrapped = ray.get(ref[0])
        if depth == max_depth:
            ray.get(signal.wait.remote())
            if failure:
                os._exit(0)
            return
        else:
            return recursive.remote(unwrapped, signal, max_depth, depth + 1)

    signal = SignalActor.remote()

    max_depth = 5
    array_oid = put_object(np.zeros(20 * 1024 * 1024, dtype=np.uint8),
                           use_ray_put)
    nested_oid = array_oid
    for _ in range(max_depth):
        nested_oid = ray.put([nested_oid])
    head_oid = recursive.remote([nested_oid], signal, max_depth)

    # Remove the local reference.
    array_oid_bytes = array_oid.binary()
    del array_oid, nested_oid

    tail_oid = head_oid
    for _ in range(max_depth):
        tail_oid = ray.get(tail_oid)

    # Check that the remote reference pins the object.
    _fill_object_store_and_get(array_oid_bytes)

    # Fulfill the dependency, causing the tail task to finish.
    ray.get(signal.send.remote())
    try:
        ray.get(tail_oid)
        assert not failure
    # TODO(edoakes): this should raise WorkerError.
    except ray.exceptions.ObjectLostError:
        assert failure

    # Reference should be gone, check that array gets evicted.
    _fill_object_store_and_get(array_oid_bytes, succeed=False)
예제 #4
0
def test_pass_returned_object_id(one_worker_100MiB, use_ray_put):
    @ray.remote(num_cpus=0)
    class Signal:
        def __init__(self):
            self.ready_event = asyncio.Event()

        def send(self):
            self.ready_event.set()

        async def wait(self):
            await self.ready_event.wait()

    @ray.remote
    def put():
        return

    @ray.remote
    def return_an_id():
        return [
            put_object(np.zeros(40 * 1024 * 1024, dtype=np.uint8), use_ray_put)
        ]

    @ray.remote
    def pending(ref):
        ray.get(ref[0])
        return ref[0]

    signal = SignalActor.remote()
    outer_oid = return_an_id.remote()
    pending_oid = pending.remote([outer_oid])

    # Remove the local reference to the returned ID.
    del outer_oid

    # Check that the inner ID is pinned by the remote task ID.
    _fill_object_store_and_get(pending_oid, succeed=False)
    ray.get(signal.send.remote())
    inner_oid = ray.get(pending_oid)
    inner_oid_binary = inner_oid.binary()
    _fill_object_store_and_get(inner_oid_binary)

    del pending_oid
    del inner_oid
    _fill_object_store_and_get(inner_oid_binary, succeed=False)
예제 #5
0
def test_recursive_serialized_reference(one_worker_100MiB, use_ray_put):
    @ray.remote(num_cpus=0)
    class Signal:
        def __init__(self):
            self.ready_event = asyncio.Event()

        def send(self):
            self.ready_event.set()

        async def wait(self):
            await self.ready_event.wait()

    @ray.remote
    def recursive(ref, signal, max_depth, depth=0):
        ray.get(ref[0])
        if depth == max_depth:
            return ray.get(signal.wait.remote())
        else:
            return recursive.remote(ref, signal, max_depth, depth + 1)

    signal = SignalActor.remote()

    max_depth = 5
    array_oid = put_object(np.zeros(40 * 1024 * 1024, dtype=np.uint8),
                           use_ray_put)
    head_oid = recursive.remote([array_oid], signal, max_depth)

    # Remove the local reference.
    array_oid_bytes = array_oid.binary()
    del array_oid

    tail_oid = head_oid
    for _ in range(max_depth):
        tail_oid = ray.get(tail_oid)

    # Check that the remote reference pins the object.
    _fill_object_store_and_get(array_oid_bytes)

    # Fulfill the dependency, causing the tail task to finish.
    ray.get(signal.send.remote())
    assert ray.get(tail_oid) is None

    # Reference should be gone, check that array gets evicted.
    _fill_object_store_and_get(array_oid_bytes, succeed=False)
예제 #6
0
def test_dying_worker_get(ray_start_2_cpus):
    @ray.remote
    def sleep_forever(signal):
        ray.get(signal.send.remote())
        time.sleep(10**6)

    @ray.remote
    def get_worker_pid():
        return os.getpid()

    signal = SignalActor.remote()

    x_id = sleep_forever.remote(signal)
    ray.get(signal.wait.remote())
    # Get the PID of the other worker.
    worker_pid = ray.get(get_worker_pid.remote())

    @ray.remote
    def f(id_in_a_list):
        ray.get(id_in_a_list[0])

    # Have the worker wait in a get call.
    result_id = f.remote([x_id])
    time.sleep(1)

    # Make sure the task hasn't finished.
    ready_ids, _ = ray.wait([result_id], timeout=0)
    assert len(ready_ids) == 0

    # Kill the worker.
    os.kill(worker_pid, SIGKILL)
    time.sleep(0.1)

    # Make sure the sleep task hasn't finished.
    ready_ids, _ = ray.wait([x_id], timeout=0)
    assert len(ready_ids) == 0
    # Seal the object so the store attempts to notify the worker that the
    # get has been fulfilled.
    obj = np.ones(200 * 1024, dtype=np.uint8)
    ray.worker.global_worker.put_object(obj, x_id)
    time.sleep(0.1)

    # Make sure that nothing has died.
    assert ray._private.services.remaining_processes_alive()
예제 #7
0
def _setup_cluster_for_test(ray_start_cluster):
    NUM_NODES = 2
    cluster = ray_start_cluster
    # Add a head node.
    cluster.add_node(_system_config={"metrics_report_interval_ms": 1000})
    # Add worker nodes.
    [cluster.add_node() for _ in range(NUM_NODES - 1)]
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    worker_should_exit = SignalActor.remote()

    # Generate some metrics from actor & tasks.
    @ray.remote
    def f():
        counter = Count("test_counter", description="desc")
        counter.record(1)
        ray.get(worker_should_exit.wait.remote())

    @ray.remote
    class A:
        async def ping(self):
            histogram = Histogram("test_histogram",
                                  description="desc",
                                  boundaries=[0.1, 1.6])
            histogram.record(1.5)
            ray.get(worker_should_exit.wait.remote())

    a = A.remote()
    obj_refs = [f.remote(), a.ping.remote()]

    node_info_list = ray.nodes()
    prom_addresses = []
    for node_info in node_info_list:
        metrics_export_port = node_info["MetricsExportPort"]
        addr = node_info["NodeManagerAddress"]
        prom_addresses.append(f"{addr}:{metrics_export_port}")

    yield prom_addresses

    ray.get(worker_should_exit.send.remote())
    ray.get(obj_refs)
    ray.shutdown()
    cluster.shutdown()
예제 #8
0
def test_map_async(pool_4_processes):
    def f(args):
        index, signal = args
        ray.get(signal.wait.remote())
        return index, os.getpid()

    signal = SignalActor.remote()
    async_result = pool_4_processes.map_async(f, [(i, signal)
                                                  for i in range(1000)])
    assert not async_result.ready()
    with pytest.raises(TimeoutError):
        async_result.get(timeout=0.01)
    async_result.wait(timeout=0.01)

    # Send the signal to finish the tasks.
    ray.get(signal.send.remote())
    async_result.wait(timeout=10)
    assert async_result.ready()
    assert async_result.successful()

    results = async_result.get()
    assert len(results) == 1000

    pid_counts = defaultdict(int)
    for i, (index, pid) in enumerate(results):
        assert i == index
        pid_counts[pid] += 1

    # Check that the functions are spread somewhat evenly.
    for count in pid_counts.values():
        assert count > 100

    def bad_func(index):
        if index == 50:
            raise Exception("test_map_async failure")

    async_result = pool_4_processes.map_async(bad_func, range(100))
    async_result.wait(10)
    assert async_result.ready()
    assert not async_result.successful()

    with pytest.raises(Exception, match="test_map_async failure"):
        async_result.get()
예제 #9
0
def test_close(pool_4_processes):
    def f(signal):
        ray.get(signal.wait.remote())
        return "hello"

    signal = SignalActor.remote()
    result = pool_4_processes.map_async(f, [signal for _ in range(4)])
    assert not result.ready()
    pool_4_processes.close()
    assert not result.ready()

    # Signal the head of line tasks to finish.
    ray.get(signal.send.remote())
    pool_4_processes.join()

    # close() shouldn't interrupt pending tasks, so check that they succeeded.
    result.wait(timeout=10)
    assert result.ready()
    assert result.successful()
    assert result.get() == ["hello"] * 4
예제 #10
0
def test_pass_returned_object_id(one_worker_100MiB, use_ray_put, failure):
    @ray.remote
    def return_an_id():
        return [
            put_object(
                np.zeros(40 * 1024 * 1024, dtype=np.uint8), use_ray_put)
        ]

    # TODO(edoakes): this fails with an ActorError with max_retries=1.
    @ray.remote(max_retries=0)
    def pending(ref, signal):
        ray.get(signal.wait.remote())
        ray.get(ref[0])
        if failure:
            os._exit(0)

    signal = SignalActor.remote()
    outer_oid = return_an_id.remote()
    inner_oid_binary = ray.get(outer_oid)[0].binary()
    pending_oid = pending.remote([outer_oid], signal)

    # Remove the local reference to the returned ID.
    del outer_oid

    # Check that the inner ID is pinned by the remote task ID and finishing
    # the task unpins the object.
    ray.get(signal.send.remote())
    try:
        # Should succeed because inner_oid is pinned if no failure.
        ray.get(pending_oid)
        assert not failure
    except ray.exceptions.RayWorkerError:
        assert failure

    def ref_not_exists():
        worker = ray.worker.global_worker
        inner_oid = ray.ObjectID(inner_oid_binary)
        return not worker.core_worker.object_exists(inner_oid)

    assert wait_for_condition(ref_not_exists)
예제 #11
0
def test_worker_holding_serialized_reference(one_worker_100MiB, use_ray_put):
    @ray.remote(num_cpus=0)
    class Signal:
        def __init__(self):
            self.ready_event = asyncio.Event()

        def send(self):
            self.ready_event.set()

        async def wait(self):
            await self.ready_event.wait()

    @ray.remote
    def child(dep1, dep2):
        return

    @ray.remote
    def launch_pending_task(ref, signal):
        return child.remote(ref[0], signal.wait.remote())

    signal = SignalActor.remote()

    # Test that the reference held by the actor isn't evicted.
    array_oid = put_object(np.zeros(40 * 1024 * 1024, dtype=np.uint8),
                           use_ray_put)
    child_return_id = ray.get(launch_pending_task.remote([array_oid], signal))

    # Remove the local reference.
    array_oid_bytes = array_oid.binary()
    del array_oid

    # Test that the reference prevents the object from being evicted.
    _fill_object_store_and_get(array_oid_bytes)

    ray.get(signal.send.remote())
    ray.get(child_return_id)
    del child_return_id

    _fill_object_store_and_get(array_oid_bytes, succeed=False)
예제 #12
0
def test_recursively_pass_returned_object_id(one_worker_100MiB):
    @ray.remote
    def put():
        return np.zeros(40 * 1024 * 1024, dtype=np.uint8)

    @ray.remote
    def return_an_id():
        return [put.remote()]

    @ray.remote
    def recursive(ref, signal, max_depth, depth=0):
        ray.get(ref[0])
        if depth == max_depth:
            return ray.get(signal.wait.remote())
        else:
            return recursive.remote(ref, signal, max_depth, depth + 1)

    max_depth = 5
    outer_oid = return_an_id.remote()
    inner_oid_bytes = ray.get(outer_oid)[0].binary()
    signal = SignalActor.remote()
    head_oid = recursive.remote([outer_oid], signal, max_depth)

    # Remove the local reference.
    del outer_oid

    tail_oid = head_oid
    for _ in range(max_depth):
        tail_oid = ray.get(tail_oid)

    # Check that the remote reference pins the object.
    _fill_object_store_and_get(inner_oid_bytes)

    # Fulfill the dependency, causing the tail task to finish.
    ray.get(signal.send.remote())
    ray.get(tail_oid)

    # Reference should be gone, check that returned ID gets evicted.
    _fill_object_store_and_get(inner_oid_bytes, succeed=False)
예제 #13
0
def test_recursively_nest_ids(one_worker_100MiB):
    @ray.remote
    def recursive(ref, signal, max_depth, depth=0):
        unwrapped = ray.get(ref[0])
        if depth == max_depth:
            return ray.get(signal.wait.remote())
        else:
            return recursive.remote(unwrapped, signal, max_depth, depth + 1)

    @ray.remote
    def put():
        return np.zeros(40 * 1024 * 1024, dtype=np.uint8)

    signal = SignalActor.remote()

    max_depth = 5
    array_oid = put.remote()
    nested_oid = array_oid
    for _ in range(max_depth):
        nested_oid = ray.put([nested_oid])
    head_oid = recursive.remote([nested_oid], signal, max_depth)

    # Remove the local reference.
    array_oid_bytes = array_oid.binary()
    del array_oid, nested_oid

    tail_oid = head_oid
    for _ in range(max_depth):
        tail_oid = ray.get(tail_oid)

    # Check that the remote reference pins the object.
    _fill_object_store_and_get(array_oid_bytes)

    # Fulfill the dependency, causing the tail task to finish.
    ray.get(signal.send.remote())
    ray.get(tail_oid)

    # Reference should be gone, check that array gets evicted.
    _fill_object_store_and_get(array_oid_bytes, succeed=False)
예제 #14
0
def test_fast(shutdown_only, use_force):
    ray.init(num_cpus=2)

    @ray.remote
    def fast(y):
        return y

    signaler = SignalActor.remote()
    ids = list()
    for _ in range(100):
        x = fast.remote("a")
        # NOTE If a non-force Cancellation is attempted in the time
        # between a worker receiving a task and the worker executing
        # that task (specifically the python execution), Cancellation
        # can fail.
        if not use_force:
            time.sleep(0.1)
        ray.cancel(x, force=use_force)
        ids.append(x)

    @ray.remote
    def wait_for(y):
        return y

    sig = signaler.wait.remote()
    for _ in range(5000):
        x = wait_for.remote(sig)
        ids.append(x)

    for idx in range(100, 5100):
        if random.random() > 0.95:
            ray.cancel(ids[idx], force=use_force)
    signaler.send.remote()
    for i, obj_ref in enumerate(ids):
        try:
            ray.get(obj_ref, timeout=120)
        except Exception as e:
            assert isinstance(
                e, valid_exceptions(use_force)), f"Failure on iteration: {i}"
예제 #15
0
def test_serve_graceful_shutdown(serve_instance):
    signal = SignalActor.remote()

    @serve.deployment(name="wait", max_concurrent_queries=10)
    class Wait:
        async def __call__(self, request):
            signal_actor = await request.body()
            await signal_actor.wait.remote()
            return ""

    Wait.config.experimental_graceful_shutdown_wait_loop_s = 0.5
    Wait.config.experimental_graceful_shutdown_timeout_s = 1000
    Wait.deploy()
    handle = Wait.get_handle()
    refs = [handle.remote(signal) for _ in range(10)]

    # Wait for all the queries to be enqueued
    with pytest.raises(ray.exceptions.GetTimeoutError):
        ray.get(refs, timeout=1)

    @ray.remote(num_cpus=0)
    def do_blocking_delete():
        Wait.delete()

    # Now delete the backend. This should trigger the shutdown sequence.
    delete_ref = do_blocking_delete.remote()

    # The queries should be enqueued but not executed becuase they are blocked
    # by signal actor.
    with pytest.raises(ray.exceptions.GetTimeoutError):
        ray.get(refs, timeout=1)

    signal.send.remote()

    # All the queries should be drained and executed without error.
    ray.get(refs)
    # Blocking delete should complete.
    ray.get(delete_ref)
예제 #16
0
def test_basic_serialized_reference(one_worker_100MiB, use_ray_put):
    @ray.remote
    def pending(ref, dep):
        ray.get(ref[0])

    array_oid = put_object(np.zeros(40 * 1024 * 1024, dtype=np.uint8),
                           use_ray_put)
    signal = SignalActor.remote()
    oid = pending.remote([array_oid], signal.wait.remote())

    # Remove the local reference.
    array_oid_bytes = array_oid.binary()
    del array_oid

    # Check that the remote reference pins the object.
    _fill_object_store_and_get(array_oid_bytes)

    # Fulfill the dependency, causing the task to finish.
    ray.get(signal.send.remote())
    ray.get(oid)

    # Reference should be gone, check that array gets evicted.
    _fill_object_store_and_get(array_oid_bytes, succeed=False)
예제 #17
0
def test_remote_cancel(ray_start_regular, use_force):
    signaler = SignalActor.remote()

    @ray.remote
    def wait_for(y):
        return ray.get(y[0])

    @ray.remote
    def remote_wait(sg):
        return [wait_for.remote([sg[0]])]

    sig = signaler.wait.remote()

    outer = remote_wait.remote([sig])
    inner = ray.get(outer)[0]

    with pytest.raises(GetTimeoutError):
        ray.get(inner, timeout=1)

    ray.cancel(inner, force=use_force)

    with pytest.raises(valid_exceptions(use_force)):
        ray.get(inner, timeout=10)
예제 #18
0
def test_ref_in_handle_input(serve_instance):
    # https://github.com/ray-project/ray/issues/12593

    unblock_worker_signal = SignalActor.remote()

    @serve.deployment
    async def blocked_by_ref(data):
        assert not isinstance(data, ray.ObjectRef)

    blocked_by_ref.deploy()
    handle = blocked_by_ref.get_handle()

    # Pass in a ref that's not ready yet
    ref = unblock_worker_signal.wait.remote()
    worker_result = handle.remote(ref)

    # Worker shouldn't execute the request
    with pytest.raises(GetTimeoutError):
        ray.get(worker_result, timeout=1)

    # Now unblock the worker
    unblock_worker_signal.send.remote()
    ray.get(worker_result)
예제 #19
0
def test_ref_in_handle_input(serve_instance):
    # https://github.com/ray-project/ray/issues/12593

    unblock_worker_signal = SignalActor.remote()

    async def blocked_by_ref(serve_request):
        data = await serve_request.body()
        assert not isinstance(data, ray.ObjectRef)

    serve.create_backend("ref", blocked_by_ref)
    serve.create_endpoint("ref", backend="ref")
    handle = serve.get_handle("ref")

    # Pass in a ref that's not ready yet
    ref = unblock_worker_signal.wait.remote()
    worker_result = handle.remote(ref)

    # Worker shouldn't execute the request
    with pytest.raises(GetTimeoutError):
        ray.get(worker_result, timeout=1)

    # Now unblock the worker
    unblock_worker_signal.send.remote()
    ray.get(worker_result)
예제 #20
0
def test_worker_holding_serialized_reference(one_worker_100MiB, use_ray_put,
                                             failure):
    @ray.remote(max_retries=1)
    def child(dep1, dep2):
        if failure:
            os._exit(0)
        return

    @ray.remote
    def launch_pending_task(ref, signal):
        return child.remote(ref[0], signal.wait.remote())

    signal = SignalActor.remote()

    # Test that the reference held by the actor isn't evicted.
    array_oid = put_object(np.zeros(40 * 1024 * 1024, dtype=np.uint8),
                           use_ray_put)
    child_return_id = ray.get(launch_pending_task.remote([array_oid], signal))

    # Remove the local reference.
    array_oid_bytes = array_oid.binary()
    del array_oid

    # Test that the reference prevents the object from being evicted.
    _fill_object_store_and_get(array_oid_bytes)

    ray.get(signal.send.remote())
    try:
        ray.get(child_return_id)
        assert not failure
    except (ray.exceptions.RayWorkerError,
            ray.exceptions.UnreconstructableError):
        assert failure
    del child_return_id

    _fill_object_store_and_get(array_oid_bytes, succeed=False)
예제 #21
0
파일: test_router.py 프로젝트: thalvari/ray
async def test_router_use_max_concurrency(serve_instance):
    # The VisibleRouter::get_queues method needs to pickle queries
    # so we register serializer here. In regular code path, query
    # serialization is done by Serve manually for performance.
    ray.register_custom_serializer(Query, Query.ray_serialize,
                                   Query.ray_deserialize)

    signal = SignalActor.remote()

    @ray.remote
    class MockWorker:
        async def handle_request(self, request):
            await signal.wait.remote()
            return "DONE"

        def ready(self):
            pass

    class VisibleRouter(Router):
        def get_queues(self):
            return self.queries_counter, self.backend_queues

    worker = MockWorker.remote()
    q = ray.remote(VisibleRouter).remote()
    await q.setup.remote("")
    backend_name = "max-concurrent-test"
    config = BackendConfig({"max_concurrent_queries": 1})
    await q.set_traffic.remote("svc", TrafficPolicy({backend_name: 1.0}))
    await q.add_new_worker.remote(backend_name, "replica-tag", worker)
    await q.set_backend_config.remote(backend_name, config)

    # We send over two queries
    first_query = q.enqueue_request.remote(RequestMetadata("svc", None), 1)
    second_query = q.enqueue_request.remote(RequestMetadata("svc", None), 1)

    # Neither queries should be available
    with pytest.raises(ray.exceptions.RayTimeoutError):
        ray.get([first_query, second_query], timeout=0.2)

    # Let's retrieve the router internal state
    queries_counter, backend_queues = await q.get_queues.remote()
    # There should be just one inflight request
    assert queries_counter[backend_name][
        "max-concurrent-test:replica-tag"] == 1
    # The second query is buffered
    assert len(backend_queues["max-concurrent-test"]) == 1

    # Let's unblock the first query
    await signal.send.remote(clear=True)
    assert await first_query == "DONE"

    # The internal state of router should have changed.
    queries_counter, backend_queues = await q.get_queues.remote()
    # There should still be one inflight request
    assert queries_counter[backend_name][
        "max-concurrent-test:replica-tag"] == 1
    # But there shouldn't be any queries in the queue
    assert len(backend_queues["max-concurrent-test"]) == 0

    # Unblocking the second query
    await signal.send.remote(clear=True)
    assert await second_query == "DONE"

    # Checking the internal state of the router one more time
    queries_counter, backend_queues = await q.get_queues.remote()
    assert queries_counter[backend_name][
        "max-concurrent-test:replica-tag"] == 0
    assert len(backend_queues["max-concurrent-test"]) == 0
예제 #22
0
def test_dependency_refcounts(ray_start_regular):
    @ray.remote
    def one_dep(dep, signal=None, fail=False):
        if signal is not None:
            ray.get(signal.wait.remote())
        if fail:
            raise Exception("failed on purpose")

    @ray.remote
    def one_dep_large(dep, signal=None):
        if signal is not None:
            ray.get(signal.wait.remote())
        # This will be spilled to plasma.
        return np.zeros(10 * 1024 * 1024, dtype=np.uint8)

    # Test that regular plasma dependency refcounts are decremented once the
    # task finishes.
    signal = SignalActor.remote()
    large_dep = ray.put(np.zeros(10 * 1024 * 1024, dtype=np.uint8))
    result = one_dep.remote(large_dep, signal=signal)
    check_refcounts({large_dep: (1, 1), result: (1, 0)})
    ray.get(signal.send.remote())
    # Reference count should be removed once the task finishes.
    check_refcounts({large_dep: (1, 0), result: (1, 0)})
    del large_dep, result
    check_refcounts({})

    # Test that inlined dependency refcounts are decremented once they are
    # inlined.
    signal = SignalActor.remote()
    dep = one_dep.remote(None, signal=signal)
    check_refcounts({dep: (1, 0)})
    result = one_dep.remote(dep)
    check_refcounts({dep: (1, 1), result: (1, 0)})
    ray.get(signal.send.remote())
    # Reference count should be removed as soon as the dependency is inlined.
    check_refcounts({dep: (1, 0), result: (1, 0)})
    del dep, result
    check_refcounts({})

    # Test that spilled plasma dependency refcounts are decremented once
    # the task finishes.
    signal1, signal2 = SignalActor.remote(), SignalActor.remote()
    dep = one_dep_large.remote(None, signal=signal1)
    check_refcounts({dep: (1, 0)})
    result = one_dep.remote(dep, signal=signal2)
    check_refcounts({dep: (1, 1), result: (1, 0)})
    ray.get(signal1.send.remote())
    ray.get(dep, timeout=10)
    # Reference count should remain because the dependency is in plasma.
    check_refcounts({dep: (1, 1), result: (1, 0)})
    ray.get(signal2.send.remote())
    # Reference count should be removed because the task finished.
    check_refcounts({dep: (1, 0), result: (1, 0)})
    del dep, result
    check_refcounts({})

    # Test that regular plasma dependency refcounts are decremented if a task
    # fails.
    signal = SignalActor.remote()
    large_dep = ray.put(np.zeros(10 * 1024 * 1024, dtype=np.uint8))
    result = one_dep.remote(large_dep, signal=signal, fail=True)
    check_refcounts({large_dep: (1, 1), result: (1, 0)})
    ray.get(signal.send.remote())
    # Reference count should be removed once the task finishes.
    check_refcounts({large_dep: (1, 0), result: (1, 0)})
    del large_dep, result
    check_refcounts({})

    # Test that spilled plasma dependency refcounts are decremented if a task
    # fails.
    signal1, signal2 = SignalActor.remote(), SignalActor.remote()
    dep = one_dep_large.remote(None, signal=signal1)
    check_refcounts({dep: (1, 0)})
    result = one_dep.remote(dep, signal=signal2, fail=True)
    check_refcounts({dep: (1, 1), result: (1, 0)})
    ray.get(signal1.send.remote())
    ray.get(dep, timeout=10)
    # Reference count should remain because the dependency is in plasma.
    check_refcounts({dep: (1, 1), result: (1, 0)})
    ray.get(signal2.send.remote())
    # Reference count should be removed because the task finished.
    check_refcounts({dep: (1, 0), result: (1, 0)})
    del dep, result
    check_refcounts({})
예제 #23
0
파일: test_failure.py 프로젝트: qyou/ray
def test_async_actor_task_retries(ray_start_regular):
    # https://github.com/ray-project/ray/issues/11683

    signal = SignalActor.remote()

    @ray.remote
    class DyingActor:
        def __init__(self):
            print("DyingActor init called")
            self.should_exit = False

        def set_should_exit(self):
            print("DyingActor.set_should_exit called")
            self.should_exit = True

        async def get(self, x, wait=False):
            print(f"DyingActor.get called with x={x}, wait={wait}")
            if self.should_exit:
                os._exit(0)
            if wait:
                await signal.wait.remote()
            return x

    # Normal in order actor task retries should work
    dying = DyingActor.options(
        max_restarts=-1,
        max_task_retries=-1,
    ).remote()

    assert ray.get(dying.get.remote(1)) == 1
    ray.get(dying.set_should_exit.remote())
    assert ray.get(dying.get.remote(42)) == 42

    # Now let's try out of order retries:
    # Task seqno 0 will return
    # Task seqno 1 will be pending and retried later
    # Task seqno 2 will return
    # Task seqno 3 will crash the actor and retried later
    dying = DyingActor.options(
        max_restarts=-1,
        max_task_retries=-1,
    ).remote()

    # seqno 0
    ref_0 = dying.get.remote(0)
    assert ray.get(ref_0) == 0
    # seqno 1
    ref_1 = dying.get.remote(1, wait=True)
    # seqno 2
    ref_2 = dying.set_should_exit.remote()
    assert ray.get(ref_2) is None
    # seqno 3, this will crash the actor because previous task set should exit
    # to true.
    ref_3 = dying.get.remote(3)

    # At this point the actor should be restarted. The two pending tasks
    # [ref_1, ref_3] should be retried, but not the completed tasks [ref_0,
    # ref_2]. Critically, if ref_2 was retried, ref_3 can never return.
    ray.get(signal.send.remote())
    assert ray.get(ref_1) == 1
    assert ray.get(ref_3) == 3
예제 #24
0
파일: test_router.py 프로젝트: johnnyL7/ray
async def test_router_use_max_concurrency(serve_instance):
    signal = SignalActor.remote()

    @ray.remote
    class MockWorker:
        async def handle_request(self, request):
            await signal.wait.remote()
            return "DONE"

        def ready(self):
            pass

    class VisibleRouter(Router):
        def get_queues(self):
            return self.queries_counter, self.backend_queues

    worker = MockWorker.remote()
    q = ray.remote(VisibleRouter).remote()
    await q.setup.remote("", serve_instance._controller_name)
    backend_name = "max-concurrent-test"
    config = BackendConfig(max_concurrent_queries=1)
    await q.set_traffic.remote("svc", TrafficPolicy({backend_name: 1.0}))
    await q.add_new_worker.remote(backend_name, "replica-tag", worker)
    await q.set_backend_config.remote(backend_name, config)

    # We send over two queries
    first_query = q.enqueue_request.remote(
        RequestMetadata(get_random_letters(10), "svc", None), 1)
    second_query = q.enqueue_request.remote(
        RequestMetadata(get_random_letters(10), "svc", None), 1)

    # Neither queries should be available
    with pytest.raises(ray.exceptions.GetTimeoutError):
        ray.get([first_query, second_query], timeout=0.2)

    # Let's retrieve the router internal state
    queries_counter, backend_queues = await q.get_queues.remote()
    # There should be just one inflight request
    assert queries_counter[backend_name][
        "max-concurrent-test:replica-tag"] == 1
    # The second query is buffered
    assert len(backend_queues["max-concurrent-test"]) == 1

    # Let's unblock the first query
    await signal.send.remote(clear=True)
    assert await first_query == "DONE"

    # The internal state of router should have changed.
    queries_counter, backend_queues = await q.get_queues.remote()
    # There should still be one inflight request
    assert queries_counter[backend_name][
        "max-concurrent-test:replica-tag"] == 1
    # But there shouldn't be any queries in the queue
    assert len(backend_queues["max-concurrent-test"]) == 0

    # Unblocking the second query
    await signal.send.remote(clear=True)
    assert await second_query == "DONE"

    # Checking the internal state of the router one more time
    queries_counter, backend_queues = await q.get_queues.remote()
    assert queries_counter[backend_name][
        "max-concurrent-test:replica-tag"] == 0
    assert len(backend_queues["max-concurrent-test"]) == 0
예제 #25
0
def test_metrics_export_end_to_end(ray_start_cluster):
    NUM_NODES = 2
    cluster = ray_start_cluster
    # Add a head node.
    cluster.add_node(
        _internal_config=json.dumps({"metrics_report_interval_ms": 1000}))
    # Add worker nodes.
    [cluster.add_node() for _ in range(NUM_NODES - 1)]
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    signal = SignalActor.remote()

    # Generate some metrics around actor & tasks.
    @ray.remote
    def f():
        counter = Count("test_counter", "desc", "unit", [])
        ray.get(signal.send.remote())
        while True:
            counter.record(1, {})
            time.sleep(0.1)

    @ray.remote
    class A:
        async def ready(self):
            pass

        async def ping(self):
            histogram = Histogram("test_histogram", "desc", "unit", [0, 1, 2],
                                  [])
            while True:
                histogram.record(1, {})
                await asyncio.sleep(0.1)

    obj_refs = [f.remote() for _ in range(30)]
    a = A.remote()
    obj_refs.append(a.ping.remote())

    # Make sure both histogram and counter are created
    ray.get(a.ready.remote())
    ray.get(signal.wait.remote())

    node_info_list = ray.nodes()
    prom_addresses = []
    for node_info in node_info_list:
        metrics_export_port = node_info["MetricsExportPort"]
        addr = node_info["NodeManagerAddress"]
        prom_addresses.append(f"{addr}:{metrics_export_port}")

    # Make sure we can ping Prometheus endpoints.
    def fetch_prometheus(prom_addresses):
        components_dict = {}
        metric_names = set()
        for address in prom_addresses:
            if address not in components_dict:
                components_dict[address] = set()
            try:
                response = requests.get(
                    "http://localhost:{}".format(metrics_export_port))
            except requests.exceptions.ConnectionError:
                return components_dict, metric_names

            for line in response.text.split("\n"):
                for family in text_string_to_metric_families(line):
                    for sample in family.samples:
                        # print(sample)
                        metric_names.add(sample.name)
                        if "Component" in sample.labels:
                            components_dict[address].add(
                                sample.labels["Component"])
        return components_dict, metric_names

    def test_prometheus_endpoint():
        # TODO(Simon): Add a gcs_server after fixing metrics.
        components_dict, metric_names = fetch_prometheus(prom_addresses)

        # Raylet should be on every node
        expected_components = {"raylet"}
        components_found = all(
            expected_components.issubset(components)
            for components in components_dict.values())

        # Core worker should be on at least one node
        components_found = components_found and any(
            "core_worker" in components
            for components in components_dict.values())

        expected_metric_names = {"ray_test_counter", "ray_test_histogram_max"}
        metric_names_found = expected_metric_names.issubset(metric_names)

        return components_found and metric_names_found

    try:
        wait_for_condition(
            test_prometheus_endpoint,
            timeout=20,
            retry_interval_ms=1000,  # Yield resource for other processes
        )
    except RuntimeError:
        # This is for debugging when test failed.
        raise RuntimeError(
            "All components were not visible to "
            "prometheus endpoints on time. "
            f"The compoenents are {fetch_prometheus(prom_addresses)}")
    ray.shutdown()
예제 #26
0
async def test_replica_set(ray_instance):
    signal = SignalActor.remote()

    @ray.remote(num_cpus=0)
    class MockWorker:
        _num_queries = 0

        async def handle_request(self, request):
            self._num_queries += 1
            await signal.wait.remote()
            return "DONE"

        async def num_queries(self):
            return self._num_queries

    # We will test a scenario with two replicas in the replica set.
    rs = ReplicaSet()
    workers = [MockWorker.remote() for _ in range(2)]
    rs.set_max_concurrent_queries(1)
    rs.update_worker_replicas(workers)

    # Send two queries. They should go through the router but blocked by signal
    # actors.
    query = Query([], {}, TaskContext.Python,
                  RequestMetadata("request-id", "endpoint",
                                  TaskContext.Python))
    first_ref = await rs.assign_replica(query)
    second_ref = await rs.assign_replica(query)

    # These should be blocked by signal actor.
    with pytest.raises(ray.exceptions.GetTimeoutError):
        ray.get([first_ref, second_ref], timeout=1)

    # Each replica should have exactly one inflight query. Let make sure the
    # queries arrived there.
    for worker in workers:
        while await worker.num_queries.remote() != 1:
            await asyncio.sleep(1)

    # Let's try to send another query.
    third_ref_pending_task = asyncio.get_event_loop().create_task(
        rs.assign_replica(query))
    # We should fail to assign a replica, so this coroutine should still be
    # pending after some time.
    await asyncio.sleep(0.2)
    assert not third_ref_pending_task.done()

    # Let's unblock the two workers
    await signal.send.remote()
    assert await first_ref == "DONE"
    assert await second_ref == "DONE"

    # The third request should be unblocked and sent to first worker.
    # This meas we should be able to get the object ref.
    third_ref = await third_ref_pending_task

    # Now we got the object ref, let's get it result.
    await signal.send.remote()
    assert await third_ref == "DONE"

    # Finally, make sure that one of the replica processed the third query.
    num_queries_set = {(await worker.num_queries.remote())
                       for worker in workers}
    assert num_queries_set == {2, 1}