Пример #1
0
def test_autoscaler_shutdown_node_http_everynode(
    shutdown_ray, call_ray_stop_only  # noqa: F811
):
    cluster = AutoscalingCluster(
        head_resources={"CPU": 2},
        worker_node_types={
            "cpu_node": {
                "resources": {
                    "CPU": 4,
                    "IS_WORKER": 100,
                },
                "node_config": {},
                "max_workers": 1,
            },
        },
        idle_timeout_minutes=0.05,
    )
    cluster.start()
    ray.init(address="auto")

    serve.start(http_options={"location": "EveryNode"})

    @ray.remote
    class Placeholder:
        def ready(self):
            return 1

    a = Placeholder.options(resources={"IS_WORKER": 1}).remote()
    assert ray.get(a.ready.remote()) == 1

    # 2 proxies, 1 controller, and one placeholder.
    wait_for_condition(lambda: len(ray._private.state.actors()) == 4)
    assert len(ray.nodes()) == 2

    # Now make sure the placeholder actor exits.
    ray.kill(a)
    # The http proxy on worker node should exit as well.
    wait_for_condition(
        lambda: len(
            list(
                filter(
                    lambda a: a["State"] == "ALIVE",
                    ray._private.state.actors().values(),
                )
            )
        )
        == 2
    )
    # Only head node should exist now.
    wait_for_condition(
        lambda: len(list(filter(lambda n: n["Alive"], ray.nodes()))) == 1
    )
def test_scaledown_shared_objects(shutdown_only):
    cluster = AutoscalingCluster(
        head_resources={"CPU": 1},
        worker_node_types={
            "cpu_node": {
                "resources": {
                    "CPU": 1,
                    "object_store_memory": 100 * 1024 * 1024,
                },
                "node_config": {},
                "min_workers": 0,
                "max_workers": 4,
            },
        },
        idle_timeout_minutes=0.05,
    )

    try:
        cluster.start(
            _system_config={"scheduler_report_pinned_bytes_only": True})
        ray.init("auto")

        # Triggers the addition of a GPU node.
        @ray.remote(num_cpus=1)
        class Actor:
            def f(self):
                pass

            def recv(self, obj):
                pass

        actors = [Actor.remote() for _ in range(5)]
        ray.get([a.f.remote() for a in actors])
        print("All five nodes launched")

        # Verify scale-up.
        wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 5)

        data = ray.put(np.zeros(1024 * 1024 * 5))
        ray.get([a.recv.remote(data) for a in actors])
        print("Data broadcast successfully, deleting actors.")
        del actors

        # Verify scale-down.
        wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 1,
                           timeout=30)
    finally:
        cluster.shutdown()
Пример #3
0
def test_fake_autoscaler_basic_e2e(shutdown_only):
    # __example_begin__
    cluster = AutoscalingCluster(
        head_resources={"CPU": 2},
        worker_node_types={
            "cpu_node": {
                "resources": {
                    "CPU": 4,
                    "object_store_memory": 1024 * 1024 * 1024,
                },
                "node_config": {},
                "min_workers": 0,
                "max_workers": 2,
            },
            "gpu_node": {
                "resources": {
                    "CPU": 2,
                    "GPU": 1,
                    "object_store_memory": 1024 * 1024 * 1024,
                },
                "node_config": {},
                "min_workers": 0,
                "max_workers": 2,
            },
        },
    )

    try:
        cluster.start()
        ray.init("auto")

        # Triggers the addition of a GPU node.
        @ray.remote(num_gpus=1)
        def f():
            print("gpu ok")

        # Triggers the addition of a CPU node.
        @ray.remote(num_cpus=3)
        def g():
            print("cpu ok")

        ray.get(f.remote())
        ray.get(g.remote())
        ray.shutdown()
    finally:
        cluster.shutdown()
Пример #4
0
def _ray_start_chaos_cluster(request):
    param = getattr(request, "param", {})
    kill_interval = param.pop("kill_interval", None)
    config = param.pop("_system_config", {})
    config.update(
        {
            "num_heartbeats_timeout": 10,
            "raylet_heartbeat_period_milliseconds": 100,
            "task_retry_delay_ms": 100,
        }
    )
    # Config of workers that are re-started.
    head_resources = param.pop("head_resources")
    worker_node_types = param.pop("worker_node_types")
    cluster = AutoscalingCluster(
        head_resources,
        worker_node_types,
        idle_timeout_minutes=10,  # Don't take down nodes.
        **param,
    )
    cluster.start(_system_config=config)
    ray.init("auto")
    nodes = ray.nodes()
    assert len(nodes) == 1

    if kill_interval is not None:
        node_killer = get_and_run_node_killer(kill_interval)

    yield cluster

    if kill_interval is not None:
        ray.get(node_killer.stop_run.remote())
        killed = ray.get(node_killer.get_total_killed_nodes.remote())
        assert len(killed) > 0
        died = {node["NodeID"] for node in ray.nodes() if not node["Alive"]}
        assert died.issubset(killed), (
            f"Raylets {died - killed} that " "we did not kill crashed"
        )

    ray.shutdown()
    cluster.shutdown()
Пример #5
0
def ray_start_chaos_cluster(request):
    """Returns the cluster and chaos thread.
    """
    os.environ["RAY_num_heartbeats_timeout"] = "5"
    os.environ["RAY_raylet_heartbeat_period_milliseconds"] = "100"
    param = getattr(request, "param", {})
    kill_interval = param.get("kill_interval", 2)
    # Config of workers that are re-started.
    head_resources = param["head_resources"]
    worker_node_types = param["worker_node_types"]

    cluster = AutoscalingCluster(head_resources, worker_node_types)
    cluster.start()
    ray.init("auto")
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_killer = get_and_run_node_killer(kill_interval)
    yield node_killer
    assert ray.get(node_killer.get_total_killed_nodes.remote()) > 0
    ray.shutdown()
    cluster.shutdown()
    del os.environ["RAY_num_heartbeats_timeout"]
    del os.environ["RAY_raylet_heartbeat_period_milliseconds"]
def test_no_scaledown_with_spilled_objects(shutdown_only):
    cluster = AutoscalingCluster(
        head_resources={"CPU": 0},
        worker_node_types={
            "cpu_node": {
                "resources": {
                    "CPU": 1,
                    "object_store_memory": 75 * 1024 * 1024,
                },
                "node_config": {},
                "min_workers": 0,
                "max_workers": 2,
            },
        },
        idle_timeout_minutes=0.05,
    )

    try:
        cluster.start(_system_config={
            "scheduler_report_pinned_bytes_only": True,
            "min_spilling_size": 0,
        })
        ray.init("auto")

        actors = [Actor.remote() for _ in range(2)]
        ray.get([a.f.remote() for a in actors])

        # Verify scale-up.
        wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 2)
        print("All nodes launched")

        # Put 10 x 80MiB objects into the object store with 75MiB memory limit.
        obj_size = 10 * 1024 * 1024
        objs = []
        for i in range(10):
            obj = actors[0].create.remote(obj_size)
            ray.get(actors[1].recv.remote(obj))
            objs.append(obj)
            print(f"obj {i}={obj.hex()}")
            del obj

        # At least 9 out of the 10 objects should have spilled.
        check_memory([obj.hex() for obj in objs], num_spilled_objects=9)
        print("Objects spilled, deleting actors and object references.")

        # Assume the 1st object always gets spilled.
        spilled_obj = objs[0]
        del objs
        del actors

        # Verify scale-down to 1 node.
        def scaledown_to_one():
            cpu = ray.cluster_resources().get("CPU", 0)
            assert cpu > 0, "Scale-down should keep at least 1 node"
            return cpu == 1

        wait_for_condition(scaledown_to_one, timeout=30)

        # Verify the spilled object still exists, and there is no object in the
        # plasma store.
        check_memory([spilled_obj.hex()], num_plasma_objects=0)

        # Delete the spilled object, the remaining worker node should be scaled
        # down.
        del spilled_obj
        wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 0)
        check_memory([], num_plasma_objects=0)
    finally:
        cluster.shutdown()
Пример #7
0
def test_demand_report_when_scale_up(shutdown_only):
    # https://github.com/ray-project/ray/issues/22122
    from ray.cluster_utils import AutoscalingCluster

    cluster = AutoscalingCluster(
        head_resources={"CPU": 0},
        worker_node_types={
            "cpu_node": {
                "resources": {
                    "CPU": 1,
                    "object_store_memory": 1024 * 1024 * 1024,
                },
                "node_config": {},
                "min_workers": 10,
                "max_workers": 10,
            },
        },
    )

    cluster.start()

    info = ray.init("auto")

    @ray.remote
    def f():
        time.sleep(10000)

    @ray.remote
    def g():
        ray.get(h.remote())

    @ray.remote
    def h():
        time.sleep(10000)

    tasks = [f.remote() for _ in range(5000)].extend(  # noqa: F841
        [g.remote() for _ in range(5000)])

    global_state_accessor = make_global_state_accessor(info)

    def check_backlog_info():
        message = global_state_accessor.get_all_resource_usage()
        if message is None:
            return 0

        resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message)
        aggregate_resource_load = resource_usage.resource_load_by_shape.resource_demands

        if len(aggregate_resource_load) != 1:
            return False

        (backlog_size, num_ready_requests_queued, shape) = (
            aggregate_resource_load[0].backlog_size,
            aggregate_resource_load[0].num_ready_requests_queued,
            aggregate_resource_load[0].shape,
        )
        if backlog_size + num_ready_requests_queued != 9990:
            return False

        if shape != {"CPU": 1.0}:
            return False
        return True

    # In ASAN test it's slow.
    # Wait for 20s for the cluster to be up
    wait_for_condition(check_backlog_info, 20)
    cluster.shutdown()
Пример #8
0
def test_demand_report_for_node_affinity_scheduling_strategy(shutdown_only):
    from ray.cluster_utils import AutoscalingCluster

    cluster = AutoscalingCluster(
        head_resources={"CPU": 0},
        worker_node_types={
            "cpu_node": {
                "resources": {
                    "CPU": 1,
                    "object_store_memory": 1024 * 1024 * 1024,
                },
                "node_config": {},
                "min_workers": 1,
                "max_workers": 1,
            },
        },
    )

    cluster.start()
    info = ray.init(address="auto")

    @ray.remote(num_cpus=1)
    def f(sleep_s):
        time.sleep(sleep_s)
        return ray.get_runtime_context().node_id

    worker_node_id = ray.get(f.remote(0))

    tasks = []
    tasks.append(f.remote(10000))
    # This is not reported since there is feasible node.
    tasks.append(
        f.options(scheduling_strategy=NodeAffinitySchedulingStrategy(
            worker_node_id, soft=False)).remote(0))
    # This is reported since there is no feasible node and soft is True.
    tasks.append(
        f.options(
            num_gpus=1,
            scheduling_strategy=NodeAffinitySchedulingStrategy(
                ray.NodeID.from_random().hex(), soft=True),
        ).remote(0))

    global_state_accessor = make_global_state_accessor(info)

    def check_resource_demand():
        message = global_state_accessor.get_all_resource_usage()
        if message is None:
            return False

        resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message)
        aggregate_resource_load = resource_usage.resource_load_by_shape.resource_demands

        if len(aggregate_resource_load) != 1:
            return False

        if aggregate_resource_load[0].num_infeasible_requests_queued != 1:
            return False

        if aggregate_resource_load[0].shape != {"CPU": 1.0, "GPU": 1.0}:
            return False

        return True

    wait_for_condition(check_resource_demand, 20)
    cluster.shutdown()
Пример #9
0
def ray_start_chaos_cluster(request):
    """Returns the cluster and chaos thread.

    Run chaos_thread.start() to start the chaos testing.
    NOTE: `cluster` is not thread-safe. `cluster`
    shouldn't be modified by other thread once
    chaos_thread.start() is called.
    """
    os.environ["RAY_num_heartbeats_timeout"] = "5"
    os.environ["RAY_raylet_heartbeat_period_milliseconds"] = "100"
    param = getattr(request, "param", {})
    kill_interval = param.get("kill_interval", 2)
    # Config of workers that are re-started.
    head_resources = param["head_resources"]
    worker_node_types = param["worker_node_types"]
    timeout = param["timeout"]

    # Use the shutdown RPC instead of signals because we can't
    # raise a signal in a non-main thread.
    def kill_raylet(ip, port, graceful=False):
        raylet_address = f"{ip}:{port}"
        channel = grpc.insecure_channel(raylet_address)
        stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
        print(f"Sending a shutdown request to {ip}:{port}")
        stub.ShutdownRaylet(
            node_manager_pb2.ShutdownRayletRequest(graceful=graceful))

    cluster = AutoscalingCluster(head_resources, worker_node_types)
    cluster.start()
    ray.init("auto")
    nodes = ray.nodes()
    assert len(nodes) == 1
    head_node_port = nodes[0]["NodeManagerPort"]
    killed_port = set()

    def run_chaos_cluster():
        start = time.time()
        while True:
            node_to_kill_ip = None
            node_to_kill_port = None
            for node in ray.nodes():
                addr = node["NodeManagerAddress"]
                port = node["NodeManagerPort"]
                if (node["Alive"] and port != head_node_port
                        and port not in killed_port):
                    node_to_kill_ip = addr
                    node_to_kill_port = port
                    break

            if node_to_kill_port is not None:
                kill_raylet(node_to_kill_ip, node_to_kill_port, graceful=False)
                killed_port.add(node_to_kill_port)
            time.sleep(kill_interval)
            print(len(ray.nodes()))
            if time.time() - start > timeout:
                break
        assert len(killed_port) > 0, (
            "None of nodes are killed by the conftest. It is a bug.")

    chaos_thread = threading.Thread(target=run_chaos_cluster)
    yield chaos_thread
    chaos_thread.join()
    ray.shutdown()
    cluster.shutdown()
    del os.environ["RAY_num_heartbeats_timeout"]
    del os.environ["RAY_raylet_heartbeat_period_milliseconds"]