Пример #1
0
def test_kill_pending_actor_with_no_restart_true():
    cluster = ray.init()
    global_state_accessor = make_global_state_accessor(cluster)

    @ray.remote(resources={"WORKER": 1.0})
    class PendingActor:
        pass

    # Kill actor with `no_restart=True`.
    actor = PendingActor.remote()
    # TODO(ffbin): The raylet doesn't guarantee the order when dealing with
    # RequestWorkerLease and CancelWorkerLease. If we kill the actor
    # immediately after creating the actor, we may not be able to clean up
    # the request cached by the raylet.
    # See https://github.com/ray-project/ray/issues/13545 for details.
    time.sleep(1)
    ray.kill(actor, no_restart=True)

    def condition1():
        message = global_state_accessor.get_all_resource_usage()
        resource_usages = gcs_utils.ResourceUsageBatchData.FromString(message)
        if len(resource_usages.resource_load_by_shape.resource_demands) == 0:
            return True
        return False

    # Actor is dead, so the infeasible task queue length is 0.
    wait_for_condition(condition1, timeout=10)

    global_state_accessor.disconnect()
    ray.shutdown()
Пример #2
0
def test_no_node_name():
    # Test that starting ray with no node name will result in a node_name=ip_address
    new_head_context = ray.init(include_dashboard=False)
    global_state_accessor = make_global_state_accessor(new_head_context)
    node_data = global_state_accessor.get_node_table()[0]
    node = gcs_utils.GcsNodeInfo.FromString(node_data)
    assert node.node_name == ray.util.get_node_ip_address()
    ray.shutdown()
Пример #3
0
def test_actor_resource_demand(shutdown_only):
    ray.shutdown()
    cluster = ray.init(num_cpus=3)
    global_state_accessor = make_global_state_accessor(cluster)

    @ray.remote(num_cpus=2)
    class Actor:
        def foo(self):
            return "ok"

    a = Actor.remote()
    ray.get(a.foo.remote())
    time.sleep(1)

    message = global_state_accessor.get_all_resource_usage()
    resource_usages = gcs_utils.ResourceUsageBatchData.FromString(message)

    # The actor is scheduled so there should be no more demands left.
    assert len(resource_usages.resource_load_by_shape.resource_demands) == 0

    @ray.remote(num_cpus=80)
    class Actor2:
        pass

    actors = []
    actors.append(Actor2.remote())
    time.sleep(1)

    # This actor cannot be scheduled.
    message = global_state_accessor.get_all_resource_usage()
    resource_usages = gcs_utils.ResourceUsageBatchData.FromString(message)
    assert len(resource_usages.resource_load_by_shape.resource_demands) == 1
    assert resource_usages.resource_load_by_shape.resource_demands[0].shape == {
        "CPU": 80.0
    }
    assert (
        resource_usages.resource_load_by_shape.resource_demands[
            0
        ].num_infeasible_requests_queued
        == 1
    )

    actors.append(Actor2.remote())
    time.sleep(1)

    # Two actors cannot be scheduled.
    message = global_state_accessor.get_all_resource_usage()
    resource_usages = gcs_utils.ResourceUsageBatchData.FromString(message)
    assert len(resource_usages.resource_load_by_shape.resource_demands) == 1
    assert (
        resource_usages.resource_load_by_shape.resource_demands[
            0
        ].num_infeasible_requests_queued
        == 2
    )

    global_state_accessor.disconnect()
Пример #4
0
def test_node_name_init():
    # Test ray.init with _node_name directly
    new_head_context = ray.init(_node_name="new_head_node",
                                include_dashboard=False)

    global_state_accessor = make_global_state_accessor(new_head_context)
    node_data = global_state_accessor.get_node_table()[0]
    node = gcs_utils.GcsNodeInfo.FromString(node_data)
    assert node.node_name == "new_head_node"
    ray.shutdown()
Пример #5
0
def test_heartbeat_ip(shutdown_only):
    cluster = ray.init(num_cpus=1)
    global_state_accessor = make_global_state_accessor(cluster)
    self_ip = ray.util.get_node_ip_address()

    def self_ip_is_set():
        message = global_state_accessor.get_all_resource_usage()
        if message is None:
            return False

        resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message)
        resources_data = resource_usage.batch[0]
        return resources_data.node_manager_address == self_ip

    wait_for_condition(self_ip_is_set, timeout=2)
    global_state_accessor.disconnect()
Пример #6
0
def test_backlog_report(shutdown_only):
    cluster = ray.init(
        num_cpus=1,
        _system_config={
            "max_pending_lease_requests_per_scheduling_category": 1
        },
    )

    global_state_accessor = make_global_state_accessor(cluster)

    @ray.remote(num_cpus=1)
    def foo(x):
        print(".")
        time.sleep(x)
        return None

    def backlog_size_set():
        message = global_state_accessor.get_all_resource_usage()
        if message is None:
            return False

        resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message)
        aggregate_resource_load = resource_usage.resource_load_by_shape.resource_demands
        if len(aggregate_resource_load) == 1:
            backlog_size = aggregate_resource_load[0].backlog_size
            print(backlog_size)
            # Ideally we'd want to assert backlog_size == 8, but guaranteeing
            # the order the order that submissions will occur is too
            # hard/flaky.
            return backlog_size > 0
        return False

    # We want this first task to finish
    refs = [foo.remote(0.5)]
    # These tasks should all start _before_ the first one finishes.
    refs.extend([foo.remote(1000) for _ in range(9)])
    # Now there's 1 request running, 1 queued in the raylet, and 8 queued in
    # the worker backlog.

    ray.get(refs[0])
    # First request finishes, second request is now running, third lease
    # request is sent to the raylet with backlog=7

    wait_for_condition(backlog_size_set, timeout=2)
    global_state_accessor.disconnect()
Пример #7
0
def test_node_name_cluster(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(node_name="head_node", include_dashboard=False)
    head_context = ray.init(address=cluster.address, include_dashboard=False)
    cluster.add_node(node_name="worker_node", include_dashboard=False)
    cluster.wait_for_nodes()

    global_state_accessor = make_global_state_accessor(head_context)
    node_table = global_state_accessor.get_node_table()
    assert len(node_table) == 2
    for node_data in node_table:
        node = gcs_utils.GcsNodeInfo.FromString(node_data)
        if (ray._private.utils.binary_to_hex(
                node.node_id) == head_context.address_info["node_id"]):
            assert node.node_name == "head_node"
        else:
            assert node.node_name == "worker_node"

    global_state_accessor.disconnect()
    ray.shutdown()
    cluster.shutdown()
Пример #8
0
def test_placement_group_load_report(ray_start_cluster):
    cluster = ray_start_cluster
    # Add a head node that doesn't have gpu resource.
    cluster.add_node(num_cpus=4)

    global_state_accessor = make_global_state_accessor(
        ray.init(address=cluster.address)
    )

    class PgLoadChecker:
        def nothing_is_ready(self):
            resource_usage = self._read_resource_usage()
            if not resource_usage:
                return False
            if resource_usage.HasField("placement_group_load"):
                pg_load = resource_usage.placement_group_load
                return len(pg_load.placement_group_data) == 2
            return False

        def only_first_one_ready(self):
            resource_usage = self._read_resource_usage()
            if not resource_usage:
                return False
            if resource_usage.HasField("placement_group_load"):
                pg_load = resource_usage.placement_group_load
                return len(pg_load.placement_group_data) == 1
            return False

        def two_infeasible_pg(self):
            resource_usage = self._read_resource_usage()
            if not resource_usage:
                return False
            if resource_usage.HasField("placement_group_load"):
                pg_load = resource_usage.placement_group_load
                return len(pg_load.placement_group_data) == 2
            return False

        def _read_resource_usage(self):
            message = global_state_accessor.get_all_resource_usage()
            if message is None:
                return False

            resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message)
            return resource_usage

    checker = PgLoadChecker()

    # Create 2 placement groups that are infeasible.
    pg_feasible = ray.util.placement_group([{"A": 1}])
    pg_infeasible = ray.util.placement_group([{"B": 1}])
    _, unready = ray.wait([pg_feasible.ready(), pg_infeasible.ready()], timeout=0)
    assert len(unready) == 2
    wait_for_condition(checker.nothing_is_ready)

    # Add a node that makes pg feasible. Make sure load include this change.
    cluster.add_node(resources={"A": 1})
    ray.get(pg_feasible.ready())
    wait_for_condition(checker.only_first_one_ready)
    # Create one more infeasible pg and make sure load is properly updated.
    pg_infeasible_second = ray.util.placement_group([{"C": 1}])
    _, unready = ray.wait([pg_infeasible_second.ready()], timeout=0)
    assert len(unready) == 1
    wait_for_condition(checker.two_infeasible_pg)
    global_state_accessor.disconnect()
Пример #9
0
def test_load_report(shutdown_only, max_shapes):
    resource1 = "A"
    resource2 = "B"
    cluster = ray.init(
        num_cpus=1,
        resources={resource1: 1},
        _system_config={
            "max_resource_shapes_per_load_report": max_shapes,
        },
    )

    global_state_accessor = make_global_state_accessor(cluster)

    @ray.remote
    def sleep():
        time.sleep(1000)

    sleep.remote()
    for _ in range(3):
        sleep.remote()
        sleep.options(resources={resource1: 1}).remote()
        sleep.options(resources={resource2: 1}).remote()

    class Checker:
        def __init__(self):
            self.report = None

        def check_load_report(self):
            message = global_state_accessor.get_all_resource_usage()
            if message is None:
                return False

            resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message)
            self.report = resource_usage.resource_load_by_shape.resource_demands
            if max_shapes == 0:
                return True
            elif max_shapes == 2:
                return len(self.report) >= 2
            else:
                return len(self.report) >= 3

    # Wait for load information to arrive.
    checker = Checker()
    wait_for_condition(checker.check_load_report)

    # Check that we respect the max shapes limit.
    if max_shapes != -1:
        assert len(checker.report) <= max_shapes

    print(checker.report)

    if max_shapes > 0:
        # Check that we differentiate between infeasible and ready tasks.
        for demand in checker.report:
            if resource2 in demand.shape:
                assert demand.num_infeasible_requests_queued > 0
                assert demand.num_ready_requests_queued == 0
            else:
                assert demand.num_ready_requests_queued > 0
                assert demand.num_infeasible_requests_queued == 0
    global_state_accessor.disconnect()
Пример #10
0
def test_demand_report_when_scale_up(shutdown_only):
    # https://github.com/ray-project/ray/issues/22122
    from ray.cluster_utils import AutoscalingCluster

    cluster = AutoscalingCluster(
        head_resources={"CPU": 0},
        worker_node_types={
            "cpu_node": {
                "resources": {
                    "CPU": 1,
                    "object_store_memory": 1024 * 1024 * 1024,
                },
                "node_config": {},
                "min_workers": 10,
                "max_workers": 10,
            },
        },
    )

    cluster.start()

    info = ray.init("auto")

    @ray.remote
    def f():
        time.sleep(10000)

    @ray.remote
    def g():
        ray.get(h.remote())

    @ray.remote
    def h():
        time.sleep(10000)

    tasks = [f.remote() for _ in range(5000)].extend(  # noqa: F841
        [g.remote() for _ in range(5000)])

    global_state_accessor = make_global_state_accessor(info)

    def check_backlog_info():
        message = global_state_accessor.get_all_resource_usage()
        if message is None:
            return 0

        resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message)
        aggregate_resource_load = resource_usage.resource_load_by_shape.resource_demands

        if len(aggregate_resource_load) != 1:
            return False

        (backlog_size, num_ready_requests_queued, shape) = (
            aggregate_resource_load[0].backlog_size,
            aggregate_resource_load[0].num_ready_requests_queued,
            aggregate_resource_load[0].shape,
        )
        if backlog_size + num_ready_requests_queued != 9990:
            return False

        if shape != {"CPU": 1.0}:
            return False
        return True

    # In ASAN test it's slow.
    # Wait for 20s for the cluster to be up
    wait_for_condition(check_backlog_info, 20)
    cluster.shutdown()
Пример #11
0
def test_demand_report_for_node_affinity_scheduling_strategy(shutdown_only):
    from ray.cluster_utils import AutoscalingCluster

    cluster = AutoscalingCluster(
        head_resources={"CPU": 0},
        worker_node_types={
            "cpu_node": {
                "resources": {
                    "CPU": 1,
                    "object_store_memory": 1024 * 1024 * 1024,
                },
                "node_config": {},
                "min_workers": 1,
                "max_workers": 1,
            },
        },
    )

    cluster.start()
    info = ray.init(address="auto")

    @ray.remote(num_cpus=1)
    def f(sleep_s):
        time.sleep(sleep_s)
        return ray.get_runtime_context().node_id

    worker_node_id = ray.get(f.remote(0))

    tasks = []
    tasks.append(f.remote(10000))
    # This is not reported since there is feasible node.
    tasks.append(
        f.options(scheduling_strategy=NodeAffinitySchedulingStrategy(
            worker_node_id, soft=False)).remote(0))
    # This is reported since there is no feasible node and soft is True.
    tasks.append(
        f.options(
            num_gpus=1,
            scheduling_strategy=NodeAffinitySchedulingStrategy(
                ray.NodeID.from_random().hex(), soft=True),
        ).remote(0))

    global_state_accessor = make_global_state_accessor(info)

    def check_resource_demand():
        message = global_state_accessor.get_all_resource_usage()
        if message is None:
            return False

        resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message)
        aggregate_resource_load = resource_usage.resource_load_by_shape.resource_demands

        if len(aggregate_resource_load) != 1:
            return False

        if aggregate_resource_load[0].num_infeasible_requests_queued != 1:
            return False

        if aggregate_resource_load[0].shape != {"CPU": 1.0, "GPU": 1.0}:
            return False

        return True

    wait_for_condition(check_resource_demand, 20)
    cluster.shutdown()