def test_kill_pending_actor_with_no_restart_true(): cluster = ray.init() global_state_accessor = make_global_state_accessor(cluster) @ray.remote(resources={"WORKER": 1.0}) class PendingActor: pass # Kill actor with `no_restart=True`. actor = PendingActor.remote() # TODO(ffbin): The raylet doesn't guarantee the order when dealing with # RequestWorkerLease and CancelWorkerLease. If we kill the actor # immediately after creating the actor, we may not be able to clean up # the request cached by the raylet. # See https://github.com/ray-project/ray/issues/13545 for details. time.sleep(1) ray.kill(actor, no_restart=True) def condition1(): message = global_state_accessor.get_all_resource_usage() resource_usages = gcs_utils.ResourceUsageBatchData.FromString(message) if len(resource_usages.resource_load_by_shape.resource_demands) == 0: return True return False # Actor is dead, so the infeasible task queue length is 0. wait_for_condition(condition1, timeout=10) global_state_accessor.disconnect() ray.shutdown()
def test_no_node_name(): # Test that starting ray with no node name will result in a node_name=ip_address new_head_context = ray.init(include_dashboard=False) global_state_accessor = make_global_state_accessor(new_head_context) node_data = global_state_accessor.get_node_table()[0] node = gcs_utils.GcsNodeInfo.FromString(node_data) assert node.node_name == ray.util.get_node_ip_address() ray.shutdown()
def test_actor_resource_demand(shutdown_only): ray.shutdown() cluster = ray.init(num_cpus=3) global_state_accessor = make_global_state_accessor(cluster) @ray.remote(num_cpus=2) class Actor: def foo(self): return "ok" a = Actor.remote() ray.get(a.foo.remote()) time.sleep(1) message = global_state_accessor.get_all_resource_usage() resource_usages = gcs_utils.ResourceUsageBatchData.FromString(message) # The actor is scheduled so there should be no more demands left. assert len(resource_usages.resource_load_by_shape.resource_demands) == 0 @ray.remote(num_cpus=80) class Actor2: pass actors = [] actors.append(Actor2.remote()) time.sleep(1) # This actor cannot be scheduled. message = global_state_accessor.get_all_resource_usage() resource_usages = gcs_utils.ResourceUsageBatchData.FromString(message) assert len(resource_usages.resource_load_by_shape.resource_demands) == 1 assert resource_usages.resource_load_by_shape.resource_demands[0].shape == { "CPU": 80.0 } assert ( resource_usages.resource_load_by_shape.resource_demands[ 0 ].num_infeasible_requests_queued == 1 ) actors.append(Actor2.remote()) time.sleep(1) # Two actors cannot be scheduled. message = global_state_accessor.get_all_resource_usage() resource_usages = gcs_utils.ResourceUsageBatchData.FromString(message) assert len(resource_usages.resource_load_by_shape.resource_demands) == 1 assert ( resource_usages.resource_load_by_shape.resource_demands[ 0 ].num_infeasible_requests_queued == 2 ) global_state_accessor.disconnect()
def test_node_name_init(): # Test ray.init with _node_name directly new_head_context = ray.init(_node_name="new_head_node", include_dashboard=False) global_state_accessor = make_global_state_accessor(new_head_context) node_data = global_state_accessor.get_node_table()[0] node = gcs_utils.GcsNodeInfo.FromString(node_data) assert node.node_name == "new_head_node" ray.shutdown()
def test_heartbeat_ip(shutdown_only): cluster = ray.init(num_cpus=1) global_state_accessor = make_global_state_accessor(cluster) self_ip = ray.util.get_node_ip_address() def self_ip_is_set(): message = global_state_accessor.get_all_resource_usage() if message is None: return False resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message) resources_data = resource_usage.batch[0] return resources_data.node_manager_address == self_ip wait_for_condition(self_ip_is_set, timeout=2) global_state_accessor.disconnect()
def test_backlog_report(shutdown_only): cluster = ray.init( num_cpus=1, _system_config={ "max_pending_lease_requests_per_scheduling_category": 1 }, ) global_state_accessor = make_global_state_accessor(cluster) @ray.remote(num_cpus=1) def foo(x): print(".") time.sleep(x) return None def backlog_size_set(): message = global_state_accessor.get_all_resource_usage() if message is None: return False resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message) aggregate_resource_load = resource_usage.resource_load_by_shape.resource_demands if len(aggregate_resource_load) == 1: backlog_size = aggregate_resource_load[0].backlog_size print(backlog_size) # Ideally we'd want to assert backlog_size == 8, but guaranteeing # the order the order that submissions will occur is too # hard/flaky. return backlog_size > 0 return False # We want this first task to finish refs = [foo.remote(0.5)] # These tasks should all start _before_ the first one finishes. refs.extend([foo.remote(1000) for _ in range(9)]) # Now there's 1 request running, 1 queued in the raylet, and 8 queued in # the worker backlog. ray.get(refs[0]) # First request finishes, second request is now running, third lease # request is sent to the raylet with backlog=7 wait_for_condition(backlog_size_set, timeout=2) global_state_accessor.disconnect()
def test_node_name_cluster(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(node_name="head_node", include_dashboard=False) head_context = ray.init(address=cluster.address, include_dashboard=False) cluster.add_node(node_name="worker_node", include_dashboard=False) cluster.wait_for_nodes() global_state_accessor = make_global_state_accessor(head_context) node_table = global_state_accessor.get_node_table() assert len(node_table) == 2 for node_data in node_table: node = gcs_utils.GcsNodeInfo.FromString(node_data) if (ray._private.utils.binary_to_hex( node.node_id) == head_context.address_info["node_id"]): assert node.node_name == "head_node" else: assert node.node_name == "worker_node" global_state_accessor.disconnect() ray.shutdown() cluster.shutdown()
def test_placement_group_load_report(ray_start_cluster): cluster = ray_start_cluster # Add a head node that doesn't have gpu resource. cluster.add_node(num_cpus=4) global_state_accessor = make_global_state_accessor( ray.init(address=cluster.address) ) class PgLoadChecker: def nothing_is_ready(self): resource_usage = self._read_resource_usage() if not resource_usage: return False if resource_usage.HasField("placement_group_load"): pg_load = resource_usage.placement_group_load return len(pg_load.placement_group_data) == 2 return False def only_first_one_ready(self): resource_usage = self._read_resource_usage() if not resource_usage: return False if resource_usage.HasField("placement_group_load"): pg_load = resource_usage.placement_group_load return len(pg_load.placement_group_data) == 1 return False def two_infeasible_pg(self): resource_usage = self._read_resource_usage() if not resource_usage: return False if resource_usage.HasField("placement_group_load"): pg_load = resource_usage.placement_group_load return len(pg_load.placement_group_data) == 2 return False def _read_resource_usage(self): message = global_state_accessor.get_all_resource_usage() if message is None: return False resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message) return resource_usage checker = PgLoadChecker() # Create 2 placement groups that are infeasible. pg_feasible = ray.util.placement_group([{"A": 1}]) pg_infeasible = ray.util.placement_group([{"B": 1}]) _, unready = ray.wait([pg_feasible.ready(), pg_infeasible.ready()], timeout=0) assert len(unready) == 2 wait_for_condition(checker.nothing_is_ready) # Add a node that makes pg feasible. Make sure load include this change. cluster.add_node(resources={"A": 1}) ray.get(pg_feasible.ready()) wait_for_condition(checker.only_first_one_ready) # Create one more infeasible pg and make sure load is properly updated. pg_infeasible_second = ray.util.placement_group([{"C": 1}]) _, unready = ray.wait([pg_infeasible_second.ready()], timeout=0) assert len(unready) == 1 wait_for_condition(checker.two_infeasible_pg) global_state_accessor.disconnect()
def test_load_report(shutdown_only, max_shapes): resource1 = "A" resource2 = "B" cluster = ray.init( num_cpus=1, resources={resource1: 1}, _system_config={ "max_resource_shapes_per_load_report": max_shapes, }, ) global_state_accessor = make_global_state_accessor(cluster) @ray.remote def sleep(): time.sleep(1000) sleep.remote() for _ in range(3): sleep.remote() sleep.options(resources={resource1: 1}).remote() sleep.options(resources={resource2: 1}).remote() class Checker: def __init__(self): self.report = None def check_load_report(self): message = global_state_accessor.get_all_resource_usage() if message is None: return False resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message) self.report = resource_usage.resource_load_by_shape.resource_demands if max_shapes == 0: return True elif max_shapes == 2: return len(self.report) >= 2 else: return len(self.report) >= 3 # Wait for load information to arrive. checker = Checker() wait_for_condition(checker.check_load_report) # Check that we respect the max shapes limit. if max_shapes != -1: assert len(checker.report) <= max_shapes print(checker.report) if max_shapes > 0: # Check that we differentiate between infeasible and ready tasks. for demand in checker.report: if resource2 in demand.shape: assert demand.num_infeasible_requests_queued > 0 assert demand.num_ready_requests_queued == 0 else: assert demand.num_ready_requests_queued > 0 assert demand.num_infeasible_requests_queued == 0 global_state_accessor.disconnect()
def test_demand_report_when_scale_up(shutdown_only): # https://github.com/ray-project/ray/issues/22122 from ray.cluster_utils import AutoscalingCluster cluster = AutoscalingCluster( head_resources={"CPU": 0}, worker_node_types={ "cpu_node": { "resources": { "CPU": 1, "object_store_memory": 1024 * 1024 * 1024, }, "node_config": {}, "min_workers": 10, "max_workers": 10, }, }, ) cluster.start() info = ray.init("auto") @ray.remote def f(): time.sleep(10000) @ray.remote def g(): ray.get(h.remote()) @ray.remote def h(): time.sleep(10000) tasks = [f.remote() for _ in range(5000)].extend( # noqa: F841 [g.remote() for _ in range(5000)]) global_state_accessor = make_global_state_accessor(info) def check_backlog_info(): message = global_state_accessor.get_all_resource_usage() if message is None: return 0 resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message) aggregate_resource_load = resource_usage.resource_load_by_shape.resource_demands if len(aggregate_resource_load) != 1: return False (backlog_size, num_ready_requests_queued, shape) = ( aggregate_resource_load[0].backlog_size, aggregate_resource_load[0].num_ready_requests_queued, aggregate_resource_load[0].shape, ) if backlog_size + num_ready_requests_queued != 9990: return False if shape != {"CPU": 1.0}: return False return True # In ASAN test it's slow. # Wait for 20s for the cluster to be up wait_for_condition(check_backlog_info, 20) cluster.shutdown()
def test_demand_report_for_node_affinity_scheduling_strategy(shutdown_only): from ray.cluster_utils import AutoscalingCluster cluster = AutoscalingCluster( head_resources={"CPU": 0}, worker_node_types={ "cpu_node": { "resources": { "CPU": 1, "object_store_memory": 1024 * 1024 * 1024, }, "node_config": {}, "min_workers": 1, "max_workers": 1, }, }, ) cluster.start() info = ray.init(address="auto") @ray.remote(num_cpus=1) def f(sleep_s): time.sleep(sleep_s) return ray.get_runtime_context().node_id worker_node_id = ray.get(f.remote(0)) tasks = [] tasks.append(f.remote(10000)) # This is not reported since there is feasible node. tasks.append( f.options(scheduling_strategy=NodeAffinitySchedulingStrategy( worker_node_id, soft=False)).remote(0)) # This is reported since there is no feasible node and soft is True. tasks.append( f.options( num_gpus=1, scheduling_strategy=NodeAffinitySchedulingStrategy( ray.NodeID.from_random().hex(), soft=True), ).remote(0)) global_state_accessor = make_global_state_accessor(info) def check_resource_demand(): message = global_state_accessor.get_all_resource_usage() if message is None: return False resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message) aggregate_resource_load = resource_usage.resource_load_by_shape.resource_demands if len(aggregate_resource_load) != 1: return False if aggregate_resource_load[0].num_infeasible_requests_queued != 1: return False if aggregate_resource_load[0].shape != {"CPU": 1.0, "GPU": 1.0}: return False return True wait_for_condition(check_resource_demand, 20) cluster.shutdown()