def test_spread_scheduling_strategy(ray_start_cluster, connect_to_client): cluster = ray_start_cluster # Create a head node cluster.add_node( num_cpus=0, _system_config={ "scheduler_spread_threshold": 1, }, ) ray.init(address=cluster.address) for i in range(2): cluster.add_node(num_cpus=8, resources={f"foo:{i}": 1}) cluster.wait_for_nodes() with connect_to_client_or_not(connect_to_client): @ray.remote def get_node_id(): return ray.worker.global_worker.current_node_id worker_node_ids = { ray.get(get_node_id.options(resources={ f"foo:{i}": 1 }).remote()) for i in range(2) } # Wait for updating driver raylet's resource view. time.sleep(5) @ray.remote(scheduling_strategy=SPREAD_SCHEDULING_STRATEGY) def task1(): internal_kv._internal_kv_put("test_task1", "task1") while internal_kv._internal_kv_exists("test_task1"): time.sleep(0.1) return ray.worker.global_worker.current_node_id @ray.remote def task2(): internal_kv._internal_kv_put("test_task2", "task2") return ray.worker.global_worker.current_node_id locations = [] locations.append(task1.remote()) while not internal_kv._internal_kv_exists("test_task1"): time.sleep(0.1) # Wait for updating driver raylet's resource view. time.sleep(5) locations.append( task2.options( scheduling_strategy=SPREAD_SCHEDULING_STRATEGY).remote()) while not internal_kv._internal_kv_exists("test_task2"): time.sleep(0.1) internal_kv._internal_kv_del("test_task1") internal_kv._internal_kv_del("test_task2") assert set(ray.get(locations)) == worker_node_ids
def test_create_upload_once(self, tmp_path, random_dir, ray_start_regular): uri = get_uri_for_directory(random_dir) uploaded = upload_package_if_needed(uri, tmp_path, random_dir) assert uploaded assert _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE) uploaded = upload_package_if_needed(uri, tmp_path, random_dir) assert not uploaded assert _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE) # Delete the URI from the internal_kv. This should trigger re-upload. _internal_kv_del(uri, namespace=KV_NAMESPACE_PACKAGE) assert not _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE) uploaded = upload_package_if_needed(uri, tmp_path, random_dir) assert uploaded
def test_create_upload_once(self, empty_dir, random_dir, ray_start_regular): uri = get_uri_for_directory(random_dir) uploaded = upload_package_if_needed(uri, empty_dir, random_dir) assert uploaded assert _internal_kv_exists(uri) uploaded = upload_package_if_needed(uri, empty_dir, random_dir) assert not uploaded assert _internal_kv_exists(uri) # Delete the URI from the internal_kv. This should trigger re-upload. _internal_kv_del(uri) assert not _internal_kv_exists(uri) uploaded = upload_package_if_needed(uri, empty_dir, random_dir) assert uploaded
def test_upload_succeeds(self, ray_start_regular): """Check function behavior when upload succeeds.""" uri = "gcs://test.zip" bytes = b"test" assert len(bytes) < GCS_STORAGE_MAX_SIZE assert not _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE) assert _store_package_in_gcs(uri, bytes) == len(bytes) assert bytes == _internal_kv_get(uri, namespace=KV_NAMESPACE_PACKAGE)
def wait(self, keys: list): while (True): all_exist = True for key in keys: result = internal_kv._internal_kv_exists(key) if not result: all_exist = False break if all_exist: return True time.sleep(1)
def wait(self, keys: list): while True: all_exist = True for key in keys: key = self.__concat_key_with_prefixes(key) result = internal_kv._internal_kv_exists(key) if not result: all_exist = False break if all_exist: return True time.sleep(1)
def package_exists(pkg_uri: str) -> bool: """Check whether the package with given uri exists or not. Args: pkg_uri (str): The uri of the package Return: True for package existing and False for not. """ (protocol, pkg_name) = _parse_uri(pkg_uri) if protocol in (Protocol.GCS, Protocol.PIN_GCS): return internal_kv._internal_kv_exists(pkg_uri) else: raise NotImplementedError(f"Protocol {protocol} is not supported")
def package_exists(pkg_uri: str) -> bool: """Check whether the package with given URI exists or not. Args: pkg_uri: The uri of the package Return: True for package existing and False for not. """ protocol, pkg_name = parse_uri(pkg_uri) if protocol == Protocol.GCS: return _internal_kv_exists(pkg_uri) else: raise NotImplementedError(f"Protocol {protocol} is not supported")
def test_task_failure_when_driver_local_raylet_dies(ray_start_cluster): cluster = ray_start_cluster head = cluster.add_node(num_cpus=4, resources={"foo": 1}) cluster.wait_for_nodes() ray.init(address=cluster.address) @ray.remote(resources={"foo": 1}) def func(): internal_kv._internal_kv_put("test_func", "func") while True: time.sleep(1) func.remote() while not internal_kv._internal_kv_exists("test_func"): time.sleep(0.1) # The lease request should wait inside raylet # since there is no available resources. ret = func.remote() # Waiting for the lease request to reach raylet. time.sleep(1) head.kill_raylet() with pytest.raises(LocalRayletDiedError): ray.get(ret)
def task1(): internal_kv._internal_kv_put("test_task1", "task1") while internal_kv._internal_kv_exists("test_task1"): time.sleep(0.1) return ray.worker.global_worker.current_node_id
def task1(): internal_kv._internal_kv_put("test_task1", "task1") while internal_kv._internal_kv_exists("test_task1"): time.sleep(0.1) return ray.get_runtime_context().node_id.hex()
def test_spread_scheduling_strategy(ray_start_cluster, connect_to_client): cluster = ray_start_cluster # Create a head node cluster.add_node( num_cpus=0, _system_config={ "scheduler_spread_threshold": 1, }, ) ray.init(address=cluster.address) for i in range(2): cluster.add_node(num_cpus=8, resources={f"foo:{i}": 1}) cluster.wait_for_nodes() with connect_to_client_or_not(connect_to_client): @ray.remote def get_node_id(): return ray.get_runtime_context().node_id.hex() worker_node_ids = { ray.get(get_node_id.options(resources={ f"foo:{i}": 1 }).remote()) for i in range(2) } # Wait for updating driver raylet's resource view. time.sleep(5) @ray.remote(scheduling_strategy="SPREAD") def task1(): internal_kv._internal_kv_put("test_task1", "task1") while internal_kv._internal_kv_exists("test_task1"): time.sleep(0.1) return ray.get_runtime_context().node_id.hex() @ray.remote def task2(): internal_kv._internal_kv_put("test_task2", "task2") return ray.get_runtime_context().node_id.hex() locations = [] locations.append(task1.remote()) while not internal_kv._internal_kv_exists("test_task1"): time.sleep(0.1) # Wait for updating driver raylet's resource view. time.sleep(5) locations.append(task2.options(scheduling_strategy="SPREAD").remote()) while not internal_kv._internal_kv_exists("test_task2"): time.sleep(0.1) internal_kv._internal_kv_del("test_task1") internal_kv._internal_kv_del("test_task2") assert set(ray.get(locations)) == worker_node_ids # Wait for updating driver raylet's resource view. time.sleep(5) # Make sure actors can be spreaded as well. @ray.remote(num_cpus=1) class Actor: def ping(self): return ray.get_runtime_context().node_id.hex() actors = [] locations = [] for i in range(8): actors.append(Actor.options(scheduling_strategy="SPREAD").remote()) locations.append(ray.get(actors[-1].ping.remote())) locations.sort() expected_locations = list(worker_node_ids) * 4 expected_locations.sort() assert locations == expected_locations
def test_node_affinity_scheduling_strategy(ray_start_cluster, connect_to_client): cluster = ray_start_cluster cluster.add_node(num_cpus=8, resources={"head": 1}) ray.init(address=cluster.address) cluster.add_node(num_cpus=8, resources={"worker": 1}) cluster.wait_for_nodes() with connect_to_client_or_not(connect_to_client): @ray.remote def get_node_id(): return ray.get_runtime_context().node_id head_node_id = ray.get( get_node_id.options(num_cpus=0, resources={ "head": 1 }).remote()) worker_node_id = ray.get( get_node_id.options(num_cpus=0, resources={ "worker": 1 }).remote()) assert worker_node_id == ray.get( get_node_id.options( scheduling_strategy=NodeAffinitySchedulingStrategy( worker_node_id, soft=False)).remote()) assert head_node_id == ray.get( get_node_id.options( scheduling_strategy=NodeAffinitySchedulingStrategy( head_node_id, soft=False)).remote()) # Doesn't fail when the node doesn't exist since soft is true. ray.get( get_node_id.options( scheduling_strategy=NodeAffinitySchedulingStrategy( ray.NodeID.from_random().hex(), soft=True)).remote()) # Doesn't fail when the node is infeasible since soft is true. assert worker_node_id == ray.get( get_node_id.options( scheduling_strategy=NodeAffinitySchedulingStrategy( head_node_id, soft=True), resources={ "worker": 1 }, ).remote()) # Fail when the node doesn't exist. with pytest.raises(ray.exceptions.TaskUnschedulableError): ray.get( get_node_id.options( scheduling_strategy=NodeAffinitySchedulingStrategy( ray.NodeID.from_random().hex(), soft=False)).remote()) # Fail when the node is infeasible. with pytest.raises(ray.exceptions.TaskUnschedulableError): ray.get( get_node_id.options( scheduling_strategy=NodeAffinitySchedulingStrategy( head_node_id, soft=False), resources={ "not_exist": 1 }, ).remote()) crashed_worker_node = cluster.add_node(num_cpus=8, resources={"crashed_worker": 1}) cluster.wait_for_nodes() crashed_worker_node_id = ray.get( get_node_id.options(num_cpus=0, resources={ "crashed_worker": 1 }).remote()) @ray.remote( max_retries=-1, scheduling_strategy=NodeAffinitySchedulingStrategy( crashed_worker_node_id, soft=True), ) def crashed_get_node_id(): if ray.get_runtime_context().node_id == crashed_worker_node_id: internal_kv._internal_kv_put("crashed_get_node_id", "crashed_worker_node_id") while True: time.sleep(1) else: return ray.get_runtime_context().node_id r = crashed_get_node_id.remote() while not internal_kv._internal_kv_exists("crashed_get_node_id"): time.sleep(0.1) cluster.remove_node(crashed_worker_node, allow_graceful=False) assert ray.get(r) in {head_node_id, worker_node_id} @ray.remote(num_cpus=1) class Actor: def get_node_id(self): return ray.get_runtime_context().node_id actor = Actor.options( scheduling_strategy=NodeAffinitySchedulingStrategy( worker_node_id, soft=False)).remote() assert worker_node_id == ray.get(actor.get_node_id.remote()) actor = Actor.options( scheduling_strategy=NodeAffinitySchedulingStrategy( head_node_id, soft=False)).remote() assert head_node_id == ray.get(actor.get_node_id.remote()) # Wait until the target node becomes available. worker_actor = Actor.options(resources={"worker": 1}).remote() assert worker_node_id == ray.get(worker_actor.get_node_id.remote()) actor = Actor.options( scheduling_strategy=NodeAffinitySchedulingStrategy(worker_node_id, soft=True), resources={ "worker": 1 }, ).remote() del worker_actor assert worker_node_id == ray.get(actor.get_node_id.remote()) # Doesn't fail when the node doesn't exist since soft is true. actor = Actor.options( scheduling_strategy=NodeAffinitySchedulingStrategy( ray.NodeID.from_random().hex(), soft=True)).remote() assert ray.get(actor.get_node_id.remote()) # Doesn't fail when the node is infeasible since soft is true. actor = Actor.options( scheduling_strategy=NodeAffinitySchedulingStrategy(head_node_id, soft=True), resources={ "worker": 1 }, ).remote() assert worker_node_id == ray.get(actor.get_node_id.remote()) # Fail when the node doesn't exist. with pytest.raises(ray.exceptions.ActorUnschedulableError): actor = Actor.options( scheduling_strategy=NodeAffinitySchedulingStrategy( ray.NodeID.from_random().hex(), soft=False)).remote() ray.get(actor.get_node_id.remote()) # Fail when the node is infeasible. with pytest.raises(ray.exceptions.ActorUnschedulableError): actor = Actor.options( scheduling_strategy=NodeAffinitySchedulingStrategy( worker_node_id, soft=False), resources={ "not_exist": 1 }, ).remote() ray.get(actor.get_node_id.remote())