def test_basic_reconstruction_actor_constructor(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. if not reconstruction_enabled: config["lineage_pinning_enabled"] = 0 cluster = ray_start_cluster # Head node with no resources. cluster.add_node(num_cpus=0, _system_config=config, enable_object_reconstruction=reconstruction_enabled) ray.init(address=cluster.address) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) cluster.add_node(num_cpus=1, resources={"node2": 1}, object_store_memory=10**8) cluster.wait_for_nodes() @ray.remote(max_retries=1 if reconstruction_enabled else 0) def large_object(): return np.zeros(10**7, dtype=np.uint8) # Both the constructor and a method depend on the large object. @ray.remote(max_restarts=-1) class Actor: def __init__(self, x): pass def dependent_task(self, x): return def pid(self): return os.getpid() obj = large_object.options(resources={"node1": 1}).remote() a = Actor.options(resources={"node1": 1}).remote(obj) ray.get(a.dependent_task.remote(obj)) pid = ray.get(a.pid.remote()) # Workaround to kill the actor process too since there is a bug where the # actor's plasma client hangs after the plasma store has exited. os.kill(pid, SIGKILL) cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) wait_for_pid_to_exit(pid) # Wait for the actor to restart. def probe(): try: ray.get(a.dependent_task.remote(obj)) return True except ray.exceptions.RayActorError: return False except (ray.exceptions.RayTaskError, ray.exceptions.ObjectLostError): return True wait_for_condition(probe) if reconstruction_enabled: ray.get(a.dependent_task.remote(obj)) else: with pytest.raises(ray.exceptions.RayTaskError) as e: x = a.dependent_task.remote(obj) print(x) ray.get(x) with pytest.raises(ray.exceptions.ObjectLostError): raise e.as_instanceof_cause()
def test_atomic_creation(ray_start_cluster): # Setup cluster. cluster = ray_start_cluster bundle_cpu_size = 2 bundle_per_node = 2 num_nodes = 2 [ cluster.add_node(num_cpus=bundle_cpu_size * bundle_per_node) for _ in range(num_nodes) ] ray.init(address=cluster.address) @ray.remote(num_cpus=1) class NormalActor: def ping(self): pass @ray.remote(num_cpus=3) def bothering_task(): import time time.sleep(1) return True # Schedule tasks to fail initial placement group creation. tasks = [bothering_task.remote() for _ in range(2)] # Create an actor that will fail bundle scheduling. # It is important to use pack strategy to make test less flaky. pg = ray.util.placement_group(name="name", strategy="SPREAD", bundles=[{ "CPU": bundle_cpu_size } for _ in range(num_nodes * bundle_per_node) ]) # Create a placement group actor. # This shouldn't be scheduled because atomic # placement group creation should've failed. pg_actor = NormalActor.options( placement_group=pg, placement_group_bundle_index=num_nodes * bundle_per_node - 1).remote() # Wait on the placement group now. It should be unready # because normal actor takes resources that are required # for one of bundle creation. ready, unready = ray.wait([pg.ready()], timeout=0) assert len(ready) == 0 assert len(unready) == 1 # Wait until all tasks are done. assert all(ray.get(tasks)) # Wait on the placement group creation. Since resources are now available, # it should be ready soon. ready, unready = ray.wait([pg.ready()]) assert len(ready) == 1 assert len(unready) == 0 # Confirm that the placement group actor is created. It will # raise an exception if actor was scheduled before placement # group was created thus it checks atomicity. ray.get(pg_actor.ping.remote(), timeout=3.0) ray.kill(pg_actor) # Make sure atomic creation failure didn't impact resources. @ray.remote(num_cpus=bundle_cpu_size) def resource_check(): return True # This should hang because every resources # are claimed by placement group. check_without_pg = [ resource_check.remote() for _ in range(bundle_per_node * num_nodes) ] # This all should scheduled on each bundle. check_with_pg = [ resource_check.options(placement_group=pg, placement_group_bundle_index=i).remote() for i in range(bundle_per_node * num_nodes) ] # Make sure these are hanging. ready, unready = ray.wait(check_without_pg, timeout=0) assert len(ready) == 0 assert len(unready) == bundle_per_node * num_nodes # Make sure these are all scheduled. assert all(ray.get(check_with_pg)) ray.util.remove_placement_group(pg) def pg_removed(): return ray.util.placement_group_table(pg)["state"] == "REMOVED" wait_for_condition(pg_removed) # Make sure check without pgs are all # scheduled properly because resources are cleaned up. assert all(ray.get(check_without_pg))
def test_shadow_traffic(serve_instance): client = serve_instance @ray.remote class RequestCounter: def __init__(self): self.requests = defaultdict(int) def record(self, backend): self.requests[backend] += 1 def get(self, backend): return self.requests[backend] counter = RequestCounter.remote() def f(_): ray.get(counter.record.remote("backend1")) return "hello" def f_shadow_1(_): ray.get(counter.record.remote("backend2")) return "oops" def f_shadow_2(_): ray.get(counter.record.remote("backend3")) return "oops" def f_shadow_3(_): ray.get(counter.record.remote("backend4")) return "oops" client.create_backend("backend1", f) client.create_backend("backend2", f_shadow_1) client.create_backend("backend3", f_shadow_2) client.create_backend("backend4", f_shadow_3) client.create_endpoint("endpoint", backend="backend1", route="/api") client.shadow_traffic("endpoint", "backend2", 1.0) client.shadow_traffic("endpoint", "backend3", 0.5) client.shadow_traffic("endpoint", "backend4", 0.1) start = time.time() num_requests = 100 for _ in range(num_requests): assert requests.get("http://127.0.0.1:8000/api").text == "hello" print("Finished 100 requests in {}s.".format(time.time() - start)) def requests_to_backend(backend): return ray.get(counter.get.remote(backend)) def check_requests(): return all([ requests_to_backend("backend1") == num_requests, requests_to_backend("backend2") == requests_to_backend("backend1"), requests_to_backend("backend3") < requests_to_backend("backend2"), requests_to_backend("backend4") < requests_to_backend("backend3"), requests_to_backend("backend4") > 0, ]) wait_for_condition(check_requests)
def test_basic(ray_start_with_dashboard): """Dashboard test that starts a Ray cluster with a dashboard server running, then hits the dashboard API and asserts that it receives sensible data.""" assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) address_info = ray_start_with_dashboard node_id = address_info["node_id"] address = address_info["redis_address"] address = address.split(":") assert len(address) == 2 client = redis.StrictRedis(host=address[0], port=int(address[1]), password=ray_constants.REDIS_DEFAULT_PASSWORD) all_processes = ray.worker._global_node.all_processes assert ray_constants.PROCESS_TYPE_DASHBOARD in all_processes assert ray_constants.PROCESS_TYPE_REPORTER not in all_processes dashboard_proc_info = all_processes[ ray_constants.PROCESS_TYPE_DASHBOARD][0] dashboard_proc = psutil.Process(dashboard_proc_info.process.pid) assert dashboard_proc.status() in [ psutil.STATUS_RUNNING, psutil.STATUS_SLEEPING ] raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0] raylet_proc = psutil.Process(raylet_proc_info.process.pid) def _search_agent(processes): for p in processes: try: for c in p.cmdline(): if "new_dashboard/agent.py" in c: return p except Exception: pass # Test for bad imports, the agent should be restarted. logger.info("Test for bad imports.") agent_proc = _search_agent(raylet_proc.children()) prepare_test_files() agent_pids = set() try: assert agent_proc is not None agent_proc.kill() agent_proc.wait() # The agent will be restarted for imports failure. for x in range(50): agent_proc = _search_agent(raylet_proc.children()) if agent_proc: agent_pids.add(agent_proc.pid) # The agent should be restarted, # so we can break if the len(agent_pid) > 1 if len(agent_pids) > 1: break time.sleep(0.1) finally: cleanup_test_files() assert len(agent_pids) > 1, agent_pids agent_proc = _search_agent(raylet_proc.children()) if agent_proc: agent_proc.kill() agent_proc.wait() logger.info("Test agent register is OK.") wait_for_condition(lambda: _search_agent(raylet_proc.children())) assert dashboard_proc.status() in [ psutil.STATUS_RUNNING, psutil.STATUS_SLEEPING ] agent_proc = _search_agent(raylet_proc.children()) agent_pid = agent_proc.pid # Check if agent register is OK. for x in range(5): logger.info("Check agent is alive.") agent_proc = _search_agent(raylet_proc.children()) assert agent_proc.pid == agent_pid time.sleep(1) # Check redis keys are set. logger.info("Check redis keys are set.") dashboard_address = client.get(dashboard_consts.REDIS_KEY_DASHBOARD) assert dashboard_address is not None dashboard_rpc_address = client.get( dashboard_consts.REDIS_KEY_DASHBOARD_RPC) assert dashboard_rpc_address is not None key = f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{node_id}" agent_ports = client.get(key) assert agent_ports is not None
def test_automatic_cleanup_job(ray_start_cluster): # Make sure the placement groups created by a # job, actor, and task are cleaned when the job is done. cluster = ray_start_cluster num_nodes = 3 num_cpu_per_node = 4 # Create 3 nodes cluster. for _ in range(num_nodes): cluster.add_node(num_cpus=num_cpu_per_node) info = ray.init(address=cluster.address) available_cpus = ray.available_resources()["CPU"] assert available_cpus == num_nodes * num_cpu_per_node driver_code = f""" import ray ray.init(address="{info["redis_address"]}") def create_pg(): pg = ray.util.placement_group( [{{"CPU": 1}} for _ in range(3)], strategy="STRICT_SPREAD") ray.get(pg.ready()) return pg @ray.remote(num_cpus=0) def f(): create_pg() @ray.remote(num_cpus=0) class A: def create_pg(self): create_pg() ray.get(f.remote()) a = A.remote() ray.get(a.create_pg.remote()) # Create 2 pgs to make sure multiple placement groups that belong # to a single job will be properly cleaned. create_pg() create_pg() ray.shutdown() """ run_string_as_driver(driver_code) # Wait until the driver is reported as dead by GCS. def is_job_done(): jobs = ray.jobs() for job in jobs: if "StopTime" in job: return True return False def assert_num_cpus(expected_num_cpus): if expected_num_cpus == 0: return "CPU" not in ray.available_resources() return ray.available_resources()["CPU"] == expected_num_cpus wait_for_condition(is_job_done) available_cpus = ray.available_resources()["CPU"] wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
def test_reconstruction_cached_dependency(ray_start_cluster, reconstruction_enabled): config = json.dumps({ "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, "lineage_pinning_enabled": 1 if reconstruction_enabled else 0, "free_objects_period_milliseconds": -1, }) cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _internal_config=config) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8, _internal_config=config) cluster.add_node(num_cpus=1, resources={"node2": 1}, object_store_memory=10**8, _internal_config=config) cluster.wait_for_nodes() ray.init(address=cluster.address, _internal_config=config) @ray.remote(max_retries=0) def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def chain(x): return x @ray.remote def dependent_task(x): return obj = large_object.options(resources={"node2": 1}).remote() obj = chain.options(resources={"node1": 1}).remote(obj) ray.get(dependent_task.options(resources={"node1": 1}).remote(obj)) cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8, _internal_config=config) assert wait_for_condition(lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10) for _ in range(20): large_object.options(resources={"node2": 1}).remote() if reconstruction_enabled: ray.get(dependent_task.remote(obj)) else: with pytest.raises(ray.exceptions.RayTaskError) as e: ray.get(dependent_task.remote(obj)) with pytest.raises(ray.exceptions.UnreconstructableError): raise e.as_instanceof_cause()
def test_delete_objects_on_worker_failure(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. temp_folder = tmp_path / "spill" temp_folder.mkdir() ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, "automatic_object_spilling_enabled": True, "object_store_full_max_retries": 4, "object_store_full_initial_delay_ms": 100, "object_spilling_config": json.dumps({ "type": "filesystem", "params": { "directory_path": str(temp_folder) } }), "min_spilling_size": 0, }) arr = np.random.rand(1024 * 1024) # 8 MB data @ray.remote class Actor: def __init__(self): self.replay_buffer = [] def get_pid(self): return os.getpid() def create_objects(self): for _ in range(80): ref = None while ref is None: ref = ray.put(arr) self.replay_buffer.append(ref) # Remove the replay buffer with 60% probability. if random.randint(0, 9) < 6: self.replay_buffer.pop() # Do random sampling. for _ in range(200): ref = random.choice(self.replay_buffer) sample = ray.get(ref, timeout=0) assert np.array_equal(sample, arr) a = Actor.remote() actor_pid = ray.get(a.get_pid.remote()) ray.get(a.create_objects.remote()) os.kill(actor_pid, 9) def wait_until_actor_dead(): try: ray.get(a.get_pid.remote()) except ray.exceptions.RayActorError: return True return False wait_for_condition(wait_until_actor_dead) def is_dir_empty(): num_files = 0 for path in temp_folder.iterdir(): num_files += 1 return num_files == 0 # After all, make sure all objects are deleted upon worker failures. wait_for_condition(is_dir_empty, timeout=1000)
def test_metrics_export_end_to_end(_setup_cluster_for_test): TEST_TIMEOUT_S = 20 prom_addresses, autoscaler_export_addr = _setup_cluster_for_test def test_cases(): components_dict, metric_names, metric_samples = fetch_prometheus( prom_addresses) # Raylet should be on every node assert all("raylet" in components for components in components_dict.values()) # GCS server should be on one node assert any("gcs_server" in components for components in components_dict.values()) # Core worker should be on at least on node assert any("core_worker" in components for components in components_dict.values()) # Make sure our user defined metrics exist for metric_name in [ "test_counter", "test_histogram", "test_driver_counter" ]: assert any(metric_name in full_name for full_name in metric_names) # Make sure metrics are recorded. for metric in _METRICS: assert metric in metric_names, \ f"metric {metric} not in {metric_names}" # Make sure the numeric values are correct test_counter_sample = [ m for m in metric_samples if "test_counter" in m.name ][0] assert test_counter_sample.value == 4.0 test_driver_counter_sample = [ m for m in metric_samples if "test_driver_counter" in m.name ][0] assert test_driver_counter_sample.value == 1.0 test_histogram_samples = [ m for m in metric_samples if "test_histogram" in m.name ] buckets = { m.labels["le"]: m.value for m in test_histogram_samples if "_bucket" in m.name } # We recorded value 1.5 for the histogram. In Prometheus data model # the histogram is cumulative. So we expect the count to appear in # <1.1 and <+Inf buckets. assert buckets == {"0.1": 0.0, "1.6": 1.0, "+Inf": 1.0} hist_count = [m for m in test_histogram_samples if "_count" in m.name][0].value hist_sum = [m for m in test_histogram_samples if "_sum" in m.name][0].value assert hist_count == 1 assert hist_sum == 1.5 # Autoscaler metrics _, autoscaler_metric_names, _ = fetch_prometheus( [autoscaler_export_addr]) for metric in _AUTOSCALER_METRICS: # Metric name should appear with some suffix (_count, _total, # etc...) in the list of all names assert any(name.startswith(metric) for name in autoscaler_metric_names), \ f"{metric} not in {autoscaler_metric_names}" def wrap_test_case_for_retry(): try: test_cases() return True except AssertionError: return False try: wait_for_condition( wrap_test_case_for_retry, timeout=TEST_TIMEOUT_S, retry_interval_ms=1000, # Yield resource for other processes ) except RuntimeError: print( f"The components are {pformat(fetch_prometheus(prom_addresses))}") test_cases() # Should fail assert
def test_two_custom_resources(ray_start_cluster): cluster = ray_start_cluster cluster.add_node( num_cpus=3, resources={ "CustomResource1": 1, "CustomResource2": 2 }) custom_resource_node = cluster.add_node( num_cpus=3, resources={ "CustomResource1": 3, "CustomResource2": 4 }) ray.init(address=cluster.address) @ray.remote def foo(): # Sleep a while to emulate a slow operation. This is needed to make # sure tasks are scheduled to different nodes. time.sleep(0.1) return ray.worker.global_worker.node.unique_id # Make sure each node has at least one idle worker. wait_for_condition( lambda: len(set(ray.get([foo.remote() for _ in range(6)]))) == 2) @ray.remote(resources={"CustomResource1": 1}) def f(): time.sleep(0.001) return ray.worker.global_worker.node.unique_id @ray.remote(resources={"CustomResource2": 1}) def g(): time.sleep(0.001) return ray.worker.global_worker.node.unique_id @ray.remote(resources={"CustomResource1": 1, "CustomResource2": 3}) def h(): time.sleep(0.001) return ray.worker.global_worker.node.unique_id @ray.remote(resources={"CustomResource1": 4}) def j(): time.sleep(0.001) return ray.worker.global_worker.node.unique_id @ray.remote(resources={"CustomResource3": 1}) def k(): time.sleep(0.001) return ray.worker.global_worker.node.unique_id # The f and g tasks should be scheduled on both raylets. assert len(set(ray.get([f.remote() for _ in range(500)]))) == 2 assert len(set(ray.get([g.remote() for _ in range(500)]))) == 2 # The h tasks should be scheduled only on the second raylet. raylet_ids = set(ray.get([h.remote() for _ in range(50)])) assert len(raylet_ids) == 1 assert list(raylet_ids)[0] == custom_resource_node.unique_id # Make sure that tasks with unsatisfied custom resource requirements do # not get scheduled. ready_ids, remaining_ids = ray.wait([j.remote(), k.remote()], timeout=0.5) assert ready_ids == []
def test_detached_placement_group(ray_start_cluster): cluster = ray_start_cluster for _ in range(2): cluster.add_node(num_cpus=3) cluster.wait_for_nodes() info = ray.init(address=cluster.address) # Make sure detached placement group will alive when job dead. driver_code = f""" import ray ray.init(address="{info["redis_address"]}") pg = ray.util.placement_group( [{{"CPU": 1}} for _ in range(2)], strategy="STRICT_SPREAD", lifetime="detached") ray.get(pg.ready()) @ray.remote(num_cpus=1) class Actor: def ready(self): return True for bundle_index in range(2): actor = Actor.options(lifetime="detached", placement_group=pg, placement_group_bundle_index=bundle_index).remote() ray.get(actor.ready.remote()) ray.shutdown() """ run_string_as_driver(driver_code) # Wait until the driver is reported as dead by GCS. def is_job_done(): jobs = ray.jobs() for job in jobs: if "StopTime" in job: return True return False def assert_alive_num_pg(expected_num_pg): alive_num_pg = 0 for _, placement_group_info in ray.util.placement_group_table().items( ): if placement_group_info["state"] == "CREATED": alive_num_pg += 1 return alive_num_pg == expected_num_pg def assert_alive_num_actor(expected_num_actor): alive_num_actor = 0 for actor_info in ray.actors().values(): if actor_info["State"] == ray.gcs_utils.ActorTableData.ALIVE: alive_num_actor += 1 return alive_num_actor == expected_num_actor wait_for_condition(is_job_done) assert assert_alive_num_pg(1) assert assert_alive_num_actor(2) # Make sure detached placement group will alive when its creator which # is detached actor dead. # Test actors first. @ray.remote(num_cpus=1) class NestedActor: def ready(self): return True @ray.remote(num_cpus=1) class Actor: def __init__(self): self.actors = [] def ready(self): return True def schedule_nested_actor_with_detached_pg(self): # Create placement group which is detached. pg = ray.util.placement_group( [{ "CPU": 1 } for _ in range(2)], strategy="STRICT_SPREAD", lifetime="detached", name="detached_pg") ray.get(pg.ready()) # Schedule nested actor with the placement group. for bundle_index in range(2): actor = NestedActor.options( placement_group=pg, placement_group_bundle_index=bundle_index, lifetime="detached").remote() ray.get(actor.ready.remote()) self.actors.append(actor) a = Actor.options(lifetime="detached").remote() ray.get(a.ready.remote()) # 1 parent actor and 2 children actor. ray.get(a.schedule_nested_actor_with_detached_pg.remote()) # Kill an actor and wait until it is killed. ray.kill(a) with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) # We should have 2 alive pgs and 4 alive actors. assert assert_alive_num_pg(2) assert assert_alive_num_actor(4)
def test_named_placement_group(ray_start_cluster): cluster = ray_start_cluster for _ in range(2): cluster.add_node(num_cpus=3) cluster.wait_for_nodes() info = ray.init(address=cluster.address) global_placement_group_name = "named_placement_group" # Create a detached placement group with name. driver_code = f""" import ray ray.init(address="{info["redis_address"]}") pg = ray.util.placement_group( [{{"CPU": 1}} for _ in range(2)], strategy="STRICT_SPREAD", name="{global_placement_group_name}", lifetime="detached") ray.get(pg.ready()) ray.shutdown() """ run_string_as_driver(driver_code) # Wait until the driver is reported as dead by GCS. def is_job_done(): jobs = ray.jobs() for job in jobs: if "StopTime" in job: return True return False wait_for_condition(is_job_done) @ray.remote(num_cpus=1) class Actor: def ping(self): return "pong" # Get the named placement group and schedule a actor. placement_group = ray.util.get_placement_group(global_placement_group_name) assert placement_group is not None assert placement_group.wait(5) actor = Actor.options( placement_group=placement_group, placement_group_bundle_index=0).remote() ray.get(actor.ping.remote()) # Create another placement group and make sure its creation will failed. same_name_pg = ray.util.placement_group( [{ "CPU": 1 } for _ in range(2)], strategy="STRICT_SPREAD", name=global_placement_group_name) assert not same_name_pg.wait(10) # Remove a named placement group and make sure the second creation # will successful. ray.util.remove_placement_group(placement_group) same_name_pg = ray.util.placement_group( [{ "CPU": 1 } for _ in range(2)], strategy="STRICT_SPREAD", name=global_placement_group_name) assert same_name_pg.wait(10) # Get a named placement group with a name that doesn't exist # and make sure it will raise ValueError correctly. error_count = 0 try: ray.util.get_placement_group("inexistent_pg") except ValueError: error_count = error_count + 1 assert error_count == 1
def test_multiple_directories(tmp_path, shutdown_only): num_dirs = 3 temp_dirs = [] for i in range(num_dirs): temp_folder = tmp_path / f"spill_{i}" temp_folder.mkdir() temp_dirs.append(temp_folder) # Limit our object store to 75 MiB of memory. min_spilling_size = 0 object_spilling_config = json.dumps({ "type": "filesystem", "params": { "directory_path": [str(directory) for directory in temp_dirs] } }) address = ray.init(object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 5, "object_store_full_delay_ms": 100, "object_spilling_config": object_spilling_config, "min_spilling_size": min_spilling_size, }) arr = np.ones(74 * 1024 * 1024, dtype=np.uint8) # 74MB. object_refs = [] # Now the storage is full. object_refs.append(ray.put(arr)) num_object_spilled = 20 for _ in range(num_object_spilled): object_refs.append(ray.put(arr)) num_files = defaultdict(int) for temp_dir in temp_dirs: temp_folder = temp_dir / ray.ray_constants.DEFAULT_OBJECT_PREFIX for path in temp_folder.iterdir(): num_files[str(temp_folder)] += 1 for ref in object_refs: assert np.array_equal(ray.get(ref), arr) print("Check distribution...") min_count = 5 is_distributed = [n_files >= min_count for n_files in num_files.values()] assert all(is_distributed) print("Check deletion...") # Empty object refs. object_refs = [] # Add a new object so that the last entry is evicted. ref = ray.put(arr) for temp_dir in temp_dirs: temp_folder = temp_dir wait_for_condition(lambda: is_dir_empty(temp_folder)) assert_no_thrashing(address["redis_address"]) # Now kill ray and see all directories are deleted. print("Check directories are deleted...") ray.shutdown() for temp_dir in temp_dirs: wait_for_condition(lambda: is_dir_empty(temp_dir, append_path=""))
def test_delete_objects_multi_node(multi_node_object_spilling_config, ray_start_cluster): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = multi_node_object_spilling_config cluster = ray_start_cluster # Head node. cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 2, "min_spilling_size": 20 * 1024 * 1024, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, "object_spilling_config": object_spilling_config, }) ray.init(address=cluster.address) # Add 2 worker nodes. for _ in range(2): cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024) cluster.wait_for_nodes() arr = np.random.rand(1024 * 1024) # 8 MB data @ray.remote(num_cpus=1) class Actor: def __init__(self): self.replay_buffer = [] def ping(self): return def create_objects(self): for _ in range(80): ref = None while ref is None: ref = ray.put(arr) self.replay_buffer.append(ref) # Remove the replay buffer with 60% probability. if random.randint(0, 9) < 6: self.replay_buffer.pop() # Do random sampling. for _ in range(50): ref = random.choice(self.replay_buffer) sample = ray.get(ref, timeout=10) assert np.array_equal(sample, arr) actors = [Actor.remote() for _ in range(3)] ray.get([actor.create_objects.remote() for actor in actors]) def wait_until_actor_dead(actor): try: ray.get(actor.ping.remote()) except ray.exceptions.RayActorError: return True return False # Kill actors to remove all references. for actor in actors: ray.kill(actor) wait_for_condition(lambda: wait_until_actor_dead(actor)) # The multi node deletion should work. wait_for_condition(lambda: is_dir_empty(temp_folder)) assert_no_thrashing(cluster.address)
assert not leaked_processes() run_experiments({ "demo": { "run": "PG", "env": "subproc", "num_samples": 1, "config": { "num_workers": 1, "env_config": { "tmp_file1": tmp1, "tmp_file2": tmp2, "tmp_file3": tmp3, "tmp_file4": tmp4, }, "framework": "tf", }, "stop": { "training_iteration": 1 }, }, }) # Check whether processes are still running or Env has not cleaned up # the given tmp files. wait_for_condition(lambda: not leaked_processes(), timeout=30) wait_for_condition(lambda: not os.path.exists(tmp1), timeout=30) wait_for_condition(lambda: not os.path.exists(tmp2), timeout=30) wait_for_condition(lambda: not os.path.exists(tmp3), timeout=30) wait_for_condition(lambda: not os.path.exists(tmp4), timeout=30) print("OK")
def test_local_clusters(): """ This tests the various behaviors of connecting to local clusters: * Using `ray.client("local").connect() ` should always create a new cluster. * Using `ray.cleint().connectIO` should create a new cluster if it doesn't connect to an existing one. * Using `ray.client().connect()` should only connect to a cluster if it was created with `ray start --head`, not from a python program. It does tests if two calls are in the same cluster by trying to create an actor with the same name in the same namespace, which will error and cause the script have a non-zero exit, which throws an exception. """ driver_template = """ import ray info = ray.client({address}).namespace("").connect() @ray.remote class Foo: def ping(self): return "pong" a = Foo.options(name="abc", lifetime="detached").remote() ray.get(a.ping.remote()) import time while True: time.sleep(30) """ blocking_local_script = driver_template.format(address="'local'", blocking=True) blocking_noaddr_script = driver_template.format(address="", blocking=True) # This should start a cluster. p1 = run_string_as_driver_nonblocking(blocking_local_script) # ray.client("local").connect() should start a second cluster. p2 = run_string_as_driver_nonblocking(blocking_local_script) # ray.client().connect() shouldn't connect to a cluster started by # ray.client("local").connect() so it should create a third one. p3 = run_string_as_driver_nonblocking(blocking_noaddr_script) # ray.client().connect() shouldn't connect to a cluster started by # ray.client().connect() so it should create a fourth one. p4 = run_string_as_driver_nonblocking(blocking_noaddr_script) wait_for_condition( lambda: len(ray._private.services.find_redis_address()) == 4, retry_interval_ms=1000) p1.kill() p2.kill() p3.kill() p4.kill() # Prevent flakiness since fatesharing takes some time. subprocess.check_output("ray stop --force", shell=True) # Since there's a cluster started with `ray start --head` # we should connect to it instead. subprocess.check_output("ray start --head", shell=True) # The assertion in the driver should cause the script to fail if we start # a new cluster instead of connecting. run_string_as_driver(""" import ray ray.client().connect() assert len(ray._private.services.find_redis_address()) == 1 """) # ray.client("local").connect() should always create a new cluster even if # there's one running. p1 = run_string_as_driver_nonblocking(blocking_local_script) wait_for_condition( lambda: len(ray._private.services.find_redis_address()) == 2, retry_interval_ms=1000) p1.kill() subprocess.check_output("ray stop --force", shell=True)
def test_dynamic_res_concurrent_res_delete(ray_start_cluster): # This test makes sure resource gets deleted correctly when a task has # already acquired the resource cluster = ray_start_cluster res_name = "test_res" res_capacity = 5 num_nodes = 5 TIMEOUT_DURATION = 1 for i in range(num_nodes): cluster.add_node() ray.init(address=cluster.address) node_ids = [node["NodeID"] for node in ray.nodes()] target_node_id = node_ids[1] @ray.remote def set_res(resource_name, resource_capacity, res_node_id): ray.experimental.set_resource(resource_name, resource_capacity, node_id=res_node_id) @ray.remote def delete_res(resource_name, res_node_id): ray.experimental.set_resource(resource_name, 0, node_id=res_node_id) # Create the resource on node 1 ray.get(set_res.remote(res_name, res_capacity, target_node_id)) def check_resources(): return ray.cluster_resources().get(res_name, None) == res_capacity wait_for_condition(check_resources) # Task to hold the resource till the driver signals to finish @ray.remote def wait_func(running_signal, finish_signal): # Signal that the task is running. ray.get(running_signal.send.remote()) # Wait until signaled by driver. ray.get(finish_signal.wait.remote()) @ray.remote def test_func(): return 1 @ray.remote(num_cpus=0) class Signal: def __init__(self): self.ready_event = asyncio.Event() def send(self): self.ready_event.set() async def wait(self): await self.ready_event.wait() running_signal = Signal.remote() finish_signal = Signal.remote() # Launch the task with resource requirement of 4, thus the new available # capacity becomes 1 task = wait_func._remote(args=[running_signal, finish_signal], resources={res_name: 4}) # Wait until wait_func is launched before updating resource ray.get(running_signal.wait.remote()) # Delete the resource ray.get(delete_res.remote(res_name, target_node_id)) # Signal task to complete ray.get(finish_signal.send.remote()) ray.get(task) # Check if scheduler state is consistent by launching a task requiring # the deleted resource This should not execute task_2 = test_func._remote(args=[], resources={res_name: 1}) # This should be infeasible successful, unsuccessful = ray.wait([task_2], timeout=TIMEOUT_DURATION) assert unsuccessful # The task did not complete because it's infeasible assert res_name not in ray.available_resources()
def test_global_gc_when_full(shutdown_only): cluster = ray.cluster_utils.Cluster() for _ in range(2): cluster.add_node(num_cpus=1, num_gpus=0, object_store_memory=100 * 1024 * 1024) ray.init(address=cluster.address) class LargeObjectWithCyclicRef: def __init__(self): self.loop = self self.large_object = ray.put( np.zeros(40 * 1024 * 1024, dtype=np.uint8)) @ray.remote(num_cpus=1) class GarbageHolder: def __init__(self): gc.disable() x = LargeObjectWithCyclicRef() self.garbage = weakref.ref(x) def has_garbage(self): return self.garbage() is not None def return_large_array(self): return np.zeros(80 * 1024 * 1024, dtype=np.uint8) try: gc.disable() # Local driver. local_ref = weakref.ref(LargeObjectWithCyclicRef()) # Remote workers. actors = [GarbageHolder.remote() for _ in range(2)] assert local_ref() is not None assert all(ray.get([a.has_garbage.remote() for a in actors])) # GC should be triggered for all workers, including the local driver, # when the driver tries to ray.put a value that doesn't fit in the # object store. This should cause the captured ObjectRefs' numpy arrays # to be evicted. ray.put(np.zeros(80 * 1024 * 1024, dtype=np.uint8)) def check_refs_gced(): return (local_ref() is None and not any(ray.get([a.has_garbage.remote() for a in actors]))) wait_for_condition(check_refs_gced) # Local driver. local_ref = weakref.ref(LargeObjectWithCyclicRef()) # Remote workers. actors = [GarbageHolder.remote() for _ in range(2)] assert all(ray.get([a.has_garbage.remote() for a in actors])) # GC should be triggered for all workers, including the local driver, # when a remote task tries to put a return value that doesn't fit in # the object store. This should cause the captured ObjectRefs' numpy # arrays to be evicted. ray.get(actors[0].return_large_array.remote()) def check_refs_gced(): return (local_ref() is None and not any(ray.get([a.has_garbage.remote() for a in actors]))) wait_for_condition(check_refs_gced) finally: gc.enable()
def test_multiple_routers(): cluster = Cluster() head_node = cluster.add_node() cluster.add_node() ray.init(head_node.address) node_ids = ray.state.node_ids() assert len(node_ids) == 2 serve.init(http_port=8005) def actor_name(index): return SERVE_PROXY_NAME + "-{}-{}".format(node_ids[0], index) # Two actors should be started. def get_first_two_actors(): try: ray.get_actor(actor_name(0)) ray.get_actor(actor_name(1)) return True except ValueError: return False wait_for_condition(get_first_two_actors) # Wait for the actors to come up. ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Kill one of the servers, the HTTP server should still function. ray.kill(ray.get_actor(actor_name(0)), no_restart=True) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Add a new node to the cluster. This should trigger a new router to get # started. new_node = cluster.add_node() def get_third_actor(): try: ray.get_actor(actor_name(2)) return True except ValueError: return False wait_for_condition(get_third_actor) # Remove the newly-added node from the cluster. The corresponding actor # should be removed as well. cluster.remove_node(new_node) def third_actor_removed(): try: ray.get_actor(actor_name(2)) return False except ValueError: return True # Check that the actor is gone and the HTTP server still functions. wait_for_condition(third_actor_removed) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Clean up the nodes (otherwise Ray will segfault). ray.shutdown() cluster.shutdown()
def test_multiple_routers(ray_cluster): cluster = ray_cluster head_node = cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) ray.init(head_node.address) node_ids = ray.state.node_ids() assert len(node_ids) == 2 serve.start(http_options=dict(port=8005, location="EveryNode")) def get_proxy_names(): proxy_names = [] for node_id, _ in get_all_node_ids(): proxy_names.append( format_actor_name(SERVE_PROXY_NAME, serve.api._global_client._controller_name, node_id)) return proxy_names wait_for_condition(lambda: len(get_proxy_names()) == 2) proxy_names = get_proxy_names() # Two actors should be started. def get_first_two_actors(): try: ray.get_actor(proxy_names[0]) ray.get_actor(proxy_names[1]) return True except ValueError: return False wait_for_condition(get_first_two_actors) # Wait for the actors to come up. ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Kill one of the servers, the HTTP server should still function. ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Add a new node to the cluster. This should trigger a new router to get # started. new_node = cluster.add_node() wait_for_condition(lambda: len(get_proxy_names()) == 3) third_proxy = get_proxy_names()[2] def get_third_actor(): try: ray.get_actor(third_proxy) return True # IndexErrors covers when cluster resources aren't updated yet. except (IndexError, ValueError): return False wait_for_condition(get_third_actor) # Remove the newly-added node from the cluster. The corresponding actor # should be removed as well. cluster.remove_node(new_node) def third_actor_removed(): try: ray.get_actor(third_proxy) return False except ValueError: return True # Check that the actor is gone and the HTTP server still functions. wait_for_condition(third_actor_removed) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))
def test_metrics_export_end_to_end(_setup_cluster_for_test): TEST_TIMEOUT_S = 20 prom_addresses = _setup_cluster_for_test def test_cases(): components_dict, metric_names, metric_samples = fetch_prometheus( prom_addresses) # Raylet should be on every node assert all("raylet" in components for components in components_dict.values()) # GCS server should be on one node assert any("gcs_server" in components for components in components_dict.values()) # Core worker should be on at least on node assert any("core_worker" in components for components in components_dict.values()) # Make sure our user defined metrics exist for metric_name in ["test_counter", "test_histogram"]: assert any(metric_name in full_name for full_name in metric_names) # Make sure GCS server metrics are recorded. assert "ray_outbound_heartbeat_size_kb_sum" in metric_names # Make sure the numeric value is correct test_counter_sample = [ m for m in metric_samples if "test_counter" in m.name ][0] assert test_counter_sample.value == 1.0 # Make sure the numeric value is correct test_histogram_samples = [ m for m in metric_samples if "test_histogram" in m.name ] buckets = { m.labels["le"]: m.value for m in test_histogram_samples if "_bucket" in m.name } # We recorded value 1.5 for the histogram. In Prometheus data model # the histogram is cumulative. So we expect the count to appear in # <1.1 and <+Inf buckets. assert buckets == {"0.1": 0.0, "1.6": 1.0, "+Inf": 1.0} hist_count = [m for m in test_histogram_samples if "_count" in m.name][0].value hist_sum = [m for m in test_histogram_samples if "_sum" in m.name][0].value assert hist_count == 1 assert hist_sum == 1.5 def wrap_test_case_for_retry(): try: test_cases() return True except AssertionError: return False try: wait_for_condition( wrap_test_case_for_retry, timeout=TEST_TIMEOUT_S, retry_interval_ms=1000, # Yield resource for other processes ) except RuntimeError: print( f"The compoenents are {pformat(fetch_prometheus(prom_addresses))}") test_cases() # Should fail assert
def test_delete_objects_multi_node(tmp_path, ray_start_cluster): # Limit our object store to 75 MiB of memory. temp_folder = tmp_path / "spill" temp_folder.mkdir() cluster = ray_start_cluster # Head node. cluster.add_node( num_cpus=1, object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 2, "automatic_object_spilling_enabled": True, "object_store_full_max_retries": 4, "object_store_full_initial_delay_ms": 100, "object_spilling_config": json.dumps({ "type": "filesystem", "params": { "directory_path": str(temp_folder) } }), }) # Add 2 worker nodes. for _ in range(2): cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024) ray.init(address=cluster.address) arr = np.random.rand(1024 * 1024) # 8 MB data @ray.remote(num_cpus=1) class Actor: def __init__(self): self.replay_buffer = [] def ping(self): return def create_objects(self): for _ in range(80): ref = None while ref is None: ref = ray.put(arr) self.replay_buffer.append(ref) # Remove the replay buffer with 60% probability. if random.randint(0, 9) < 6: self.replay_buffer.pop() # Do random sampling. for _ in range(200): ref = random.choice(self.replay_buffer) sample = ray.get(ref, timeout=0) assert np.array_equal(sample, arr) actors = [Actor.remote() for _ in range(3)] ray.get([actor.create_objects.remote() for actor in actors]) def wait_until_actor_dead(actor): try: ray.get(actor.ping.remote()) except ray.exceptions.RayActorError: return True return False def is_dir_empty(): num_files = 0 for path in temp_folder.iterdir(): num_files += 1 return num_files == 0 # Kill actors to remove all references. for actor in actors: ray.kill(actor) wait_for_condition(lambda: wait_until_actor_dead(actor)) # The multi node deletion should work. wait_for_condition(is_dir_empty)
def test_metrics_export_end_to_end(_setup_cluster_for_test): TEST_TIMEOUT_S = 20 prom_addresses = _setup_cluster_for_test # Make sure we can ping Prometheus endpoints. def fetch_prometheus(prom_addresses): components_dict = {} metric_names = set() metric_samples = [] for address in prom_addresses: if address not in components_dict: components_dict[address] = set() try: response = requests.get(f"http://{address}/metrics") except requests.exceptions.ConnectionError: continue for line in response.text.split("\n"): for family in text_string_to_metric_families(line): for sample in family.samples: metric_names.add(sample.name) metric_samples.append(sample) if "Component" in sample.labels: components_dict[address].add( sample.labels["Component"]) return components_dict, metric_names, metric_samples def test_cases(): components_dict, metric_names, metric_samples = fetch_prometheus( prom_addresses) # Raylet should be on every node assert all("raylet" in components for components in components_dict.values()) # GCS server should be on one node assert any("gcs_server" in components for components in components_dict.values()) # Core worker should be on at least on node assert any("core_worker" in components for components in components_dict.values()) # Make sure our user defined metrics exist for metric_name in ["test_counter", "test_histogram"]: assert any(metric_name in full_name for full_name in metric_names) # Make sure the numeric value is correct test_counter_sample = [ m for m in metric_samples if "test_counter" in m.name ][0] assert test_counter_sample.value == 1.0 # Make sure the numeric value is correct test_histogram_samples = [ m for m in metric_samples if "test_histogram" in m.name ] buckets = { m.labels["le"]: m.value for m in test_histogram_samples if "_bucket" in m.name } # We recorded value 1.5 for the histogram. In Prometheus data model # the histogram is cumulative. So we expect the count to appear in # <1.1 and <+Inf buckets. assert buckets == {"0.1": 0.0, "1.6": 1.0, "+Inf": 1.0} hist_count = [m for m in test_histogram_samples if "_count" in m.name][0].value hist_sum = [m for m in test_histogram_samples if "_sum" in m.name][0].value assert hist_count == 1 assert hist_sum == 1.5 def wrap_test_case_for_retry(): try: test_cases() return True except AssertionError: return False try: wait_for_condition( wrap_test_case_for_retry, timeout=TEST_TIMEOUT_S, retry_interval_ms=1000, # Yield resource for other processes ) except RuntimeError: print( f"The compoenents are {pformat(fetch_prometheus(prom_addresses))}") test_cases() # Should fail assert
def test_automatic_cleanup_detached_actors(ray_start_cluster): # Make sure the placement groups created by a # detached actors are cleaned properly. cluster = ray_start_cluster num_nodes = 3 num_cpu_per_node = 2 # Create 3 nodes cluster. for _ in range(num_nodes): cluster.add_node(num_cpus=num_cpu_per_node) info = ray.init(address=cluster.address) available_cpus = ray.available_resources()["CPU"] assert available_cpus == num_nodes * num_cpu_per_node driver_code = f""" import ray ray.init(address="{info["redis_address"]}") def create_pg(): pg = ray.util.placement_group( [{{"CPU": 1}} for _ in range(3)], strategy="STRICT_SPREAD") ray.get(pg.ready()) return pg # TODO(sang): Placement groups created by tasks launched by detached actor # is not cleaned with the current protocol. # @ray.remote(num_cpus=0) # def f(): # create_pg() @ray.remote(num_cpus=0, max_restarts=1) class A: def create_pg(self): create_pg() def create_child_pg(self): self.a = A.options(name="B").remote() ray.get(self.a.create_pg.remote()) def kill_child_actor(self): ray.kill(self.a) try: ray.get(self.a.create_pg.remote()) except Exception: pass a = A.options(lifetime="detached", name="A").remote() ray.get(a.create_pg.remote()) # TODO(sang): Currently, child tasks are cleaned when a detached actor # is dead. We cannot test this scenario until it is fixed. # ray.get(a.create_child_pg.remote()) ray.shutdown() """ run_string_as_driver(driver_code) # Wait until the driver is reported as dead by GCS. def is_job_done(): jobs = ray.jobs() for job in jobs: if "StopTime" in job: return True return False def assert_num_cpus(expected_num_cpus): if expected_num_cpus == 0: return "CPU" not in ray.available_resources() return ray.available_resources()["CPU"] == expected_num_cpus wait_for_condition(is_job_done) assert assert_num_cpus(num_nodes) # Make sure when a child actor spawned by a detached actor # is killed, the placement group is removed. a = ray.get_actor("A") # TODO(sang): child of detached actors # seem to be killed when jobs are done. We should fix this before # testing this scenario. # ray.get(a.kill_child_actor.remote()) # assert assert_num_cpus(num_nodes) # Make sure placement groups are cleaned when detached actors are killed. ray.kill(a, no_restart=False) wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node)) # The detached actor a should've been restarted. # Recreate a placement group. ray.get(a.create_pg.remote()) wait_for_condition(lambda: assert_num_cpus(num_nodes)) # Kill it again and make sure the placement group # that is created is deleted again. ray.kill(a, no_restart=False) wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
def test_actor_restart(ray_init_with_task_retry_delay): """Test actor restart when actor process is killed.""" @ray.remote(max_restarts=1) class RestartableActor: """An actor that will be restarted at most once.""" def __init__(self): self.value = 0 def increase(self, exit=False): if exit: os._exit(-1) self.value += 1 return self.value def get_pid(self): return os.getpid() actor = RestartableActor.remote() # Submit some tasks and kill on a task midway through. results = [actor.increase.remote(exit=(i == 100)) for i in range(200)] # Make sure that all tasks were executed in order before the actor's death. i = 1 while results: res = results[0] try: r = ray.get(res) if r != i: # Actor restarted at this task without any failed tasks in # between. break results.pop(0) i += 1 except ray.exceptions.RayActorError: break # Skip any tasks that errored. while results: try: ray.get(results[0]) except ray.exceptions.RayActorError: results.pop(0) # Check all tasks that executed after the restart. if results: # The actor executed some tasks after the restart. i = 1 while results: r = ray.get(results.pop(0)) assert r == i i += 1 # Check that we can still call the actor. result = actor.increase.remote() assert ray.get(result) == r + 1 else: # Wait for the actor to restart. def ping(): try: ray.get(actor.increase.remote()) return True except ray.exceptions.RayActorError: return False wait_for_condition(ping) # The actor has restarted. Kill actor process one more time. actor.increase.remote(exit=True) # The actor has exceeded max restarts. All tasks should fail. for _ in range(100): with pytest.raises(ray.exceptions.RayActorError): ray.get(actor.increase.remote()) # Create another actor. actor = RestartableActor.remote() # Intentionlly exit the actor actor.__ray_terminate__.remote() # Check that the actor won't be restarted. with pytest.raises(ray.exceptions.RayActorError): ray.get(actor.increase.remote())
def test_remove_placement_group(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=4) ray.init(address=cluster.address) # First try to remove a placement group that doesn't # exist. This should not do anything. random_group_id = PlacementGroupID.from_random() random_placement_group = PlacementGroup(random_group_id, [{"CPU": 1}]) for _ in range(3): ray.util.remove_placement_group(random_placement_group) # Creating a placement group as soon as it is # created should work. placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}]) ray.util.remove_placement_group(placement_group) def is_placement_group_removed(): table = ray.util.placement_group_table(placement_group) if "state" not in table: return False return table["state"] == "REMOVED" wait_for_condition(is_placement_group_removed) # # Now let's create a placement group. placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}]) # Create an actor that occupies resources. @ray.remote(num_cpus=2) class A: def f(self): return 3 # Currently, there's no way to prevent # tasks to be retried for removed placement group. # Set max_retrie=0 for testing. # TODO(sang): Handle this edge case. @ray.remote(num_cpus=2, max_retries=0) def long_running_task(): print(os.getpid()) import time time.sleep(50) # Schedule a long running task and actor. task_ref = long_running_task.options( placement_group=placement_group).remote() a = A.options(placement_group=placement_group).remote() assert ray.get(a.f.remote()) == 3 ray.util.remove_placement_group(placement_group) # Subsequent remove request shouldn't do anything. for _ in range(3): ray.util.remove_placement_group(placement_group) # Make sure placement group resources are # released and we can schedule this task. @ray.remote(num_cpus=4) def f(): return 3 assert ray.get(f.remote()) == 3 # Since the placement group is removed, # the actor should've been killed. # That means this request should fail. with pytest.raises(ray.exceptions.RayActorError, match="actor died"): ray.get(a.f.remote(), timeout=3.0) with pytest.raises(ray.exceptions.WorkerCrashedError): ray.get(task_ref)
def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster): """Test actor owner node dies before local dependencies are resolved. This test verifies the scenario where owner node has failed before actor dependencies are resolved. Reference: https://github.com/ray-project/ray/pull/8045 """ @ray.remote class Actor: def __init__(self, dependency): print("actor: {}".format(os.getpid())) self.dependency = dependency def f(self): return self.dependency # Make sure it is scheduled in the second node. @ray.remote(resources={"node": 1}, num_cpus=1) class Owner: def get_pid(self): return os.getpid() def create_actor(self, caller_handle): s = SignalActor.remote() # Create an actor which depends on an object that can never be # resolved. actor_handle = Actor.remote(s.wait.remote()) pid = os.getpid() signal_handle = SignalActor.remote() caller_handle.call.remote(pid, signal_handle, actor_handle) # Wait until the `Caller` start executing the remote `call` method. ray.get(signal_handle.wait.remote()) @ray.remote class Caller: def call(self, owner_pid, signal_handle, actor_handle): # Notify the `Owner` that the `Caller` is executing the remote # `call` method. ray.get(signal_handle.send.remote()) # Wait for the `Owner` to exit. wait_for_pid_to_exit(owner_pid) oid = actor_handle.f.remote() # It will hang without location resolution protocol. ray.get(oid) def hang(self): return True cluster = ray_start_cluster node_to_be_broken = cluster.add_node(num_cpus=1, resources={"node": 1}) owner = Owner.remote() owner_pid = ray.get(owner.get_pid.remote()) caller = Caller.remote() owner.create_actor.remote(caller) cluster.remove_node(node_to_be_broken) # Wait for the `Owner` to exit. wait_for_pid_to_exit(owner_pid) # It will hang here if location is not properly resolved. assert (wait_for_condition(lambda: ray.get(caller.hang.remote())))
def test_submit_job_validation(ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) job_root_dir = os.path.join( os.path.dirname(ray_start_with_dashboard["session_dir"]), "job") shutil.rmtree(job_root_dir, ignore_errors=True) def _ensure_available_nodes(): resp = requests.post(f"{webui_url}/jobs") resp.raise_for_status() result = resp.json() assert result["result"] is False return "no nodes available" not in result["msg"] wait_for_condition(_ensure_available_nodes, timeout=5) # Invalid value. resp = requests.post(f"{webui_url}/jobs", json={ "language": "Unsupported", "runtime_env": { "working_dir": "http://xxx/yyy.zip" }, "driver_entry": "python_file_name_without_ext", }) resp.raise_for_status() result = resp.json() assert result["result"] is False msg = result["msg"] assert "language" in msg and "value is not a valid" in msg, resp.text # Missing required field. resp = requests.post(f"{webui_url}/jobs", json={ "language": job_consts.PYTHON, "runtime_env": { "working_dir": "http://xxx/yyy.zip" }, }) resp.raise_for_status() result = resp.json() assert result["result"] is False msg = result["msg"] assert all(p in msg for p in ["missing", "driver_entry"]), resp.text # Incorrect value type. resp = requests.post(f"{webui_url}/jobs", json={ "language": job_consts.PYTHON, "runtime_env": { "working_dir": ["http://xxx/yyy.zip"] }, "driver_entry": "python_file_name_without_ext", }) resp.raise_for_status() result = resp.json() assert result["result"] is False msg = result["msg"] assert all(p in msg for p in ["working_dir", "str"]), resp.text # Invalid key. resp = requests.post(f"{webui_url}/jobs", json={ "language": job_consts.PYTHON, "runtime_env": { "working_dir": "http://xxx/yyy.zip" }, "driver_entry": "python_file_name_without_ext", "invalid_key": 1, }) resp.raise_for_status() result = resp.json() assert result["result"] is False msg = result["msg"] assert all(p in msg for p in ["not permitted", "invalid_key"]), resp.text
def test_initial_workers(shutdown_only): # `num_cpus` should be <=2 because a Travis CI machine only has 2 CPU cores ray.init(num_cpus=1, include_dashboard=True) wait_for_condition(lambda: len(get_workers()) == 1)
def decorated_func(quantity): wait_for_condition( lambda: ray.available_resources()[resource_name] < quantity) return True
def test_memory_dashboard(shutdown_only): """Test Memory table. These tests verify examples in this document. https://docs.ray.io/en/latest/memory-management.html#debugging-using-ray-memory """ addresses = ray.init(num_cpus=2) webui_url = addresses["webui_url"].replace("127.0.0.1", "http://127.0.0.1") assert (wait_until_server_available(addresses["webui_url"]) is True) def get_memory_table(): memory_table = requests.get(webui_url + "/api/memory_table").json() return memory_table["result"] def memory_table_ready(): """Wait until the new fresh memory table is ready.""" global prev_memory_table memory_table = get_memory_table() is_ready = memory_table["group"] != prev_memory_table prev_memory_table = memory_table["group"] return is_ready def stop_memory_table(): requests.get(webui_url + "/api/stop_memory_table").json() def test_local_reference(): @ray.remote def f(arg): return arg # a and b are local references. a = ray.put(None) # Noqa F841 b = f.remote(None) # Noqa F841 wait_for_condition(memory_table_ready) memory_table = get_memory_table() summary = memory_table["summary"] group = memory_table["group"] assert summary["total_captured_in_objects"] == 0 assert summary["total_pinned_in_memory"] == 0 assert summary["total_used_by_pending_task"] == 0 assert summary["total_local_ref_count"] == 2 for table in group.values(): for entry in table["entries"]: assert ( entry["reference_type"] == ReferenceType.LOCAL_REFERENCE) stop_memory_table() return True def test_object_pinned_in_memory(): a = ray.put(np.zeros(200 * 1024, dtype=np.uint8)) b = ray.get(a) # Noqa F841 del a wait_for_condition(memory_table_ready) memory_table = get_memory_table() summary = memory_table["summary"] group = memory_table["group"] assert summary["total_captured_in_objects"] == 0 assert summary["total_pinned_in_memory"] == 1 assert summary["total_used_by_pending_task"] == 0 assert summary["total_local_ref_count"] == 0 for table in group.values(): for entry in table["entries"]: assert ( entry["reference_type"] == ReferenceType.PINNED_IN_MEMORY) stop_memory_table() return True def test_pending_task_references(): @ray.remote def f(arg): time.sleep(1) a = ray.put(np.zeros(200 * 1024, dtype=np.uint8)) b = f.remote(a) wait_for_condition(memory_table_ready) memory_table = get_memory_table() summary = memory_table["summary"] assert summary["total_captured_in_objects"] == 0 assert summary["total_pinned_in_memory"] == 1 assert summary["total_used_by_pending_task"] == 1 assert summary["total_local_ref_count"] == 1 # Make sure the function f is done before going to the next test. # Otherwise, the memory table will be corrupted because the # task f won't be done when the next test is running. ray.get(b) stop_memory_table() return True def test_serialized_object_ref_reference(): @ray.remote def f(arg): time.sleep(1) a = ray.put(None) b = f.remote([a]) # Noqa F841 wait_for_condition(memory_table_ready) memory_table = get_memory_table() summary = memory_table["summary"] assert summary["total_captured_in_objects"] == 0 assert summary["total_pinned_in_memory"] == 0 assert summary["total_used_by_pending_task"] == 1 assert summary["total_local_ref_count"] == 2 # Make sure the function f is done before going to the next test. # Otherwise, the memory table will be corrupted because the # task f won't be done when the next test is running. ray.get(b) stop_memory_table() return True def test_captured_object_ref_reference(): a = ray.put(None) b = ray.put([a]) # Noqa F841 del a wait_for_condition(memory_table_ready) memory_table = get_memory_table() summary = memory_table["summary"] assert summary["total_captured_in_objects"] == 1 assert summary["total_pinned_in_memory"] == 0 assert summary["total_used_by_pending_task"] == 0 assert summary["total_local_ref_count"] == 1 stop_memory_table() return True def test_actor_handle_reference(): @ray.remote class Actor: pass a = Actor.remote() # Noqa F841 b = Actor.remote() # Noqa F841 c = Actor.remote() # Noqa F841 wait_for_condition(memory_table_ready) memory_table = get_memory_table() summary = memory_table["summary"] group = memory_table["group"] assert summary["total_captured_in_objects"] == 0 assert summary["total_pinned_in_memory"] == 0 assert summary["total_used_by_pending_task"] == 0 assert summary["total_local_ref_count"] == 0 assert summary["total_actor_handles"] == 3 for table in group.values(): for entry in table["entries"]: assert (entry["reference_type"] == ReferenceType.ACTOR_HANDLE) stop_memory_table() return True # These tests should be retried because it takes at least one second # to get the fresh new memory table. It is because memory table is updated # Whenever raylet and node info is renewed which takes 1 second. wait_for_condition(test_local_reference, timeout=30000, retry_interval_ms=1000) wait_for_condition(test_object_pinned_in_memory, timeout=30000, retry_interval_ms=1000) wait_for_condition(test_pending_task_references, timeout=30000, retry_interval_ms=1000) wait_for_condition(test_serialized_object_ref_reference, timeout=30000, retry_interval_ms=1000) wait_for_condition(test_captured_object_ref_reference, timeout=30000, retry_interval_ms=1000) wait_for_condition(test_actor_handle_reference, timeout=30000, retry_interval_ms=1000)