def test_cluster(): """Basic test for adding and removing nodes in cluster.""" g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() assert node.remaining_processes_alive() assert node2.remaining_processes_alive() g.remove_node(node2) g.remove_node(node) assert not any(n.any_processes_alive() for n in [node, node2])
def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled): config = json.dumps({ "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, "lineage_pinning_enabled": 1 if reconstruction_enabled else 0, "free_objects_period_milliseconds": -1, }) cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _internal_config=config) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8, _internal_config=config) cluster.add_node(num_cpus=1, resources={"node2": 1}, object_store_memory=10**8, _internal_config=config) cluster.wait_for_nodes() ray.init(address=cluster.address, _internal_config=config) @ray.remote(max_retries=1 if reconstruction_enabled else 0) def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def dependent_task(x): return x obj = ray.put(np.zeros(10**7, dtype=np.uint8)) result = dependent_task.options(resources={"node1": 1}).remote(obj) ray.get(result) del obj cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8, _internal_config=config) for _ in range(20): ray.put(np.zeros(10**7, dtype=np.uint8)) if reconstruction_enabled: ray.get(result) else: with pytest.raises(ray.exceptions.UnreconstructableError): ray.get(result)
def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled): config = json.dumps({ "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, "lineage_pinning_enabled": 1 if reconstruction_enabled else 0, "free_objects_period_milliseconds": -1, }) cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _internal_config=config, object_store_memory=10**8) node_to_kill = cluster.add_node(num_cpus=1, object_store_memory=10**8, _internal_config=config) cluster.wait_for_nodes() ray.init(address=cluster.address, _internal_config=config) @ray.remote(max_retries=1 if reconstruction_enabled else 0) def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def chain(x): return x @ray.remote def dependent_task(x): return x obj = large_object.remote() for _ in range(20): obj = chain.remote(obj) ray.get(dependent_task.remote(obj)) cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, object_store_memory=10**8, _internal_config=config) if reconstruction_enabled: ray.get(dependent_task.remote(obj)) else: with pytest.raises(ray.exceptions.RayTaskError) as e: ray.get(dependent_task.remote(obj)) with pytest.raises(ray.exceptions.UnreconstructableError): raise e.as_instanceof_cause()
def test_cached_object(ray_start_cluster): config = json.dumps({ "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, }) cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _internal_config=config) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) cluster.add_node(num_cpus=1, resources={"node2": 1}, object_store_memory=10**8) cluster.wait_for_nodes() ray.init(address=cluster.address) @ray.remote def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def dependent_task(x): return obj = large_object.options(resources={"node1": 1}).remote() ray.get(dependent_task.options(resources={"node2": 1}).remote(obj)) cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) assert wait_for_condition(lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10) for _ in range(20): large_object.options(resources={"node2": 1}).remote() ray.get(dependent_task.remote(obj))
def test_connect_with_disconnected_node(shutdown_only): config = json.dumps({ "num_heartbeats_timeout": 50, "raylet_heartbeat_timeout_milliseconds": 10, }) cluster = Cluster() cluster.add_node(num_cpus=0, _internal_config=config) ray.init(address=cluster.address) info = relevant_errors(ray_constants.REMOVED_NODE_ERROR) assert len(info) == 0 # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(dead_node, allow_graceful=False) wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 1) # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(dead_node, allow_graceful=False) wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2) # This node is killed by SIGTERM, ray_monitor will not mark it again. removing_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(removing_node, allow_graceful=True) with pytest.raises(RayTestTimeoutException): wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 3, timeout=2) # There is no connection error to a dead node. info = relevant_errors(ray_constants.RAYLET_CONNECTION_ERROR) assert len(info) == 0
def test_connect_with_disconnected_node(shutdown_only): config = { "num_heartbeats_timeout": 50, "raylet_heartbeat_timeout_milliseconds": 10, } cluster = Cluster() cluster.add_node(num_cpus=0, _system_config=config) ray.init(address=cluster.address) p = init_error_pubsub() errors = get_error_message(p, 1, timeout=5) assert len(errors) == 0 # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0) cluster.remove_node(dead_node, allow_graceful=False) errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR) assert len(errors) == 1 # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0) cluster.remove_node(dead_node, allow_graceful=False) errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR) assert len(errors) == 1 # This node is killed by SIGTERM, ray_monitor will not mark it again. removing_node = cluster.add_node(num_cpus=0) cluster.remove_node(removing_node, allow_graceful=True) errors = get_error_message(p, 1, timeout=2) assert len(errors) == 0 # There is no connection error to a dead node. errors = get_error_message(p, 1, timeout=2) assert len(errors) == 0 p.close()
def test_spill_dir_cleanup_on_raylet_start(object_spilling_config): object_spilling_config, temp_folder = object_spilling_config cluster = Cluster() cluster.add_node( num_cpus=0, object_store_memory=75 * 1024 * 1024, _system_config={"object_spilling_config": object_spilling_config}, ) ray.init(address=cluster.address) node2 = cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024) # This task will run on node 2 because node 1 has no CPU resource @ray.remote(num_cpus=1) def run_workload(): ids = [] for _ in range(2): arr = np.random.rand(5 * 1024 * 1024) # 40 MB ids.append(ray.put(arr)) return ids ids = ray.get(run_workload.remote()) assert not is_dir_empty(temp_folder) # Kill node 2 cluster.remove_node(node2) # Verify that the spill folder is not empty assert not is_dir_empty(temp_folder) # Start a new node cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024) # Verify that the spill folder is now cleaned up assert is_dir_empty(temp_folder) # We hold the object refs to prevent them from being deleted del ids ray.shutdown() cluster.shutdown()
def test_multiple_routers(): cluster = Cluster() head_node = cluster.add_node() cluster.add_node() ray.init(head_node.address) node_ids = ray.state.node_ids() assert len(node_ids) == 2 client = serve.start(http_port=8005) # noqa: F841 def get_proxy_names(): proxy_names = [] for node_id, _ in get_all_node_ids(): proxy_names.append( format_actor_name(SERVE_PROXY_NAME, client._controller_name, node_id)) return proxy_names wait_for_condition(lambda: len(get_proxy_names()) == 2) proxy_names = get_proxy_names() # Two actors should be started. def get_first_two_actors(): try: ray.get_actor(proxy_names[0]) ray.get_actor(proxy_names[1]) return True except ValueError: return False wait_for_condition(get_first_two_actors) # Wait for the actors to come up. ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Kill one of the servers, the HTTP server should still function. ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Add a new node to the cluster. This should trigger a new router to get # started. new_node = cluster.add_node() wait_for_condition(lambda: len(get_proxy_names()) == 3) third_proxy = get_proxy_names()[2] def get_third_actor(): try: ray.get_actor(third_proxy) return True # IndexErrors covers when cluster resources aren't updated yet. except (IndexError, ValueError): return False wait_for_condition(get_third_actor) # Remove the newly-added node from the cluster. The corresponding actor # should be removed as well. cluster.remove_node(new_node) def third_actor_removed(): try: ray.get_actor(third_proxy) return False except ValueError: return True # Check that the actor is gone and the HTTP server still functions. wait_for_condition(third_actor_removed) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Clean up the nodes (otherwise Ray will segfault). ray.shutdown() cluster.shutdown()
iteration = 0 previous_ids = [1 for _ in range(100)] start_time = time.time() previous_time = start_time while True: for _ in range(100): previous_ids = [f.remote(previous_id) for previous_id in previous_ids] ray.get(previous_ids) for _ in range(100): previous_ids = [f.remote(previous_id) for previous_id in previous_ids] node_to_kill = get_other_nodes(cluster, exclude_head=True)[0] # Remove the first non-head node. cluster.remove_node(node_to_kill) cluster.add_node() new_time = time.time() print("Iteration {}:\n" " - Iteration time: {}.\n" " - Absolute time: {}.\n" " - Total elapsed time: {}.".format(iteration, new_time - previous_time, new_time, new_time - start_time)) update_progress({ "iteration": iteration, "iteration_time": new_time - previous_time, "absolute_time": new_time, "elapsed_time": new_time - start_time,
def test_reconstruction_cached_dependency(ray_start_cluster, reconstruction_enabled): config = json.dumps({ "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, "lineage_pinning_enabled": 1 if reconstruction_enabled else 0, "free_objects_period_milliseconds": -1, }) cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _internal_config=config) ray.init(address=cluster.address) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8, _internal_config=config) cluster.add_node(num_cpus=1, resources={"node2": 1}, object_store_memory=10**8, _internal_config=config) cluster.wait_for_nodes() @ray.remote(max_retries=0) def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def chain(x): return x @ray.remote def dependent_task(x): return obj = large_object.options(resources={"node2": 1}).remote() obj = chain.options(resources={"node1": 1}).remote(obj) ray.get(dependent_task.options(resources={"node1": 1}).remote(obj)) cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8, _internal_config=config) assert wait_for_condition(lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10) for _ in range(20): large_object.options(resources={"node2": 1}).remote() if reconstruction_enabled: ray.get(dependent_task.remote(obj)) else: with pytest.raises(ray.exceptions.RayTaskError) as e: ray.get(dependent_task.remote(obj)) with pytest.raises(ray.exceptions.UnreconstructableError): raise e.as_instanceof_cause()
def test_multiple_routers(): cluster = Cluster() head_node = cluster.add_node() cluster.add_node() ray.init(head_node.address) node_ids = ray.state.node_ids() assert len(node_ids) == 2 serve.init(http_port=8005) def actor_name(index): return SERVE_PROXY_NAME + "-{}-{}".format(node_ids[0], index) # Two actors should be started. def get_first_two_actors(): try: ray.get_actor(actor_name(0)) ray.get_actor(actor_name(1)) return True except ValueError: return False wait_for_condition(get_first_two_actors) # Wait for the actors to come up. ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Kill one of the servers, the HTTP server should still function. ray.kill(ray.get_actor(actor_name(0)), no_restart=True) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Add a new node to the cluster. This should trigger a new router to get # started. new_node = cluster.add_node() def get_third_actor(): try: ray.get_actor(actor_name(2)) return True except ValueError: return False wait_for_condition(get_third_actor) # Remove the newly-added node from the cluster. The corresponding actor # should be removed as well. cluster.remove_node(new_node) def third_actor_removed(): try: ray.get_actor(actor_name(2)) return False except ValueError: return True # Check that the actor is gone and the HTTP server still functions. wait_for_condition(third_actor_removed) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Clean up the nodes (otherwise Ray will segfault). ray.shutdown() cluster.shutdown()