def test_failed_actor_init(ray_start_regular): error_message1 = "actor constructor failed" error_message2 = "actor method failed" @ray.remote class FailedActor(object): def __init__(self): raise Exception(error_message1) def fail_method(self): raise Exception(error_message2) a = FailedActor.remote() # Make sure that we get errors from a failed constructor. wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1) errors = relevant_errors(ray_constants.TASK_PUSH_ERROR) assert len(errors) == 1 assert error_message1 in errors[0]["message"] # Make sure that we get errors from a failed method. a.fail_method.remote() wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) errors = relevant_errors(ray_constants.TASK_PUSH_ERROR) assert len(errors) == 2 assert error_message1 in errors[1]["message"]
def test_connect_with_disconnected_node(shutdown_only): config = json.dumps({ "num_heartbeats_timeout": 50, "heartbeat_timeout_milliseconds": 10, }) cluster = Cluster() cluster.add_node(num_cpus=0, _internal_config=config) ray.init(redis_address=cluster.redis_address) info = relevant_errors(ray_constants.REMOVED_NODE_ERROR) assert len(info) == 0 # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(dead_node, allow_graceful=False) wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 1, timeout=2) # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(dead_node, allow_graceful=False) wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2, timeout=2) # This node is killed by SIGTERM, ray_monitor will not mark it again. removing_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(removing_node, allow_graceful=True) with pytest.raises(Exception, match=("Timing out of wait.")): wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 3, timeout=2) # There is no connection error to a dead node. info = relevant_errors(ray_constants.RAYLET_CONNECTION_ERROR) assert len(info) == 0
def test_connect_with_disconnected_node(shutdown_only): config = json.dumps({ "num_heartbeats_timeout": 50, "heartbeat_timeout_milliseconds": 10, }) cluster = Cluster() cluster.add_node(num_cpus=0, _internal_config=config) ray.init(redis_address=cluster.redis_address) info = relevant_errors(ray_constants.REMOVED_NODE_ERROR) assert len(info) == 0 # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(dead_node, allow_graceful=False) wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 1, timeout=2) # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(dead_node, allow_graceful=False) wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2, timeout=2) # This node is killed by SIGTERM, ray_monitor will not mark it again. removing_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(removing_node, allow_graceful=True) with pytest.raises(Exception, match=('Timing out of wait.')): wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 3, timeout=2) # There is no connection error to a dead node. info = relevant_errors(ray_constants.RAYLET_CONNECTION_ERROR) assert len(info) == 0
def test_fail_importing_actor(ray_start_regular): # Create the contents of a temporary Python file. temporary_python_file = """ def temporary_helper_function(): return 1 """ f = tempfile.NamedTemporaryFile(suffix=".py") f.write(temporary_python_file.encode("ascii")) f.flush() directory = os.path.dirname(f.name) # Get the module name and strip ".py" from the end. module_name = os.path.basename(f.name)[:-3] sys.path.append(directory) module = __import__(module_name) # Define an actor that closes over this temporary module. This should # fail when it is unpickled. @ray.remote class Foo(object): def __init__(self): self.x = module.temporary_python_file() def get_val(self): return 1 # There should be no errors yet. assert len(ray.errors()) == 0 # Create an actor. foo = Foo.remote() # Wait for the error to arrive. wait_for_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR, 1) errors = relevant_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR) assert "No module named" in errors[0]["message"] # Wait for the error from when the __init__ tries to run. wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1) errors = relevant_errors(ray_constants.TASK_PUSH_ERROR) assert ("failed to be imported, and so cannot execute this method" in errors[0]["message"]) # Check that if we try to get the function it throws an exception and # does not hang. with pytest.raises(Exception): ray.get(foo.get_val.remote()) # Wait for the error from when the call to get_val. wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) errors = relevant_errors(ray_constants.TASK_PUSH_ERROR) assert ("failed to be imported, and so cannot execute this method" in errors[1]["message"]) f.close() # Clean up the junk we added to sys.path. sys.path.pop(-1)
def test_fail_importing_actor(ray_start_regular): # Create the contents of a temporary Python file. temporary_python_file = """ def temporary_helper_function(): return 1 """ f = tempfile.NamedTemporaryFile(suffix=".py") f.write(temporary_python_file.encode("ascii")) f.flush() directory = os.path.dirname(f.name) # Get the module name and strip ".py" from the end. module_name = os.path.basename(f.name)[:-3] sys.path.append(directory) module = __import__(module_name) # Define an actor that closes over this temporary module. This should # fail when it is unpickled. @ray.remote class Foo(object): def __init__(self): self.x = module.temporary_python_file() def get_val(self): return 1 # There should be no errors yet. assert len(ray.error_info()) == 0 # Create an actor. foo = Foo.remote() # Wait for the error to arrive. wait_for_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR, 1) errors = relevant_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR) assert "No module named" in errors[0]["message"] # Wait for the error from when the __init__ tries to run. wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1) errors = relevant_errors(ray_constants.TASK_PUSH_ERROR) assert ("failed to be imported, and so cannot execute this method" in errors[0]["message"]) # Check that if we try to get the function it throws an exception and # does not hang. with pytest.raises(Exception): ray.get(foo.get_val.remote()) # Wait for the error from when the call to get_val. wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) errors = relevant_errors(ray_constants.TASK_PUSH_ERROR) assert ("failed to be imported, and so cannot execute this method" in errors[1]["message"]) f.close() # Clean up the junk we added to sys.path. sys.path.pop(-1)
def test_failed_task(ray_start_regular): @ray.remote def throw_exception_fct1(): raise Exception("Test function 1 intentionally failed.") @ray.remote def throw_exception_fct2(): raise Exception("Test function 2 intentionally failed.") @ray.remote(num_return_vals=3) def throw_exception_fct3(x): raise Exception("Test function 3 intentionally failed.") throw_exception_fct1.remote() throw_exception_fct1.remote() wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) assert len(relevant_errors(ray_constants.TASK_PUSH_ERROR)) == 2 for task in relevant_errors(ray_constants.TASK_PUSH_ERROR): msg = task.get("message") assert "Test function 1 intentionally failed." in msg x = throw_exception_fct2.remote() try: ray.get(x) except Exception as e: assert "Test function 2 intentionally failed." in str(e) else: # ray.get should throw an exception. assert False x, y, z = throw_exception_fct3.remote(1.0) for ref in [x, y, z]: try: ray.get(ref) except Exception as e: assert "Test function 3 intentionally failed." in str(e) else: # ray.get should throw an exception. assert False class CustomException(ValueError): pass @ray.remote def f(): raise CustomException("This function failed.") try: ray.get(f.remote()) except Exception as e: assert "This function failed." in str(e) assert isinstance(e, CustomException) assert isinstance(e, ray.exceptions.RayTaskError) assert "RayTaskError(CustomException)" in repr(e) else: # ray.get should throw an exception. assert False
def test_failed_task(ray_start_regular): @ray.remote def throw_exception_fct1(): raise Exception("Test function 1 intentionally failed.") @ray.remote def throw_exception_fct2(): raise Exception("Test function 2 intentionally failed.") @ray.remote(num_return_vals=3) def throw_exception_fct3(x): raise Exception("Test function 3 intentionally failed.") throw_exception_fct1.remote() throw_exception_fct1.remote() wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) assert len(relevant_errors(ray_constants.TASK_PUSH_ERROR)) == 2 for task in relevant_errors(ray_constants.TASK_PUSH_ERROR): msg = task.get("message") assert "Test function 1 intentionally failed." in msg x = throw_exception_fct2.remote() try: ray.get(x) except Exception as e: assert "Test function 2 intentionally failed." in str(e) else: # ray.get should throw an exception. assert False x, y, z = throw_exception_fct3.remote(1.0) for ref in [x, y, z]: try: ray.get(ref) except Exception as e: assert "Test function 3 intentionally failed." in str(e) else: # ray.get should throw an exception. assert False @ray.remote def f(): raise Exception("This function failed.") try: ray.get(f.remote()) except Exception as e: assert "This function failed." in str(e) else: # ray.get should throw an exception. assert False
def test_warning_for_dead_node(ray_start_cluster_2_nodes): cluster = ray_start_cluster_2_nodes cluster.wait_for_nodes() node_ids = {item["NodeID"] for item in ray.nodes()} # Try to make sure that the monitor has received at least one heartbeat # from the node. time.sleep(0.5) # Kill both raylets. cluster.list_all_nodes()[1].kill_raylet() cluster.list_all_nodes()[0].kill_raylet() # Check that we get warning messages for both raylets. wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2, timeout=40) # Extract the client IDs from the error messages. This will need to be # changed if the error message changes. warning_node_ids = { item["message"].split(" ")[5] for item in relevant_errors(ray_constants.REMOVED_NODE_ERROR) } assert node_ids == warning_node_ids
def test_warning_for_dead_node(ray_start_two_nodes): cluster = ray_start_two_nodes cluster.wait_for_nodes() client_ids = {item["ClientID"] for item in ray.global_state.client_table()} # Try to make sure that the monitor has received at least one heartbeat # from the node. time.sleep(0.5) # Kill both raylets. cluster.list_all_nodes()[1].kill_raylet() cluster.list_all_nodes()[0].kill_raylet() # Check that we get warning messages for both raylets. wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2, timeout=40) # Extract the client IDs from the error messages. This will need to be # changed if the error message changes. warning_client_ids = { item["message"].split(" ")[5] for item in relevant_errors(ray_constants.REMOVED_NODE_ERROR) } assert client_ids == warning_client_ids
def test_failed_function_to_run(ray_start_2_cpus): def f(worker): if ray.worker.global_worker.mode == ray.WORKER_MODE: raise Exception("Function to run failed.") ray.worker.global_worker.run_function_on_all_workers(f) wait_for_errors(ray_constants.FUNCTION_TO_RUN_PUSH_ERROR, 2) # Check that the error message is in the task info. errors = relevant_errors(ray_constants.FUNCTION_TO_RUN_PUSH_ERROR) assert len(errors) == 2 assert "Function to run failed." in errors[0]["message"] assert "Function to run failed." in errors[1]["message"]
def test_failed_function_to_run(ray_start_regular): def f(worker): if ray.worker.global_worker.mode == ray.WORKER_MODE: raise Exception("Function to run failed.") ray.worker.global_worker.run_function_on_all_workers(f) wait_for_errors(ray_constants.FUNCTION_TO_RUN_PUSH_ERROR, 2) # Check that the error message is in the task info. errors = relevant_errors(ray_constants.FUNCTION_TO_RUN_PUSH_ERROR) assert len(errors) == 2 assert "Function to run failed." in errors[0]["message"] assert "Function to run failed." in errors[1]["message"]
def test_worker_dying(ray_start_regular): # Define a remote function that will kill the worker that runs it. @ray.remote def f(): eval("exit()") with pytest.raises(ray.exceptions.RayWorkerError): ray.get(f.remote()) wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1) errors = relevant_errors(ray_constants.WORKER_DIED_PUSH_ERROR) assert len(errors) == 1 assert "died or was killed while executing" in errors[0]["message"]
def test_fail_importing_remote_function(ray_start_2_cpus): # Create the contents of a temporary Python file. temporary_python_file = """ def temporary_helper_function(): return 1 """ f = tempfile.NamedTemporaryFile(suffix=".py") f.write(temporary_python_file.encode("ascii")) f.flush() directory = os.path.dirname(f.name) # Get the module name and strip ".py" from the end. module_name = os.path.basename(f.name)[:-3] sys.path.append(directory) module = __import__(module_name) # Define a function that closes over this temporary module. This should # fail when it is unpickled. @ray.remote def g(): try: module.temporary_python_file() except Exception: # This test is not concerned with the error from running this # function. Only from unpickling the remote function. pass # Invoke the function so that the definition is exported. g.remote() wait_for_errors(ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR, 2) errors = relevant_errors(ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR) assert len(errors) == 2 assert "No module named" in errors[0]["message"] assert "No module named" in errors[1]["message"] # Check that if we try to call the function it throws an exception and # does not hang. for _ in range(10): with pytest.raises(Exception): ray.get(g.remote()) f.close() # Clean up the junk we added to sys.path. sys.path.pop(-1)
def test_fail_importing_remote_function(ray_start_regular): # Create the contents of a temporary Python file. temporary_python_file = """ def temporary_helper_function(): return 1 """ f = tempfile.NamedTemporaryFile(suffix=".py") f.write(temporary_python_file.encode("ascii")) f.flush() directory = os.path.dirname(f.name) # Get the module name and strip ".py" from the end. module_name = os.path.basename(f.name)[:-3] sys.path.append(directory) module = __import__(module_name) # Define a function that closes over this temporary module. This should # fail when it is unpickled. @ray.remote def g(): return module.temporary_python_file() wait_for_errors(ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR, 2) errors = relevant_errors(ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR) assert len(errors) == 2 assert "No module named" in errors[0]["message"] assert "No module named" in errors[1]["message"] # Check that if we try to call the function it throws an exception and # does not hang. for _ in range(10): with pytest.raises(Exception): ray.get(g.remote()) f.close() # Clean up the junk we added to sys.path. sys.path.pop(-1)