def test_warning_for_infeasible_tasks(ray_start_regular, error_pubsub): p = error_pubsub # Check that we get warning messages for infeasible tasks. @ray.remote(num_gpus=1) def f(): pass @ray.remote(resources={"Custom": 1}) class Foo: pass # This task is infeasible. f.remote() errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR # This actor placement task is infeasible. Foo.remote() errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR # Placement group cannot be made, but no warnings should occur. total_cpus = ray.cluster_resources()["CPU"] # Occupy one cpu by an actor @ray.remote(num_cpus=1) class A: pass a = A.remote() print(a) @ray.remote(num_cpus=total_cpus) def g(): pass pg = placement_group([{"CPU": total_cpus}], strategy="STRICT_PACK") g.options(placement_group=pg).remote() errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR, timeout=5) assert len(errors) == 0, errors
def test_ready_warning_suppressed(ray_start_regular, error_pubsub): p = error_pubsub # Create an infeasible pg. pg = ray.util.placement_group([{"CPU": 2}] * 2, strategy="STRICT_PACK") with pytest.raises(ray.exceptions.GetTimeoutError): ray.get(pg.ready(), timeout=0.5) errors = get_error_message( p, 1, ray.ray_constants.INFEASIBLE_TASK_ERROR, timeout=0.1) assert len(errors) == 0
def test_runtime_env_no_spurious_resource_deadlock_msg( runtime_env_local_dev_env_var, ray_start_regular, error_pubsub): p = error_pubsub @ray.remote(runtime_env={"pip": ["tensorflow", "torch"]}) def f(): pass # Check no warning printed. ray.get(f.remote()) errors = get_error_message(p, 5, ray.ray_constants.RESOURCE_DEADLOCK_ERROR) assert len(errors) == 0
def test_push_error_to_driver_through_redis(ray_start_regular, error_pubsub): address_info = ray_start_regular address = address_info["redis_address"] redis_client = ray._private.services.create_redis_client( address, password=ray.ray_constants.REDIS_DEFAULT_PASSWORD) error_message = "Test error message" ray._private.utils.push_error_to_driver_through_redis( redis_client, ray_constants.DASHBOARD_AGENT_DIED_ERROR, error_message) errors = get_error_message(error_pubsub, 1, ray_constants.DASHBOARD_AGENT_DIED_ERROR) assert errors[0].type == ray_constants.DASHBOARD_AGENT_DIED_ERROR assert errors[0].error_message == error_message
def test_publish_error_to_driver(ray_start_regular, error_pubsub): address_info = ray_start_regular gcs_publisher = GcsPublisher(address=address_info["gcs_address"]) error_message = "Test error message" ray._private.utils.publish_error_to_driver( ray_constants.DASHBOARD_AGENT_DIED_ERROR, error_message, gcs_publisher=gcs_publisher, ) errors = get_error_message(error_pubsub, 1, ray_constants.DASHBOARD_AGENT_DIED_ERROR) assert errors[0].type == ray_constants.DASHBOARD_AGENT_DIED_ERROR assert errors[0].error_message == error_message
def test_failed_function_to_run(ray_start_2_cpus, error_pubsub): p = error_pubsub def f(worker): if ray.worker.global_worker.mode == ray.WORKER_MODE: raise Exception("Function to run failed.") ray.worker.global_worker.run_function_on_all_workers(f) # Check that the error message is in the task info. errors = get_error_message(p, 2, ray_constants.FUNCTION_TO_RUN_PUSH_ERROR) assert len(errors) == 2 assert errors[0].type == ray_constants.FUNCTION_TO_RUN_PUSH_ERROR assert "Function to run failed." in errors[0].error_message assert "Function to run failed." in errors[1].error_message
def test_version_mismatch(error_pubsub, shutdown_only): ray_version = ray.__version__ ray.__version__ = "fake ray version" ray.init(num_cpus=1) p = error_pubsub errors = get_error_message(p, 1, ray_constants.VERSION_MISMATCH_PUSH_ERROR) assert False, errors assert len(errors) == 1 assert errors[0].type == ray_constants.VERSION_MISMATCH_PUSH_ERROR # Reset the version. ray.__version__ = ray_version
def wait_for_errors(p, error_check): # Wait for errors from all the nondeterministic tasks. errors = [] time_left = 100 while time_left > 0: errors.extend(get_error_message(p, 1)) if error_check(errors): break time_left -= 1 time.sleep(1) # Make sure that enough errors came through. assert error_check(errors) return errors
def test_detached_warning(shutdown_only): ray.init() @ray.remote class DetachedActor: def ping(self): return "pong" error_pubsub = init_error_pubsub() actor = DetachedActor.options( # noqa: F841 name="Pinger", lifetime="detached").remote() errors = get_error_message(error_pubsub, 1, None) error = errors.pop() assert error.type == ray_constants.DETACHED_ACTOR_ANONYMOUS_NAMESPACE_ERROR
def test_warning_for_too_many_nested_tasks(shutdown_only): # Check that if we run a workload which requires too many workers to be # started that we will receive a warning. num_cpus = 2 ray.init(num_cpus=num_cpus) p = init_error_pubsub() remote_wait = Semaphore.remote(value=0) nested_wait = Semaphore.remote(value=0) ray.get([ remote_wait.locked.remote(), nested_wait.locked.remote(), ]) @ray.remote(num_cpus=0.25) def f(): time.sleep(1000) return 1 @ray.remote(num_cpus=0.25) def h(nested_waits): nested_wait.release.remote() ray.get(nested_waits) ray.get(f.remote()) @ray.remote(num_cpus=0.25) def g(remote_waits, nested_waits): # Sleep so that the f tasks all get submitted to the scheduler after # the g tasks. remote_wait.release.remote() # wait until every lock is released. ray.get(remote_waits) ray.get(h.remote(nested_waits)) num_root_tasks = num_cpus * 4 # Lock remote task until everything is scheduled. remote_waits = [] nested_waits = [] for _ in range(num_root_tasks): remote_waits.append(remote_wait.acquire.remote()) nested_waits.append(nested_wait.acquire.remote()) [g.remote(remote_waits, nested_waits) for _ in range(num_root_tasks)] errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR p.close()
def test_warning_actor_waiting_on_actor(shutdown_only): ray.init(num_cpus=1, _system_config={"debug_dump_period_milliseconds": 500}) p = init_error_pubsub() @ray.remote(num_cpus=1) class Actor: pass a = Actor.remote() # noqa b = Actor.remote() # noqa errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
def test_warning_for_dead_autoscaler(ray_start_regular, error_pubsub): # Terminate the autoscaler process. from ray.worker import _global_node autoscaler_process = _global_node.all_processes[ ray_constants.PROCESS_TYPE_MONITOR][0].process autoscaler_process.terminate() # Confirm that we receive an autoscaler failure error. errors = get_error_message( error_pubsub, 1, ray_constants.MONITOR_DIED_ERROR, timeout=5) assert len(errors) == 1 # Confirm that the autoscaler failure error is stored. error = _internal_kv_get(DEBUG_AUTOSCALING_ERROR) assert error is not None
def test_dashboard_agent_restart(ray_start_cluster_head, error_pubsub): """Test that when the agent fails to start many times in a row if the error message is suppressed correctly without spamming the driver. """ # Choose a duplicated port for the agent so that it will crash. p = error_pubsub errors = get_error_message( p, 1, ray_constants.DASHBOARD_AGENT_DIED_ERROR, timeout=10) for e in errors: assert ("There are 2 possible problems " "if you see this error." in e.error_message) # Make sure the agent process is not started anymore. cluster = ray_start_cluster_head wait_for_condition(lambda: search_agents(cluster) is None)
def test_worker_dying(ray_start_regular, error_pubsub): p = error_pubsub # Define a remote function that will kill the worker that runs it. @ray.remote(max_retries=0) def f(): eval("exit()") with pytest.raises(ray.exceptions.WorkerCrashedError): ray.get(f.remote()) errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_DIED_PUSH_ERROR assert "died or was killed while executing" in errors[0].error_message
def test_worker_raising_exception(ray_start_regular, error_pubsub): p = error_pubsub @ray.remote(max_calls=2) def f(): # This is the only reasonable variable we can set here that makes the # execute_task function fail after the task got executed. worker = ray.worker.global_worker worker.function_actor_manager.increase_task_counter = None # Running this task should cause the worker to raise an exception after # the task has successfully completed. f.remote() errors = get_error_message(p, 1, ray_constants.WORKER_CRASH_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_CRASH_PUSH_ERROR
def test_warning_for_infeasible_zero_cpu_actor(shutdown_only): # Check that we cannot place an actor on a 0 CPU machine and that we get an # infeasibility warning (even though the actor creation task itself # requires no CPUs). ray.init(num_cpus=0) p = init_error_pubsub() @ray.remote class Foo: pass # The actor creation should be infeasible. Foo.remote() errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR p.close()
def test_put_error1(ray_start_object_store_memory, error_pubsub): p = error_pubsub num_objects = 3 object_size = 4 * 10**5 # Define a task with a single dependency, a numpy array, that returns # another array. @ray.remote def single_dependency(i, arg): arg = np.copy(arg) arg[0] = i return arg @ray.remote def put_arg_task(): # Launch num_objects instances of the remote task, each dependent # on the one before it. The result of the first task should get # evicted. args = [] arg = single_dependency.remote(0, np.zeros(object_size, dtype=np.uint8)) for i in range(num_objects): arg = single_dependency.remote(i, arg) args.append(arg) # Get the last value to force all tasks to finish. value = ray.get(args[-1]) assert value[0] == i # Get the first value (which should have been evicted) to force # reconstruction. Currently, since we're not able to reconstruct # `ray.put` objects that were evicted and whose originating tasks # are still running, this for-loop should hang and push an error to # the driver. ray.get(args[0]) put_arg_task.remote() # Make sure we receive the correct error message. errors = get_error_message(p, 1, ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR
def test_warning_many_actor_tasks_queued(shutdown_only): ray.init(num_cpus=1) p = init_error_pubsub() @ray.remote(num_cpus=1) class Foo: def f(self): import time time.sleep(1) a = Foo.remote() [a.f.remote() for _ in range(50000)] errors = get_error_message(p, 4, ray_constants.EXCESS_QUEUEING_WARNING) msgs = [e.error_message for e in errors] assert "Warning: More than 5000 tasks are pending submission to actor" in msgs[0] assert "Warning: More than 10000 tasks are pending submission to actor" in msgs[1] assert "Warning: More than 20000 tasks are pending submission to actor" in msgs[2] assert "Warning: More than 40000 tasks are pending submission to actor" in msgs[3]
def test_publish_error_to_driver(ray_start_regular, error_pubsub): address_info = ray_start_regular address = address_info["redis_address"] redis_client = ray._private.services.create_redis_client( address, password=ray.ray_constants.REDIS_DEFAULT_PASSWORD) gcs_publisher = None if gcs_pubsub_enabled(): gcs_publisher = GcsPublisher( address=gcs_utils.get_gcs_address_from_redis(redis_client)) error_message = "Test error message" ray._private.utils.publish_error_to_driver( ray_constants.DASHBOARD_AGENT_DIED_ERROR, error_message, redis_client=redis_client, gcs_publisher=gcs_publisher) errors = get_error_message(error_pubsub, 1, ray_constants.DASHBOARD_AGENT_DIED_ERROR) assert errors[0].type == ray_constants.DASHBOARD_AGENT_DIED_ERROR assert errors[0].error_message == error_message
def test_raylet_and_agent_share_fate(shutdown_only): """Test raylet and agent share fate.""" ray.init(include_dashboard=True) p = init_error_pubsub() node = ray._private.worker._global_node all_processes = node.all_processes raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0] raylet_proc = psutil.Process(raylet_proc_info.process.pid) wait_for_condition(lambda: search_agent(raylet_proc.children())) agent_proc = search_agent(raylet_proc.children()) agent_pid = agent_proc.pid check_agent_register(raylet_proc, agent_pid) # The agent should be dead if raylet exits. raylet_proc.terminate() raylet_proc.wait() agent_proc.wait(5) # No error should be reported for graceful termination. errors = get_error_message(p, 1, ray_constants.RAYLET_DIED_ERROR) assert len(errors) == 0, errors ray.shutdown() ray.init(include_dashboard=True) all_processes = ray._private.worker._global_node.all_processes raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0] raylet_proc = psutil.Process(raylet_proc_info.process.pid) wait_for_condition(lambda: search_agent(raylet_proc.children())) agent_proc = search_agent(raylet_proc.children()) agent_pid = agent_proc.pid check_agent_register(raylet_proc, agent_pid) # The raylet should be dead if agent exits. agent_proc.kill() agent_proc.wait() raylet_proc.wait(5)
def test_no_warning_many_actor_tasks_queued_when_sequential(shutdown_only, sync: bool): ray.init(num_cpus=1) p = init_error_pubsub() @ray.remote(num_cpus=1) class SyncFoo: def f(self): return 1 @ray.remote(num_cpus=1) class AsyncFoo: async def f(self): return 1 Foo = SyncFoo if sync else AsyncFoo a = Foo.remote() for _ in range(10000): assert ray.get(a.f.remote()) == 1 errors = get_error_message(p, 1, ray_constants.EXCESS_QUEUEING_WARNING, timeout=1) assert len(errors) == 0
def test_failed_actor_method(ray_start_regular, error_pubsub): p = error_pubsub error_message2 = "actor method failed" @ray.remote class FailedActor: def __init__(self): pass def fail_method(self): raise Exception(error_message2) a = FailedActor.remote() # Make sure that we get errors from a failed method. a.fail_method.remote() errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.TASK_PUSH_ERROR assert error_message2 in errors[0].error_message
def test_actor_worker_dying(ray_start_regular, error_pubsub): p = error_pubsub @ray.remote class Actor: def kill(self): eval("exit()") @ray.remote def consume(x): pass a = Actor.remote() [obj], _ = ray.wait([a.kill.remote()], timeout=5) with pytest.raises(ray.exceptions.RayActorError): ray.get(obj) with pytest.raises(ray.exceptions.RayTaskError): ray.get(consume.remote(obj)) errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_DIED_PUSH_ERROR
def test_warning_all_tasks_blocked(shutdown_only): ray.init(num_cpus=1, _system_config={"debug_dump_period_milliseconds": 500}) p = init_error_pubsub() @ray.remote(num_cpus=1) class Foo: def f(self): return 0 @ray.remote def f(): # Creating both actors is not possible. actors = [Foo.remote() for _ in range(3)] for a in actors: ray.get(a.f.remote()) # Run in a task to check we handle the blocked task case correctly f.remote() errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
def test_actor_scope_or_intentionally_killed_message(ray_start_regular, error_pubsub): p = error_pubsub @ray.remote class Actor: def __init__(self): # This log is added to debug a flaky test issue. print(os.getpid()) def ping(self): pass a = Actor.remote() # Without this waiting, there seems to be race condition happening # in the CI. This is not a fundamental fix for that, but it at least # makes the test less flaky. ray.get(a.ping.remote()) a = Actor.remote() a.__ray_terminate__.remote() time.sleep(1) errors = get_error_message(p, 1) assert len(errors) == 0, "Should not have propogated an error - {}".format(errors)
def test_warning_for_dead_node(ray_start_cluster_2_nodes, error_pubsub): cluster = ray_start_cluster_2_nodes cluster.wait_for_nodes() p = error_pubsub node_ids = {item["NodeID"] for item in ray.nodes()} # Try to make sure that the monitor has received at least one heartbeat # from the node. time.sleep(0.5) # Kill both raylets. cluster.list_all_nodes()[1].kill_raylet() cluster.list_all_nodes()[0].kill_raylet() # Check that we get warning messages for both raylets. errors = get_error_message(p, 2, ray_constants.REMOVED_NODE_ERROR, 40) # Extract the client IDs from the error messages. This will need to be # changed if the error message changes. warning_node_ids = {error.error_message.split(" ")[5] for error in errors} assert node_ids == warning_node_ids
def test_warning_task_waiting_on_actor(shutdown_only): ray.init(num_cpus=1, _system_config={"debug_dump_period_milliseconds": 500}) p = init_error_pubsub() @ray.remote(num_cpus=1) class Actor: def hello(self): pass a = Actor.remote() # noqa ray.get(a.hello.remote()) @ray.remote(num_cpus=1) def f(): print("f running") time.sleep(999) ids = [f.remote()] # noqa errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
def test_actor_worker_dying_future_tasks(ray_start_regular, error_pubsub): p = error_pubsub @ray.remote(max_restarts=0) class Actor: def getpid(self): return os.getpid() def sleep(self): time.sleep(1) a = Actor.remote() pid = ray.get(a.getpid.remote()) tasks1 = [a.sleep.remote() for _ in range(10)] os.kill(pid, 9) time.sleep(0.1) tasks2 = [a.sleep.remote() for _ in range(10)] for obj in tasks1 + tasks2: with pytest.raises(Exception): ray.get(obj) errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_DIED_PUSH_ERROR
def test_error_isolation(call_ray_start): address = call_ray_start # Connect a driver to the Ray cluster. ray.init(address=address) # If a GRPC call exceeds timeout, the calls is cancelled at client side but # server may still reply to it, leading to missed message. Using a sequence # number to ensure no message is dropped can be the long term solution, # but its complexity and the fact the Ray subscribers do not use deadline # in production makes it less preferred. # Therefore, a simpler workaround is used instead: a different subscriber # is used for each get_error_message() call. subscribers = [init_error_pubsub() for _ in range(3)] # There shouldn't be any errors yet. errors = get_error_message(subscribers[0], 1, timeout=2) assert len(errors) == 0 error_string1 = "error_string1" error_string2 = "error_string2" @ray.remote def f(): raise Exception(error_string1) # Run a remote function that throws an error. with pytest.raises(Exception): ray.get(f.remote()) # Wait for the error to appear in Redis. errors = get_error_message(subscribers[1], 1) # Make sure we got the error. assert len(errors) == 1 assert error_string1 in errors[0].error_message # Start another driver and make sure that it does not receive this # error. Make the other driver throw an error, and make sure it # receives that error. driver_script = """ import ray import time from ray._private.test_utils import init_error_pubsub, get_error_message ray.init(address="{}") subscribers = [init_error_pubsub() for _ in range(2)] time.sleep(1) errors = get_error_message(subscribers[0], 1, timeout=2) assert len(errors) == 0 @ray.remote def f(): raise Exception("{}") try: ray.get(f.remote()) except Exception as e: pass errors = get_error_message(subscribers[1], 1) assert len(errors) == 1 assert "{}" in errors[0].error_message print("success") """.format( address, error_string2, error_string2 ) out = run_string_as_driver(driver_script) # Make sure the other driver succeeded. assert "success" in out # Make sure that the other error message doesn't show up for this # driver. errors = get_error_message(subscribers[2], 1) assert len(errors) == 1
def test_threaded_actor_integration_test_stress(ray_start_cluster_head, log_pubsub, error_pubsub): """This is a sanity test that checks threaded actors are working with the nightly stress test. """ cluster = ray_start_cluster_head p = log_pubsub e = error_pubsub # Prepare the config num_remote_nodes = 4 num_parents = 6 num_children = 6 death_probability = 0.95 max_concurrency = 10 for _ in range(num_remote_nodes): cluster.add_node(num_cpus=2) @ray.remote class Child(object): def __init__(self, death_probability): self.death_probability = death_probability def ping(self): # Exit process with some probability. exit_chance = np.random.rand() if exit_chance > self.death_probability: sys.exit(-1) @ray.remote class Parent(object): def __init__(self, num_children, death_probability=0.95): self.death_probability = death_probability self.children = [ Child.options( max_concurrency=max_concurrency).remote(death_probability) for _ in range(num_children) ] def ping(self, num_pings): children_outputs = [] for _ in range(num_pings): children_outputs += [ child.ping.remote() for child in self.children ] try: ray.get(children_outputs) except Exception: # Replace the children if one of them died. self.__init__(len(self.children), self.death_probability) def kill(self): # Clean up children. ray.get( [child.__ray_terminate__.remote() for child in self.children]) parents = [ Parent.options(max_concurrency=max_concurrency).remote( num_children, death_probability) for _ in range(num_parents) ] start = time.time() loop_times = [] for _ in range(10): loop_start = time.time() ray.get([parent.ping.remote(10) for parent in parents]) # Kill a parent actor with some probability. exit_chance = np.random.rand() if exit_chance > death_probability: parent_index = np.random.randint(len(parents)) parents[parent_index].kill.remote() parents[parent_index] = Parent.options( max_concurrency=max_concurrency).remote( num_children, death_probability) loop_times.append(time.time() - loop_start) result = {} print("Finished in: {}s".format(time.time() - start)) print("Average iteration time: {}s".format( sum(loop_times) / len(loop_times))) print("Max iteration time: {}s".format(max(loop_times))) print("Min iteration time: {}s".format(min(loop_times))) result["total_time"] = time.time() - start result["avg_iteration_time"] = sum(loop_times) / len(loop_times) result["max_iteration_time"] = max(loop_times) result["min_iteration_time"] = min(loop_times) result["success"] = 1 print(result) ensure_cpu_returned(10) del parents # Make sure parents are still scheduleable. parents = [ Parent.options(max_concurrency=max_concurrency).remote( num_children, death_probability) for _ in range(num_parents) ] ray.get([parent.ping.remote(10) for parent in parents]) """ Make sure there are not SIGSEGV, SIGBART, or other odd check failures. """ # Get all logs for 20 seconds. logs = test_utils.get_log_message(p, timeout=20) for log in logs: assert "SIG" not in log, "There's the segfault or SIGBART reported." assert "Check failed" not in log, ( "There's the check failure reported.") # Get error messages for 10 seconds. errors = test_utils.get_error_message(e, timeout=10) for error in errors: print(error) assert "You can ignore this message if" not in error.error_message, ( "Resource deadlock warning shouldn't be printed, but it did.")