def test_cluster_rllib_restore(start_connected_cluster, tmpdir): cluster = start_connected_cluster dirpath = str(tmpdir) script = """ import time import ray from ray import tune ray.init(address="{address}") tune.run( "PG", name="experiment", config=dict(env="CartPole-v1", framework="tf"), stop=dict(training_iteration=10), local_dir="{checkpoint_dir}", checkpoint_freq=1, max_failures=1, dict(experiment=kwargs), raise_on_failed_trial=False) """.format( address=cluster.address, checkpoint_dir=dirpath) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. local_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(100): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner( resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration"): break time.sleep(0.3) if not TrialRunner.checkpoint_exists(local_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() cluster.wait_for_nodes() # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": "PG", "checkpoint_freq": 1, "local_dir": dirpath, } }, resume=True) assert all(t.status == Trial.TERMINATED for t in trials2) ray.shutdown() cluster.shutdown()
def test_driver_dead(shutdown_only): """Make sure all ray workers are shutdown when driver is killed.""" driver = """ import ray ray.init(_system_config={"ping_gcs_rpc_server_max_retries": 1}) @ray.remote def f(): import time time.sleep(10) num_cpus = int(ray.available_resources()["CPU"]) tasks = [f.remote() for _ in range(num_cpus)] """ p = run_string_as_driver_nonblocking(driver) # Make sure the driver is running. time.sleep(1) assert p.poll() is None wait_for_condition(lambda: len(get_all_ray_worker_processes()) > 0) # Kill the driver process. p.kill() p.wait() time.sleep(0.1) wait_for_condition(lambda: len(get_all_ray_worker_processes()) == 0)
def test_job_timestamps(ray_start_regular): driver_template = """ import ray from time import sleep ray.init(address="{}") print("My job id: ", str(ray.get_runtime_context().job_id)) {} ray.shutdown() """ non_hanging = driver_template.format(ray_start_regular["redis_address"], "sleep(1)") hanging_driver = driver_template.format(ray_start_regular["redis_address"], "sleep(60)") out = run_string_as_driver(non_hanging) p = run_string_as_driver_nonblocking(hanging_driver) # The nonblocking process needs time to connect. time.sleep(1) jobs = list(ray.state.jobs()) jobs.sort(key=lambda x: x["JobID"]) driver = jobs[0] finished = jobs[1] running = jobs[2] # The initial driver timestamp/start time go down a different code path. assert driver["Timestamp"] == driver["StartTime"] assert finished["Timestamp"] == finished["EndTime"] assert running["Timestamp"] == running["StartTime"] assert finished["EndTime"] > finished["StartTime"] > 0, out lapsed = finished["EndTime"] - finished["StartTime"] assert 0 < lapsed < 2000, f"Job should've taken ~1s, {finished}" assert running["StartTime"] > 0 assert running["EndTime"] == 0 p.kill() # Give the second job time to clean itself up. time.sleep(1) jobs = list(ray.state.jobs()) jobs.sort(key=lambda x: x["JobID"]) # jobs[0] is the test case driver. finished = jobs[1] prev_running = jobs[2] assert finished["EndTime"] > finished["StartTime"] > 0, f"{finished}" assert finished["EndTime"] == finished["Timestamp"] lapsed = finished["EndTime"] - finished["StartTime"] assert 0 < lapsed < 2000, f"Job should've taken ~1s {finished}" assert prev_running["EndTime"] > prev_running["StartTime"] > 0
def test_workflow_lifetime_2(call_ray_start): # Case 2: driver terminated proc = run_string_as_driver_nonblocking(driver_script.format(100)) time.sleep(10) proc.kill() time.sleep(1) workflow.init() output = workflow.get_output("driver_terminated") assert ray.get(output) == 20
def test_recovery_cluster_failure(): subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) proc = run_string_as_driver_nonblocking(driver_script) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) workflow.init() assert ray.get(workflow.resume("cluster_failure")) == 20 ray.shutdown()
def test_recovery_cluster_failure(): subprocess.run(["ray start --head"], shell=True) time.sleep(1) proc = run_string_as_driver_nonblocking(driver_script) time.sleep(10) subprocess.run(["ray stop"], shell=True) proc.kill() time.sleep(1) ray.init() assert ray.get(workflow.resume("cluster_failure")) == 20 ray.shutdown()
def test_recovery_cluster_failure(reset_workflow, tmp_path): subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) proc = run_string_as_driver_nonblocking( driver_script.format(tmp_path=str(tmp_path))) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) workflow.init(str(tmp_path)) assert ray.get(workflow.resume("cluster_failure")) == 20 workflow.storage.set_global_storage(None) ray.shutdown()
def test_drivers_named_actors(call_ray_start): # This test will create some drivers that submit some tasks to the same # named actor. address = call_ray_start ray.init(address=address) # Define a driver that creates a named actor then sleeps for a while. driver_script1 = """ import ray import time ray.init(address="{}") @ray.remote class Counter: def __init__(self): self.count = 0 def increment(self): self.count += 1 return self.count counter = Counter.remote() ray.util.register_actor("Counter", counter) time.sleep(100) """.format(address) # Define a driver that submits to the named actor and exits. driver_script2 = """ import ray import time ray.init(address="{}") while True: try: counter = ray.util.get_actor("Counter") break except ValueError: time.sleep(1) assert ray.get(counter.increment.remote()) == {} print("success") """.format(address, "{}") process_handle = run_string_as_driver_nonblocking(driver_script1) for i in range(3): driver_script = driver_script2.format(i + 1) out = run_string_as_driver(driver_script) assert "success" in out process_handle.kill()
def test_jobconfig_compatible_3(ray_start_cluster_head, working_dir): # start job_config=something # start job_config=something else cluster = ray_start_cluster_head (address, env, PKG_DIR) = start_client_server(cluster, True) runtime_env = """{ "py_modules": [test_module.__path__[0]] }""" execute_statement = """ sleep(600) """ script = driver_script.format(**locals()) proc = run_string_as_driver_nonblocking(script, env) sleep(5) runtime_env = f"""{{ "working_dir": test_module.__path__[0] }}""" execute_statement = "print('OK')" script = driver_script.format(**locals()) out = run_string_as_driver(script, env) proc.kill() proc.wait() assert out.strip().split()[-1] == "ERROR"
def test_worker_stdout(): script = """ import ray import sys ray.init(num_cpus=2) @ray.remote def foo(out_str, err_str): print(out_str) print(err_str, file=sys.stderr) ray.get(foo.remote("abc", "def")) """ proc = run_string_as_driver_nonblocking(script) out_str = proc.stdout.read().decode("ascii") err_str = proc.stderr.read().decode("ascii") assert out_str.endswith("abc\n") assert err_str.split("\n")[-2].endswith("def")
def test_jobconfig_compatible_1(ray_start_cluster_head, working_dir): # start job_config=None # start job_config=something cluster = ray_start_cluster_head (address, env, PKG_DIR) = start_client_server(cluster, True) runtime_env = None execute_statement = """ sleep(600) """ script = driver_script.format(**locals()) # Have one running with job config = None proc = run_string_as_driver_nonblocking(script, env) # waiting it to be up sleep(5) runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" execute_statement = "print(sum(ray.get([run_test.remote()] * 1000)))" script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == "ERROR" proc.kill() proc.wait()
def test_env_installation_nonblocking(shutdown_only): """Test fix for https://github.com/ray-project/ray/issues/16226.""" env1 = {"pip": ["pip-install-test==0.5"]} job_config = ray.job_config.JobConfig(runtime_env=env1) ray.init(job_config=job_config) @ray.remote def f(): return "hello" # Warm up a worker because it takes time to start. ray.get(f.remote()) def assert_tasks_finish_quickly(total_sleep_s=0.1): """Call f every 0.01 seconds for total time total_sleep_s.""" gap_s = 0.01 for i in range(int(total_sleep_s / gap_s)): start = time.time() ray.get(f.remote()) # Env installation takes around 10 to 60 seconds. If we fail the # below assert, we can be pretty sure an env installation blocked # the task. assert time.time() - start < 0.1 time.sleep(gap_s) assert_tasks_finish_quickly() env2 = {"pip": ["pip-install-test==0.5", "requests"]} f.options(runtime_env=env2).remote() # Check that installing env2 above does not block tasks using env1. assert_tasks_finish_quickly() proc = run_string_as_driver_nonblocking( install_env_script.format(env=env1)) # Check that installing env1 in a new worker in the script above does not # block other tasks that use env1. assert_tasks_finish_quickly(total_sleep_s=5) proc.kill() proc.wait()
def test_job_gc(call_ray_start): address = call_ray_start ray.init(address=address) driver = """ import ray ray.init(address="{}") @ray.remote class Actor: def __init__(self): pass _ = Actor.remote() """.format(address) p = run_string_as_driver_nonblocking(driver) # Wait for actor to be created wait_for_num_actors(1) actor_table = ray.actors() assert len(actor_table) == 1 job_table = ray.jobs() assert len(job_table) == 2 # dash # Kill the driver process. p.kill() p.wait() def actor_finish(): actor_table = ray.actors() if (len(actor_table) == 0): return True else: return False wait_for_condition(actor_finish)
def test_jobconfig_compatible_2(ray_start_cluster_head, working_dir): # start job_config=something # start job_config=None cluster = ray_start_cluster_head (address, env, PKG_DIR) = start_client_server(cluster, True) runtime_env = """{ "py_modules": [test_module.__path__[0]] }""" # To make the first one hanging there execute_statement = """ sleep(600) """ script = driver_script.format(**locals()) proc = run_string_as_driver_nonblocking(script, env) sleep(5) runtime_env = None # Execute the following in the second one which should # succeed execute_statement = "print('OK')" script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == "OK" proc.kill() proc.wait()
def test_job_gc_with_detached_actor(call_ray_start): address = call_ray_start ray.init(address=address) driver = """ import ray ray.init(address="{}") @ray.remote class Actor: def __init__(self): pass def value(self): return 1 _ = Actor.options(lifetime="detached", name="DetachedActor").remote() # Make sure the actor is created before the driver exits. ray.get(_.value.remote()) """.format(address) p = run_string_as_driver_nonblocking(driver) # Wait for actor to be created wait_for_num_actors(1, ray.gcs_utils.ActorTableData.ALIVE) actor_table = ray.actors() assert len(actor_table) == 1 job_table = ray.jobs() assert len(job_table) == 2 # dash # Kill the driver process. p.kill() p.wait() detached_actor = ray.get_actor("DetachedActor") assert ray.get(detached_actor.value.remote()) == 1
def test_jobconfig_compatible_3(ray_start_cluster_head, working_dir): # start job_config=something # start job_config=something else cluster = ray_start_cluster_head (address, env, PKG_DIR) = start_client_server(cluster, True) runtime_env = """{ "py_modules": [test_module.__path__[0]] }""" # To make the first one hanging ther execute_statement = """ sleep(600) """ script = driver_script.format(**locals()) proc = run_string_as_driver_nonblocking(script, env) sleep(5) runtime_env = f""" {{ "working_dir": test_module.__path__[0] }}""" # noqa: F541 # Execute the following cmd in the second one and ensure that # it is able to run. execute_statement = "print('OK')" script = driver_script.format(**locals()) out = run_string_as_driver(script, env) proc.kill() proc.wait() assert out.strip().split()[-1] == "OK"
def test_dying_driver_get(ray_start_regular): # Start the Ray processes. address_info = ray_start_regular @ray.remote def sleep_forever(): time.sleep(10**6) x_id = sleep_forever.remote() driver = """ import ray ray.init("{}") ray.get(ray.ObjectRef(ray._private.utils.hex_to_binary("{}"))) """.format(address_info["redis_address"], x_id.hex()) p = run_string_as_driver_nonblocking(driver) # Make sure the driver is running. time.sleep(1) assert p.poll() is None # Kill the driver process. p.kill() p.wait() time.sleep(0.1) # Make sure the original task hasn't finished. ready_ids, _ = ray.wait([x_id], timeout=0) assert len(ready_ids) == 0 # Seal the object so the store attempts to notify the worker that the # get has been fulfilled. obj = np.ones(200 * 1024, dtype=np.uint8) ray.worker.global_worker.put_object(obj, x_id) time.sleep(0.1) # Make sure that nothing has died. assert ray._private.services.remaining_processes_alive()
def test_head_node_down(ray_start_cluster): """Make sure all ray workers when head node is dead.""" cluster = ray_start_cluster # head node. head = cluster.add_node( num_cpus=2, _system_config={"ping_gcs_rpc_server_max_retries": 1}) # worker nodes. num_worker_nodes = 2 for _ in range(num_worker_nodes): cluster.add_node(num_cpus=2) cluster.wait_for_nodes() # Start a driver. driver = """ import ray ray.init(address="{}") @ray.remote def f(): import time time.sleep(10) num_cpus = int(ray.available_resources()["CPU"]) tasks = [f.remote() for _ in range(num_cpus)] import time time.sleep(100) """.format(cluster.address) p = run_string_as_driver_nonblocking(driver) # Make sure the driver is running. time.sleep(1) wait_for_condition(lambda: p.poll() is None) wait_for_condition(lambda: len(get_all_ray_worker_processes()) > 0) cluster.remove_node(head) wait_for_condition(lambda: len(get_all_ray_worker_processes()) == 0)
def test_dying_driver_wait(ray_start_regular): # Start the Ray processes. address_info = ray_start_regular @ray.remote def sleep_forever(): time.sleep(10**6) x_id = sleep_forever.remote() driver = """ import ray ray.init("{}") ray.wait([ray.ObjectID(ray.utils.hex_to_binary("{}"))]) """.format(address_info["redis_address"], x_id.hex()) p = run_string_as_driver_nonblocking(driver) # Make sure the driver is running. time.sleep(1) assert p.poll() is None # Kill the driver process. p.kill() p.wait() time.sleep(0.1) # Make sure the original task hasn't finished. ready_ids, _ = ray.wait([x_id], timeout=0) assert len(ready_ids) == 0 # Seal the object so the store attempts to notify the worker that the # wait can return. ray.worker.global_worker.put_object(1, x_id.with_plasma_transport_type()) time.sleep(0.1) # Make sure that nothing has died. assert ray.services.remaining_processes_alive()
def test_recovery_cluster_failure_resume_all(reset_workflow, tmp_path): tmp_path = tmp_path subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) workflow_dir = tmp_path / "workflow" lock_file = tmp_path / "lock_file" driver_script = f""" import time from ray.experimental import workflow from filelock import FileLock @workflow.step def foo(x): with FileLock("{str(lock_file)}"): return 20 if __name__ == "__main__": workflow.init("{str(workflow_dir)}") assert foo.step(0).run(workflow_id="cluster_failure") == 20 """ lock = FileLock(lock_file) lock.acquire() proc = run_string_as_driver_nonblocking(driver_script) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) lock.release() workflow.init(str(workflow_dir)) resumed = workflow.resume_all() assert len(resumed) == 1 (wid, obj_ref) = resumed[0] assert wid == "cluster_failure" assert ray.get(obj_ref) == 20 workflow.storage.set_global_storage(None) ray.shutdown()
def test_job_gc_with_detached_actor(call_ray_start): address = call_ray_start ray.init(address=address) driver = """ import ray ray.init(address="{}") @ray.remote class Actor: def __init__(self): pass def value(self): return 1 _ = Actor.options(name="DetachedActor").remote() """.format(address) p = run_string_as_driver_nonblocking(driver) # Wait for actor to be created wait_for_num_actors(1) actor_table = ray.actors() assert len(actor_table) == 1 job_table = ray.jobs() assert len(job_table) == 2 # Kill the driver process. p.kill() p.wait() detached_actor = ray.get_actor("DetachedActor") assert ray.get(detached_actor.value.remote()) == 1
def test_cluster_interrupt(start_connected_cluster, tmpdir): """Tests run_experiment on cluster shutdown with actual interrupt. This is an end-to-end test. """ cluster = start_connected_cluster dirpath = str(tmpdir) # Needs to be in scope for pytest class _Mock(tune.Trainable): """Finishes on the 4th iteration.""" def _setup(self, config): self.state = {"hi": 0} def _train(self): self.state["hi"] += 1 time.sleep(0.5) return {"done": self.state["hi"] >= 4} def _save(self, path): return self.state def _restore(self, state): self.state = state # Removes indent from class. reformatted = "\n".join(line[4:] if len(line) else line for line in inspect.getsource(_Mock).split("\n")) script = """ import time import ray from ray import tune ray.init(address="{address}") {fail_class_code} tune.run( {fail_class}, name="experiment", stop=dict(training_iteration=5), local_dir="{checkpoint_dir}", checkpoint_freq=1, global_checkpoint_period=0, max_failures=1, raise_on_failed_trial=False) """.format(address=cluster.address, checkpoint_dir=dirpath, fail_class_code=reformatted, fail_class=_Mock.__name__) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. local_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(50): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration") == 3: break time.sleep(0.2) if not TrialRunner.checkpoint_exists(local_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() Experiment.register_if_needed(_Mock) # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() assert trials[0].last_result["training_iteration"] == 3 assert trials[0].status == Trial.PENDING # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": _Mock, "local_dir": dirpath, "checkpoint_freq": 1 } }, resume=True, raise_on_failed_trial=False) assert all(t.status == Trial.TERMINATED for t in trials2) assert {t.trial_id for t in trials2} == {t.trial_id for t in trials} ray.shutdown() cluster.shutdown()
def test_local_clusters(): """ This tests the various behaviors of connecting to local clusters: * Using `ray.client("local").connect() ` should always create a new cluster. * Using `ray.cleint().connectIO` should create a new cluster if it doesn't connect to an existing one. * Using `ray.client().connect()` should only connect to a cluster if it was created with `ray start --head`, not from a python program. It does tests if two calls are in the same cluster by trying to create an actor with the same name in the same namespace, which will error and cause the script have a non-zero exit, which throws an exception. """ driver_template = """ import ray info = ray.client({address}).namespace("").connect() @ray.remote class Foo: def ping(self): return "pong" a = Foo.options(name="abc", lifetime="detached").remote() ray.get(a.ping.remote()) import time while True: time.sleep(30) """ blocking_local_script = driver_template.format( address="'local'", blocking=True) blocking_noaddr_script = driver_template.format(address="", blocking=True) # This should start a cluster. p1 = run_string_as_driver_nonblocking(blocking_local_script) # ray.client("local").connect() should start a second cluster. p2 = run_string_as_driver_nonblocking(blocking_local_script) # ray.client().connect() shouldn't connect to a cluster started by # ray.client("local").connect() so it should create a third one. p3 = run_string_as_driver_nonblocking(blocking_noaddr_script) # ray.client().connect() shouldn't connect to a cluster started by # ray.client().connect() so it should create a fourth one. p4 = run_string_as_driver_nonblocking(blocking_noaddr_script) wait_for_condition( lambda: len(ray._private.services.find_redis_address()) == 4, retry_interval_ms=1000) p1.kill() p2.kill() p3.kill() p4.kill() # Prevent flakiness since fatesharing takes some time. subprocess.check_output("ray stop --force", shell=True) # Since there's a cluster started with `ray start --head` # we should connect to it instead. subprocess.check_output("ray start --head", shell=True) # The assertion in the driver should cause the script to fail if we start # a new cluster instead of connecting. run_string_as_driver(""" import ray ray.client().connect() assert len(ray._private.services.find_redis_address()) == 1 """) # ray.client("local").connect() should always create a new cluster even if # there's one running. p1 = run_string_as_driver_nonblocking(blocking_local_script) wait_for_condition( lambda: len(ray._private.services.find_redis_address()) == 2, retry_interval_ms=1000) p1.kill() subprocess.check_output("ray stop --force", shell=True)
def test_drivers_release_resources(call_ray_start): address = call_ray_start # Define a driver that creates an actor and exits. driver_script1 = """ import time import ray ray.init(address="{}") @ray.remote def f(duration): time.sleep(duration) @ray.remote(num_gpus=1) def g(duration): time.sleep(duration) @ray.remote(num_gpus=1) class Foo: def __init__(self): pass # Make sure some resources are available for us to run tasks. ray.get(f.remote(0)) ray.get(g.remote(0)) # Start a bunch of actors and tasks that use resources. These should all be # cleaned up when this driver exits. foos = [Foo.remote() for _ in range(100)] [f.remote(10 ** 6) for _ in range(100)] print("success") """.format(address) driver_script2 = (driver_script1 + "import sys\nsys.stdout.flush()\ntime.sleep(10 ** 6)\n") def wait_for_success_output(process_handle, timeout=10): # Wait until the process prints "success" and then return. start_time = time.time() while time.time() - start_time < timeout: output_line = ray.utils.decode( process_handle.stdout.readline()).strip() print(output_line) if output_line == "success": return raise RayTestTimeoutException( "Timed out waiting for process to print success.") # Make sure we can run this driver repeatedly, which means that resources # are getting released in between. for _ in range(5): out = run_string_as_driver(driver_script1) # Make sure the first driver ran to completion. assert "success" in out # Also make sure that this works when the driver exits ungracefully. process_handle = run_string_as_driver_nonblocking(driver_script2) wait_for_success_output(process_handle) # Kill the process ungracefully. process_handle.kill()
def test_multi_drivers(shutdown_only): info = ray.init(num_cpus=10) driver_code = """ import os import sys import ray ray.init(address="{}") @ray.remote class Actor: def get_pid(self): return os.getpid() @ray.remote def get_pid(): return os.getpid() pid_objs = [] # Submit some normal tasks and get the PIDs of workers which execute the tasks. pid_objs = pid_objs + [get_pid.remote() for _ in range(2)] # Create some actors and get the PIDs of actors. actors = [Actor.remote() for _ in range(2)] pid_objs = pid_objs + [actor.get_pid.remote() for actor in actors] pids = set([ray.get(obj) for obj in pid_objs]) # Write pids to stdout print("PID:" + str.join(",", [str(_) for _ in pids])) ray.shutdown() """.format(info["redis_address"]) driver_count = 3 processes = [ run_string_as_driver_nonblocking(driver_code) for _ in range(driver_count) ] outputs = [] for p in processes: out = p.stdout.read().decode("ascii") err = p.stderr.read().decode("ascii") p.wait() # out, err = p.communicate() # out = ray.utils.decode(out) # err = ray.utils.decode(err) if p.returncode != 0: print("Driver with PID {} returned error code {}".format( p.pid, p.returncode)) print("STDOUT:\n{}".format(out)) print("STDERR:\n{}".format(err)) outputs.append((p, out)) all_worker_pids = set() for p, out in outputs: assert p.returncode == 0 for line in out.splitlines(): if line.startswith("PID:"): worker_pids = [int(_) for _ in line.split(":")[1].split(",")] assert len(worker_pids) > 0 for worker_pid in worker_pids: assert worker_pid not in all_worker_pids, ( ("Worker process with PID {} is shared" + " by multiple drivers.").format(worker_pid)) all_worker_pids.add(worker_pid)
def test_multi_driver_logging(ray_start_regular): address_info = ray_start_regular address = address_info["redis_address"] # ray.init(address=address) driver1_wait = Semaphore.options(name="driver1_wait").remote(value=0) driver2_wait = Semaphore.options(name="driver2_wait").remote(value=0) main_wait = Semaphore.options(name="main_wait").remote(value=0) # The creation of an actor is asynchronous. # We need to wait for the completion of the actor creation, # otherwise we can't get the actor by name. ray.get(driver1_wait.locked.remote()) ray.get(driver2_wait.locked.remote()) ray.get(main_wait.locked.remote()) # Params are address, semaphore name, output1, output2 driver_script_template = """ import ray import sys from ray.test_utils import Semaphore @ray.remote(num_cpus=0) def remote_print(s, file=None): print(s, file=file) ray.init(address="{}") driver_wait = ray.get_actor("{}") main_wait = ray.get_actor("main_wait") ray.get(main_wait.release.remote()) ray.get(driver_wait.acquire.remote()) s1 = "{}" ray.get(remote_print.remote(s1)) ray.get(main_wait.release.remote()) ray.get(driver_wait.acquire.remote()) s2 = "{}" ray.get(remote_print.remote(s2)) ray.get(main_wait.release.remote()) """ p1 = run_string_as_driver_nonblocking( driver_script_template.format(address, "driver1_wait", "1", "2")) p2 = run_string_as_driver_nonblocking( driver_script_template.format(address, "driver2_wait", "3", "4")) ray.get(main_wait.acquire.remote()) ray.get(main_wait.acquire.remote()) # At this point both of the other drivers are fully initialized. ray.get(driver1_wait.release.remote()) ray.get(driver2_wait.release.remote()) # At this point driver1 should receive '1' and driver2 '3' ray.get(main_wait.acquire.remote()) ray.get(main_wait.acquire.remote()) ray.get(driver1_wait.release.remote()) ray.get(driver2_wait.release.remote()) # At this point driver1 should receive '2' and driver2 '4' ray.get(main_wait.acquire.remote()) ray.get(main_wait.acquire.remote()) driver1_out = p1.stdout.read().decode("ascii") driver2_out = p2.stdout.read().decode("ascii") if sys.platform == "win32": driver1_out = driver1_out.replace("\r", "") driver2_out = driver2_out.replace("\r", "") driver1_out_split = driver1_out.split("\n") driver2_out_split = driver2_out.split("\n") assert driver1_out_split[0][-1] == "1" assert driver1_out_split[1][-1] == "2" assert driver2_out_split[0][-1] == "3" assert driver2_out_split[1][-1] == "4"