def driver_0(redis_address, driver_index): """The script for driver 0. This driver should create five actors that each use one GPU and some actors that use no GPUs. After a while, it should exit. """ ray.init(redis_address=redis_address) # Wait for all the nodes to join the cluster. _wait_for_nodes_to_join(total_num_nodes) # Start some long running task. Driver 2 will make sure the worker running # this task has been killed. for i in range(num_long_running_tasks_per_driver): long_running_task.remote(driver_index, i, redis_address) # Create some actors that require one GPU. actors_one_gpu = [ Actor1.remote(driver_index, i, redis_address) for i in range(5) ] # Create some actors that don't require any GPUs. actors_no_gpus = [ Actor0.remote(driver_index, 5 + i, redis_address) for i in range(5) ] for _ in range(1000): ray.get([actor.check_ids.remote() for actor in actors_one_gpu]) ray.get([actor.check_ids.remote() for actor in actors_no_gpus]) # Start a long-running method on one actor and make sure this doesn't # affect anything. actors_no_gpus[0].long_running_method.remote() _broadcast_event("DRIVER_0_DONE", redis_address)
def long_running_task(driver_index, task_index, redis_address): _broadcast_event(remote_function_event_name(driver_index, task_index), redis_address, data=(ray.services.get_node_ip_address(), os.getpid())) # Loop forever. while True: time.sleep(100)
def driver(redis_address, driver_index): """The script for all drivers. This driver should create five actors that each use one GPU. After a while, it should exit. """ ray.init(redis_address=redis_address) # Wait for all the nodes to join the cluster. _wait_for_nodes_to_join(total_num_nodes) # Limit the number of drivers running concurrently. for i in range(driver_index - max_concurrent_drivers + 1): _wait_for_event("DRIVER_{}_DONE".format(i), redis_address) def try_to_create_actor(actor_class, timeout=500): # Try to create an actor, but allow failures while we wait for the # monitor to release the resources for the removed drivers. start_time = time.time() while time.time() - start_time < timeout: try: actor = actor_class.remote() except Exception: time.sleep(0.1) else: return actor # If we are here, then we timed out while looping. raise Exception("Timed out while trying to create actor.") # Create some actors that require one GPU. actors_one_gpu = [] for _ in range(num_gpus_per_driver): actors_one_gpu.append(try_to_create_actor(Actor1)) for _ in range(100): ray.get([actor.check_ids.remote() for actor in actors_one_gpu]) _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)
def __init__(self, driver_index, actor_index, redis_address): _broadcast_event(actor_event_name(driver_index, actor_index), redis_address, data=(ray.services.get_node_ip_address(), os.getpid())) assert len(ray.get_gpu_ids()) == 2
def cleanup_driver(redis_address, driver_index): """The script for drivers 2 through 6. This driver should wait for the first two drivers to finish. Then it should create some actors that use a total of ten GPUs. """ ray.init(redis_address=redis_address) # Only one of the cleanup drivers should create more actors. if driver_index == 2: # We go ahead and create some actors that don't require any GPUs. We # don't need to wait for the other drivers to finish. We call methods # on these actors later to make sure they haven't been killed. actors_no_gpus = [ Actor0.remote(driver_index, i, redis_address) for i in range(10) ] _wait_for_event("DRIVER_0_DONE", redis_address) _wait_for_event("DRIVER_1_DONE", redis_address) def try_to_create_actor(actor_class, driver_index, actor_index, timeout=20): # Try to create an actor, but allow failures while we wait for the # monitor to release the resources for the removed drivers. start_time = time.time() while time.time() - start_time < timeout: try: actor = actor_class.remote(driver_index, actor_index, redis_address) except Exception: time.sleep(0.1) else: return actor # If we are here, then we timed out while looping. raise Exception("Timed out while trying to create actor.") # Only one of the cleanup drivers should create more actors. if driver_index == 2: # Create some actors that require one GPU. actors_one_gpu = [] for i in range(10): actors_one_gpu.append( try_to_create_actor(Actor1, driver_index, 10 + 3 + i)) removed_workers = 0 # Make sure that the PIDs for the long-running tasks from driver 0 and # driver 1 have been killed. for i in range(num_long_running_tasks_per_driver): node_ip_address, pid = _wait_for_event( remote_function_event_name(0, i), redis_address) if node_ip_address == ray.services.get_node_ip_address(): wait_for_pid_to_exit(pid) removed_workers += 1 for i in range(num_long_running_tasks_per_driver): node_ip_address, pid = _wait_for_event( remote_function_event_name(1, i), redis_address) if node_ip_address == ray.services.get_node_ip_address(): wait_for_pid_to_exit(pid) removed_workers += 1 # Make sure that the PIDs for the actors from driver 0 and driver 1 have # been killed. for i in range(10): node_ip_address, pid = _wait_for_event(actor_event_name(0, i), redis_address) if node_ip_address == ray.services.get_node_ip_address(): wait_for_pid_to_exit(pid) removed_workers += 1 for i in range(9): node_ip_address, pid = _wait_for_event(actor_event_name(1, i), redis_address) if node_ip_address == ray.services.get_node_ip_address(): wait_for_pid_to_exit(pid) removed_workers += 1 print( "{} workers/actors were removed on this node.".format(removed_workers)) # Only one of the cleanup drivers should create and use more actors. if driver_index == 2: for _ in range(1000): ray.get([actor.check_ids.remote() for actor in actors_one_gpu]) ray.get([actor.check_ids.remote() for actor in actors_no_gpus]) _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)