def driver_0(redis_address, driver_index): """The script for driver 0. This driver should create five actors that each use one GPU and some actors that use no GPUs. After a while, it should exit. """ ray.init(redis_address=redis_address) # Wait for all the nodes to join the cluster. _wait_for_nodes_to_join(total_num_nodes) # Start some long running task. Driver 2 will make sure the worker running # this task has been killed. for i in range(num_long_running_tasks_per_driver): long_running_task.remote(driver_index, i, redis_address) # Create some actors that require one GPU. actors_one_gpu = [Actor1.remote(driver_index, i, redis_address) for i in range(5)] # Create some actors that don't require any GPUs. actors_no_gpus = [Actor0.remote(driver_index, 5 + i, redis_address) for i in range(5)] for _ in range(1000): ray.get([actor.check_ids.remote() for actor in actors_one_gpu]) ray.get([actor.check_ids.remote() for actor in actors_no_gpus]) # Start a long-running method on one actor and make sure this doesn't # affect anything. actors_no_gpus[0].long_running_method.remote() _broadcast_event("DRIVER_0_DONE", redis_address)
def driver_0(redis_address, driver_index): """The script for driver 0. This driver should create five actors that each use one GPU and some actors that use no GPUs. After a while, it should exit. """ ray.init(redis_address=redis_address) # Wait for all the nodes to join the cluster. _wait_for_nodes_to_join(total_num_nodes) # Start some long running task. Driver 2 will make sure the worker running # this task has been killed. for i in range(num_long_running_tasks_per_driver): long_running_task.remote(driver_index, i, redis_address) # Create some actors that require one GPU. actors_one_gpu = [ Actor1.remote(driver_index, i, redis_address) for i in range(5) ] # Create some actors that don't require any GPUs. actors_no_gpus = [ Actor0.remote(driver_index, 5 + i, redis_address) for i in range(5) ] for _ in range(1000): ray.get([actor.check_ids.remote() for actor in actors_one_gpu]) ray.get([actor.check_ids.remote() for actor in actors_no_gpus]) # Start a long-running method on one actor and make sure this doesn't # affect anything. actors_no_gpus[0].long_running_method.remote() _broadcast_event("DRIVER_0_DONE", redis_address)
def driver(redis_address, driver_index): """The script for driver 0. This driver should create five actors that each use one GPU and some actors that use no GPUs. After a while, it should exit. """ ray.init(redis_address=redis_address) # Wait for all the nodes to join the cluster. _wait_for_nodes_to_join(total_num_nodes) # Limit the number of drivers running concurrently. for i in range(driver_index - max_concurrent_drivers + 1): _wait_for_event("DRIVER_{}_DONE".format(i), redis_address) def try_to_create_actor(actor_class, timeout=100): # Try to create an actor, but allow failures while we wait for the # monitor to release the resources for the removed drivers. start_time = time.time() while time.time() - start_time < timeout: try: actor = actor_class.remote() except Exception as e: time.sleep(0.1) else: return actor # If we are here, then we timed out while looping. raise Exception("Timed out while trying to create actor.") # Create some actors that require one GPU. actors_one_gpu = [] for _ in range(num_gpus_per_driver): actors_one_gpu.append(try_to_create_actor(Actor1)) for _ in range(100): ray.get([actor.check_ids.remote() for actor in actors_one_gpu]) _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)
def driver(redis_address, driver_index): """The script for all drivers. This driver should create five actors that each use one GPU. After a while, it should exit. """ ray.init(redis_address=redis_address) # Wait for all the nodes to join the cluster. _wait_for_nodes_to_join(total_num_nodes) # Limit the number of drivers running concurrently. for i in range(driver_index - max_concurrent_drivers + 1): _wait_for_event("DRIVER_{}_DONE".format(i), redis_address) def try_to_create_actor(actor_class, timeout=500): # Try to create an actor, but allow failures while we wait for the # monitor to release the resources for the removed drivers. start_time = time.time() while time.time() - start_time < timeout: try: actor = actor_class.remote() except Exception: time.sleep(0.1) else: return actor # If we are here, then we timed out while looping. raise Exception("Timed out while trying to create actor.") # Create some actors that require one GPU. actors_one_gpu = [] for _ in range(num_gpus_per_driver): actors_one_gpu.append(try_to_create_actor(Actor1)) for _ in range(100): ray.get([actor.check_ids.remote() for actor in actors_one_gpu]) _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)