예제 #1
0
def driver_0(redis_address, driver_index):
    """The script for driver 0.

    This driver should create five actors that each use one GPU and some actors
    that use no GPUs. After a while, it should exit.
    """
    ray.init(redis_address=redis_address)

    # Wait for all the nodes to join the cluster.
    _wait_for_nodes_to_join(total_num_nodes)

    # Start some long running task. Driver 2 will make sure the worker running
    # this task has been killed.
    for i in range(num_long_running_tasks_per_driver):
        long_running_task.remote(driver_index, i, redis_address)

    # Create some actors that require one GPU.
    actors_one_gpu = [Actor1.remote(driver_index, i, redis_address)
                      for i in range(5)]
    # Create some actors that don't require any GPUs.
    actors_no_gpus = [Actor0.remote(driver_index, 5 + i, redis_address)
                      for i in range(5)]

    for _ in range(1000):
        ray.get([actor.check_ids.remote() for actor in actors_one_gpu])
        ray.get([actor.check_ids.remote() for actor in actors_no_gpus])

    # Start a long-running method on one actor and make sure this doesn't
    # affect anything.
    actors_no_gpus[0].long_running_method.remote()

    _broadcast_event("DRIVER_0_DONE", redis_address)
예제 #2
0
def long_running_task(driver_index, task_index, redis_address):
    _broadcast_event(remote_function_event_name(driver_index, task_index),
                     redis_address,
                     data=(ray.services.get_node_ip_address(), os.getpid()))
    # Loop forever.
    while True:
        time.sleep(100)
예제 #3
0
def driver_0(redis_address, driver_index):
    """The script for driver 0.

    This driver should create five actors that each use one GPU and some actors
    that use no GPUs. After a while, it should exit.
    """
    ray.init(redis_address=redis_address)

    # Wait for all the nodes to join the cluster.
    _wait_for_nodes_to_join(total_num_nodes)

    # Start some long running task. Driver 2 will make sure the worker running
    # this task has been killed.
    for i in range(num_long_running_tasks_per_driver):
        long_running_task.remote(driver_index, i, redis_address)

    # Create some actors that require one GPU.
    actors_one_gpu = [
        Actor1.remote(driver_index, i, redis_address) for i in range(5)
    ]
    # Create some actors that don't require any GPUs.
    actors_no_gpus = [
        Actor0.remote(driver_index, 5 + i, redis_address) for i in range(5)
    ]

    for _ in range(1000):
        ray.get([actor.check_ids.remote() for actor in actors_one_gpu])
        ray.get([actor.check_ids.remote() for actor in actors_no_gpus])

    # Start a long-running method on one actor and make sure this doesn't
    # affect anything.
    actors_no_gpus[0].long_running_method.remote()

    _broadcast_event("DRIVER_0_DONE", redis_address)
예제 #4
0
def long_running_task(driver_index, task_index, redis_address):
    _broadcast_event(remote_function_event_name(driver_index, task_index),
                     redis_address,
                     data=(ray.services.get_node_ip_address(), os.getpid()))
    # Loop forever.
    while True:
        time.sleep(100)
예제 #5
0
def driver(redis_address, driver_index):
    """The script for driver 0.

    This driver should create five actors that each use one GPU and some actors
    that use no GPUs. After a while, it should exit.
    """
    ray.init(redis_address=redis_address)

    # Wait for all the nodes to join the cluster.
    _wait_for_nodes_to_join(total_num_nodes)

    # Limit the number of drivers running concurrently.
    for i in range(driver_index - max_concurrent_drivers + 1):
        _wait_for_event("DRIVER_{}_DONE".format(i), redis_address)

    def try_to_create_actor(actor_class, timeout=100):
        # Try to create an actor, but allow failures while we wait for the
        # monitor to release the resources for the removed drivers.
        start_time = time.time()
        while time.time() - start_time < timeout:
            try:
                actor = actor_class.remote()
            except Exception as e:
                time.sleep(0.1)
            else:
                return actor
        # If we are here, then we timed out while looping.
        raise Exception("Timed out while trying to create actor.")

    # Create some actors that require one GPU.
    actors_one_gpu = []
    for _ in range(num_gpus_per_driver):
        actors_one_gpu.append(try_to_create_actor(Actor1))

    for _ in range(100):
        ray.get([actor.check_ids.remote() for actor in actors_one_gpu])

    _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)
예제 #6
0
def driver(redis_address, driver_index):
    """The script for all drivers.

    This driver should create five actors that each use one GPU. After a while,
    it should exit.
    """
    ray.init(redis_address=redis_address)

    # Wait for all the nodes to join the cluster.
    _wait_for_nodes_to_join(total_num_nodes)

    # Limit the number of drivers running concurrently.
    for i in range(driver_index - max_concurrent_drivers + 1):
        _wait_for_event("DRIVER_{}_DONE".format(i), redis_address)

    def try_to_create_actor(actor_class, timeout=500):
        # Try to create an actor, but allow failures while we wait for the
        # monitor to release the resources for the removed drivers.
        start_time = time.time()
        while time.time() - start_time < timeout:
            try:
                actor = actor_class.remote()
            except Exception:
                time.sleep(0.1)
            else:
                return actor
        # If we are here, then we timed out while looping.
        raise Exception("Timed out while trying to create actor.")

    # Create some actors that require one GPU.
    actors_one_gpu = []
    for _ in range(num_gpus_per_driver):
        actors_one_gpu.append(try_to_create_actor(Actor1))

    for _ in range(100):
        ray.get([actor.check_ids.remote() for actor in actors_one_gpu])

    _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)
예제 #7
0
 def __init__(self, driver_index, actor_index, redis_address):
     _broadcast_event(actor_event_name(driver_index, actor_index),
                      redis_address,
                      data=(ray.services.get_node_ip_address(),
                            os.getpid()))
     assert len(ray.get_gpu_ids()) == 2
예제 #8
0
def cleanup_driver(redis_address, driver_index):
    """The script for drivers 2 through 6.

    This driver should wait for the first two drivers to finish. Then it should
    create some actors that use a total of ten GPUs.
    """
    ray.init(redis_address=redis_address)

    # Only one of the cleanup drivers should create more actors.
    if driver_index == 2:
        # We go ahead and create some actors that don't require any GPUs. We
        # don't need to wait for the other drivers to finish. We call methods
        # on these actors later to make sure they haven't been killed.
        actors_no_gpus = [Actor0.remote(driver_index, i, redis_address)
                          for i in range(10)]

    _wait_for_event("DRIVER_0_DONE", redis_address)
    _wait_for_event("DRIVER_1_DONE", redis_address)

    def try_to_create_actor(actor_class, driver_index, actor_index,
                            timeout=20):
        # Try to create an actor, but allow failures while we wait for the
        # monitor to release the resources for the removed drivers.
        start_time = time.time()
        while time.time() - start_time < timeout:
            try:
                actor = actor_class.remote(driver_index, actor_index,
                                           redis_address)
            except Exception as e:
                time.sleep(0.1)
            else:
                return actor
        # If we are here, then we timed out while looping.
        raise Exception("Timed out while trying to create actor.")

    # Only one of the cleanup drivers should create more actors.
    if driver_index == 2:
        # Create some actors that require two GPUs.
        actors_two_gpus = []
        for i in range(3):
            actors_two_gpus.append(try_to_create_actor(Actor2, driver_index,
                                                       10 + i))
        # Create some actors that require one GPU.
        actors_one_gpu = []
        for i in range(4):
            actors_one_gpu.append(try_to_create_actor(Actor1, driver_index,
                                                      10 + 3 + i))

    removed_workers = 0

    # Make sure that the PIDs for the long-running tasks from driver 0 and
    # driver 1 have been killed.
    for i in range(num_long_running_tasks_per_driver):
        node_ip_address, pid = _wait_for_event(
            remote_function_event_name(0, i), redis_address)
        if node_ip_address == ray.services.get_node_ip_address():
            wait_for_pid_to_exit(pid)
            removed_workers += 1
    for i in range(num_long_running_tasks_per_driver):
        node_ip_address, pid = _wait_for_event(
            remote_function_event_name(1, i), redis_address)
        if node_ip_address == ray.services.get_node_ip_address():
            wait_for_pid_to_exit(pid)
            removed_workers += 1
    # Make sure that the PIDs for the actors from driver 0 and driver 1 have
    # been killed.
    for i in range(10):
        node_ip_address, pid = _wait_for_event(actor_event_name(0, i),
                                               redis_address)
        if node_ip_address == ray.services.get_node_ip_address():
            wait_for_pid_to_exit(pid)
            removed_workers += 1
    for i in range(9):
        node_ip_address, pid = _wait_for_event(actor_event_name(1, i),
                                               redis_address)
        if node_ip_address == ray.services.get_node_ip_address():
            wait_for_pid_to_exit(pid)
            removed_workers += 1

    print("{} workers/actors were removed on this node."
          .format(removed_workers))

    # Only one of the cleanup drivers should create and use more actors.
    if driver_index == 2:
        for _ in range(1000):
            ray.get([actor.check_ids.remote() for actor in actors_two_gpus])
            ray.get([actor.check_ids.remote() for actor in actors_one_gpu])
            ray.get([actor.check_ids.remote() for actor in actors_no_gpus])

    _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)
예제 #9
0
 def __init__(self, driver_index, actor_index, redis_address):
     _broadcast_event(actor_event_name(driver_index, actor_index),
                      redis_address,
                      data=(ray.services.get_node_ip_address(),
                            os.getpid()))
     assert len(ray.get_gpu_ids()) == 2
예제 #10
0
def cleanup_driver(redis_address, driver_index):
    """The script for drivers 2 through 6.

    This driver should wait for the first two drivers to finish. Then it should
    create some actors that use a total of ten GPUs.
    """
    ray.init(redis_address=redis_address)

    # Only one of the cleanup drivers should create more actors.
    if driver_index == 2:
        # We go ahead and create some actors that don't require any GPUs. We
        # don't need to wait for the other drivers to finish. We call methods
        # on these actors later to make sure they haven't been killed.
        actors_no_gpus = [
            Actor0.remote(driver_index, i, redis_address) for i in range(10)
        ]

    _wait_for_event("DRIVER_0_DONE", redis_address)
    _wait_for_event("DRIVER_1_DONE", redis_address)

    def try_to_create_actor(actor_class,
                            driver_index,
                            actor_index,
                            timeout=20):
        # Try to create an actor, but allow failures while we wait for the
        # monitor to release the resources for the removed drivers.
        start_time = time.time()
        while time.time() - start_time < timeout:
            try:
                actor = actor_class.remote(driver_index, actor_index,
                                           redis_address)
            except Exception as e:
                time.sleep(0.1)
            else:
                return actor
        # If we are here, then we timed out while looping.
        raise Exception("Timed out while trying to create actor.")

    # Only one of the cleanup drivers should create more actors.
    if driver_index == 2:
        # Create some actors that require one GPU.
        actors_one_gpu = []
        for i in range(10):
            actors_one_gpu.append(
                try_to_create_actor(Actor1, driver_index, 10 + 3 + i))

    removed_workers = 0

    # Make sure that the PIDs for the long-running tasks from driver 0 and
    # driver 1 have been killed.
    for i in range(num_long_running_tasks_per_driver):
        node_ip_address, pid = _wait_for_event(
            remote_function_event_name(0, i), redis_address)
        if node_ip_address == ray.services.get_node_ip_address():
            wait_for_pid_to_exit(pid)
            removed_workers += 1
    for i in range(num_long_running_tasks_per_driver):
        node_ip_address, pid = _wait_for_event(
            remote_function_event_name(1, i), redis_address)
        if node_ip_address == ray.services.get_node_ip_address():
            wait_for_pid_to_exit(pid)
            removed_workers += 1
    # Make sure that the PIDs for the actors from driver 0 and driver 1 have
    # been killed.
    for i in range(10):
        node_ip_address, pid = _wait_for_event(actor_event_name(0, i),
                                               redis_address)
        if node_ip_address == ray.services.get_node_ip_address():
            wait_for_pid_to_exit(pid)
            removed_workers += 1
    for i in range(9):
        node_ip_address, pid = _wait_for_event(actor_event_name(1, i),
                                               redis_address)
        if node_ip_address == ray.services.get_node_ip_address():
            wait_for_pid_to_exit(pid)
            removed_workers += 1

    print(
        "{} workers/actors were removed on this node.".format(removed_workers))

    # Only one of the cleanup drivers should create and use more actors.
    if driver_index == 2:
        for _ in range(1000):
            ray.get([actor.check_ids.remote() for actor in actors_one_gpu])
            ray.get([actor.check_ids.remote() for actor in actors_no_gpus])

    _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)