Exemplo n.º 1
0
def driver_0(redis_address, driver_index):
    """The script for driver 0.

    This driver should create five actors that each use one GPU and some actors
    that use no GPUs. After a while, it should exit.
    """
    ray.init(redis_address=redis_address)

    # Wait for all the nodes to join the cluster.
    _wait_for_nodes_to_join(total_num_nodes)

    # Start some long running task. Driver 2 will make sure the worker running
    # this task has been killed.
    for i in range(num_long_running_tasks_per_driver):
        long_running_task.remote(driver_index, i, redis_address)

    # Create some actors that require one GPU.
    actors_one_gpu = [Actor1.remote(driver_index, i, redis_address)
                      for i in range(5)]
    # Create some actors that don't require any GPUs.
    actors_no_gpus = [Actor0.remote(driver_index, 5 + i, redis_address)
                      for i in range(5)]

    for _ in range(1000):
        ray.get([actor.check_ids.remote() for actor in actors_one_gpu])
        ray.get([actor.check_ids.remote() for actor in actors_no_gpus])

    # Start a long-running method on one actor and make sure this doesn't
    # affect anything.
    actors_no_gpus[0].long_running_method.remote()

    _broadcast_event("DRIVER_0_DONE", redis_address)
Exemplo n.º 2
0
def driver_0(redis_address, driver_index):
    """The script for driver 0.

    This driver should create five actors that each use one GPU and some actors
    that use no GPUs. After a while, it should exit.
    """
    ray.init(redis_address=redis_address)

    # Wait for all the nodes to join the cluster.
    _wait_for_nodes_to_join(total_num_nodes)

    # Start some long running task. Driver 2 will make sure the worker running
    # this task has been killed.
    for i in range(num_long_running_tasks_per_driver):
        long_running_task.remote(driver_index, i, redis_address)

    # Create some actors that require one GPU.
    actors_one_gpu = [
        Actor1.remote(driver_index, i, redis_address) for i in range(5)
    ]
    # Create some actors that don't require any GPUs.
    actors_no_gpus = [
        Actor0.remote(driver_index, 5 + i, redis_address) for i in range(5)
    ]

    for _ in range(1000):
        ray.get([actor.check_ids.remote() for actor in actors_one_gpu])
        ray.get([actor.check_ids.remote() for actor in actors_no_gpus])

    # Start a long-running method on one actor and make sure this doesn't
    # affect anything.
    actors_no_gpus[0].long_running_method.remote()

    _broadcast_event("DRIVER_0_DONE", redis_address)
Exemplo n.º 3
0
def driver(redis_address, driver_index):
    """The script for driver 0.

    This driver should create five actors that each use one GPU and some actors
    that use no GPUs. After a while, it should exit.
    """
    ray.init(redis_address=redis_address)

    # Wait for all the nodes to join the cluster.
    _wait_for_nodes_to_join(total_num_nodes)

    # Limit the number of drivers running concurrently.
    for i in range(driver_index - max_concurrent_drivers + 1):
        _wait_for_event("DRIVER_{}_DONE".format(i), redis_address)

    def try_to_create_actor(actor_class, timeout=100):
        # Try to create an actor, but allow failures while we wait for the
        # monitor to release the resources for the removed drivers.
        start_time = time.time()
        while time.time() - start_time < timeout:
            try:
                actor = actor_class.remote()
            except Exception as e:
                time.sleep(0.1)
            else:
                return actor
        # If we are here, then we timed out while looping.
        raise Exception("Timed out while trying to create actor.")

    # Create some actors that require one GPU.
    actors_one_gpu = []
    for _ in range(num_gpus_per_driver):
        actors_one_gpu.append(try_to_create_actor(Actor1))

    for _ in range(100):
        ray.get([actor.check_ids.remote() for actor in actors_one_gpu])

    _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)
Exemplo n.º 4
0
def driver(redis_address, driver_index):
    """The script for all drivers.

    This driver should create five actors that each use one GPU. After a while,
    it should exit.
    """
    ray.init(redis_address=redis_address)

    # Wait for all the nodes to join the cluster.
    _wait_for_nodes_to_join(total_num_nodes)

    # Limit the number of drivers running concurrently.
    for i in range(driver_index - max_concurrent_drivers + 1):
        _wait_for_event("DRIVER_{}_DONE".format(i), redis_address)

    def try_to_create_actor(actor_class, timeout=500):
        # Try to create an actor, but allow failures while we wait for the
        # monitor to release the resources for the removed drivers.
        start_time = time.time()
        while time.time() - start_time < timeout:
            try:
                actor = actor_class.remote()
            except Exception:
                time.sleep(0.1)
            else:
                return actor
        # If we are here, then we timed out while looping.
        raise Exception("Timed out while trying to create actor.")

    # Create some actors that require one GPU.
    actors_one_gpu = []
    for _ in range(num_gpus_per_driver):
        actors_one_gpu.append(try_to_create_actor(Actor1))

    for _ in range(100):
        ray.get([actor.check_ids.remote() for actor in actors_one_gpu])

    _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)