예제 #1
0
def test_cluster():
    """Basic test for adding and removing nodes in cluster."""
    g = Cluster(initialize_head=False)
    node = g.add_node()
    node2 = g.add_node()
    assert node.remaining_processes_alive()
    assert node2.remaining_processes_alive()
    g.remove_node(node2)
    g.remove_node(node)
    assert not any(n.any_processes_alive() for n in [node, node2])
예제 #2
0
def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
    config = json.dumps({
        "num_heartbeats_timeout":
        10,
        "raylet_heartbeat_timeout_milliseconds":
        100,
        "lineage_pinning_enabled":
        1 if reconstruction_enabled else 0,
        "free_objects_period_milliseconds":
        -1,
    })
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _internal_config=config)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8,
                                    _internal_config=config)
    cluster.add_node(num_cpus=1,
                     resources={"node2": 1},
                     object_store_memory=10**8,
                     _internal_config=config)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address, _internal_config=config)

    @ray.remote(max_retries=1 if reconstruction_enabled else 0)
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def dependent_task(x):
        return x

    obj = ray.put(np.zeros(10**7, dtype=np.uint8))
    result = dependent_task.options(resources={"node1": 1}).remote(obj)
    ray.get(result)
    del obj

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     resources={"node1": 1},
                     object_store_memory=10**8,
                     _internal_config=config)

    for _ in range(20):
        ray.put(np.zeros(10**7, dtype=np.uint8))

    if reconstruction_enabled:
        ray.get(result)
    else:
        with pytest.raises(ray.exceptions.UnreconstructableError):
            ray.get(result)
예제 #3
0
def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
    config = json.dumps({
        "num_heartbeats_timeout":
        10,
        "raylet_heartbeat_timeout_milliseconds":
        100,
        "lineage_pinning_enabled":
        1 if reconstruction_enabled else 0,
        "free_objects_period_milliseconds":
        -1,
    })
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0,
                     _internal_config=config,
                     object_store_memory=10**8)
    node_to_kill = cluster.add_node(num_cpus=1,
                                    object_store_memory=10**8,
                                    _internal_config=config)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address, _internal_config=config)

    @ray.remote(max_retries=1 if reconstruction_enabled else 0)
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def chain(x):
        return x

    @ray.remote
    def dependent_task(x):
        return x

    obj = large_object.remote()
    for _ in range(20):
        obj = chain.remote(obj)
    ray.get(dependent_task.remote(obj))

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     object_store_memory=10**8,
                     _internal_config=config)

    if reconstruction_enabled:
        ray.get(dependent_task.remote(obj))
    else:
        with pytest.raises(ray.exceptions.RayTaskError) as e:
            ray.get(dependent_task.remote(obj))
            with pytest.raises(ray.exceptions.UnreconstructableError):
                raise e.as_instanceof_cause()
예제 #4
0
def test_cached_object(ray_start_cluster):
    config = json.dumps({
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
    })
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _internal_config=config)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8)
    cluster.add_node(num_cpus=1,
                     resources={"node2": 1},
                     object_store_memory=10**8)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    @ray.remote
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def dependent_task(x):
        return

    obj = large_object.options(resources={"node1": 1}).remote()
    ray.get(dependent_task.options(resources={"node2": 1}).remote(obj))

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     resources={"node1": 1},
                     object_store_memory=10**8)
    assert wait_for_condition(lambda: not all(node["Alive"]
                                              for node in ray.nodes()),
                              timeout=10)

    for _ in range(20):
        large_object.options(resources={"node2": 1}).remote()

    ray.get(dependent_task.remote(obj))
예제 #5
0
def test_connect_with_disconnected_node(shutdown_only):
    config = json.dumps({
        "num_heartbeats_timeout": 50,
        "raylet_heartbeat_timeout_milliseconds": 10,
    })
    cluster = Cluster()
    cluster.add_node(num_cpus=0, _internal_config=config)
    ray.init(address=cluster.address)
    info = relevant_errors(ray_constants.REMOVED_NODE_ERROR)
    assert len(info) == 0
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0, _internal_config=config)
    cluster.remove_node(dead_node, allow_graceful=False)
    wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 1)
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0, _internal_config=config)
    cluster.remove_node(dead_node, allow_graceful=False)
    wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2)
    # This node is killed by SIGTERM, ray_monitor will not mark it again.
    removing_node = cluster.add_node(num_cpus=0, _internal_config=config)
    cluster.remove_node(removing_node, allow_graceful=True)
    with pytest.raises(RayTestTimeoutException):
        wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 3, timeout=2)
    # There is no connection error to a dead node.
    info = relevant_errors(ray_constants.RAYLET_CONNECTION_ERROR)
    assert len(info) == 0
예제 #6
0
def test_connect_with_disconnected_node(shutdown_only):
    config = {
        "num_heartbeats_timeout": 50,
        "raylet_heartbeat_timeout_milliseconds": 10,
    }
    cluster = Cluster()
    cluster.add_node(num_cpus=0, _system_config=config)
    ray.init(address=cluster.address)
    p = init_error_pubsub()
    errors = get_error_message(p, 1, timeout=5)
    assert len(errors) == 0
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0)
    cluster.remove_node(dead_node, allow_graceful=False)
    errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR)
    assert len(errors) == 1
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0)
    cluster.remove_node(dead_node, allow_graceful=False)
    errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR)
    assert len(errors) == 1
    # This node is killed by SIGTERM, ray_monitor will not mark it again.
    removing_node = cluster.add_node(num_cpus=0)
    cluster.remove_node(removing_node, allow_graceful=True)
    errors = get_error_message(p, 1, timeout=2)
    assert len(errors) == 0
    # There is no connection error to a dead node.
    errors = get_error_message(p, 1, timeout=2)
    assert len(errors) == 0
    p.close()
예제 #7
0
def test_spill_dir_cleanup_on_raylet_start(object_spilling_config):
    object_spilling_config, temp_folder = object_spilling_config
    cluster = Cluster()
    cluster.add_node(
        num_cpus=0,
        object_store_memory=75 * 1024 * 1024,
        _system_config={"object_spilling_config": object_spilling_config},
    )
    ray.init(address=cluster.address)
    node2 = cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)

    # This task will run on node 2 because node 1 has no CPU resource
    @ray.remote(num_cpus=1)
    def run_workload():
        ids = []
        for _ in range(2):
            arr = np.random.rand(5 * 1024 * 1024)  # 40 MB
            ids.append(ray.put(arr))
        return ids

    ids = ray.get(run_workload.remote())
    assert not is_dir_empty(temp_folder)

    # Kill node 2
    cluster.remove_node(node2)

    # Verify that the spill folder is not empty
    assert not is_dir_empty(temp_folder)

    # Start a new node
    cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)

    # Verify that the spill folder is now cleaned up
    assert is_dir_empty(temp_folder)

    # We hold the object refs to prevent them from being deleted
    del ids
    ray.shutdown()
    cluster.shutdown()
예제 #8
0
def test_multiple_routers():
    cluster = Cluster()
    head_node = cluster.add_node()
    cluster.add_node()

    ray.init(head_node.address)
    node_ids = ray.state.node_ids()
    assert len(node_ids) == 2
    client = serve.start(http_port=8005)  # noqa: F841

    def get_proxy_names():
        proxy_names = []
        for node_id, _ in get_all_node_ids():
            proxy_names.append(
                format_actor_name(SERVE_PROXY_NAME, client._controller_name,
                                  node_id))
        return proxy_names

    wait_for_condition(lambda: len(get_proxy_names()) == 2)
    proxy_names = get_proxy_names()

    # Two actors should be started.
    def get_first_two_actors():
        try:
            ray.get_actor(proxy_names[0])
            ray.get_actor(proxy_names[1])
            return True
        except ValueError:
            return False

    wait_for_condition(get_first_two_actors)

    # Wait for the actors to come up.
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Kill one of the servers, the HTTP server should still function.
    ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Add a new node to the cluster. This should trigger a new router to get
    # started.
    new_node = cluster.add_node()

    wait_for_condition(lambda: len(get_proxy_names()) == 3)
    third_proxy = get_proxy_names()[2]

    def get_third_actor():
        try:
            ray.get_actor(third_proxy)
            return True
        # IndexErrors covers when cluster resources aren't updated yet.
        except (IndexError, ValueError):
            return False

    wait_for_condition(get_third_actor)

    # Remove the newly-added node from the cluster. The corresponding actor
    # should be removed as well.
    cluster.remove_node(new_node)

    def third_actor_removed():
        try:
            ray.get_actor(third_proxy)
            return False
        except ValueError:
            return True

    # Check that the actor is gone and the HTTP server still functions.
    wait_for_condition(third_actor_removed)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Clean up the nodes (otherwise Ray will segfault).
    ray.shutdown()
    cluster.shutdown()
예제 #9
0
iteration = 0
previous_ids = [1 for _ in range(100)]
start_time = time.time()
previous_time = start_time
while True:
    for _ in range(100):
        previous_ids = [f.remote(previous_id) for previous_id in previous_ids]

    ray.get(previous_ids)

    for _ in range(100):
        previous_ids = [f.remote(previous_id) for previous_id in previous_ids]
    node_to_kill = get_other_nodes(cluster, exclude_head=True)[0]

    # Remove the first non-head node.
    cluster.remove_node(node_to_kill)
    cluster.add_node()

    new_time = time.time()
    print("Iteration {}:\n"
          "  - Iteration time: {}.\n"
          "  - Absolute time: {}.\n"
          "  - Total elapsed time: {}.".format(iteration,
                                               new_time - previous_time,
                                               new_time,
                                               new_time - start_time))
    update_progress({
        "iteration": iteration,
        "iteration_time": new_time - previous_time,
        "absolute_time": new_time,
        "elapsed_time": new_time - start_time,
예제 #10
0
def test_reconstruction_cached_dependency(ray_start_cluster,
                                          reconstruction_enabled):
    config = json.dumps({
        "num_heartbeats_timeout":
        10,
        "raylet_heartbeat_timeout_milliseconds":
        100,
        "lineage_pinning_enabled":
        1 if reconstruction_enabled else 0,
        "free_objects_period_milliseconds":
        -1,
    })
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _internal_config=config)
    ray.init(address=cluster.address)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8,
                                    _internal_config=config)
    cluster.add_node(num_cpus=1,
                     resources={"node2": 1},
                     object_store_memory=10**8,
                     _internal_config=config)
    cluster.wait_for_nodes()

    @ray.remote(max_retries=0)
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def chain(x):
        return x

    @ray.remote
    def dependent_task(x):
        return

    obj = large_object.options(resources={"node2": 1}).remote()
    obj = chain.options(resources={"node1": 1}).remote(obj)
    ray.get(dependent_task.options(resources={"node1": 1}).remote(obj))

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     resources={"node1": 1},
                     object_store_memory=10**8,
                     _internal_config=config)
    assert wait_for_condition(lambda: not all(node["Alive"]
                                              for node in ray.nodes()),
                              timeout=10)

    for _ in range(20):
        large_object.options(resources={"node2": 1}).remote()

    if reconstruction_enabled:
        ray.get(dependent_task.remote(obj))
    else:
        with pytest.raises(ray.exceptions.RayTaskError) as e:
            ray.get(dependent_task.remote(obj))
            with pytest.raises(ray.exceptions.UnreconstructableError):
                raise e.as_instanceof_cause()
예제 #11
0
def test_multiple_routers():
    cluster = Cluster()
    head_node = cluster.add_node()
    cluster.add_node()

    ray.init(head_node.address)
    node_ids = ray.state.node_ids()
    assert len(node_ids) == 2
    serve.init(http_port=8005)

    def actor_name(index):
        return SERVE_PROXY_NAME + "-{}-{}".format(node_ids[0], index)

    # Two actors should be started.
    def get_first_two_actors():
        try:
            ray.get_actor(actor_name(0))
            ray.get_actor(actor_name(1))
            return True
        except ValueError:
            return False

    wait_for_condition(get_first_two_actors)

    # Wait for the actors to come up.
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Kill one of the servers, the HTTP server should still function.
    ray.kill(ray.get_actor(actor_name(0)), no_restart=True)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Add a new node to the cluster. This should trigger a new router to get
    # started.
    new_node = cluster.add_node()

    def get_third_actor():
        try:
            ray.get_actor(actor_name(2))
            return True
        except ValueError:
            return False

    wait_for_condition(get_third_actor)

    # Remove the newly-added node from the cluster. The corresponding actor
    # should be removed as well.
    cluster.remove_node(new_node)

    def third_actor_removed():
        try:
            ray.get_actor(actor_name(2))
            return False
        except ValueError:
            return True

    # Check that the actor is gone and the HTTP server still functions.
    wait_for_condition(third_actor_removed)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Clean up the nodes (otherwise Ray will segfault).
    ray.shutdown()
    cluster.shutdown()