示例#1
0
def test_dashboard_agent_restart(set_agent_failure_env_var,
                                 ray_start_cluster_head, error_pubsub,
                                 log_pubsub):
    """Test that when the agent fails to start many times in a row
    if the error message is suppressed correctly without spamming
    the driver.
    """
    # Choose a duplicated port for the agent so that it will crash.
    errors = get_error_message(error_pubsub,
                               1,
                               ray_constants.DASHBOARD_AGENT_DIED_ERROR,
                               timeout=10)
    assert len(errors) == 1
    for e in errors:
        assert ("There are 3 possible problems "
                "if you see this error." in e.error_message)
    # Make sure the agent process is not started anymore.
    cluster = ray_start_cluster_head
    wait_for_condition(lambda: search_agents(cluster) is None)

    # Make sure there's no spammy message for 5 seconds.
    def matcher(log_batch):
        return log_batch["pid"] != "autoscaler"

    match = get_log_batch(log_pubsub, 1, timeout=5, matcher=matcher)
    assert len(match) == 0, \
        "There are spammy logs during Ray agent restart process. "\
        f"Logs: {match}"
示例#2
0
def test_gcs_server_failiure_report(ray_start_regular, log_pubsub):
    # Get gcs server pid to send a signal.
    all_processes = ray.worker._global_node.all_processes
    gcs_server_process = all_processes["gcs_server"][0].process
    gcs_server_pid = gcs_server_process.pid

    os.kill(gcs_server_pid, signal.SIGBUS)
    # wait for 30 seconds, for the 1st batch of logs.
    batches = get_log_batch(log_pubsub, 1, timeout=30)
    assert len(batches) == 1
    assert batches[0]["pid"] == "gcs_server", batches
def test_runtime_env_logging_to_driver(ray_start_regular_shared, log_pubsub):
    @ray.remote(runtime_env={"pip": [f"requests=={REQUEST_VERSIONS[0]}"]})
    def func():
        pass

    ray.get(func.remote())

    # Check the stderr from the worker.
    def matcher(log_batch):
        return log_batch["pid"] == "runtime_env"

    match = get_log_batch(log_pubsub, 1, timeout=5, matcher=matcher)
    assert len(match) > 0
示例#4
0
def test_raylet_node_manager_server_failure(ray_start_cluster_head,
                                            log_pubsub):
    cluster = ray_start_cluster_head
    redis_port = int(cluster.address.split(":")[1])
    # Reuse redis port to make node manager grpc server fail to start.
    with pytest.raises(Exception):
        cluster.add_node(wait=False, node_manager_port=redis_port)

    # wait for max 10 seconds.
    def matcher(log_batch):
        return log_batch["pid"] == "raylet" and any(
            "Failed to start the grpc server." in line
            for line in log_batch["lines"])

    match = get_log_batch(log_pubsub, 1, timeout=10, matcher=matcher)
    assert len(match) > 0
示例#5
0
def test_metrics_override_shouldnt_warn(ray_start_regular, log_pubsub):
    # https://github.com/ray-project/ray/issues/12859

    @ray.remote
    def override():
        a = Counter("num_count", description="")
        b = Counter("num_count", description="")
        a.inc(1)
        b.inc(1)

    ray.get(override.remote())

    # Check the stderr from the worker.
    def matcher(log_batch):
        return any("Attempt to register measure" in line
                   for line in log_batch["lines"])

    match = get_log_batch(log_pubsub, 1, timeout=5, matcher=matcher)
    assert len(match) == 0, match
示例#6
0
def test_gcs_server_failiure_report(ray_start_regular, log_pubsub):
    # Get gcs server pid to send a signal.
    all_processes = ray.worker._global_node.all_processes
    gcs_server_process = all_processes["gcs_server"][0].process
    gcs_server_pid = gcs_server_process.pid

    # TODO(mwtian): make sure logs are delivered after GCS is restarted.
    if sys.platform == "win32":
        sig = 9
    else:
        sig = signal.SIGBUS
    os.kill(gcs_server_pid, sig)
    # wait for 30 seconds, for the 1st batch of logs.
    batches = get_log_batch(log_pubsub, 1, timeout=30)
    assert gcs_server_process.poll() is not None
    if sys.platform != "win32":
        # Windows signal handler does not run when process is terminated
        assert len(batches) == 1
        assert batches[0]["pid"] == "gcs_server", batches
示例#7
0
    def submit_job():
        # Connect a driver to the Ray cluster.
        ray.init(address=cluster.address, ignore_reinit_error=True)
        p = init_log_pubsub()
        # It always prints the monitor messages.
        logs = get_log_message(p, 1)

        @ray.remote
        def f():
            print("remote func")

        ray.get(f.remote())

        def matcher(log_batch):
            return log_batch["task_name"] == "f"

        logs = get_log_batch(p, 1, matcher=matcher)
        # It should logs with pid of hex job id instead of None
        assert logs[0]["pid"] is not None
        ray.shutdown()