Exemplo n.º 1
0
def test_ignore_windows_access_violation(ray_start_regular_shared):
    @ray.remote
    def print_msg():
        print("Windows fatal exception: access violation\n")

    @ray.remote
    def print_after(_obj):
        print("done")

    p = init_log_pubsub()
    print_after.remote(print_msg.remote())
    msgs = get_log_message(
        p, num=3, timeout=1, job_id=ray.get_runtime_context().job_id.hex()
    )

    assert len(msgs) == 1, msgs
    assert msgs[0][0] == "done"
Exemplo n.º 2
0
    def submit_job():
        # Connect a driver to the Ray cluster.
        ray.init(address=cluster.address, ignore_reinit_error=True)
        p = init_log_pubsub()
        # It always prints the monitor messages.
        logs = get_log_message(p, 1)

        @ray.remote
        def f():
            print("remote func")

        ray.get(f.remote())

        def matcher(log_batch):
            return log_batch["task_name"] == "f"

        logs = get_log_batch(p, 1, matcher=matcher)
        # It should logs with pid of hex job id instead of None
        assert logs[0]["pid"] is not None
        ray.shutdown()
Exemplo n.º 3
0
def test_log_monitor_backpressure(ray_start_cluster, monkeypatch):
    update_interval = 3
    monkeypatch.setenv("LOG_NAME_UPDATE_INTERVAL_S", str(update_interval))
    # Intentionally set low to trigger the backpressure condition.
    monkeypatch.setenv("RAY_LOG_MONITOR_MANY_FILES_THRESHOLD", "1")
    expected_str = "abcxyz"

    def matcher(line):
        return line == expected_str

    # Test log monitor still works with backpressure.
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    # Connect a driver to the Ray cluster.
    ray.init(address=cluster.address)
    p = init_log_pubsub()

    @ray.remote
    class Actor:
        def print(self):
            print(expected_str)

    now = datetime.now()
    a = Actor.remote()
    ray.get(a.print.remote())
    logs = get_log_message(p, 1, matcher=matcher)
    assert logs[0][0] == expected_str
    # Since the log file update is delayed,
    # it should take more than update_interval
    # to publish a message for a new worker.
    assert (datetime.now() - now).seconds >= update_interval

    now = datetime.now()
    a = Actor.remote()
    ray.get(a.print.remote())
    logs = get_log_message(p, 1, matcher=matcher)
    assert logs[0][0] == expected_str
    assert (datetime.now() - now).seconds >= update_interval
Exemplo n.º 4
0
def test_log_monitor_backpressure(ray_start_cluster):
    update_interval = 3
    os.environ["LOG_NAME_UPDATE_INTERVAL_S"] = str(update_interval)
    # Intentionally set low to trigger the backpressure condition.
    os.environ["RAY_LOG_MONITOR_MANY_FILES_THRESHOLD"] = "1"
    expected_str = "abc"

    # Test log monitor still works with backpressure.
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    # Connect a driver to the Ray cluster.
    ray.init(address=cluster.address)
    p = init_log_pubsub()
    # It always prints the monitor messages.
    logs = get_log_message(p, 1)

    @ray.remote
    class Actor:
        def print(self):
            print(expected_str)

    now = datetime.now()
    a = Actor.remote()
    a.print.remote()
    logs = get_log_message(p, 1)
    assert logs[0] == expected_str
    # Since the log file update is delayed,
    # it should take more than update_interval
    # to publish a message for a new worker.
    assert (datetime.now() - now).seconds >= update_interval

    now = datetime.now()
    a = Actor.remote()
    a.print.remote()
    logs = get_log_message(p, 1)
    assert logs[0] == expected_str
    assert (datetime.now() - now).seconds >= update_interval
Exemplo n.º 5
0
def log_pubsub():
    p = init_log_pubsub()
    yield p
    p.close()