示例#1
0
def assert_no_system_failure(p, timeout):
    # Get all logs for 20 seconds.
    logs = get_log_message(p, timeout=timeout)
    for log in logs:
        assert "SIG" not in log, ("There's the segfault or SIGBART reported.")
        assert "Check failed" not in log, (
            "There's the check failure reported.")
示例#2
0
def test_log_monitor_backpressure(ray_start_cluster, monkeypatch):
    update_interval = 3
    monkeypatch.setenv("LOG_NAME_UPDATE_INTERVAL_S", str(update_interval))
    # Intentionally set low to trigger the backpressure condition.
    monkeypatch.setenv("RAY_LOG_MONITOR_MANY_FILES_THRESHOLD", "1")
    expected_str = "abcxyz"

    def matcher(line):
        return line == expected_str

    # Test log monitor still works with backpressure.
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    # Connect a driver to the Ray cluster.
    ray.init(address=cluster.address)
    p = init_log_pubsub()

    @ray.remote
    class Actor:
        def print(self):
            print(expected_str)

    now = datetime.now()
    a = Actor.remote()
    ray.get(a.print.remote())
    logs = get_log_message(p, 1, matcher=matcher)
    assert logs[0][0] == expected_str
    # Since the log file update is delayed,
    # it should take more than update_interval
    # to publish a message for a new worker.
    assert (datetime.now() - now).seconds >= update_interval

    now = datetime.now()
    a = Actor.remote()
    ray.get(a.print.remote())
    logs = get_log_message(p, 1, matcher=matcher)
    assert logs[0][0] == expected_str
    assert (datetime.now() - now).seconds >= update_interval
示例#3
0
def test_log_monitor_backpressure(ray_start_cluster):
    update_interval = 3
    os.environ["LOG_NAME_UPDATE_INTERVAL_S"] = str(update_interval)
    # Intentionally set low to trigger the backpressure condition.
    os.environ["RAY_LOG_MONITOR_MANY_FILES_THRESHOLD"] = "1"
    expected_str = "abc"

    # Test log monitor still works with backpressure.
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    # Connect a driver to the Ray cluster.
    ray.init(address=cluster.address)
    p = init_log_pubsub()
    # It always prints the monitor messages.
    logs = get_log_message(p, 1)

    @ray.remote
    class Actor:
        def print(self):
            print(expected_str)

    now = datetime.now()
    a = Actor.remote()
    a.print.remote()
    logs = get_log_message(p, 1)
    assert logs[0] == expected_str
    # Since the log file update is delayed,
    # it should take more than update_interval
    # to publish a message for a new worker.
    assert (datetime.now() - now).seconds >= update_interval

    now = datetime.now()
    a = Actor.remote()
    a.print.remote()
    logs = get_log_message(p, 1)
    assert logs[0] == expected_str
    assert (datetime.now() - now).seconds >= update_interval
示例#4
0
def test_ignore_windows_access_violation(ray_start_regular_shared):
    @ray.remote
    def print_msg():
        print("Windows fatal exception: access violation\n")

    @ray.remote
    def print_after(_obj):
        print("done")

    p = init_log_pubsub()
    print_after.remote(print_msg.remote())
    msgs = get_log_message(
        p, num=3, timeout=1, job_id=ray.get_runtime_context().job_id.hex()
    )

    assert len(msgs) == 1, msgs
    assert msgs[0][0] == "done"
示例#5
0
    def submit_job():
        # Connect a driver to the Ray cluster.
        ray.init(address=cluster.address, ignore_reinit_error=True)
        p = init_log_pubsub()
        # It always prints the monitor messages.
        logs = get_log_message(p, 1)

        @ray.remote
        def f():
            print("remote func")

        ray.get(f.remote())

        def matcher(log_batch):
            return log_batch["task_name"] == "f"

        logs = get_log_batch(p, 1, matcher=matcher)
        # It should logs with pid of hex job id instead of None
        assert logs[0]["pid"] is not None
        ray.shutdown()
示例#6
0
def test_threaded_actor_integration_test_stress(ray_start_cluster_head,
                                                log_pubsub, error_pubsub):
    """This is a sanity test that checks threaded actors are
        working with the nightly stress test.
    """
    cluster = ray_start_cluster_head
    p = log_pubsub
    e = error_pubsub

    # Prepare the config
    num_remote_nodes = 4
    num_parents = 6
    num_children = 6
    death_probability = 0.95
    max_concurrency = 10

    for _ in range(num_remote_nodes):
        cluster.add_node(num_cpus=2)

    @ray.remote
    class Child(object):
        def __init__(self, death_probability):
            self.death_probability = death_probability

        def ping(self):
            # Exit process with some probability.
            exit_chance = np.random.rand()
            if exit_chance > self.death_probability:
                sys.exit(-1)

    @ray.remote
    class Parent(object):
        def __init__(self, num_children, death_probability=0.95):
            self.death_probability = death_probability
            self.children = [
                Child.options(
                    max_concurrency=max_concurrency).remote(death_probability)
                for _ in range(num_children)
            ]

        def ping(self, num_pings):
            children_outputs = []
            for _ in range(num_pings):
                children_outputs += [
                    child.ping.remote() for child in self.children
                ]
            try:
                ray.get(children_outputs)
            except Exception:
                # Replace the children if one of them died.
                self.__init__(len(self.children), self.death_probability)

        def kill(self):
            # Clean up children.
            ray.get(
                [child.__ray_terminate__.remote() for child in self.children])

    parents = [
        Parent.options(max_concurrency=max_concurrency).remote(
            num_children, death_probability) for _ in range(num_parents)
    ]

    start = time.time()
    loop_times = []
    for _ in range(10):
        loop_start = time.time()
        ray.get([parent.ping.remote(10) for parent in parents])

        # Kill a parent actor with some probability.
        exit_chance = np.random.rand()
        if exit_chance > death_probability:
            parent_index = np.random.randint(len(parents))
            parents[parent_index].kill.remote()
            parents[parent_index] = Parent.options(
                max_concurrency=max_concurrency).remote(
                    num_children, death_probability)
        loop_times.append(time.time() - loop_start)
    result = {}
    print("Finished in: {}s".format(time.time() - start))
    print("Average iteration time: {}s".format(
        sum(loop_times) / len(loop_times)))
    print("Max iteration time: {}s".format(max(loop_times)))
    print("Min iteration time: {}s".format(min(loop_times)))
    result["total_time"] = time.time() - start
    result["avg_iteration_time"] = sum(loop_times) / len(loop_times)
    result["max_iteration_time"] = max(loop_times)
    result["min_iteration_time"] = min(loop_times)
    result["success"] = 1
    print(result)
    ensure_cpu_returned(10)
    del parents

    # Make sure parents are still scheduleable.
    parents = [
        Parent.options(max_concurrency=max_concurrency).remote(
            num_children, death_probability) for _ in range(num_parents)
    ]
    ray.get([parent.ping.remote(10) for parent in parents])
    """
    Make sure there are not SIGSEGV, SIGBART, or other odd check failures.
    """
    # Get all logs for 20 seconds.
    logs = test_utils.get_log_message(p, timeout=20)
    for log in logs:
        assert "SIG" not in log, "There's the segfault or SIGBART reported."
        assert "Check failed" not in log, (
            "There's the check failure reported.")

    # Get error messages for 10 seconds.
    errors = test_utils.get_error_message(e, timeout=10)
    for error in errors:
        print(error)
        assert "You can ignore this message if" not in error.error_message, (
            "Resource deadlock warning shouldn't be printed, but it did.")