def test_ignore_windows_access_violation(ray_start_regular_shared): @ray.remote def print_msg(): print("Windows fatal exception: access violation\n") @ray.remote def print_after(_obj): print("done") p = init_log_pubsub() print_after.remote(print_msg.remote()) msgs = get_log_message( p, num=3, timeout=1, job_id=ray.get_runtime_context().job_id.hex() ) assert len(msgs) == 1, msgs assert msgs[0][0] == "done"
def submit_job(): # Connect a driver to the Ray cluster. ray.init(address=cluster.address, ignore_reinit_error=True) p = init_log_pubsub() # It always prints the monitor messages. logs = get_log_message(p, 1) @ray.remote def f(): print("remote func") ray.get(f.remote()) def matcher(log_batch): return log_batch["task_name"] == "f" logs = get_log_batch(p, 1, matcher=matcher) # It should logs with pid of hex job id instead of None assert logs[0]["pid"] is not None ray.shutdown()
def test_log_monitor_backpressure(ray_start_cluster, monkeypatch): update_interval = 3 monkeypatch.setenv("LOG_NAME_UPDATE_INTERVAL_S", str(update_interval)) # Intentionally set low to trigger the backpressure condition. monkeypatch.setenv("RAY_LOG_MONITOR_MANY_FILES_THRESHOLD", "1") expected_str = "abcxyz" def matcher(line): return line == expected_str # Test log monitor still works with backpressure. cluster = ray_start_cluster cluster.add_node(num_cpus=4) # Connect a driver to the Ray cluster. ray.init(address=cluster.address) p = init_log_pubsub() @ray.remote class Actor: def print(self): print(expected_str) now = datetime.now() a = Actor.remote() ray.get(a.print.remote()) logs = get_log_message(p, 1, matcher=matcher) assert logs[0][0] == expected_str # Since the log file update is delayed, # it should take more than update_interval # to publish a message for a new worker. assert (datetime.now() - now).seconds >= update_interval now = datetime.now() a = Actor.remote() ray.get(a.print.remote()) logs = get_log_message(p, 1, matcher=matcher) assert logs[0][0] == expected_str assert (datetime.now() - now).seconds >= update_interval
def test_log_monitor_backpressure(ray_start_cluster): update_interval = 3 os.environ["LOG_NAME_UPDATE_INTERVAL_S"] = str(update_interval) # Intentionally set low to trigger the backpressure condition. os.environ["RAY_LOG_MONITOR_MANY_FILES_THRESHOLD"] = "1" expected_str = "abc" # Test log monitor still works with backpressure. cluster = ray_start_cluster cluster.add_node(num_cpus=4) # Connect a driver to the Ray cluster. ray.init(address=cluster.address) p = init_log_pubsub() # It always prints the monitor messages. logs = get_log_message(p, 1) @ray.remote class Actor: def print(self): print(expected_str) now = datetime.now() a = Actor.remote() a.print.remote() logs = get_log_message(p, 1) assert logs[0] == expected_str # Since the log file update is delayed, # it should take more than update_interval # to publish a message for a new worker. assert (datetime.now() - now).seconds >= update_interval now = datetime.now() a = Actor.remote() a.print.remote() logs = get_log_message(p, 1) assert logs[0] == expected_str assert (datetime.now() - now).seconds >= update_interval
def log_pubsub(): p = init_log_pubsub() yield p p.close()