async def test_aio_publish_and_subscribe_logs(ray_start_regular): address_info = ray_start_regular redis = ray._private.services.create_redis_client( address_info["redis_address"], password=ray.ray_constants.REDIS_DEFAULT_PASSWORD) gcs_server_addr = gcs_utils.get_gcs_address_from_redis(redis) subscriber = GcsAioSubscriber(address=gcs_server_addr) await subscriber.subscribe_logs() publisher = GcsAioPublisher(address=gcs_server_addr) log_batch = { "ip": "127.0.0.1", "pid": "gcs", "job": "0001", "is_err": False, "lines": ["line 1", "line 2"], "actor_name": "test actor", "task_name": "test task", } await publisher.publish_logs(log_batch) assert await subscriber.poll_logs() == log_batch await subscriber.close()
def _handle_failure(self, error): logger.exception("Error in monitor loop") if self.autoscaler is not None and \ os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1": self.autoscaler.kill_workers() # Take down autoscaler workers if necessary. self.destroy_autoscaler_workers() # Something went wrong, so push an error to all current and future # drivers. message = f"The autoscaler failed with the following error:\n{error}" if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True) redis_client = ray._private.services.create_redis_client( self.redis_address, password=self.redis_password) gcs_publisher = None if args.gcs_address: gcs_publisher = GcsPublisher(address=args.gcs_address) elif gcs_pubsub_enabled(): gcs_publisher = GcsPublisher( address=get_gcs_address_from_redis(redis_client)) from ray._private.utils import publish_error_to_driver publish_error_to_driver( ray_constants.MONITOR_DIED_ERROR, message, redis_client=redis_client, gcs_publisher=gcs_publisher)
def test_publish_and_subscribe_logs(ray_start_regular): address_info = ray_start_regular redis = ray._private.services.create_redis_client( address_info["redis_address"], password=ray.ray_constants.REDIS_DEFAULT_PASSWORD) gcs_server_addr = gcs_utils.get_gcs_address_from_redis(redis) subscriber = GcsLogSubscriber(address=gcs_server_addr) subscriber.subscribe() publisher = GcsPublisher(address=gcs_server_addr) log_batch = { "ip": "127.0.0.1", "pid": 1234, "job": "0001", "is_err": False, "lines": ["line 1", "line 2"], "actor_name": "test actor", "task_name": "test task", } publisher.publish_logs(log_batch) # PID is treated as string. log_batch["pid"] = "1234" assert subscriber.poll() == log_batch subscriber.close()
def __init__(self, logs_dir, redis_address, redis_password=None): """Initialize the log monitor object.""" self.ip = services.get_node_ip_address() self.logs_dir = logs_dir self.redis_client = ray._private.services.create_redis_client( redis_address, password=redis_password) self.publisher = None if gcs_pubsub.gcs_pubsub_enabled(): gcs_addr = gcs_utils.get_gcs_address_from_redis(self.redis_client) self.publisher = gcs_pubsub.GcsPublisher(address=gcs_addr) self.log_filenames = set() self.open_file_infos = [] self.closed_file_infos = [] self.can_open_more_files = True
def test_publish_and_subscribe_function_keys(ray_start_regular): address_info = ray_start_regular redis = ray._private.services.create_redis_client( address_info["redis_address"], password=ray.ray_constants.REDIS_DEFAULT_PASSWORD) gcs_server_addr = gcs_utils.get_gcs_address_from_redis(redis) subscriber = GcsFunctionKeySubscriber(address=gcs_server_addr) subscriber.subscribe() publisher = GcsPublisher(address=gcs_server_addr) publisher.publish_function_key(b"111") publisher.publish_function_key(b"222") assert subscriber.poll() == b"111" assert subscriber.poll() == b"222" subscriber.close()
def test_publish_error_to_driver(ray_start_regular, error_pubsub): address_info = ray_start_regular address = address_info["redis_address"] redis_client = ray._private.services.create_redis_client( address, password=ray.ray_constants.REDIS_DEFAULT_PASSWORD) gcs_publisher = None if gcs_pubsub_enabled(): gcs_publisher = GcsPublisher( address=gcs_utils.get_gcs_address_from_redis(redis_client)) error_message = "Test error message" ray._private.utils.publish_error_to_driver( ray_constants.DASHBOARD_AGENT_DIED_ERROR, error_message, redis_client=redis_client, gcs_publisher=gcs_publisher) errors = get_error_message(error_pubsub, 1, ray_constants.DASHBOARD_AGENT_DIED_ERROR) assert errors[0].type == ray_constants.DASHBOARD_AGENT_DIED_ERROR assert errors[0].error_message == error_message
async def test_aio_publish_and_subscribe_error_info(ray_start_regular): address_info = ray_start_regular redis = ray._private.services.create_redis_client( address_info["redis_address"], password=ray.ray_constants.REDIS_DEFAULT_PASSWORD) gcs_server_addr = gcs_utils.get_gcs_address_from_redis(redis) subscriber = GcsAioSubscriber(address=gcs_server_addr) await subscriber.subscribe_error() publisher = GcsAioPublisher(address=gcs_server_addr) err1 = ErrorTableData(error_message="test error message 1") err2 = ErrorTableData(error_message="test error message 2") await publisher.publish_error(b"aaa_id", err1) await publisher.publish_error(b"bbb_id", err2) assert await subscriber.poll_error() == (b"aaa_id", err1) assert await subscriber.poll_error() == (b"bbb_id", err2) await subscriber.close()
raylet_pid = os.environ["RAY_RAYLET_PID"] node_ip = args.node_ip_address if restart_count >= max_restart_count: # Agent is failed to be started many times. # Push an error to all drivers, so that users can know the # impact of the issue. redis_client = None gcs_publisher = None if gcs_pubsub_enabled(): if use_gcs_for_bootstrap(): gcs_publisher = GcsPublisher(args.gcs_address) else: redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) gcs_publisher = GcsPublisher( address=get_gcs_address_from_redis(redis_client)) else: redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) traceback_str = ray._private.utils.format_error_message( traceback.format_exc()) message = ( f"(ip={node_ip}) " f"The agent on node {platform.uname()[1]} failed to " f"be restarted {max_restart_count} " "times. There are 3 possible problems if you see this error." "\n 1. The dashboard might not display correct " "information on this node." "\n 2. Metrics on this node won't be reported." "\n 3. runtime_env APIs won't work."
service_discovery.start() loop = asyncio.get_event_loop() loop.run_until_complete(dashboard.run()) except Exception as e: traceback_str = ray._private.utils.format_error_message( traceback.format_exc()) message = f"The dashboard on node {platform.uname()[1]} " \ f"failed with the following " \ f"error:\n{traceback_str}" if isinstance(e, FrontendNotFoundError): logger.warning(message) else: logger.error(message) raise e # Something went wrong, so push an error to all drivers. redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) gcs_publisher = None if args.gcs_address: gcs_publisher = GcsPublisher(address=args.gcs_address) elif gcs_pubsub_enabled(): gcs_publisher = GcsPublisher( address=gcs_utils.get_gcs_address_from_redis(redis_client)) ray._private.utils.publish_error_to_driver( redis_client, ray_constants.DASHBOARD_DIED_ERROR, message, redis_client=redis_client, gcs_publisher=gcs_publisher)
def test_subscribe_two_channels(ray_start_regular): """Tests concurrently subscribing to two channels work.""" address_info = ray_start_regular redis = ray._private.services.create_redis_client( address_info["redis_address"], password=ray.ray_constants.REDIS_DEFAULT_PASSWORD) gcs_server_addr = gcs_utils.get_gcs_address_from_redis(redis) num_messages = 100 errors = [] def receive_errors(): subscriber = GcsErrorSubscriber(address=gcs_server_addr) subscriber.subscribe() while len(errors) < num_messages: _, msg = subscriber.poll() errors.append(msg) logs = [] def receive_logs(): subscriber = GcsLogSubscriber(address=gcs_server_addr) subscriber.subscribe() while len(logs) < num_messages: log_batch = subscriber.poll() logs.append(log_batch) t1 = threading.Thread(target=receive_errors) t1.start() t2 = threading.Thread(target=receive_logs) t2.start() publisher = GcsPublisher(address=gcs_server_addr) for i in range(0, num_messages): publisher.publish_error(b"msg_id", ErrorTableData(error_message=f"error {i}")) publisher.publish_logs({ "ip": "127.0.0.1", "pid": "gcs", "job": "0001", "is_err": False, "lines": [f"line {i}"], "actor_name": "test actor", "task_name": "test task", }) t1.join(timeout=10) assert not t1.is_alive(), len(errors) assert len(errors) == num_messages, len(errors) t2.join(timeout=10) assert not t2.is_alive(), len(logs) assert len(logs) == num_messages, len(logs) for i in range(0, num_messages): assert errors[i].error_message == f"error {i}" assert logs[i]["lines"][0] == f"line {i}"
def __init__( self, address, autoscaling_config, redis_password=None, prefix_cluster_info=False, monitor_ip=None, stop_event: Optional[Event] = None, ): if not use_gcs_for_bootstrap(): # Initialize the Redis clients. redis_address = address self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) (ip, port) = address.split(":") # Initialize the gcs stub for getting all node resource usage. gcs_address = get_gcs_address_from_redis(self.redis) else: gcs_address = address redis_address = None options = (("grpc.enable_http_proxy", 0), ) gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, options) # TODO: Use gcs client for this self.gcs_node_resources_stub = ( gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel)) self.gcs_node_info_stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub( gcs_channel) # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker if use_gcs_for_bootstrap(): gcs_client = GcsClient(address=gcs_address) else: worker.redis_client = self.redis gcs_client = GcsClient.create_from_redis(self.redis) if monitor_ip: monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}" if use_gcs_for_bootstrap(): gcs_client.internal_kv_put(b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None) else: self.redis.set("AutoscalerMetricsAddress", monitor_addr) _initialize_internal_kv(gcs_client) if monitor_ip: monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}" if use_gcs_for_bootstrap(): gcs_client.internal_kv_put(b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None) else: self.redis.set("AutoscalerMetricsAddress", monitor_addr) worker.mode = 0 if use_gcs_for_bootstrap(): head_node_ip = gcs_address.split(":")[0] else: head_node_ip = redis_address.split(":")[0] self.redis_address = redis_address self.redis_password = redis_password self.load_metrics = LoadMetrics() self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] self.autoscaling_config = autoscaling_config self.autoscaler = None # If set, we are in a manually created cluster (non-autoscaling) and # simply mirroring what the GCS tells us the cluster node types are. self.readonly_config = None self.prom_metrics = AutoscalerPrometheusMetrics() if monitor_ip and prometheus_client: # If monitor_ip wasn't passed in, then don't attempt to start the # metric server to keep behavior identical to before metrics were # introduced try: logger.info( "Starting autoscaler metrics server on port {}".format( AUTOSCALER_METRIC_PORT)) prometheus_client.start_http_server( port=AUTOSCALER_METRIC_PORT, addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "", registry=self.prom_metrics.registry, ) except Exception: logger.exception( "An exception occurred while starting the metrics server.") elif not prometheus_client: logger.warning("`prometheus_client` not found, so metrics will " "not be exported.") logger.info("Monitor: Started")