class Monitor: """Autoscaling monitor. This process periodically collects stats from the GCS and triggers autoscaler updates. Attributes: redis: A connection to the Redis server. """ def __init__(self, redis_address, autoscaling_config, redis_password=None, prefix_cluster_info=False, monitor_ip=None, stop_event: Optional[Event] = None): # Initialize the Redis clients. ray.state.state._initialize_global_state( redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) if monitor_ip: self.redis.set("AutoscalerMetricsAddress", f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}") (ip, port) = redis_address.split(":") # Initialize the gcs stub for getting all node resource usage. gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") options = (("grpc.enable_http_proxy", 0), ) gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, options) # TODO: Use gcs client for this self.gcs_node_resources_stub = \ gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) self.gcs_node_info_stub = \ gcs_service_pb2_grpc.NodeInfoGcsServiceStub(gcs_channel) # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis gcs_client = GcsClient.create_from_redis(self.redis) _initialize_internal_kv(gcs_client) worker.mode = 0 head_node_ip = redis_address.split(":")[0] self.redis_address = redis_address self.redis_password = redis_password if os.environ.get("RAY_FAKE_CLUSTER"): self.load_metrics = LoadMetrics(local_ip=FAKE_HEAD_NODE_ID) else: self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] self.autoscaling_config = autoscaling_config self.autoscaler = None # If set, we are in a manually created cluster (non-autoscaling) and # simply mirroring what the GCS tells us the cluster node types are. self.readonly_config = None self.prom_metrics = AutoscalerPrometheusMetrics() if monitor_ip and prometheus_client: # If monitor_ip wasn't passed in, then don't attempt to start the # metric server to keep behavior identical to before metrics were # introduced try: logger.info( "Starting autoscaler metrics server on port {}".format( AUTOSCALER_METRIC_PORT)) prometheus_client.start_http_server( port=AUTOSCALER_METRIC_PORT, addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "", registry=self.prom_metrics.registry) except Exception: logger.exception( "An exception occurred while starting the metrics server.") elif not prometheus_client: logger.warning("`prometheus_client` not found, so metrics will " "not be exported.") logger.info("Monitor: Started") def _initialize_autoscaler(self): if self.autoscaling_config: autoscaling_config = self.autoscaling_config else: # This config mirrors the current setup of the manually created # cluster. Each node gets its own unique node type. self.readonly_config = BASE_READONLY_CONFIG # Note that the "available_node_types" of the config can change. def get_latest_readonly_config(): return self.readonly_config autoscaling_config = get_latest_readonly_config self.autoscaler = StandardAutoscaler( autoscaling_config, self.load_metrics, self.gcs_node_info_stub, prefix_cluster_info=self.prefix_cluster_info, event_summarizer=self.event_summarizer, prom_metrics=self.prom_metrics) def update_load_metrics(self): """Fetches resource usage data from GCS and updates load metrics.""" request = gcs_service_pb2.GetAllResourceUsageRequest() response = self.gcs_node_resources_stub.GetAllResourceUsage( request, timeout=60) resources_batch_data = response.resource_usage_data # Tell the readonly node provider what nodes to report. if self.readonly_config: new_nodes = [] for msg in list(resources_batch_data.batch): node_id = msg.node_id.hex() new_nodes.append((node_id, msg.node_manager_address)) self.autoscaler.provider._set_nodes(new_nodes) mirror_node_types = {} cluster_full = False for resource_message in resources_batch_data.batch: node_id = resource_message.node_id # Generate node type config based on GCS reported node list. if self.readonly_config: # Keep prefix in sync with ReadonlyNodeProvider. node_type = format_readonly_node_type(node_id.hex()) resources = {} for k, v in resource_message.resources_total.items(): resources[k] = v mirror_node_types[node_type] = { "resources": resources, "node_config": {}, "max_workers": 1, } if (hasattr(resource_message, "cluster_full_of_actors_detected") and resource_message.cluster_full_of_actors_detected): # Aggregate this flag across all batches. cluster_full = True resource_load = dict(resource_message.resource_load) total_resources = dict(resource_message.resources_total) available_resources = dict(resource_message.resources_available) waiting_bundles, infeasible_bundles = parse_resource_demands( resources_batch_data.resource_load_by_shape) pending_placement_groups = list( resources_batch_data.placement_group_load.placement_group_data) use_node_id_as_ip = (self.autoscaler is not None and self.autoscaler.config["provider"].get( "use_node_id_as_ip", False)) # "use_node_id_as_ip" is a hack meant to address situations in # which there's more than one Ray node residing at a given ip. # TODO (Dmitri): Stop using ips as node identifiers. # https://github.com/ray-project/ray/issues/19086 if use_node_id_as_ip: peloton_id = total_resources.get("NODE_ID_AS_RESOURCE") # Legacy support https://github.com/ray-project/ray/pull/17312 if peloton_id is not None: ip = str(int(peloton_id)) else: ip = node_id.hex() else: ip = resource_message.node_manager_address self.load_metrics.update(ip, node_id, total_resources, available_resources, resource_load, waiting_bundles, infeasible_bundles, pending_placement_groups, cluster_full) if self.readonly_config: self.readonly_config["available_node_types"].update( mirror_node_types) def update_resource_requests(self): """Fetches resource requests from the internal KV and updates load.""" if not _internal_kv_initialized(): return data = _internal_kv_get( ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL) if data: try: resource_request = json.loads(data) self.load_metrics.set_resource_requests(resource_request) except Exception: logger.exception("Error parsing resource requests") def _run(self): """Run the monitor loop.""" while True: if self.stop_event and self.stop_event.is_set(): break self.update_load_metrics() self.update_resource_requests() self.update_event_summary() status = { "load_metrics_report": self.load_metrics.summary()._asdict(), "time": time.time(), "monitor_pid": os.getpid() } # Process autoscaling actions if self.autoscaler: # Only used to update the load metrics for the autoscaler. self.autoscaler.update() status[ "autoscaler_report"] = self.autoscaler.summary()._asdict() for msg in self.event_summarizer.summary(): logger.info("{}{}".format( ray_constants.LOG_PREFIX_EVENT_SUMMARY, msg)) self.event_summarizer.clear() as_json = json.dumps(status) if _internal_kv_initialized(): _internal_kv_put( DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True) # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S) def update_event_summary(self): """Report the current size of the cluster. To avoid log spam, only cluster size changes (CPU or GPU count change) are reported to the event summarizer. The event summarizer will report only the latest cluster size per batch. """ avail_resources = self.load_metrics.resources_avail_summary() if (not self.readonly_config and avail_resources != self.last_avail_resources): self.event_summarizer.add( "Resized to {}.", # e.g., Resized to 100 CPUs, 4 GPUs. quantity=avail_resources, aggregate=lambda old, new: new) self.last_avail_resources = avail_resources def destroy_autoscaler_workers(self): """Cleanup the autoscaler, in case of an exception in the run() method. We kill the worker nodes, but retain the head node in order to keep logs around, keeping costs minimal. This monitor process runs on the head node anyway, so this is more reliable.""" if self.autoscaler is None: return # Nothing to clean up. if self.autoscaling_config is None: # This is a logic error in the program. Can't do anything. logger.error( "Monitor: Cleanup failed due to lack of autoscaler config.") return logger.info("Monitor: Exception caught. Taking down workers...") clean = False while not clean: try: teardown_cluster( config_file=self.autoscaling_config, yes=True, # Non-interactive. workers_only=True, # Retain head node for logs. override_cluster_name=None, keep_min_workers=True, # Retain minimal amount of workers. ) clean = True logger.info("Monitor: Workers taken down.") except Exception: logger.error("Monitor: Cleanup exception. Trying again...") time.sleep(2) def _handle_failure(self, error): logger.exception("Error in monitor loop") if self.autoscaler is not None and \ os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1": self.autoscaler.kill_workers() # Take down autoscaler workers if necessary. self.destroy_autoscaler_workers() # Something went wrong, so push an error to all current and future # drivers. message = f"The autoscaler failed with the following error:\n{error}" if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True) redis_client = ray._private.services.create_redis_client( self.redis_address, password=self.redis_password) gcs_publisher = None if args.gcs_address: gcs_publisher = GcsPublisher(address=args.gcs_address) elif gcs_pubsub_enabled(): gcs_publisher = GcsPublisher( address=get_gcs_address_from_redis(redis_client)) from ray._private.utils import publish_error_to_driver publish_error_to_driver( ray_constants.MONITOR_DIED_ERROR, message, redis_client=redis_client, gcs_publisher=gcs_publisher) def _signal_handler(self, sig, frame): self._handle_failure(f"Terminated with signal {sig}\n" + "".join(traceback.format_stack(frame))) sys.exit(sig + 128) def run(self): # Register signal handlers for autoscaler termination. signal.signal(signal.SIGINT, self._signal_handler) signal.signal(signal.SIGTERM, self._signal_handler) try: if _internal_kv_initialized(): # Delete any previous autoscaling errors. _internal_kv_del(DEBUG_AUTOSCALING_ERROR) self._initialize_autoscaler() self._run() except Exception: self._handle_failure(traceback.format_exc()) raise
class Monitor: """A monitor for Ray processes. The monitor is in charge of cleaning up the tables in the global state after processes have died. The monitor is currently not responsible for detecting component failures. Attributes: redis: A connection to the Redis server. primary_subscribe_client: A pubsub client for the Redis server. This is used to receive notifications about failed components. """ def __init__(self, redis_address, autoscaling_config, redis_password=None): # Initialize the Redis clients. ray.state.state._initialize_global_state(redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) self.global_state_accessor = GlobalStateAccessor( redis_address, redis_password, False) self.global_state_accessor.connect() # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.mode = 0 # Setup subscriptions to the primary Redis server and the Redis shards. self.primary_subscribe_client = self.redis.pubsub( ignore_subscribe_messages=True) # Keep a mapping from raylet client ID to IP address to use # for updating the load metrics. self.raylet_id_to_ip_map = {} head_node_ip = redis_address.split(":")[0] self.load_metrics = LoadMetrics(local_ip=head_node_ip) if autoscaling_config: self.autoscaler = StandardAutoscaler(autoscaling_config, self.load_metrics) self.autoscaling_config = autoscaling_config else: self.autoscaler = None self.autoscaling_config = None def __del__(self): """Destruct the monitor object.""" # We close the pubsub client to avoid leaking file descriptors. try: primary_subscribe_client = self.primary_subscribe_client except AttributeError: primary_subscribe_client = None if primary_subscribe_client is not None: primary_subscribe_client.close() if self.global_state_accessor is not None: self.global_state_accessor.disconnect() self.global_state_accessor = None def subscribe(self, channel): """Subscribe to the given channel on the primary Redis shard. Args: channel (str): The channel to subscribe to. Raises: Exception: An exception is raised if the subscription fails. """ self.primary_subscribe_client.subscribe(channel) def update_load_metrics(self): """Fetches heartbeat data from GCS and updates load metrics.""" all_heartbeat = self.global_state_accessor.get_all_heartbeat() heartbeat_batch_data = \ ray.gcs_utils.HeartbeatBatchTableData.FromString(all_heartbeat) for heartbeat_message in heartbeat_batch_data.batch: resource_load = dict(heartbeat_message.resource_load) total_resources = dict(heartbeat_message.resources_total) available_resources = dict(heartbeat_message.resources_available) waiting_bundles, infeasible_bundles = parse_resource_demands( heartbeat_batch_data.resource_load_by_shape) pending_placement_groups = list( heartbeat_batch_data.placement_group_load.placement_group_data) # Update the load metrics for this raylet. node_id = ray.utils.binary_to_hex(heartbeat_message.node_id) ip = self.raylet_id_to_ip_map.get(node_id) if ip: self.load_metrics.update(ip, total_resources, available_resources, resource_load, waiting_bundles, infeasible_bundles, pending_placement_groups) else: logger.warning( f"Monitor: could not find ip for node {node_id}") def autoscaler_resource_request_handler(self, _, data): """Handle a notification of a resource request for the autoscaler. This channel and method are only used by the manual `ray.autoscaler.sdk.request_resources` api. Args: channel: unused data: a resource request as JSON, e.g. {"CPU": 1} """ if not self.autoscaler: return try: self.autoscaler.request_resources(json.loads(data)) except Exception: # We don't want this to kill the monitor. traceback.print_exc() def process_messages(self, max_messages=10000): """Process all messages ready in the subscription channels. This reads messages from the subscription channels and calls the appropriate handlers until there are no messages left. Args: max_messages: The maximum number of messages to process before returning. """ subscribe_clients = [self.primary_subscribe_client] for subscribe_client in subscribe_clients: for _ in range(max_messages): message = None try: message = subscribe_client.get_message() except redis.exceptions.ConnectionError: pass if message is None: # Continue on to the next subscribe client. break # Parse the message. channel = message["channel"] data = message["data"] if (channel == ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL): message_handler = self.autoscaler_resource_request_handler else: assert False, "This code should be unreachable." # Call the handler. message_handler(channel, data) def update_raylet_map(self, _append_port=False): """Updates internal raylet map. Args: _append_port (bool): Defaults to False. Appending the port is useful in testing, as mock clusters have many nodes with the same IP and cannot be uniquely identified. """ all_raylet_nodes = ray.nodes() self.raylet_id_to_ip_map = {} for raylet_info in all_raylet_nodes: node_id = (raylet_info.get("DBClientID") or raylet_info["NodeID"]) ip_address = (raylet_info.get("AuxAddress") or raylet_info["NodeManagerAddress"]).split(":")[0] if _append_port: ip_address += ":" + str(raylet_info["NodeManagerPort"]) self.raylet_id_to_ip_map[node_id] = ip_address def _run(self): """Run the monitor. This function loops forever, checking for messages about dead database clients and cleaning up state accordingly. """ self.subscribe(ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL) # Handle messages from the subscription channels. while True: # Process autoscaling actions if self.autoscaler: # Only used to update the load metrics for the autoscaler. self.update_raylet_map() self.update_load_metrics() self.autoscaler.update() # Process a round of messages. self.process_messages() # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S) def destroy_autoscaler_workers(self): """Cleanup the autoscaler, in case of an exception in the run() method. We kill the worker nodes, but retain the head node in order to keep logs around, keeping costs minimal. This monitor process runs on the head node anyway, so this is more reliable.""" if self.autoscaler is None: return # Nothing to clean up. if self.autoscaling_config is None: # This is a logic error in the program. Can't do anything. logger.error( "Monitor: Cleanup failed due to lack of autoscaler config.") return logger.info("Monitor: Exception caught. Taking down workers...") clean = False while not clean: try: teardown_cluster( config_file=self.autoscaling_config, yes=True, # Non-interactive. workers_only=True, # Retain head node for logs. override_cluster_name=None, keep_min_workers=True, # Retain minimal amount of workers. ) clean = True logger.info("Monitor: Workers taken down.") except Exception: logger.error("Monitor: Cleanup exception. Trying again...") time.sleep(2) def run(self): try: self._run() except Exception: logger.exception("Error in monitor loop") if self.autoscaler: self.autoscaler.kill_workers() raise
class Monitor: """Autoscaling monitor. This process periodically collects stats from the GCS and triggers autoscaler updates. Attributes: redis: A connection to the Redis server. """ def __init__(self, redis_address, autoscaling_config, redis_password=None, prefix_cluster_info=False, monitor_ip=None, stop_event: Optional[Event] = None): # Initialize the Redis clients. ray.state.state._initialize_global_state(redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) if monitor_ip: self.redis.set("AutoscalerMetricsAddress", f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}") (ip, port) = redis_address.split(":") self.gcs_client = connect_to_gcs(ip, int(port), redis_password) # Initialize the gcs stub for getting all node resource usage. gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") options = (("grpc.enable_http_proxy", 0), ) gcs_channel = grpc.insecure_channel(gcs_address, options=options) self.gcs_node_resources_stub = \ gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.gcs_client = self.gcs_client worker.mode = 0 head_node_ip = redis_address.split(":")[0] self.redis_address = redis_address self.redis_password = redis_password self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] self.autoscaling_config = autoscaling_config self.autoscaler = None self.prom_metrics = AutoscalerPrometheusMetrics() if monitor_ip: # If monitor_ip wasn't passed in, then don't attempt to start the # metric server to keep behavior identical to before metrics were # introduced try: logger.info( "Starting autoscaler metrics server on port {}".format( AUTOSCALER_METRIC_PORT)) prometheus_client.start_http_server( AUTOSCALER_METRIC_PORT, registry=self.prom_metrics.registry) except Exception: logger.exception( "An exception occurred while starting the metrics server.") logger.info("Monitor: Started") def __del__(self): disconnect_from_gcs(self.gcs_client) def _initialize_autoscaler(self): if self.autoscaling_config: self.autoscaler = StandardAutoscaler( self.autoscaling_config, self.load_metrics, prefix_cluster_info=self.prefix_cluster_info, event_summarizer=self.event_summarizer, prom_metrics=self.prom_metrics) def update_load_metrics(self): """Fetches resource usage data from GCS and updates load metrics.""" request = gcs_service_pb2.GetAllResourceUsageRequest() response = self.gcs_node_resources_stub.GetAllResourceUsage(request, timeout=4) resources_batch_data = response.resource_usage_data for resource_message in resources_batch_data.batch: resource_load = dict(resource_message.resource_load) total_resources = dict(resource_message.resources_total) available_resources = dict(resource_message.resources_available) waiting_bundles, infeasible_bundles = parse_resource_demands( resources_batch_data.resource_load_by_shape) pending_placement_groups = list( resources_batch_data.placement_group_load.placement_group_data) ip = resource_message.node_manager_address self.load_metrics.update(ip, total_resources, available_resources, resource_load, waiting_bundles, infeasible_bundles, pending_placement_groups) def update_resource_requests(self): """Fetches resource requests from the internal KV and updates load.""" if not _internal_kv_initialized(): return data = _internal_kv_get( ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL) if data: try: resource_request = json.loads(data) self.load_metrics.set_resource_requests(resource_request) except Exception: logger.exception("Error parsing resource requests") def _run(self): """Run the monitor loop.""" while True: if self.stop_event and self.stop_event.is_set(): break self.update_load_metrics() self.update_resource_requests() self.update_event_summary() status = { "load_metrics_report": self.load_metrics.summary()._asdict(), "time": time.time(), "monitor_pid": os.getpid() } # Process autoscaling actions if self.autoscaler: # Only used to update the load metrics for the autoscaler. self.autoscaler.update() status["autoscaler_report"] = self.autoscaler.summary( )._asdict() for msg in self.event_summarizer.summary(): logger.info(":event_summary:{}".format(msg)) self.event_summarizer.clear() as_json = json.dumps(status) if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True) # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S) def update_event_summary(self): """Report the current size of the cluster. To avoid log spam, only cluster size changes (CPU or GPU count change) are reported to the event summarizer. The event summarizer will report only the latest cluster size per batch. """ avail_resources = self.load_metrics.resources_avail_summary() if avail_resources != self.last_avail_resources: self.event_summarizer.add( "Resized to {}.", # e.g., Resized to 100 CPUs, 4 GPUs. quantity=avail_resources, aggregate=lambda old, new: new) self.last_avail_resources = avail_resources def destroy_autoscaler_workers(self): """Cleanup the autoscaler, in case of an exception in the run() method. We kill the worker nodes, but retain the head node in order to keep logs around, keeping costs minimal. This monitor process runs on the head node anyway, so this is more reliable.""" if self.autoscaler is None: return # Nothing to clean up. if self.autoscaling_config is None: # This is a logic error in the program. Can't do anything. logger.error( "Monitor: Cleanup failed due to lack of autoscaler config.") return logger.info("Monitor: Exception caught. Taking down workers...") clean = False while not clean: try: teardown_cluster( config_file=self.autoscaling_config, yes=True, # Non-interactive. workers_only=True, # Retain head node for logs. override_cluster_name=None, keep_min_workers=True, # Retain minimal amount of workers. ) clean = True logger.info("Monitor: Workers taken down.") except Exception: logger.error("Monitor: Cleanup exception. Trying again...") time.sleep(2) def _handle_failure(self, error): logger.exception("Error in monitor loop") if self.autoscaler is not None and \ os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1": self.autoscaler.kill_workers() # Take down autoscaler workers if necessary. self.destroy_autoscaler_workers() # Something went wrong, so push an error to all current and future # drivers. message = f"The autoscaler failed with the following error:\n{error}" if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True) redis_client = ray._private.services.create_redis_client( self.redis_address, password=self.redis_password) from ray._private.utils import push_error_to_driver_through_redis push_error_to_driver_through_redis(redis_client, ray_constants.MONITOR_DIED_ERROR, message) def _signal_handler(self, sig, frame): self._handle_failure(f"Terminated with signal {sig}\n" + "".join(traceback.format_stack(frame))) sys.exit(sig + 128) def run(self): # Register signal handlers for autoscaler termination. signal.signal(signal.SIGINT, self._signal_handler) signal.signal(signal.SIGTERM, self._signal_handler) try: if _internal_kv_initialized(): # Delete any previous autoscaling errors. _internal_kv_del(DEBUG_AUTOSCALING_ERROR) self._initialize_autoscaler() self._run() except Exception: self._handle_failure(traceback.format_exc()) raise
class Monitor: """Autoscaling monitor. This process periodically collects stats from the GCS and triggers autoscaler updates. Attributes: redis: A connection to the Redis server. """ def __init__(self, redis_address, autoscaling_config, redis_password=None, prefix_cluster_info=False): # Initialize the Redis clients. ray.state.state._initialize_global_state(redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) self.global_state_accessor = GlobalStateAccessor( redis_address, redis_password, False) self.global_state_accessor.connect() # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.mode = 0 # Keep a mapping from raylet client ID to IP address to use # for updating the load metrics. self.raylet_id_to_ip_map = {} head_node_ip = redis_address.split(":")[0] self.load_metrics = LoadMetrics(local_ip=head_node_ip) if autoscaling_config: self.autoscaler = StandardAutoscaler( autoscaling_config, self.load_metrics, prefix_cluster_info=prefix_cluster_info) self.autoscaling_config = autoscaling_config else: self.autoscaler = None self.autoscaling_config = None def __del__(self): """Destruct the monitor object.""" # We close the pubsub client to avoid leaking file descriptors. if self.global_state_accessor is not None: self.global_state_accessor.disconnect() self.global_state_accessor = None def update_load_metrics(self): """Fetches resource usage data from GCS and updates load metrics.""" all_resources = self.global_state_accessor.get_all_resource_usage() resources_batch_data = \ ray.gcs_utils.ResourceUsageBatchData.FromString(all_resources) for resource_message in resources_batch_data.batch: resource_load = dict(resource_message.resource_load) total_resources = dict(resource_message.resources_total) available_resources = dict(resource_message.resources_available) waiting_bundles, infeasible_bundles = parse_resource_demands( resources_batch_data.resource_load_by_shape) pending_placement_groups = list( resources_batch_data.placement_group_load.placement_group_data) # Update the load metrics for this raylet. node_id = ray.utils.binary_to_hex(resource_message.node_id) ip = self.raylet_id_to_ip_map.get(node_id) if ip: self.load_metrics.update(ip, total_resources, available_resources, resource_load, waiting_bundles, infeasible_bundles, pending_placement_groups) else: logger.warning( f"Monitor: could not find ip for node {node_id}") def update_resource_requests(self): """Fetches resource requests from the internal KV and updates load.""" if not _internal_kv_initialized(): return data = _internal_kv_get( ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL) if data: try: resource_request = json.loads(data) self.load_metrics.set_resource_requests(resource_request) except Exception: logger.exception("Error parsing resource requests") def autoscaler_resource_request_handler(self, _, data): """Handle a notification of a resource request for the autoscaler. This channel and method are only used by the manual `ray.autoscaler.sdk.request_resources` api. Args: channel: unused data: a resource request as JSON, e.g. {"CPU": 1} """ resource_request = json.loads(data) self.load_metrics.set_resource_requests(resource_request) def update_raylet_map(self, _append_port=False): """Updates internal raylet map. Args: _append_port (bool): Defaults to False. Appending the port is useful in testing, as mock clusters have many nodes with the same IP and cannot be uniquely identified. """ all_raylet_nodes = ray.nodes() self.raylet_id_to_ip_map = {} for raylet_info in all_raylet_nodes: node_id = (raylet_info.get("DBClientID") or raylet_info["NodeID"]) ip_address = (raylet_info.get("AuxAddress") or raylet_info["NodeManagerAddress"]).split(":")[0] if _append_port: ip_address += ":" + str(raylet_info["NodeManagerPort"]) self.raylet_id_to_ip_map[node_id] = ip_address def _run(self): """Run the monitor loop.""" while True: self.update_raylet_map() self.update_load_metrics() self.update_resource_requests() status = { "load_metrics_report": self.load_metrics.summary()._asdict() } # Process autoscaling actions if self.autoscaler: # Only used to update the load metrics for the autoscaler. self.autoscaler.update() status["autoscaler_report"] = self.autoscaler.summary( )._asdict() as_json = json.dumps(status) if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True) # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S) def destroy_autoscaler_workers(self): """Cleanup the autoscaler, in case of an exception in the run() method. We kill the worker nodes, but retain the head node in order to keep logs around, keeping costs minimal. This monitor process runs on the head node anyway, so this is more reliable.""" if self.autoscaler is None: return # Nothing to clean up. if self.autoscaling_config is None: # This is a logic error in the program. Can't do anything. logger.error( "Monitor: Cleanup failed due to lack of autoscaler config.") return logger.info("Monitor: Exception caught. Taking down workers...") clean = False while not clean: try: teardown_cluster( config_file=self.autoscaling_config, yes=True, # Non-interactive. workers_only=True, # Retain head node for logs. override_cluster_name=None, keep_min_workers=True, # Retain minimal amount of workers. ) clean = True logger.info("Monitor: Workers taken down.") except Exception: logger.error("Monitor: Cleanup exception. Trying again...") time.sleep(2) def run(self): try: self._run() except Exception: logger.exception("Error in monitor loop") if self.autoscaler: self.autoscaler.kill_workers() raise
class Monitor: """A monitor for Ray processes. The monitor is in charge of cleaning up the tables in the global state after processes have died. The monitor is currently not responsible for detecting component failures. Attributes: redis: A connection to the Redis server. primary_subscribe_client: A pubsub client for the Redis server. This is used to receive notifications about failed components. """ def __init__(self, redis_address, autoscaling_config, redis_password=None): # Initialize the Redis clients. ray.state.state._initialize_global_state(redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.mode = 0 # Setup subscriptions to the primary Redis server and the Redis shards. self.primary_subscribe_client = self.redis.pubsub( ignore_subscribe_messages=True) # Keep a mapping from raylet client ID to IP address to use # for updating the load metrics. self.raylet_id_to_ip_map = {} self.light_heartbeat_enabled = ray._config.light_heartbeat_enabled() self.load_metrics = LoadMetrics() if autoscaling_config: self.autoscaler = StandardAutoscaler(autoscaling_config, self.load_metrics) self.autoscaling_config = autoscaling_config else: self.autoscaler = None self.autoscaling_config = None def __del__(self): """Destruct the monitor object.""" # We close the pubsub client to avoid leaking file descriptors. try: primary_subscribe_client = self.primary_subscribe_client except AttributeError: primary_subscribe_client = None if primary_subscribe_client is not None: primary_subscribe_client.close() def subscribe(self, channel): """Subscribe to the given channel on the primary Redis shard. Args: channel (str): The channel to subscribe to. Raises: Exception: An exception is raised if the subscription fails. """ self.primary_subscribe_client.subscribe(channel) def psubscribe(self, pattern): """Subscribe to the given pattern on the primary Redis shard. Args: pattern (str): The pattern to subscribe to. Raises: Exception: An exception is raised if the subscription fails. """ self.primary_subscribe_client.psubscribe(pattern) def parse_resource_demands(self, resource_load_by_shape): """Handle the message.resource_load_by_shape protobuf for the demand based autoscaling. Catch and log all exceptions so this doesn't interfere with the utilization based autoscaler until we're confident this is stable. Args: resource_load_by_shape (pb2.gcs.ResourceLoad): The resource demands in protobuf form or None. """ waiting_bundles, infeasible_bundles = [], [] try: if self.autoscaler: for resource_demand_pb in list( resource_load_by_shape.resource_demands): request_shape = dict(resource_demand_pb.shape) for _ in range( resource_demand_pb.num_ready_requests_queued): waiting_bundles.append(request_shape) for _ in range( resource_demand_pb.num_infeasible_requests_queued): infeasible_bundles.append(request_shape) except Exception as e: logger.exception(e) return waiting_bundles, infeasible_bundles def xray_heartbeat_batch_handler(self, unused_channel, data): """Handle an xray heartbeat batch message from Redis.""" pub_message = ray.gcs_utils.PubSubMessage.FromString(data) heartbeat_data = pub_message.data message = ray.gcs_utils.HeartbeatBatchTableData.FromString( heartbeat_data) for heartbeat_message in message.batch: resource_load = dict(heartbeat_message.resource_load) total_resources = dict(heartbeat_message.resources_total) available_resources = dict(heartbeat_message.resources_available) waiting_bundles, infeasible_bundles = \ self.parse_resource_demands(message.resource_load_by_shape) # Update the load metrics for this raylet. client_id = ray.utils.binary_to_hex(heartbeat_message.client_id) ip = self.raylet_id_to_ip_map.get(client_id) if ip: update_available_resources = not self.light_heartbeat_enabled \ or heartbeat_message.resources_available_changed() update_resource_load = not self.light_heartbeat_enabled \ or heartbeat_message.resource_load_changed() self.load_metrics.update(ip, total_resources, update_available_resources, available_resources, update_resource_load, resource_load, waiting_bundles, infeasible_bundles) else: logger.warning( f"Monitor: could not find ip for client {client_id}") def xray_job_notification_handler(self, unused_channel, data): """Handle a notification that a job has been added or removed. Args: unused_channel: The message channel. data: The message data. """ pub_message = ray.gcs_utils.PubSubMessage.FromString(data) job_data = pub_message.data message = ray.gcs_utils.JobTableData.FromString(job_data) job_id = message.job_id if message.is_dead: logger.info("Monitor: " "XRay Driver {} has been removed.".format( binary_to_hex(job_id))) def autoscaler_resource_request_handler(self, _, data): """Handle a notification of a resource request for the autoscaler. This channel and method are only used by the manual `ray.autoscaler.sdk.request_resources` api. Args: channel: unused data: a resource request as JSON, e.g. {"CPU": 1} """ if not self.autoscaler: return try: self.autoscaler.request_resources(json.loads(data)) except Exception: # We don't want this to kill the monitor. traceback.print_exc() def process_messages(self, max_messages=10000): """Process all messages ready in the subscription channels. This reads messages from the subscription channels and calls the appropriate handlers until there are no messages left. Args: max_messages: The maximum number of messages to process before returning. """ subscribe_clients = [self.primary_subscribe_client] for subscribe_client in subscribe_clients: for _ in range(max_messages): message = None try: message = subscribe_client.get_message() except redis.exceptions.ConnectionError: pass if message is None: # Continue on to the next subscribe client. break # Parse the message. pattern = message["pattern"] channel = message["channel"] data = message["data"] # Determine the appropriate message handler. if pattern == ray.gcs_utils.XRAY_HEARTBEAT_BATCH_PATTERN: # Similar functionality as raylet info channel message_handler = self.xray_heartbeat_batch_handler elif pattern == ray.gcs_utils.XRAY_JOB_PATTERN: # Handles driver death. message_handler = self.xray_job_notification_handler elif (channel == ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL): message_handler = self.autoscaler_resource_request_handler else: assert False, "This code should be unreachable." # Call the handler. message_handler(channel, data) def update_raylet_map(self, _append_port=False): """Updates internal raylet map. Args: _append_port (bool): Defaults to False. Appending the port is useful in testing, as mock clusters have many nodes with the same IP and cannot be uniquely identified. """ all_raylet_nodes = ray.nodes() self.raylet_id_to_ip_map = {} for raylet_info in all_raylet_nodes: node_id = (raylet_info.get("DBClientID") or raylet_info["NodeID"]) ip_address = (raylet_info.get("AuxAddress") or raylet_info["NodeManagerAddress"]).split(":")[0] if _append_port: ip_address += ":" + str(raylet_info["NodeManagerPort"]) self.raylet_id_to_ip_map[node_id] = ip_address def _run(self): """Run the monitor. This function loops forever, checking for messages about dead database clients and cleaning up state accordingly. """ # Initialize the mapping from raylet client ID to IP address. self.update_raylet_map() # Initialize the subscription channel. self.psubscribe(ray.gcs_utils.XRAY_HEARTBEAT_BATCH_PATTERN) self.psubscribe(ray.gcs_utils.XRAY_JOB_PATTERN) if self.autoscaler: self.subscribe( ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL) # TODO(rkn): If there were any dead clients at startup, we should clean # up the associated state in the state tables. # Handle messages from the subscription channels. while True: # Process autoscaling actions if self.autoscaler: # Only used to update the load metrics for the autoscaler. self.update_raylet_map() self.autoscaler.update() # Process a round of messages. self.process_messages() # Wait for a heartbeat interval before processing the next round of # messages. time.sleep(ray._config.raylet_heartbeat_timeout_milliseconds() * 1e-3) def destroy_autoscaler_workers(self): """Cleanup the autoscaler, in case of an exception in the run() method. We kill the worker nodes, but retain the head node in order to keep logs around, keeping costs minimal. This monitor process runs on the head node anyway, so this is more reliable.""" if self.autoscaler is None: return # Nothing to clean up. if self.autoscaling_config is None: # This is a logic error in the program. Can't do anything. logger.error( "Monitor: Cleanup failed due to lack of autoscaler config.") return logger.info("Monitor: Exception caught. Taking down workers...") clean = False while not clean: try: teardown_cluster( config_file=self.autoscaling_config, yes=True, # Non-interactive. workers_only=True, # Retain head node for logs. override_cluster_name=None, keep_min_workers=True, # Retain minimal amount of workers. ) clean = True logger.info("Monitor: Workers taken down.") except Exception: logger.error("Monitor: Cleanup exception. Trying again...") time.sleep(2) def run(self): try: self._run() except Exception: logger.exception("Error in monitor loop") if self.autoscaler: self.autoscaler.kill_workers() raise
class Monitor: """Autoscaling monitor. This process periodically collects stats from the GCS and triggers autoscaler updates. Attributes: redis: A connection to the Redis server. """ def __init__(self, redis_address, autoscaling_config, redis_password=None, prefix_cluster_info=False): # Initialize the Redis clients. ray.state.state._initialize_global_state(redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) # Initialize the gcs stub for getting all node resource usage. gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") gcs_channel = grpc.insecure_channel(gcs_address) self.gcs_node_resources_stub = \ gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.mode = 0 head_node_ip = redis_address.split(":")[0] self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None self.event_summarizer = EventSummarizer() if autoscaling_config: self.autoscaler = StandardAutoscaler( autoscaling_config, self.load_metrics, prefix_cluster_info=prefix_cluster_info, event_summarizer=self.event_summarizer) self.autoscaling_config = autoscaling_config else: self.autoscaler = None self.autoscaling_config = None logger.info("Monitor: Started") def update_load_metrics(self): """Fetches resource usage data from GCS and updates load metrics.""" request = gcs_service_pb2.GetAllResourceUsageRequest() response = self.gcs_node_resources_stub.GetAllResourceUsage(request, timeout=4) resources_batch_data = response.resource_usage_data for resource_message in resources_batch_data.batch: resource_load = dict(resource_message.resource_load) total_resources = dict(resource_message.resources_total) available_resources = dict(resource_message.resources_available) waiting_bundles, infeasible_bundles = parse_resource_demands( resources_batch_data.resource_load_by_shape) pending_placement_groups = list( resources_batch_data.placement_group_load.placement_group_data) ip = resource_message.node_manager_address self.load_metrics.update(ip, total_resources, available_resources, resource_load, waiting_bundles, infeasible_bundles, pending_placement_groups) def update_resource_requests(self): """Fetches resource requests from the internal KV and updates load.""" if not _internal_kv_initialized(): return data = _internal_kv_get( ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL) if data: try: resource_request = json.loads(data) self.load_metrics.set_resource_requests(resource_request) except Exception: logger.exception("Error parsing resource requests") def _run(self): """Run the monitor loop.""" while True: self.update_load_metrics() self.update_resource_requests() self.update_event_summary() status = { "load_metrics_report": self.load_metrics.summary()._asdict() } # Process autoscaling actions if self.autoscaler: # Only used to update the load metrics for the autoscaler. self.autoscaler.update() status["autoscaler_report"] = self.autoscaler.summary( )._asdict() for msg in self.event_summarizer.summary(): logger.info(":event_summary:{}".format(msg)) self.event_summarizer.clear() as_json = json.dumps(status) if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True) # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S) def update_event_summary(self): """Report the current size of the cluster. To avoid log spam, only cluster size changes (CPU or GPU count change) are reported to the event summarizer. The event summarizer will report only the latest cluster size per batch. """ avail_resources = self.load_metrics.resources_avail_summary() if avail_resources != self.last_avail_resources: self.event_summarizer.add( "Resized to {}.", # e.g., Resized to 100 CPUs, 4 GPUs. quantity=avail_resources, aggregate=lambda old, new: new) self.last_avail_resources = avail_resources def destroy_autoscaler_workers(self): """Cleanup the autoscaler, in case of an exception in the run() method. We kill the worker nodes, but retain the head node in order to keep logs around, keeping costs minimal. This monitor process runs on the head node anyway, so this is more reliable.""" if self.autoscaler is None: return # Nothing to clean up. if self.autoscaling_config is None: # This is a logic error in the program. Can't do anything. logger.error( "Monitor: Cleanup failed due to lack of autoscaler config.") return logger.info("Monitor: Exception caught. Taking down workers...") clean = False while not clean: try: teardown_cluster( config_file=self.autoscaling_config, yes=True, # Non-interactive. workers_only=True, # Retain head node for logs. override_cluster_name=None, keep_min_workers=True, # Retain minimal amount of workers. ) clean = True logger.info("Monitor: Workers taken down.") except Exception: logger.error("Monitor: Cleanup exception. Trying again...") time.sleep(2) def run(self): try: self._run() except Exception: logger.exception("Error in monitor loop") if self.autoscaler: self.autoscaler.kill_workers() raise