def _update(self): now = time.time() # Throttle autoscaling updates to this interval to avoid exceeding # rate limits on API calls. if now - self.last_update_time < self.update_interval_s: return self.last_update_time = now self.update_worker_list() self.load_metrics.prune_active_ips([ self.provider.internal_ip(node_id) for node_id in self.all_workers ]) self.terminate_nodes_to_enforce_config_constraints(now) self.launch_required_nodes() if self.disable_node_updaters: self.terminate_unhealthy_nodes(now) else: self.process_completed_updates() self.update_nodes() self.attempt_to_recover_unhealthy_nodes(now) self.set_prometheus_updater_data() logger.info(self.info_string()) legacy_log_info_string(self, self.workers)
def _update(self): now = time.time() # Throttle autoscaling updates to this interval to avoid exceeding # rate limits on API calls. if now - self.last_update_time < self.update_interval_s: return self.last_update_time = now self.update_worker_list() self.load_metrics.prune_active_ips([ self.provider.internal_ip(node_id) for node_id in self.all_workers ]) if not self.provider.is_readonly(): self.terminate_nodes_to_enforce_config_constraints(now) # Dict[NodeType, int], List[ResourceDict] to_launch, unfulfilled = ( self.resource_demand_scheduler.get_nodes_to_launch( self.provider.non_terminated_nodes(tag_filters={}), self.pending_launches.breakdown(), self.load_metrics.get_resource_demand_vector(), self.load_metrics.get_resource_utilization(), self.load_metrics.get_pending_placement_groups(), self.load_metrics.get_static_node_resources_by_ip(), ensure_min_cluster_size=self.load_metrics. get_resource_requests())) self._report_pending_infeasible(unfulfilled) if not self.provider.is_readonly(): self.launch_required_nodes(to_launch) if self.disable_node_updaters: self.terminate_unhealthy_nodes(now) else: self.process_completed_updates() self.update_nodes() self.attempt_to_recover_unhealthy_nodes(now) self.set_prometheus_updater_data() logger.info(self.info_string()) legacy_log_info_string(self, self.workers)
def _update(self): now = time.time() # Throttle autoscaling updates to this interval to avoid exceeding # rate limits on API calls. if now - self.last_update_time < self.update_interval_s: return self.last_update_time = now nodes = self.workers() self.load_metrics.prune_active_ips([ self.provider.internal_ip(node_id) for node_id in self.all_workers() ]) # Terminate any idle or out of date nodes last_used = self.load_metrics.last_used_time_by_ip horizon = now - (60 * self.config["idle_timeout_minutes"]) nodes_to_terminate: Dict[NodeID, bool] = [] node_type_counts = collections.defaultdict(int) # Sort based on last used to make sure to keep min_workers that # were most recently used. Otherwise, _keep_min_workers_of_node_type # might keep a node that should be terminated. sorted_node_ids = self._sort_based_on_last_used(nodes, last_used) # Don't terminate nodes needed by request_resources() nodes_allowed_to_terminate: Dict[NodeID, bool] = {} if self.load_metrics.get_resource_requests(): nodes_allowed_to_terminate = self._get_nodes_allowed_to_terminate( sorted_node_ids) for node_id in sorted_node_ids: # Make sure to not kill idle node types if the number of workers # of that type is lower/equal to the min_workers of that type # or it is needed for request_resources(). if (self._keep_min_worker_of_node_type(node_id, node_type_counts) or not nodes_allowed_to_terminate.get( node_id, True)) and self.launch_config_ok(node_id): continue node_ip = self.provider.internal_ip(node_id) if node_ip in last_used and last_used[node_ip] < horizon: logger.info("StandardAutoscaler: " "{}: Terminating idle node.".format(node_id)) self.event_summarizer.add("Removing {} nodes of type " + self._get_node_type(node_id) + " (idle).", quantity=1, aggregate=operator.add) nodes_to_terminate.append(node_id) elif not self.launch_config_ok(node_id): logger.info("StandardAutoscaler: " "{}: Terminating outdated node.".format(node_id)) self.event_summarizer.add("Removing {} nodes of type " + self._get_node_type(node_id) + " (outdated).", quantity=1, aggregate=operator.add) nodes_to_terminate.append(node_id) if nodes_to_terminate: self.provider.terminate_nodes(nodes_to_terminate) nodes = self.workers() # Terminate nodes if there are too many nodes_to_terminate = [] while (len(nodes) - len(nodes_to_terminate)) > self.config["max_workers"] and nodes: to_terminate = nodes.pop() logger.info("StandardAutoscaler: " "{}: Terminating unneeded node.".format(to_terminate)) self.event_summarizer.add("Removing {} nodes of type " + self._get_node_type(to_terminate) + " (max workers).", quantity=1, aggregate=operator.add) nodes_to_terminate.append(to_terminate) if nodes_to_terminate: self.provider.terminate_nodes(nodes_to_terminate) nodes = self.workers() to_launch = self.resource_demand_scheduler.get_nodes_to_launch( self.provider.non_terminated_nodes(tag_filters={}), self.pending_launches.breakdown(), self.load_metrics.get_resource_demand_vector(), self.load_metrics.get_resource_utilization(), self.load_metrics.get_pending_placement_groups(), self.load_metrics.get_static_node_resources_by_ip(), ensure_min_cluster_size=self.load_metrics.get_resource_requests()) for node_type, count in to_launch.items(): self.launch_new_node(count, node_type=node_type) nodes = self.workers() # Process any completed updates completed = [] for node_id, updater in self.updaters.items(): if not updater.is_alive(): completed.append(node_id) if completed: nodes_to_terminate: List[NodeID] = [] for node_id in completed: if self.updaters[node_id].exitcode == 0: self.num_successful_updates[node_id] += 1 # Mark the node as active to prevent the node recovery # logic immediately trying to restart Ray on the new node. self.load_metrics.mark_active( self.provider.internal_ip(node_id)) else: logger.error(f"StandardAutoscaler: {node_id}: Terminating " "failed to setup/initialize node.") self.event_summarizer.add("Removing {} nodes of type " + self._get_node_type(node_id) + " (launch failed).", quantity=1, aggregate=operator.add) nodes_to_terminate.append(node_id) self.num_failed_updates[node_id] += 1 del self.updaters[node_id] if nodes_to_terminate: self.provider.terminate_nodes(nodes_to_terminate) nodes = self.workers() # Update nodes with out-of-date files. # TODO(edoakes): Spawning these threads directly seems to cause # problems. They should at a minimum be spawned as daemon threads. # See https://github.com/ray-project/ray/pull/5903 for more info. T = [] for node_id, commands, ray_start, docker_config in ( self.should_update(node_id) for node_id in nodes): if node_id is not None: resources = self._node_resources(node_id) logger.debug(f"{node_id}: Starting new thread runner.") T.append( threading.Thread(target=self.spawn_updater, args=(node_id, commands, ray_start, resources, docker_config))) for t in T: t.start() for t in T: t.join() # Attempt to recover unhealthy nodes for node_id in nodes: self.recover_if_needed(node_id, now) logger.info(self.info_string()) legacy_log_info_string(self, nodes)
def _update(self): now = time.time() # Throttle autoscaling updates to this interval to avoid exceeding # rate limits on API calls. if now - self.last_update_time < self.update_interval_s: return self.last_update_time = now nodes = self.workers() self.load_metrics.prune_active_ips([ self.provider.internal_ip(node_id) for node_id in self.all_workers() ]) # Terminate any idle or out of date nodes last_used = self.load_metrics.last_used_time_by_ip horizon = now - (60 * self.config["idle_timeout_minutes"]) nodes_to_terminate: List[NodeID] = [] node_type_counts = defaultdict(int) # Sort based on last used to make sure to keep min_workers that # were most recently used. Otherwise, _keep_min_workers_of_node_type # might keep a node that should be terminated. sorted_node_ids = self._sort_based_on_last_used(nodes, last_used) # Don't terminate nodes needed by request_resources() nodes_not_allowed_to_terminate: FrozenSet[NodeID] = {} if self.load_metrics.get_resource_requests(): nodes_not_allowed_to_terminate = \ self._get_nodes_needed_for_request_resources(sorted_node_ids) def keep_node(node_id: NodeID) -> None: # Update per-type counts and add node_id to nodes_to_keep. tags = self.provider.node_tags(node_id) if TAG_RAY_USER_NODE_TYPE in tags: node_type = tags[TAG_RAY_USER_NODE_TYPE] node_type_counts[node_type] += 1 def schedule_node_termination(node_id: NodeID, reason_opt: Optional[str]) -> None: if reason_opt is None: raise Exception("reason should be not None.") reason: str = reason_opt # Log, record an event, and add node_id to nodes_to_terminate. logger.info("StandardAutoscaler: " "{}: Terminating {} node.".format(node_id, reason)) self.event_summarizer.add("Removing {} nodes of type " + self._get_node_type(node_id) + " ({}).".format(reason), quantity=1, aggregate=operator.add) nodes_to_terminate.append(node_id) # Nodes that we could terminate, if needed. nodes_we_could_terminate: List[NodeID] = [] for node_id in sorted_node_ids: # Make sure to not kill idle node types if the number of workers # of that type is lower/equal to the min_workers of that type # or it is needed for request_resources(). should_keep_or_terminate, reason = self._keep_worker_of_node_type( node_id, node_type_counts) if should_keep_or_terminate == KeepOrTerminate.terminate: schedule_node_termination(node_id, reason) continue if ((should_keep_or_terminate == KeepOrTerminate.keep or node_id in nodes_not_allowed_to_terminate) and self.launch_config_ok(node_id)): keep_node(node_id) continue node_ip = self.provider.internal_ip(node_id) if node_ip in last_used and last_used[node_ip] < horizon: schedule_node_termination(node_id, "idle") elif not self.launch_config_ok(node_id): schedule_node_termination(node_id, "outdated") else: keep_node(node_id) nodes_we_could_terminate.append(node_id) # Terminate nodes if there are too many num_extra_nodes_to_terminate = (len(nodes) - len(nodes_to_terminate) - self.config["max_workers"]) if num_extra_nodes_to_terminate > len(nodes_we_could_terminate): logger.warning( "StandardAutoscaler: trying to terminate " f"{num_extra_nodes_to_terminate} nodes, while only " f"{len(nodes_we_could_terminate)} are safe to terminate." " Inconsistent config is likely.") num_extra_nodes_to_terminate = len(nodes_we_could_terminate) # If num_extra_nodes_to_terminate is negative or zero, # we would have less than max_workers nodes after terminating # nodes_to_terminate and we do not need to terminate anything else. if num_extra_nodes_to_terminate > 0: extra_nodes_to_terminate = nodes_we_could_terminate[ -num_extra_nodes_to_terminate:] for node_id in extra_nodes_to_terminate: schedule_node_termination(node_id, "max workers") if nodes_to_terminate: self._terminate_nodes_and_cleanup(nodes_to_terminate) nodes = self.workers() to_launch = self.resource_demand_scheduler.get_nodes_to_launch( self.provider.non_terminated_nodes(tag_filters={}), self.pending_launches.breakdown(), self.load_metrics.get_resource_demand_vector(), self.load_metrics.get_resource_utilization(), self.load_metrics.get_pending_placement_groups(), self.load_metrics.get_static_node_resources_by_ip(), ensure_min_cluster_size=self.load_metrics.get_resource_requests()) for node_type, count in to_launch.items(): self.launch_new_node(count, node_type=node_type) if to_launch: nodes = self.workers() # Process any completed updates completed_nodes = [] for node_id, updater in self.updaters.items(): if not updater.is_alive(): completed_nodes.append(node_id) if completed_nodes: failed_nodes = [] for node_id in completed_nodes: updater = self.updaters[node_id] if updater.exitcode == 0: self.num_successful_updates[node_id] += 1 self.prom_metrics.successful_updates.inc() if updater.for_recovery: self.prom_metrics.successful_recoveries.inc() if updater.update_time: self.prom_metrics.worker_update_time.observe( updater.update_time) # Mark the node as active to prevent the node recovery # logic immediately trying to restart Ray on the new node. self.load_metrics.mark_active( self.provider.internal_ip(node_id)) else: failed_nodes.append(node_id) self.num_failed_updates[node_id] += 1 self.prom_metrics.failed_updates.inc() if updater.for_recovery: self.prom_metrics.failed_recoveries.inc() self.node_tracker.untrack(node_id) del self.updaters[node_id] if failed_nodes: # Some nodes in failed_nodes may have been terminated # during an update (for being idle after missing a heartbeat). # Only terminate currently non terminated nodes. non_terminated_nodes = self.workers() nodes_to_terminate: List[NodeID] = [] for node_id in failed_nodes: if node_id in non_terminated_nodes: nodes_to_terminate.append(node_id) logger.error(f"StandardAutoscaler: {node_id}:" " Terminating. Failed to setup/initialize" " node.") self.event_summarizer.add( "Removing {} nodes of type " + self._get_node_type(node_id) + " (launch failed).", quantity=1, aggregate=operator.add) else: logger.warning(f"StandardAutoscaler: {node_id}:" " Failed to update node." " Node has already been terminated.") if nodes_to_terminate: self._terminate_nodes_and_cleanup(nodes_to_terminate) nodes = self.workers() # Update nodes with out-of-date files. # TODO(edoakes): Spawning these threads directly seems to cause # problems. They should at a minimum be spawned as daemon threads. # See https://github.com/ray-project/ray/pull/5903 for more info. T = [] for node_id, setup_commands, ray_start_commands, docker_config in ( self.should_update(node_id) for node_id in nodes): if node_id is not None: resources = self._node_resources(node_id) logger.debug(f"{node_id}: Starting new thread runner.") T.append( threading.Thread(target=self.spawn_updater, args=(node_id, setup_commands, ray_start_commands, resources, docker_config))) for t in T: t.start() for t in T: t.join() if self.disable_node_updaters: # If updaters are unavailable, terminate unhealthy nodes. nodes_to_terminate = self.get_unhealthy_nodes(nodes, now) if nodes_to_terminate: self._terminate_nodes_and_cleanup(nodes_to_terminate) nodes = self.workers() else: # Attempt to recover unhealthy nodes for node_id in nodes: self.recover_if_needed(node_id, now) self.prom_metrics.updating_nodes.set(len(self.updaters)) num_recovering = 0 for updater in self.updaters.values(): if updater.for_recovery: num_recovering += 1 self.prom_metrics.recovering_nodes.set(num_recovering) logger.info(self.info_string()) legacy_log_info_string(self, nodes)