def _setup_autoscaler(self): self.runner = MockProcessRunner() self.config = yaml.safe_load(open(self.config_path).read()) self.provider.create_node( {}, { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, TAG_RAY_USER_NODE_TYPE: self.config["head_node_type"], }, 1, ) self.head_ip = self.provider.non_terminated_node_ips({})[0] self.load_metrics = LoadMetrics(local_ip=self.head_ip) self.autoscaler = StandardAutoscaler( self.config_path, self.load_metrics, # Don't let the autoscaler start any node launchers. Instead, we # will launch nodes ourself after every update call. max_concurrent_launches=0, max_failures=0, process_runner=self.runner, update_interval_s=0, ) # Manually create a node launcher. Note that we won't start it as a # separate thread. self.node_launcher = NodeLauncher( provider=self.autoscaler.provider, queue=self.autoscaler.launch_queue, index=0, pending=self.autoscaler.pending_launches, node_types=self.autoscaler.available_node_types, )
def __init__(self, config_path, load_metrics, max_launch_batch=AUTOSCALER_MAX_LAUNCH_BATCH, max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES, max_failures=AUTOSCALER_MAX_NUM_FAILURES, process_runner=subprocess, update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S): self.config_path = config_path # Keep this before self.reset (self.provider needs to be created # exactly once). self.provider = None self.resource_demand_scheduler = None self.reset(errors_fatal=True) self.load_metrics = load_metrics self.max_failures = max_failures self.max_launch_batch = max_launch_batch self.max_concurrent_launches = max_concurrent_launches self.process_runner = process_runner # Map from node_id to NodeUpdater processes self.updaters = {} self.num_failed_updates = defaultdict(int) self.num_successful_updates = defaultdict(int) self.num_failures = 0 self.last_update_time = 0.0 self.update_interval_s = update_interval_s # Node launchers self.launch_queue = queue.Queue() self.pending_launches = ConcurrentCounter() max_batches = math.ceil(max_concurrent_launches / float(max_launch_batch)) for i in range(int(max_batches)): node_launcher = NodeLauncher( provider=self.provider, queue=self.launch_queue, index=i, pending=self.pending_launches, node_types=self.available_node_types, ) node_launcher.daemon = True node_launcher.start() # Expand local file_mounts to allow ~ in the paths. This can't be done # earlier when the config is written since we might be on different # platform and the expansion would result in wrong path. self.config["file_mounts"] = { remote: os.path.expanduser(local) for remote, local in self.config["file_mounts"].items() } for local_path in self.config["file_mounts"].values(): assert os.path.exists(local_path) # List of resource bundles the user is requesting of the cluster. self.resource_demand_vector = [] logger.info("StandardAutoscaler: {}".format(self.config))
def __init__( self, config_path: str, load_metrics: LoadMetrics, max_launch_batch: int = AUTOSCALER_MAX_LAUNCH_BATCH, max_concurrent_launches: int = AUTOSCALER_MAX_CONCURRENT_LAUNCHES, max_failures: int = AUTOSCALER_MAX_NUM_FAILURES, process_runner: Any = subprocess, update_interval_s: int = AUTOSCALER_UPDATE_INTERVAL_S, prefix_cluster_info: bool = False, event_summarizer: Optional[EventSummarizer] = None, prom_metrics: Optional[AutoscalerPrometheusMetrics] = None): """Create a StandardAutoscaler. Args: config_path: Path to a Ray Autoscaler YAML. load_metrics: Provides metrics for the Ray cluster. max_launch_batch: Max number of nodes to launch in one request. max_concurrent_launches: Max number of nodes that can be concurrently launched. This value and `max_launch_batch` determine the number of batches that are used to launch nodes. max_failures: Number of failures that the autoscaler will tolerate before exiting. process_runner: Subprocess-like interface used by the CommandRunner. update_interval_s: Seconds between running the autoscaling loop. prefix_cluster_info: Whether to add the cluster name to info strings. event_summarizer: Utility to consolidate duplicated messages. prom_metrics: Prometheus metrics for autoscaler-related operations. """ self.config_path = config_path # Prefix each line of info string with cluster name if True self.prefix_cluster_info = prefix_cluster_info # Keep this before self.reset (self.provider needs to be created # exactly once). self.provider = None # Keep this before self.reset (if an exception occurs in reset # then prom_metrics must be instantitiated to increment the # exception counter) self.prom_metrics = prom_metrics or \ AutoscalerPrometheusMetrics() self.resource_demand_scheduler = None self.reset(errors_fatal=True) self.head_node_ip = load_metrics.local_ip self.load_metrics = load_metrics self.max_failures = max_failures self.max_launch_batch = max_launch_batch self.max_concurrent_launches = max_concurrent_launches self.process_runner = process_runner self.event_summarizer = event_summarizer or EventSummarizer() # Map from node_id to NodeUpdater threads self.updaters = {} self.num_failed_updates = defaultdict(int) self.num_successful_updates = defaultdict(int) self.num_failures = 0 self.last_update_time = 0.0 self.update_interval_s = update_interval_s # Tracks active worker nodes self.workers = [] # Tracks nodes scheduled for termination self.nodes_to_terminate = [] # Disable NodeUpdater threads if true. # Should be set to true in situations where another component, such as # a Kubernetes operator, is responsible for Ray setup on nodes. self.disable_node_updaters = self.config["provider"].get( "disable_node_updaters", False) # Node launchers self.launch_queue = queue.Queue() self.pending_launches = ConcurrentCounter() max_batches = math.ceil(max_concurrent_launches / float(max_launch_batch)) for i in range(int(max_batches)): node_launcher = NodeLauncher(provider=self.provider, queue=self.launch_queue, index=i, pending=self.pending_launches, node_types=self.available_node_types, prom_metrics=self.prom_metrics) node_launcher.daemon = True node_launcher.start() # NodeTracker maintains soft state to track the number of recently # failed nodes. It is best effort only. self.node_tracker = NodeTracker() # Expand local file_mounts to allow ~ in the paths. This can't be done # earlier when the config is written since we might be on different # platform and the expansion would result in wrong path. self.config["file_mounts"] = { remote: os.path.expanduser(local) for remote, local in self.config["file_mounts"].items() } for local_path in self.config["file_mounts"].values(): assert os.path.exists(local_path) logger.info("StandardAutoscaler: {}".format(self.config))
def __init__(self, config_path, load_metrics, max_launch_batch=AUTOSCALER_MAX_LAUNCH_BATCH, max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES, max_failures=AUTOSCALER_MAX_NUM_FAILURES, process_runner=subprocess, update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S, prefix_cluster_info=False, event_summarizer=None, prom_metrics=None): self.config_path = config_path # Prefix each line of info string with cluster name if True self.prefix_cluster_info = prefix_cluster_info # Keep this before self.reset (self.provider needs to be created # exactly once). self.provider = None # Keep this before self.reset (if an exception occurs in reset # then prom_metrics must be instantitiated to increment the # exception counter) self.prom_metrics = prom_metrics or \ AutoscalerPrometheusMetrics() self.resource_demand_scheduler = None self.reset(errors_fatal=True) self.head_node_ip = load_metrics.local_ip self.load_metrics = load_metrics self.max_failures = max_failures self.max_launch_batch = max_launch_batch self.max_concurrent_launches = max_concurrent_launches self.process_runner = process_runner self.event_summarizer = event_summarizer or EventSummarizer() # Map from node_id to NodeUpdater processes self.updaters = {} self.num_failed_updates = defaultdict(int) self.num_successful_updates = defaultdict(int) self.num_failures = 0 self.last_update_time = 0.0 self.update_interval_s = update_interval_s # Node launchers self.launch_queue = queue.Queue() self.pending_launches = ConcurrentCounter() max_batches = math.ceil(max_concurrent_launches / float(max_launch_batch)) for i in range(int(max_batches)): node_launcher = NodeLauncher(provider=self.provider, queue=self.launch_queue, index=i, pending=self.pending_launches, node_types=self.available_node_types, prom_metrics=self.prom_metrics) node_launcher.daemon = True node_launcher.start() # NodeTracker maintains soft state to track the number of recently # failed nodes. It is best effort only. self.node_tracker = NodeTracker() # Expand local file_mounts to allow ~ in the paths. This can't be done # earlier when the config is written since we might be on different # platform and the expansion would result in wrong path. self.config["file_mounts"] = { remote: os.path.expanduser(local) for remote, local in self.config["file_mounts"].items() } for local_path in self.config["file_mounts"].values(): assert os.path.exists(local_path) logger.info("StandardAutoscaler: {}".format(self.config))