def destroy_autoscaler_workers(self): """Cleanup the autoscaler, in case of an exception in the run() method. We kill the worker nodes, but retain the head node in order to keep logs around, keeping costs minimal. This monitor process runs on the head node anyway, so this is more reliable.""" if self.autoscaler is None: return # Nothing to clean up. if self.autoscaling_config is None: # This is a logic error in the program. Can't do anything. logger.error( "Monitor: Cleanup failed due to lack of autoscaler config.") return logger.info("Monitor: Exception caught. Taking down workers...") clean = False while not clean: try: teardown_cluster( config_file=self.autoscaling_config, yes=True, # Non-interactive. workers_only=True, # Retain head node for logs. override_cluster_name=None, keep_min_workers=True, # Retain minimal amount of workers. ) clean = True logger.info("Monitor: Workers taken down.") except Exception: logger.error("Monitor: Cleanup exception. Trying again...") time.sleep(2)
def down(cluster_config_file, yes, workers_only, cluster_name, keep_min_workers, log_style, log_color, verbose): """Tear down a Ray cluster.""" cli_logger.configure(log_style, log_color, verbose) teardown_cluster(cluster_config_file, yes, workers_only, cluster_name, keep_min_workers)
def _teardown(self) -> None: commands.teardown_cluster( self.config_path, yes=True, workers_only=False, override_cluster_name=None, keep_min_workers=False, )
def teardown_cluster(cluster_config: Union[dict, str]) -> None: """Destroys all nodes of a Ray cluster described by a config json. Args: cluster_config (Union[str, dict]): Either the config dict of the cluster, or a path pointing to a file containing the config. """ return commands.teardown_cluster( config_file=_as_config_file(cluster_config), yes=True, workers_only=False, override_cluster_name=None, keep_min_workers=False)
def teardown_cluster(cluster_config: Union[dict, str], workers_only: bool = False, keep_min_workers: bool = False) -> None: """Destroys all nodes of a Ray cluster described by a config json. Args: cluster_config (Union[str, dict]): Either the config dict of the cluster, or a path pointing to a file containing the config. workers_only (bool): Whether to keep the head node running and only teardown worker nodes. keep_min_workers (bool): Whether to keep min_workers (as specified in the YAML) still running. """ with _as_config_file(cluster_config) as config_file: return commands.teardown_cluster(config_file=config_file, yes=True, workers_only=workers_only, override_cluster_name=None, keep_min_workers=keep_min_workers)