Пример #1
0
    def destroy_autoscaler_workers(self):
        """Cleanup the autoscaler, in case of an exception in the run() method.

        We kill the worker nodes, but retain the head node in order to keep
        logs around, keeping costs minimal. This monitor process runs on the
        head node anyway, so this is more reliable."""

        if self.autoscaler is None:
            return  # Nothing to clean up.

        if self.autoscaling_config is None:
            # This is a logic error in the program. Can't do anything.
            logger.error(
                "Monitor: Cleanup failed due to lack of autoscaler config.")
            return

        logger.info("Monitor: Exception caught. Taking down workers...")
        clean = False
        while not clean:
            try:
                teardown_cluster(
                    config_file=self.autoscaling_config,
                    yes=True,  # Non-interactive.
                    workers_only=True,  # Retain head node for logs.
                    override_cluster_name=None,
                    keep_min_workers=True,  # Retain minimal amount of workers.
                )
                clean = True
                logger.info("Monitor: Workers taken down.")
            except Exception:
                logger.error("Monitor: Cleanup exception. Trying again...")
                time.sleep(2)
Пример #2
0
def stop():
    project_definition = load_project_or_throw()
    teardown_cluster(
        project_definition["cluster"],
        yes=True,
        workers_only=False,
        override_cluster_name=None)
Пример #3
0
def stop(name):
    project_definition = load_project_or_throw()
    teardown_cluster(
        project_definition.cluster_yaml(),
        yes=True,
        workers_only=False,
        override_cluster_name=name)
Пример #4
0
 def __do_destroy(self):
     try:
         teardown_cluster(
             self.config_file,
             yes=True,
             workers_only=False,
             override_cluster_name=None,
             keep_min_workers=0,
         )
         self.ready = False
         self.config = None
     except BaseException as ex:
         self.destroyer.exc = CannotDestroyCluster(
             "Cannot destroy cluster", cause=ex, traceback=traceback.format_exc()
         )
         if not self.destroyer.silent:
             sys.stderr.write(f"Cannot destroy cluster:\n{traceback.format_exc()}\n")
Пример #5
0
def teardown(cluster_config_file, yes, workers_only, cluster_name,
             keep_min_workers):
    """Tear down a Ray cluster."""
    teardown_cluster(cluster_config_file, yes, workers_only, cluster_name,
                     keep_min_workers)
Пример #6
0
def teardown(cluster_config_file, yes, workers_only, cluster_name):
    """Tear down the Ray cluster."""
    teardown_cluster(cluster_config_file, yes, workers_only, cluster_name)
Пример #7
0
def teardown(cluster_config_file, yes, workers_only, cluster_name):
    teardown_cluster(cluster_config_file, yes, workers_only, cluster_name)
Пример #8
0
def teardown(cluster_config_file, yes):
    teardown_cluster(cluster_config_file, yes)
Пример #9
0
def teardown(cluster_config_file, yes):
    teardown_cluster(cluster_config_file, yes)
Пример #10
0
def down(cluster_config_file, yes, workers_only, cluster_name,
         keep_min_workers, log_old_style, log_color, verbose):
    """Tear down a Ray cluster."""
    teardown_cluster(cluster_config_file, yes, workers_only, cluster_name,
                     keep_min_workers, log_old_style, log_color, verbose)
Пример #11
0
def teardown(cluster_config_file, yes, workers_only, cluster_name):
    """Tear down the Ray cluster."""
    teardown_cluster(cluster_config_file, yes, workers_only, cluster_name)