def run_docker_stop(node, container_name): try: updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=False, docker_config=config.get("docker")) _exec( updater, f"docker stop {container_name}", False, False, run_env="host") except Exception: cli_logger.warning(f"Docker stop failed on {node}") cli_logger.old_warning(logger, f"Docker stop failed on {node}")
def _bootstrap_config(config: Dict[str, Any], no_config_cache: bool = False) -> Dict[str, Any]: config = prepare_config(config) hasher = hashlib.sha1() hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) cache_key = os.path.join(tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())) if os.path.exists(cache_key) and not no_config_cache: cli_logger.old_info(logger, "Using cached config at {}", cache_key) config_cache = json.loads(open(cache_key).read()) if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION: # todo: is it fine to re-resolve? afaik it should be. # we can have migrations otherwise or something # but this seems overcomplicated given that resolving is # relatively cheap try_reload_log_state(config_cache["config"]["provider"], config_cache.get("provider_log_info")) cli_logger.verbose("Loaded cached config from " + cf.bold("{}"), cache_key) return config_cache["config"] else: cli_logger.warning( "Found cached cluster config " "but the version " + cf.bold("{}") + " " "(expected " + cf.bold("{}") + ") does not match.\n" "This is normal if cluster launcher was updated.\n" "Config will be re-resolved.", config_cache.get("_version", "none"), CONFIG_CACHE_VERSION) validate_config(config) importer = NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) provider_cls = importer(config["provider"]) with cli_logger.timed( # todo: better message "Bootstraping {} config", PROVIDER_PRETTY_NAMES.get(config["provider"]["type"])): resolved_config = provider_cls.bootstrap_config(config) if not no_config_cache: with open(cache_key, "w") as f: config_cache = { "_version": CONFIG_CACHE_VERSION, "provider_log_info": try_get_log_state(config["provider"]), "config": resolved_config } f.write(json.dumps(config_cache)) return resolved_config
def handle_cli_override(key, override): if override is not None: if key in config: nonlocal printed_overrides printed_overrides = True cli_logger.warning( "`{}` override provided on the command line.\n" " Using " + cf.bold("{}") + cf.dimmed(" [configuration file has " + cf.bold("{}") + "]"), key, override, config[key]) config[key] = override
def handle_ssh_fails(e, first_conn_refused_time, retry_interval): """Handle SSH system failures coming from a subprocess. Args: e: The `ProcessRunnerException` to handle. first_conn_refused_time: The time (as reported by this function) or None, indicating the last time a CONN_REFUSED error was caught. After exceeding a patience value, the program will be aborted since SSH will likely never recover. retry_interval: The interval after which the command will be retried, used here just to inform the user. """ if e.msg_type != "ssh_command_failed": return if e.special_case == "ssh_conn_refused": if first_conn_refused_time is not None and \ time.time() - first_conn_refused_time > \ CONN_REFUSED_PATIENCE: cli_logger.error( "SSH connection was being refused " "for {} seconds. Head node assumed " "unreachable.", cf.bold(str(CONN_REFUSED_PATIENCE))) cli_logger.abort("Check the node's firewall settings " "and the cloud network configuration.") cli_logger.warning("SSH connection was refused.") cli_logger.warning("This might mean that the SSH daemon is " "still setting up, or that " "the host is inaccessable (e.g. due to " "a firewall).") return time.time() if e.special_case in ["ssh_timeout", "ssh_conn_refused"]: cli_logger.print("SSH still not available, " "retrying in {} seconds.", cf.bold(str(retry_interval))) else: raise e return first_conn_refused_time
def warn_about_bad_start_command(start_commands): ray_start_cmd = list(filter(lambda x: "ray start" in x, start_commands)) if len(ray_start_cmd) == 0: cli_logger.warning( "Ray runtime will not be started because `{}` is not in `{}`.", cf.bold("ray start"), cf.bold("head_start_ray_commands")) cli_logger.old_warning( logger, "Ray start is not included in the head_start_ray_commands section." ) if not any("autoscaling-config" in x for x in ray_start_cmd): cli_logger.warning( "The head node will not launch any workers because " "`{}` does not have `{}` set.\n" "Potential fix: add `{}` to the `{}` command under `{}`.", cf.bold("ray start"), cf.bold("--autoscaling-config"), cf.bold("--autoscaling-config=~/ray_bootstrap_config.yaml"), cf.bold("ray start"), cf.bold("head_start_ray_commands")) cli_logger.old_warning( logger, "Ray start on the head node does not have the flag" "--autoscaling-config set. The head node will not launch" "workers. Add --autoscaling-config=~/ray_bootstrap_config.yaml" "to ray start in the head_start_ray_commands section.")
def _set_ssh_ip_if_required(self): if self.ssh_ip is not None: return # We assume that this never changes. # I think that's reasonable. deadline = time.time() + NODE_START_WAIT_S with LogTimer(self.log_prefix + "Got IP"): ip = self.wait_for_ip(deadline) cli_logger.doassert(ip is not None, "Could not get node IP.") # todo: msg assert ip is not None, "Unable to find IP of node" self.ssh_ip = ip # This should run before any SSH commands and therefore ensure that # the ControlPath directory exists, allowing SSH to maintain # persistent sessions later on. try: os.makedirs(self.ssh_control_path, mode=0o700, exist_ok=True) except OSError as e: cli_logger.warning("{}", str(e)) # todo: msg cli_logger.old_warning(logger, "{}", str(e))
def create_or_update_cluster(config_file: str, override_min_workers: Optional[int], override_max_workers: Optional[int], no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str], no_config_cache: bool = False, redirect_command_output: bool = False, use_login_shells: bool = True) -> None: """Create or updates an autoscaling Ray cluster from a config json.""" set_using_login_shells(use_login_shells) if not use_login_shells: cmd_output_util.set_allow_interactive(False) if redirect_command_output is None: # Do not redirect by default. cmd_output_util.set_output_redirected(False) else: cmd_output_util.set_output_redirected(redirect_command_output) if use_login_shells: cli_logger.warning( "Commands running under a login shell can produce more " "output than special processing can handle.") cli_logger.warning( "Thus, the output from subcommands will be logged as is.") cli_logger.warning( "Consider using {}, {}.", cf.bold("--use-normal-shells"), cf.underlined("if you tested your workflow and it is compatible")) cli_logger.newline() def handle_yaml_error(e): cli_logger.error("Cluster config invalid") cli_logger.newline() cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file) cli_logger.newline() with cli_logger.verbatim_error_ctx("PyYAML error:"): cli_logger.error(e) cli_logger.abort() try: config = yaml.safe_load(open(config_file).read()) except FileNotFoundError: cli_logger.abort( "Provided cluster configuration file ({}) does not exist", cf.bold(config_file)) raise except yaml.parser.ParserError as e: handle_yaml_error(e) raise except yaml.scanner.ScannerError as e: handle_yaml_error(e) raise # todo: validate file_mounts, ssh keys, etc. importer = NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: cli_logger.abort( "Unknown provider type " + cf.bold("{}") + "\n" "Available providers are: {}", config["provider"]["type"], cli_logger.render_list([ k for k in NODE_PROVIDERS.keys() if NODE_PROVIDERS[k] is not None ])) raise NotImplementedError("Unsupported provider {}".format( config["provider"])) cli_logger.success("Cluster configuration valid") printed_overrides = False def handle_cli_override(key, override): if override is not None: if key in config: nonlocal printed_overrides printed_overrides = True cli_logger.warning( "`{}` override provided on the command line.\n" " Using " + cf.bold("{}") + cf.dimmed(" [configuration file has " + cf.bold("{}") + "]"), key, override, config[key]) config[key] = override handle_cli_override("min_workers", override_min_workers) handle_cli_override("max_workers", override_max_workers) handle_cli_override("cluster_name", override_cluster_name) if printed_overrides: cli_logger.newline() cli_logger.labeled_value("Cluster", config["cluster_name"]) # disable the cli_logger here if needed # because it only supports aws if config["provider"]["type"] != "aws": cli_logger.old_style = True cli_logger.newline() config = _bootstrap_config(config, no_config_cache=no_config_cache) try_logging_config(config) get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name)
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) cli_logger.old_confirm("This will destroy your cluster", yes) if not workers_only: try: exec_cluster(config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occured when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") cli_logger.old_exception( logger, "Ignoring error attempting a clean shutdown.") provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): workers = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.dimmed("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) cli_logger.old_info(logger, "teardown_cluster: Keeping {} nodes...", min_workers) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.dimmed("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_HEAD}) return head + workers def run_docker_stop(node, container_name): try: updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=False, docker_config=config.get("docker")) _exec(updater, f"docker stop {container_name}", False, False, run_env="host") except Exception: cli_logger.warning(f"Docker stop failed on {node}") cli_logger.old_warning(logger, f"Docker stop failed on {node}") # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() container_name = config.get("docker", {}).get("container_name") if container_name: for node in A: run_docker_stop(node, container_name) with LogTimer("teardown_cluster: done."): while A: cli_logger.old_info( logger, "teardown_cluster: " "Shutting down {} nodes...", len(A)) provider.terminate_nodes(A) cli_logger.print("Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep(1) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after 1 second.", cf.bold(len(A))) cli_logger.success("No nodes remaining.") finally: provider.cleanup()
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool, log_old_style: bool, log_color: str, verbose: int): """Destroys all nodes of a Ray cluster described by a config json.""" cli_logger.old_style = log_old_style cli_logger.color_mode = log_color cli_logger.verbosity = verbose cli_logger.dump_command_output = verbose == 3 # todo: add a separate flag? config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) cli_logger.old_confirm("This will destroy your cluster", yes) if not workers_only: try: exec_cluster(config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occured when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") cli_logger.old_exception( logger, "Ignoring error attempting a clean shutdown.") provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): workers = provider.non_terminated_nodes( {TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER}) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.gray("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) cli_logger.old_info(logger, "teardown_cluster: Keeping {} nodes...", min_workers) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.gray("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes( {TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD}) return head + workers # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() with LogTimer("teardown_cluster: done."): while A: cli_logger.old_info( logger, "teardown_cluster: " "Shutting down {} nodes...", len(A)) provider.terminate_nodes(A) cli_logger.print("Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep(1) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after 1 second.", cf.bold(len(A))) finally: provider.cleanup()
cli_logger.old_style = False cli_logger.verbosity = 999 cli_logger.detect_colors() cli_logger.print( cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined")) cli_logger.labeled_value("Label", "value") cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3])) cli_logger.newline() cli_logger.very_verbose("Very verbose") cli_logger.verbose("Verbose") cli_logger.verbose_warning("Verbose warning") cli_logger.verbose_error("Verbose error") cli_logger.print("Info") cli_logger.success("Success") cli_logger.warning("Warning") cli_logger.error("Error") cli_logger.newline() try: cli_logger.abort("Abort") except Exception: pass try: cli_logger.doassert(False, "Assert") except Exception: pass cli_logger.newline() cli_logger.confirm(True, "example") cli_logger.newline() with cli_logger.indented(): cli_logger.print("Indented")