def handle_cli_override(key, override): if override is not None: if key in config: nonlocal printed_overrides printed_overrides = True cli_logger.warning( "`{}` override provided on the command line.\n" " Using " + cf.bold("{}") + cf.dimmed(" [configuration file has " + cf.bold("{}") + "]"), key, override, config[key]) config[key] = override
def rsync_down(self, source, target, docker_mount_if_possible=False): cli_logger.old_info(logger, "{}Syncing {} from {}...", self.log_prefix, source, target) options = {} options["docker_mount_if_possible"] = docker_mount_if_possible options["rsync_exclude"] = self.rsync_options.get("rsync_exclude") options["rsync_filter"] = self.rsync_options.get("rsync_filter") self.cmd_runner.run_rsync_down(source, target, options=options) cli_logger.verbose("`rsync`ed {} (remote) to {} (local)", cf.bold(source), cf.bold(target))
def wait_ready(self, deadline): with cli_logger.group("Waiting for SSH to become available", _numbered=("[]", 1, NUM_SETUP_STEPS)): with LogTimer(self.log_prefix + "Got remote shell"): cli_logger.print("Running `{}` as a test.", cf.bold("uptime")) first_conn_refused_time = None while True: if time.time() > deadline: raise Exception("wait_ready timeout exceeded.") if self.provider.is_terminated(self.node_id): raise Exception("wait_ready aborting because node " "detected as terminated.") try: # Run outside of the container self.cmd_runner.run("uptime", timeout=5, run_env="host") cli_logger.success("Success.") return True except ProcessRunnerError as e: first_conn_refused_time = \ cmd_output_util.handle_ssh_fails( e, first_conn_refused_time, retry_interval=READY_CHECK_INTERVAL) time.sleep(READY_CHECK_INTERVAL) except Exception as e: # TODO(maximsmol): we should not be ignoring # exceptions if they get filtered properly # (new style log + non-interactive shells) # # however threading this configuration state # is a pain and I'm leaving it for later retry_str = "(" + str(e) + ")" if hasattr(e, "cmd"): if isinstance(e.cmd, str): cmd_ = e.cmd elif isinstance(e.cmd, list): cmd_ = " ".join(e.cmd) else: logger.debug(f"e.cmd type ({type(e.cmd)}) not " "list or str.") cmd_ = str(e.cmd) retry_str = "(Exit Status {}): {}".format( e.returncode, cmd_) cli_logger.print( "SSH still not available {}, " "retrying in {} seconds.", cf.dimmed(retry_str), cf.bold(str(READY_CHECK_INTERVAL))) time.sleep(READY_CHECK_INTERVAL)
def run(self): update_start_time = time.time() if (cmd_output_util.does_allow_interactive() and cmd_output_util.is_output_redirected()): # this is most probably a bug since the user has no control # over these settings msg = ("Output was redirected for an interactive command. " "Either do not pass `--redirect-command-output` " "or also pass in `--use-normal-shells`.") cli_logger.abort(msg) try: with LogTimer(self.log_prefix + "Applied config {}".format(self.runtime_hash)): self.do_update() except Exception as e: self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED}) cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED)) cli_logger.error("!!!") if hasattr(e, "cmd"): cli_logger.error( "Setup command `{}` failed with exit code {}. stderr:", cf.bold(e.cmd), e.returncode, ) else: cli_logger.verbose_error("{}", str(vars(e))) # todo: handle this better somehow? cli_logger.error("{}", str(e)) # todo: print stderr here cli_logger.error("!!!") cli_logger.newline() if isinstance(e, click.ClickException): # todo: why do we ignore this here return raise tags_to_set = { TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, TAG_RAY_RUNTIME_CONFIG: self.runtime_hash, } if self.file_mounts_contents_hash is not None: tags_to_set[ TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash self.provider.set_node_tags(self.node_id, tags_to_set) cli_logger.labeled_value("New status", STATUS_UP_TO_DATE) self.update_time = time.time() - update_start_time self.exitcode = 0
def wait_ready(self, deadline): with cli_logger.group("Waiting for SSH to become available", _numbered=("[]", 1, 6)): with LogTimer(self.log_prefix + "Got remote shell"): cli_logger.old_info(logger, "{}Waiting for remote shell...", self.log_prefix) cli_logger.print("Running `{}` as a test.", cf.bold("uptime")) first_conn_refused_time = None while time.time() < deadline and \ not self.provider.is_terminated(self.node_id): try: cli_logger.old_debug(logger, "{}Waiting for remote shell...", self.log_prefix) # Run outside of the container self.cmd_runner.run("uptime", timeout=5, run_env="host") cli_logger.old_debug(logger, "Uptime succeeded.") cli_logger.success("Success.") return True except ProcessRunnerError as e: first_conn_refused_time = \ cmd_output_util.handle_ssh_fails( e, first_conn_refused_time, retry_interval=READY_CHECK_INTERVAL) time.sleep(READY_CHECK_INTERVAL) except Exception as e: # TODO(maximsmol): we should not be ignoring # exceptions if they get filtered properly # (new style log + non-interactive shells) # # however threading this configuration state # is a pain and I'm leaving it for later retry_str = str(e) if hasattr(e, "cmd"): retry_str = "(Exit Status {}): {}".format( e.returncode, " ".join(e.cmd)) cli_logger.print( "SSH still not available {}, " "retrying in {} seconds.", cf.dimmed(retry_str), cf.bold(str(READY_CHECK_INTERVAL))) cli_logger.old_debug(logger, "{}Node not up, retrying: {}", self.log_prefix, retry_str) time.sleep(READY_CHECK_INTERVAL) assert False, "Unable to connect to node"
def resource_cache(name, region, max_retries=BOTO_MAX_RETRIES, **kwargs): cli_logger.verbose("Creating AWS resource `{}` in `{}`", cf.bold(name), cf.bold(region)) kwargs.setdefault( "config", Config(retries={"max_attempts": max_retries}), ) return boto3.resource( name, region, **kwargs, )
def handle_ssh_fails(e, first_conn_refused_time, retry_interval): """Handle SSH system failures coming from a subprocess. Args: e: The `ProcessRunnerException` to handle. first_conn_refused_time: The time (as reported by this function) or None, indicating the last time a CONN_REFUSED error was caught. After exceeding a patience value, the program will be aborted since SSH will likely never recover. retry_interval: The interval after which the command will be retried, used here just to inform the user. """ if e.msg_type != "ssh_command_failed": return if e.special_case == "ssh_conn_refused": if ( first_conn_refused_time is not None and time.time() - first_conn_refused_time > CONN_REFUSED_PATIENCE ): cli_logger.error( "SSH connection was being refused " "for {} seconds. Head node assumed " "unreachable.", cf.bold(str(CONN_REFUSED_PATIENCE)), ) cli_logger.abort( "Check the node's firewall settings " "and the cloud network configuration." ) cli_logger.warning("SSH connection was refused.") cli_logger.warning( "This might mean that the SSH daemon is " "still setting up, or that " "the host is inaccessable (e.g. due to " "a firewall)." ) return time.time() if e.special_case in ["ssh_timeout", "ssh_conn_refused"]: cli_logger.print( "SSH still not available, " "retrying in {} seconds.", cf.bold(str(retry_interval)), ) else: raise e return first_conn_refused_time
def _bootstrap_config(config: Dict[str, Any], no_config_cache: bool = False) -> Dict[str, Any]: config = prepare_config(config) hasher = hashlib.sha1() hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) cache_key = os.path.join(tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())) if os.path.exists(cache_key) and not no_config_cache: cli_logger.old_info(logger, "Using cached config at {}", cache_key) config_cache = json.loads(open(cache_key).read()) if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION: # todo: is it fine to re-resolve? afaik it should be. # we can have migrations otherwise or something # but this seems overcomplicated given that resolving is # relatively cheap try_reload_log_state(config_cache["config"]["provider"], config_cache.get("provider_log_info")) if log_once("_printed_cached_config_warning"): cli_logger.verbose_warning( "Loaded cached provider configuration " "from " + cf.bold("{}"), cache_key) if cli_logger.verbosity == 0: cli_logger.warning("Loaded cached provider configuration") cli_logger.warning( "If you experience issues with " "the cloud provider, try re-running " "the command with {}.", cf.bold("--no-config-cache")) return config_cache["config"] else: cli_logger.warning( "Found cached cluster config " "but the version " + cf.bold("{}") + " " "(expected " + cf.bold("{}") + ") does not match.\n" "This is normal if cluster launcher was updated.\n" "Config will be re-resolved.", config_cache.get("_version", "none"), CONFIG_CACHE_VERSION) validate_config(config) importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) provider_cls = importer(config["provider"])
def client_cache(name, region, max_retries=BOTO_MAX_RETRIES, **kwargs): try: # try to re-use a client from the resource cache first return resource_cache(name, region, max_retries, **kwargs).meta.client except ResourceNotExistsError: # fall back for clients without an associated resource cli_logger.verbose("Creating AWS client `{}` in `{}`", cf.bold(name), cf.bold(region)) kwargs.setdefault( "config", Config(retries={"max_attempts": max_retries}), ) return boto3.client( name, region, **kwargs, )
def handle_yaml_error(e): cli_logger.error("Cluster config invalid") cli_logger.newline() cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file) cli_logger.newline() with cli_logger.verbatim_error_ctx("PyYAML error:"): cli_logger.error(e) cli_logger.abort()
def run_rsync_up(self, source, target, options=None): self._set_ssh_ip_if_required() command = [ "rsync", "--rsh", subprocess.list2cmdline(["ssh"] + self.ssh_options.to_ssh_options_list( timeout=120)), "-avz", source, "{}@{}:{}".format(self.ssh_user, self.ssh_ip, target) ] cli_logger.verbose("Running `{}`", cf.bold(" ".join(command))) self._run_helper(command, silent=is_rsync_silent())
def _get_key(key_name, config): ec2 = _resource("ec2", config) try: for key in ec2.key_pairs.filter(Filters=[{ "Name": "key-name", "Values": [key_name] }]): if key.name == key_name: return key except botocore.exceptions.ClientError as exc: handle_boto_error(exc, "Failed to fetch EC2 key pair {} from AWS.", cf.bold(key_name)) raise exc
def run_cmd_redirected(cmd, process_runner=subprocess, silent=False, use_login_shells=False): """Run a command and optionally redirect output to a file. Args: cmd (List[str]): Command to run. process_runner: Process runner used for executing commands. silent: If true, the command output will be silenced completely (redirected to /dev/null), unless verbose logging is enabled. Use this for running utility commands like rsync. """ if silent and cli_logger.verbosity < 1: return _run_and_process_output( cmd, process_runner=process_runner, stdout_file=process_runner.DEVNULL, stderr_file=process_runner.DEVNULL, use_login_shells=use_login_shells, ) if not is_output_redirected(): return _run_and_process_output( cmd, process_runner=process_runner, stdout_file=sys.stdout, stderr_file=sys.stderr, use_login_shells=use_login_shells, ) else: tmpfile_path = os.path.join( tempfile.gettempdir(), "ray-up-{}-{}.txt".format(cmd[0], time.time())) with open( tmpfile_path, mode="w", # line buffering buffering=1, ) as tmp: cli_logger.verbose("Command stdout is redirected to {}", cf.bold(tmp.name)) return _run_and_process_output( cmd, process_runner=process_runner, stdout_file=tmp, stderr_file=tmp, use_login_shells=use_login_shells, )
def _get_role(role_name, config): iam = _resource("iam", config) role = iam.Role(role_name) try: role.load() return role except botocore.exceptions.ClientError as exc: if exc.response.get("Error", {}).get("Code") == "NoSuchEntity": return None else: handle_boto_error( exc, "Failed to fetch IAM role data for {} from AWS.", cf.bold(role_name)) raise exc
def kill_node(config_file, yes, hard, override_cluster_name): """Kills a random Raylet worker.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) cli_logger.confirm(yes, "A random node will be killed.") cli_logger.old_confirm("This will kill a node in your cluster", yes) provider = _get_node_provider(config["provider"], config["cluster_name"]) try: nodes = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_WORKER }) node = random.choice(nodes) cli_logger.print("Shutdown " + cf.bold("{}"), node) cli_logger.old_info(logger, "kill_node: Shutdown worker {}", node) if hard: provider.terminate_node(node) else: updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=False, docker_config=config.get("docker")) _exec(updater, "ray stop", False, False) time.sleep(POLL_INTERVAL) if config.get("provider", {}).get("use_internal_ips", False) is True: node_ip = provider.internal_ip(node) else: node_ip = provider.external_ip(node) finally: provider.cleanup() return node_ip
def up(cluster_config_file, min_workers, max_workers, no_restart, restart_only, yes, cluster_name, no_config_cache, redirect_command_output, use_login_shells, log_style, log_color, verbose): """Create or update a Ray cluster.""" cli_logger.configure(log_style, log_color, verbose) if restart_only or no_restart: cli_logger.doassert(restart_only != no_restart, "`{}` is incompatible with `{}`.", cf.bold("--restart-only"), cf.bold("--no-restart")) assert restart_only != no_restart, "Cannot set both 'restart_only' " \ "and 'no_restart' at the same time!" if urllib.parse.urlparse(cluster_config_file).scheme in ("http", "https"): try: response = urllib.request.urlopen(cluster_config_file, timeout=5) content = response.read() file_name = cluster_config_file.split("/")[-1] with open(file_name, "wb") as f: f.write(content) cluster_config_file = file_name except urllib.error.HTTPError as e: cli_logger.warning("{}", str(e)) cli_logger.warning( "Could not download remote cluster configuration file.") create_or_update_cluster( config_file=cluster_config_file, override_min_workers=min_workers, override_max_workers=max_workers, no_restart=no_restart, restart_only=restart_only, yes=yes, override_cluster_name=cluster_name, no_config_cache=no_config_cache, redirect_command_output=redirect_command_output, use_login_shells=use_login_shells)
def _get_instance_profile(profile_name, config): iam = _resource("iam", config) profile = iam.InstanceProfile(profile_name) try: profile.load() return profile except botocore.exceptions.ClientError as exc: if exc.response.get("Error", {}).get("Code") == "NoSuchEntity": return None else: handle_boto_error( exc, "Failed to fetch IAM instance profile data for {} from AWS.", cf.bold(profile_name), ) raise exc
def warn_about_bad_start_command(start_commands: List[str]) -> None: ray_start_cmd = list(filter(lambda x: "ray start" in x, start_commands)) if len(ray_start_cmd) == 0: cli_logger.warning( "Ray runtime will not be started because `{}` is not in `{}`.", cf.bold("ray start"), cf.bold("head_start_ray_commands")) if not any("autoscaling-config" in x for x in ray_start_cmd): cli_logger.warning( "The head node will not launch any workers because " "`{}` does not have `{}` set.\n" "Potential fix: add `{}` to the `{}` command under `{}`.", cf.bold("ray start"), cf.bold("--autoscaling-config"), cf.bold("--autoscaling-config=~/ray_bootstrap_config.yaml"), cf.bold("ray start"), cf.bold("head_start_ray_commands"))
def _create_security_group(config, vpc_id, group_name): client = _client("ec2", config) client.create_security_group( Description="Auto-created security group for Ray workers", GroupName=group_name, VpcId=vpc_id) security_group = _get_security_group(config, vpc_id, group_name) cli_logger.doassert(security_group, "Failed to create security group") # err msg cli_logger.verbose("Created new security group {}", cf.bold(security_group.group_name), _tags=dict(id=security_group.id)) cli_logger.doassert(security_group, "Failed to create security group") # err msg assert security_group, "Failed to create security group" return security_group
def _wait_for_ip(self, deadline): # if we have IP do not print waiting info ip = self._get_node_ip() if ip is not None: cli_logger.labeled_value("Fetched IP", ip) return ip interval = 10 with cli_logger.group("Waiting for IP"): while time.time() < deadline and \ not self.provider.is_terminated(self.node_id): ip = self._get_node_ip() if ip is not None: cli_logger.labeled_value("Received", ip) return ip cli_logger.print("Not yet available, retrying in {} seconds", cf.bold(str(interval))) time.sleep(interval) return None
def warn_about_bad_start_command(start_commands): ray_start_cmd = list(filter(lambda x: "ray start" in x, start_commands)) if len(ray_start_cmd) == 0: cli_logger.warning( "Ray runtime will not be started because `{}` is not in `{}`.", cf.bold("ray start"), cf.bold("head_start_ray_commands")) cli_logger.old_warning( logger, "Ray start is not included in the head_start_ray_commands section." ) if not any("autoscaling-config" in x for x in ray_start_cmd): cli_logger.warning( "The head node will not launch any workers because " "`{}` does not have `{}` set.\n" "Potential fix: add `{}` to the `{}` command under `{}`.", cf.bold("ray start"), cf.bold("--autoscaling-config"), cf.bold("--autoscaling-config=~/ray_bootstrap_config.yaml"), cf.bold("ray start"), cf.bold("head_start_ray_commands")) cli_logger.old_warning( logger, "Ray start on the head node does not have the flag" "--autoscaling-config set. The head node will not launch" "workers. Add --autoscaling-config=~/ray_bootstrap_config.yaml" "to ray start in the head_start_ray_commands section.")
def do_update(self): self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH}) cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH) deadline = time.time() + NODE_START_WAIT_S self.wait_ready(deadline) node_tags = self.provider.node_tags(self.node_id) logger.debug("Node tags: {}".format(str(node_tags))) if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash: # When resuming from a stopped instance the runtime_hash may be the # same, but the container will not be started. self.cmd_runner.run_init( as_head=self.is_head_node, file_mounts=self.file_mounts) # runtime_hash will only change whenever the user restarts # or updates their cluster with `get_or_create_head_node` if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and ( not self.file_mounts_contents_hash or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) == self.file_mounts_contents_hash): # todo: we lie in the confirmation message since # full setup might be cancelled here cli_logger.print( "Configuration already up to date, " "skipping file mounts, initalization and setup commands.", _numbered=("[]", "2-5", 6)) cli_logger.old_info(logger, "{}{} already up-to-date, skip to ray start", self.log_prefix, self.node_id) else: cli_logger.print( "Updating cluster configuration.", _tags=dict(hash=self.runtime_hash)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES}) cli_logger.labeled_value("New status", STATUS_SYNCING_FILES) self.sync_file_mounts(self.rsync_up, step_numbers=(2, 6)) # Only run setup commands if runtime_hash has changed because # we don't want to run setup_commands every time the head node # file_mounts folders have changed. if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash: # Run init commands self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP}) cli_logger.labeled_value("New status", STATUS_SETTING_UP) if self.initialization_commands: with cli_logger.group( "Running initialization commands", _numbered=("[]", 3, 5)): with LogTimer( self.log_prefix + "Initialization commands", show_status=True): for cmd in self.initialization_commands: try: # Overriding the existing SSHOptions class # with a new SSHOptions class that uses # this ssh_private_key as its only __init__ # argument. # Run outside docker. self.cmd_runner.run( cmd, ssh_options_override_ssh_key=self. auth_config.get("ssh_private_key"), run_env="host") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Initialization command failed." ) from None else: cli_logger.print( "No initialization commands to run.", _numbered=("[]", 3, 6)) self.cmd_runner.run_init( as_head=self.is_head_node, file_mounts=self.file_mounts) if self.setup_commands: with cli_logger.group( "Running setup commands", # todo: fix command numbering _numbered=("[]", 4, 6)): with LogTimer( self.log_prefix + "Setup commands", show_status=True): total = len(self.setup_commands) for i, cmd in enumerate(self.setup_commands): if cli_logger.verbosity == 0 and len(cmd) > 30: cmd_to_print = cf.bold(cmd[:30]) + "..." else: cmd_to_print = cf.bold(cmd) cli_logger.print( "{}", cmd_to_print, _numbered=("()", i, total)) try: # Runs in the container if docker is in use self.cmd_runner.run(cmd, run_env="auto") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Setup command failed.") else: cli_logger.print( "No setup commands to run.", _numbered=("[]", 4, 6)) with cli_logger.group( "Starting the Ray runtime", _numbered=("[]", 6, 6)): with LogTimer( self.log_prefix + "Ray start commands", show_status=True): for cmd in self.ray_start_commands: if self.node_resources: env_vars = { RESOURCES_ENVIRONMENT_VARIABLE: self.node_resources } else: env_vars = {} try: old_redirected = cmd_output_util.is_output_redirected() cmd_output_util.set_output_redirected(False) # Runs in the container if docker is in use self.cmd_runner.run( cmd, environment_variables=env_vars, run_env="auto") cmd_output_util.set_output_redirected(old_redirected) except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error("See above for stderr.") raise click.ClickException("Start command failed.")
def run( self, cmd, timeout=120, exit_on_fail=False, port_forward=None, with_output=False, environment_variables: Dict[str, object] = None, run_env="auto", # Unused argument. ssh_options_override_ssh_key="", shutdown_after_run=False, silent=False): if shutdown_after_run: cmd += "; sudo shutdown -h now" if ssh_options_override_ssh_key: ssh_options = SSHOptions(ssh_options_override_ssh_key) else: ssh_options = self.ssh_options assert isinstance( ssh_options, SSHOptions ), "ssh_options must be of type SSHOptions, got {}".format( type(ssh_options)) self._set_ssh_ip_if_required() if is_using_login_shells(): ssh = ["ssh", "-tt"] else: ssh = ["ssh"] if port_forward: with cli_logger.group("Forwarding ports"): if not isinstance(port_forward, list): port_forward = [port_forward] for local, remote in port_forward: cli_logger.verbose( "Forwarding port {} to port {} on localhost.", cf.bold(local), cf.bold(remote)) # todo: msg ssh += ["-L", "{}:localhost:{}".format(remote, local)] final_cmd = ssh + ssh_options.to_ssh_options_list( timeout=timeout) + ["{}@{}".format(self.ssh_user, self.ssh_ip)] if cmd: if environment_variables: cmd = _with_environment_variables(cmd, environment_variables) if is_using_login_shells(): final_cmd += _with_interactive(cmd) else: final_cmd += [cmd] else: # We do this because `-o ControlMaster` causes the `-N` flag to # still create an interactive shell in some ssh versions. final_cmd.append("while true; do sleep 86400; done") cli_logger.verbose("Running `{}`", cf.bold(cmd)) with cli_logger.indented(): cli_logger.very_verbose("Full command is `{}`", cf.bold(" ".join(final_cmd))) if cli_logger.verbosity > 0: with cli_logger.indented(): return self._run_helper(final_cmd, with_output, exit_on_fail, silent=silent) else: return self._run_helper(final_cmd, with_output, exit_on_fail, silent=silent)
def get_or_create_head_node(config: Dict[str, Any], config_file: str, no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str], _provider: Optional[NodeProvider] = None, _runner: ModuleType = subprocess) -> None: """Create the cluster head node, which in turn creates the workers.""" provider = (_provider or _get_node_provider(config["provider"], config["cluster_name"])) config = copy.deepcopy(config) config_file = os.path.abspath(config_file) head_node_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, } nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: cli_logger.confirm( yes, "No head node found. " "Launching a new cluster.", _abort=True) if head_node: if restart_only: cli_logger.confirm( yes, "Updating cluster configuration and " "restarting the cluster Ray runtime. " "Setup commands will not be run due to `{}`.\n", cf.bold("--restart-only"), _abort=True) elif no_restart: cli_logger.print( "Cluster Ray runtime will not be restarted due " "to `{}`.", cf.bold("--no-restart")) cli_logger.confirm( yes, "Updating cluster configuration and " "running setup commands.", _abort=True) else: cli_logger.print( "Updating cluster configuration and running full setup.") cli_logger.confirm( yes, cf.bold("Cluster Ray runtime will be restarted."), _abort=True) cli_logger.newline() # TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync) head_node_config = copy.deepcopy(config["head_node"]) if "head_node_type" in config: head_node_tags[TAG_RAY_USER_NODE_TYPE] = config["head_node_type"] head_node_config.update(config["available_node_types"][config[ "head_node_type"]]["node_config"]) launch_hash = hash_launch_conf(head_node_config, config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: with cli_logger.group("Acquiring an up-to-date head node"): if head_node is not None: cli_logger.print( "Currently running head node is out-of-date with " "cluster configuration") cli_logger.print( "hash is {}, expected {}", cf.bold( provider.node_tags(head_node) .get(TAG_RAY_LAUNCH_CONFIG)), cf.bold(launch_hash)) cli_logger.confirm(yes, "Relaunching it.", _abort=True) provider.terminate_node(head_node) cli_logger.print("Terminated head node {}", head_node) head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( config["cluster_name"]) provider.create_node(head_node_config, head_node_tags, 1) cli_logger.print("Launched a new head node") start = time.time() head_node = None with cli_logger.group("Fetching the new head node"): while True: if time.time() - start > 50: cli_logger.abort( "Head node fetch timed out.") # todo: msg raise RuntimeError("Failed to create head node.") nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) == 1: head_node = nodes[0] break time.sleep(POLL_INTERVAL) cli_logger.newline() with cli_logger.group( "Setting up head node", _numbered=("<>", 1, 1), # cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]), _tags=dict()): # add id, ARN to tags? # TODO(ekl) right now we always update the head node even if the # hash matches. # We could prompt the user for what they want to do here. # No need to pass in cluster_sync_files because we use this # hash to set up the head node (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf( config["file_mounts"], None, config) # Rewrite the auth config so that the head # node can update the workers remote_config = copy.deepcopy(config) # drop proxy options if they exist, otherwise # head node won't be able to connect to workers remote_config["auth"].pop("ssh_proxy_command", None) if "ssh_private_key" in config["auth"]: remote_key_path = "~/ray_bootstrap_key.pem" remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts remote_config["no_restart"] = no_restart remote_config = provider.prepare_for_head_node(remote_config) # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile( "w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update({ "~/ray_bootstrap_config.yaml": remote_config_file.name }) if "ssh_private_key" in config["auth"]: config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], }) cli_logger.print("Prepared bootstrap config") if restart_only: setup_commands = [] ray_start_commands = config["head_start_ray_commands"] elif no_restart: setup_commands = config["head_setup_commands"] ray_start_commands = [] else: setup_commands = config["head_setup_commands"] ray_start_commands = config["head_start_ray_commands"] if not no_restart: warn_about_bad_start_command(ray_start_commands) updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=config["initialization_commands"], setup_commands=setup_commands, ray_start_commands=ray_start_commands, process_runner=_runner, runtime_hash=runtime_hash, file_mounts_contents_hash=file_mounts_contents_hash, is_head_node=True, rsync_options={ "rsync_exclude": config.get("rsync_exclude"), "rsync_filter": config.get("rsync_filter") }, docker_config=config.get("docker")) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.non_terminated_nodes(head_node_tags) if updater.exitcode != 0: # todo: this does not follow the mockup and is not good enough cli_logger.abort("Failed to setup head node.") sys.exit(1) monitor_str = "tail -n 100 -f /tmp/ray/session_latest/logs/monitor*" if override_cluster_name: modifiers = " --cluster-name={}".format(quote(override_cluster_name)) else: modifiers = "" cli_logger.newline() with cli_logger.group("Useful commands"): cli_logger.print("Monitor autoscaling with") cli_logger.print( cf.bold(" ray exec {}{} {}"), config_file, modifiers, quote(monitor_str)) cli_logger.print("Connect to a terminal on the cluster head:") cli_logger.print(cf.bold(" ray attach {}{}"), config_file, modifiers) remote_shell_str = updater.cmd_runner.remote_shell_command_str() cli_logger.print("Get a remote shell to the cluster manually:") cli_logger.print(" {}", remote_shell_str.strip())
def exec_cluster(config_file: str, *, cmd: str = None, run_env: str = "auto", screen: bool = False, tmux: bool = False, stop: bool = False, start: bool = False, override_cluster_name: Optional[str] = None, no_config_cache: bool = False, port_forward: Optional[Port_forward] = None, with_output: bool = False) -> str: """Runs a command on the specified cluster. Arguments: config_file: path to the cluster yaml cmd: command to run run_env: whether to run the command on the host or in a container. Select between "auto", "host" and "docker" screen: whether to run in a screen tmux: whether to run in a tmux session stop: whether to stop the cluster after command run start: whether to start the cluster if it isn't up override_cluster_name: set the name of the cluster port_forward ( (int, int) or list[(int, int)] ): port(s) to forward """ assert not (screen and tmux), "Can specify only one of `screen` or `tmux`." assert run_env in RUN_ENV_TYPES, "--run_env must be in {}".format( RUN_ENV_TYPES) # TODO(rliaw): We default this to True to maintain backwards-compat. # In the future we would want to support disabling login-shells # and interactivity. cmd_output_util.set_allow_interactive(True) config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config, no_config_cache=no_config_cache) head_node = _get_head_node( config, config_file, override_cluster_name, create_if_needed=start) provider = _get_node_provider(config["provider"], config["cluster_name"]) updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=True, rsync_options={ "rsync_exclude": config.get("rsync_exclude"), "rsync_filter": config.get("rsync_filter") }, docker_config=config.get("docker")) shutdown_after_run = False if cmd and stop: cmd += "; ".join([ "ray stop", "ray teardown ~/ray_bootstrap_config.yaml --yes --workers-only" ]) shutdown_after_run = True result = _exec( updater, cmd, screen, tmux, port_forward=port_forward, with_output=with_output, run_env=run_env, shutdown_after_run=shutdown_after_run) if tmux or screen: attach_command_parts = ["ray attach", config_file] if override_cluster_name is not None: attach_command_parts.append( "--cluster-name={}".format(override_cluster_name)) if tmux: attach_command_parts.append("--tmux") elif screen: attach_command_parts.append("--screen") attach_command = " ".join(attach_command_parts) cli_logger.print("Run `{}` to check command status.", cf.bold(attach_command)) return result
def _bootstrap_config(config: Dict[str, Any], no_config_cache: bool = False) -> Dict[str, Any]: config = prepare_config(config) hasher = hashlib.sha1() hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) cache_key = os.path.join(tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())) if os.path.exists(cache_key) and not no_config_cache: config_cache = json.loads(open(cache_key).read()) if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION: # todo: is it fine to re-resolve? afaik it should be. # we can have migrations otherwise or something # but this seems overcomplicated given that resolving is # relatively cheap try_reload_log_state(config_cache["config"]["provider"], config_cache.get("provider_log_info")) if log_once("_printed_cached_config_warning"): cli_logger.verbose_warning( "Loaded cached provider configuration " "from " + cf.bold("{}"), cache_key) if cli_logger.verbosity == 0: cli_logger.warning("Loaded cached provider configuration") cli_logger.warning( "If you experience issues with " "the cloud provider, try re-running " "the command with {}.", cf.bold("--no-config-cache")) return config_cache["config"] else: cli_logger.warning( "Found cached cluster config " "but the version " + cf.bold("{}") + " " "(expected " + cf.bold("{}") + ") does not match.\n" "This is normal if cluster launcher was updated.\n" "Config will be re-resolved.", config_cache.get("_version", "none"), CONFIG_CACHE_VERSION) importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) provider_cls = importer(config["provider"]) cli_logger.print("Checking {} environment settings", _PROVIDER_PRETTY_NAMES.get(config["provider"]["type"])) try: config = provider_cls.fillout_available_node_types_resources(config) except Exception as exc: if cli_logger.verbosity > 2: logger.exception("Failed to autodetect node resources.") else: cli_logger.warning( f"Failed to autodetect node resources: {str(exc)}. " "You can see full stack trace with higher verbosity.") # NOTE: if `resources` field is missing, validate_config for providers # other than AWS and Kubernetes will fail (the schema error will ask the # user to manually fill the resources) as we currently support autofilling # resources for AWS and Kubernetes only. validate_config(config) resolved_config = provider_cls.bootstrap_config(config) if not no_config_cache: with open(cache_key, "w") as f: config_cache = { "_version": CONFIG_CACHE_VERSION, "provider_log_info": try_get_log_state(config["provider"]), "config": resolved_config } f.write(json.dumps(config_cache)) return resolved_config
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool) -> None: """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) if not workers_only: try: exec_cluster( config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occurred when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") provider = _get_node_provider(config["provider"], config["cluster_name"]) def remaining_nodes(): workers = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_WORKER }) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.dimmed("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.dimmed("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_HEAD }) return head + workers def run_docker_stop(node, container_name): try: exec_cluster( config_file, cmd=f"docker stop {container_name}", run_env="host", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception: cli_logger.warning(f"Docker stop failed on {node}") # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() container_name = config.get("docker", {}).get("container_name") if container_name: for node in A: run_docker_stop(node, container_name) with LogTimer("teardown_cluster: done."): while A: provider.terminate_nodes(A) cli_logger.print( "Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep(POLL_INTERVAL) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL) cli_logger.success("No nodes remaining.")
def _configure_key_pair(config): if "ssh_private_key" in config["auth"]: _set_config_info(keypair_src="config") # If the key is not configured via the cloudinit # UserData, it should be configured via KeyName or # else we will risk starting a node that we cannot # SSH into: if "UserData" not in config["head_node"]: cli_logger.doassert( # todo: verify schema beforehand? "KeyName" in config["head_node"], "`KeyName` missing for head node.") # todo: err msg assert "KeyName" in config["head_node"] if "UserData" not in config["worker_nodes"]: cli_logger.doassert( "KeyName" in config["worker_nodes"], "`KeyName` missing for worker nodes.") # todo: err msg assert "KeyName" in config["worker_nodes"] return config _set_config_info(keypair_src="default") ec2 = _resource("ec2", config) # Writing the new ssh key to the filesystem fails if the ~/.ssh # directory doesn't already exist. os.makedirs(os.path.expanduser("~/.ssh"), exist_ok=True) # Try a few times to get or create a good key pair. MAX_NUM_KEYS = 30 for i in range(MAX_NUM_KEYS): key_name = config["provider"].get("key_pair", {}).get("key_name") key_name, key_path = key_pair(i, config["provider"]["region"], key_name) key = _get_key(key_name, config) # Found a good key. if key and os.path.exists(key_path): break # We can safely create a new key. if not key and not os.path.exists(key_path): cli_logger.verbose( "Creating new key pair {} for use as the default.", cf.bold(key_name)) key = ec2.create_key_pair(KeyName=key_name) # We need to make sure to _create_ the file with the right # permissions. In order to do that we need to change the default # os.open behavior to include the mode we want. with open(key_path, "w", opener=partial(os.open, mode=0o600)) as f: f.write(key.key_material) break if not key: cli_logger.abort( "No matching local key file for any of the key pairs in this " "account with ids from 0..{}. " "Consider deleting some unused keys pairs from your account.", key_name) cli_logger.doassert( os.path.exists(key_path), "Private key file " + cf.bold("{}") + " not found for " + cf.bold("{}"), key_path, key_name) # todo: err msg assert os.path.exists(key_path), \ "Private key file {} not found for {}".format(key_path, key_name) config["auth"]["ssh_private_key"] = key_path config["head_node"]["KeyName"] = key_name config["worker_nodes"]["KeyName"] = key_name return config
def create_or_update_cluster(config_file: str, override_min_workers: Optional[int], override_max_workers: Optional[int], no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str] = None, no_config_cache: bool = False, redirect_command_output: Optional[bool] = False, use_login_shells: bool = True) -> None: """Create or updates an autoscaling Ray cluster from a config json.""" set_using_login_shells(use_login_shells) if not use_login_shells: cmd_output_util.set_allow_interactive(False) if redirect_command_output is None: # Do not redirect by default. cmd_output_util.set_output_redirected(False) else: cmd_output_util.set_output_redirected(redirect_command_output) def handle_yaml_error(e): cli_logger.error("Cluster config invalid") cli_logger.newline() cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file) cli_logger.newline() with cli_logger.verbatim_error_ctx("PyYAML error:"): cli_logger.error(e) cli_logger.abort() try: config = yaml.safe_load(open(config_file).read()) except FileNotFoundError: cli_logger.abort( "Provided cluster configuration file ({}) does not exist", cf.bold(config_file)) raise except yaml.parser.ParserError as e: handle_yaml_error(e) raise except yaml.scanner.ScannerError as e: handle_yaml_error(e) raise # todo: validate file_mounts, ssh keys, etc. importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: cli_logger.abort( "Unknown provider type " + cf.bold("{}") + "\n" "Available providers are: {}", config["provider"]["type"], cli_logger.render_list([ k for k in _NODE_PROVIDERS.keys() if _NODE_PROVIDERS[k] is not None ])) raise NotImplementedError("Unsupported provider {}".format( config["provider"])) printed_overrides = False def handle_cli_override(key, override): if override is not None: if key in config: nonlocal printed_overrides printed_overrides = True cli_logger.warning( "`{}` override provided on the command line.\n" " Using " + cf.bold("{}") + cf.dimmed( " [configuration file has " + cf.bold("{}") + "]"), key, override, config[key]) config[key] = override handle_cli_override("min_workers", override_min_workers) handle_cli_override("max_workers", override_max_workers) handle_cli_override("cluster_name", override_cluster_name) if printed_overrides: cli_logger.newline() cli_logger.labeled_value("Cluster", config["cluster_name"]) cli_logger.newline() config = _bootstrap_config(config, no_config_cache=no_config_cache) try_logging_config(config) get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name)
def submit( address: Optional[str], job_id: Optional[str], runtime_env: Optional[str], runtime_env_json: Optional[str], working_dir: Optional[str], entrypoint: Tuple[str], no_wait: bool, ): """Submits a job to be run on the cluster. Example: ray job submit -- python my_script.py --arg=val """ if ray_constants.RAY_JOB_SUBMIT_HOOK in os.environ: # Submit all args as **kwargs per the JOB_SUBMIT_HOOK contract. _load_class(os.environ[ray_constants.RAY_JOB_SUBMIT_HOOK])( address=address, job_id=job_id, runtime_env=runtime_env, runtime_env_json=runtime_env_json, working_dir=working_dir, entrypoint=entrypoint, no_wait=no_wait, ) client = _get_sdk_client(address, create_cluster_if_needed=True) final_runtime_env = parse_runtime_env_args( runtime_env=runtime_env, runtime_env_json=runtime_env_json, working_dir=working_dir, ) job_id = client.submit_job( entrypoint=list2cmdline(entrypoint), job_id=job_id, runtime_env=final_runtime_env, ) _log_big_success_msg(f"Job '{job_id}' submitted successfully") with cli_logger.group("Next steps"): cli_logger.print("Query the logs of the job:") with cli_logger.indented(): cli_logger.print(cf.bold(f"ray job logs {job_id}")) cli_logger.print("Query the status of the job:") with cli_logger.indented(): cli_logger.print(cf.bold(f"ray job status {job_id}")) cli_logger.print("Request the job to be stopped:") with cli_logger.indented(): cli_logger.print(cf.bold(f"ray job stop {job_id}")) cli_logger.newline() sdk_version = client.get_version() # sdk version 0 does not have log streaming if not no_wait: if int(sdk_version) > 0: cli_logger.print("Tailing logs until the job exits " "(disable with --no-wait):") asyncio.get_event_loop().run_until_complete( _tail_logs(client, job_id)) else: cli_logger.warning( "Tailing logs is not enabled for job sdk client version " f"{sdk_version}. Please upgrade your ray to latest version " "for this feature.")