def sync_file_mounts(self, sync_cmd, step_numbers=(0, 2)): # step_numbers is (# of previous steps, total steps) previous_steps, total_steps = step_numbers nolog_paths = [] if cli_logger.verbosity == 0: nolog_paths = [ "~/ray_bootstrap_key.pem", "~/ray_bootstrap_config.yaml" ] def do_sync(remote_path, local_path, allow_non_existing_paths=False): if allow_non_existing_paths and not os.path.exists(local_path): # Ignore missing source files. In the future we should support # the --delete-missing-args command to delete files that have # been removed return assert os.path.exists(local_path), local_path if os.path.isdir(local_path): if not local_path.endswith("/"): local_path += "/" if not remote_path.endswith("/"): remote_path += "/" with LogTimer(self.log_prefix + "Synced {} to {}".format(local_path, remote_path)): if not isinstance(self.cmd_runner, DockerCommandRunner): # The DockerCommandRunner handles this internally self.cmd_runner.run( "mkdir -p {}".format(os.path.dirname(remote_path)), run_env="host") sync_cmd(local_path, remote_path, file_mount=True) if remote_path not in nolog_paths: # todo: timed here? cli_logger.print("{} from {}", cf.bold(remote_path), cf.bold(local_path)) # Rsync file mounts with cli_logger.group( "Processing file mounts", _numbered=("[]", previous_steps + 1, total_steps)): for remote_path, local_path in self.file_mounts.items(): do_sync(remote_path, local_path) if self.cluster_synced_files: with cli_logger.group( "Processing worker file mounts", _numbered=("[]", previous_steps + 2, total_steps)): for path in self.cluster_synced_files: do_sync(path, path, allow_non_existing_paths=True) else: cli_logger.print( "No worker file mounts to sync", _numbered=("[]", previous_steps + 2, total_steps))
def sync_file_mounts(self, sync_cmd): nolog_paths = [] if cli_logger.verbosity == 0: nolog_paths = [ "~/ray_bootstrap_key.pem", "~/ray_bootstrap_config.yaml" ] # Rsync file mounts with cli_logger.group("Processing file mounts", _numbered=("[]", 2, 5)): for remote_path, local_path in self.file_mounts.items(): assert os.path.exists(local_path), local_path if os.path.isdir(local_path): if not local_path.endswith("/"): local_path += "/" if not remote_path.endswith("/"): remote_path += "/" with LogTimer( self.log_prefix + "Synced {} to {}".format(local_path, remote_path)): self.cmd_runner.run("mkdir -p {}".format( os.path.dirname(remote_path))) sync_cmd(local_path, remote_path) if remote_path not in nolog_paths: # todo: timed here? cli_logger.print("{} from {}", cf.bold(remote_path), cf.bold(local_path))
def run(self, cmd, timeout=120, exit_on_fail=False, port_forward=None, with_output=False, ssh_options_override=None, **kwargs): ssh_options = ssh_options_override or self.ssh_options assert isinstance( ssh_options, SSHOptions ), "ssh_options must be of type SSHOptions, got {}".format( type(ssh_options)) self._set_ssh_ip_if_required() if is_using_login_shells(): ssh = ["ssh", "-tt"] else: ssh = ["ssh"] if port_forward: with cli_logger.group("Forwarding ports"): if not isinstance(port_forward, list): port_forward = [port_forward] for local, remote in port_forward: cli_logger.verbose( "Forwarding port {} to port {} on localhost.", cf.bold(local), cf.bold(remote)) # todo: msg cli_logger.old_info(logger, "{}Forwarding {} -> localhost:{}", self.log_prefix, local, remote) ssh += ["-L", "{}:localhost:{}".format(remote, local)] final_cmd = ssh + ssh_options.to_ssh_options_list( timeout=timeout) + ["{}@{}".format(self.ssh_user, self.ssh_ip)] if cmd: if is_using_login_shells(): final_cmd += _with_interactive(cmd) else: final_cmd += [cmd] cli_logger.old_info(logger, "{}Running {}", self.log_prefix, " ".join(final_cmd)) else: # We do this because `-o ControlMaster` causes the `-N` flag to # still create an interactive shell in some ssh versions. final_cmd.append(quote("while true; do sleep 86400; done")) cli_logger.verbose("Running `{}`", cf.bold(cmd)) with cli_logger.indented(): cli_logger.very_verbose("Full command is `{}`", cf.bold(" ".join(final_cmd))) if cli_logger.verbosity > 0: with cli_logger.indented(): return self._run_helper(final_cmd, with_output, exit_on_fail) else: return self._run_helper(final_cmd, with_output, exit_on_fail)
def wait_ready(self, deadline): with cli_logger.group("Waiting for SSH to become available", _numbered=("[]", 1, 6)): with LogTimer(self.log_prefix + "Got remote shell"): cli_logger.old_info(logger, "{}Waiting for remote shell...", self.log_prefix) cli_logger.print("Running `{}` as a test.", cf.bold("uptime")) first_conn_refused_time = None while time.time() < deadline and \ not self.provider.is_terminated(self.node_id): try: cli_logger.old_debug(logger, "{}Waiting for remote shell...", self.log_prefix) # Run outside of the container self.cmd_runner.run("uptime", run_env="host") cli_logger.old_debug(logger, "Uptime succeeded.") cli_logger.success("Success.") return True except ProcessRunnerError as e: first_conn_refused_time = \ cmd_output_util.handle_ssh_fails( e, first_conn_refused_time, retry_interval=READY_CHECK_INTERVAL) time.sleep(READY_CHECK_INTERVAL) except Exception as e: # TODO(maximsmol): we should not be ignoring # exceptions if they get filtered properly # (new style log + non-interactive shells) # # however threading this configuration state # is a pain and I'm leaving it for later retry_str = str(e) if hasattr(e, "cmd"): retry_str = "(Exit Status {}): {}".format( e.returncode, " ".join(e.cmd)) cli_logger.print( "SSH still not available {}, " "retrying in {} seconds.", cf.dimmed(retry_str), cf.bold(str(READY_CHECK_INTERVAL))) cli_logger.old_debug(logger, "{}Node not up, retrying: {}", self.log_prefix, retry_str) time.sleep(READY_CHECK_INTERVAL) assert False, "Unable to connect to node"
def wait_ready(self, deadline): with cli_logger.group( "Waiting for SSH to become available", _numbered=("[]", 1, 6)): with LogTimer(self.log_prefix + "Got remote shell"): cli_logger.old_info(logger, "{}Waiting for remote shell...", self.log_prefix) cli_logger.print("Running `{}` as a test.", cf.bold("uptime")) while time.time() < deadline and \ not self.provider.is_terminated(self.node_id): try: cli_logger.old_debug(logger, "{}Waiting for remote shell...", self.log_prefix) self.cmd_runner.run("uptime") cli_logger.old_debug(logger, "Uptime succeeded.") cli_logger.success("Success.") return True except Exception as e: retry_str = str(e) if hasattr(e, "cmd"): retry_str = "(Exit Status {}): {}".format( e.returncode, " ".join(e.cmd)) cli_logger.print( "SSH still not available {}, " "retrying in {} seconds.", cf.gray(retry_str), cf.bold(str(READY_CHECK_INTERVAL))) cli_logger.old_debug(logger, "{}Node not up, retrying: {}", self.log_prefix, retry_str) time.sleep(READY_CHECK_INTERVAL) assert False, "Unable to connect to node"
def log_to_cli(config): provider_name = PROVIDER_PRETTY_NAMES.get("aws", None) cli_logger.doassert(provider_name is not None, "Could not find a pretty name for the AWS provider.") with cli_logger.group("{} config", provider_name): def same_everywhere(key): return config["head_node"][key] == config["worker_nodes"][key] def print_info(resource_string, key, head_src_key, workers_src_key, allowed_tags=["default"], list_value=False): head_tags = {} workers_tags = {} if _log_info[head_src_key] in allowed_tags: head_tags[_log_info[head_src_key]] = True if _log_info[workers_src_key] in allowed_tags: workers_tags[_log_info[workers_src_key]] = True head_value_str = config["head_node"][key] if list_value: head_value_str = cli_logger.render_list(head_value_str) if same_everywhere(key): cli_logger.labeled_value( # todo: handle plural vs singular? resource_string + " (head & workers)", "{}", head_value_str, _tags=head_tags) else: workers_value_str = config["worker_nodes"][key] if list_value: workers_value_str = cli_logger.render_list( workers_value_str) cli_logger.labeled_value(resource_string + " (head)", "{}", head_value_str, _tags=head_tags) cli_logger.labeled_value(resource_string + " (workers)", "{}", workers_value_str, _tags=workers_tags) tags = {"default": _log_info["head_instance_profile_src"] == "default"} cli_logger.labeled_value( "IAM Profile", "{}", _arn_to_name(config["head_node"]["IamInstanceProfile"]["Arn"]), _tags=tags) print_info("EC2 Key pair", "KeyName", "keypair_src", "keypair_src") print_info("VPC Subnets", "SubnetIds", "head_subnet_src", "workers_subnet_src", list_value=True) print_info("EC2 Security groups", "SecurityGroupIds", "head_security_group_src", "workers_security_group_src", list_value=True) print_info("EC2 AMI", "ImageId", "head_ami_src", "workers_ami_src", allowed_tags=["dlami"]) cli_logger.newline()
def get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name, _provider=None, _runner=subprocess): """Create the cluster head node, which in turn creates the workers.""" provider = (_provider or get_node_provider(config["provider"], config["cluster_name"])) config = copy.deepcopy(config) raw_config_file = config_file # used for printing to the user config_file = os.path.abspath(config_file) try: head_node_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, } nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: cli_logger.confirm(yes, "No head node found. " "Launching a new cluster.", _abort=True) cli_logger.old_confirm("This will create a new cluster", yes) elif not no_restart: cli_logger.old_confirm("This will restart cluster services", yes) if head_node: if restart_only: cli_logger.confirm( yes, "Updating cluster configuration and " "restarting the cluster Ray runtime. " "Setup commands will not be run due to `{}`.\n", cf.bold("--restart-only"), _abort=True) elif no_restart: cli_logger.print( "Cluster Ray runtime will not be restarted due " "to `{}`.", cf.bold("--no-restart")) cli_logger.confirm(yes, "Updating cluster configuration and " "running setup commands.", _abort=True) else: cli_logger.print( "Updating cluster configuration and running full setup.") cli_logger.confirm( yes, cf.bold("Cluster Ray runtime will be restarted."), _abort=True) cli_logger.newline() # TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync) head_node_config = copy.deepcopy(config["head_node"]) if "head_node_type" in config: head_node_tags[TAG_RAY_USER_NODE_TYPE] = config["head_node_type"] head_node_config.update(config["available_node_types"][ config["head_node_type"]]["node_config"]) launch_hash = hash_launch_conf(head_node_config, config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: with cli_logger.group("Acquiring an up-to-date head node"): if head_node is not None: cli_logger.print( "Currently running head node is out-of-date with " "cluster configuration") cli_logger.print( "hash is {}, expected {}", cf.bold( provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG)), cf.bold(launch_hash)) cli_logger.confirm(yes, "Relaunching it.", _abort=True) cli_logger.old_confirm( "Head node config out-of-date. It will be terminated", yes) cli_logger.old_info( logger, "get_or_create_head_node: " "Shutting down outdated head node {}", head_node) provider.terminate_node(head_node) cli_logger.print("Terminated head node {}", head_node) cli_logger.old_info( logger, "get_or_create_head_node: Launching new head node...") head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( config["cluster_name"]) provider.create_node(head_node_config, head_node_tags, 1) cli_logger.print("Launched a new head node") start = time.time() head_node = None with cli_logger.timed("Fetching the new head node"): while True: if time.time() - start > 50: cli_logger.abort( "Head node fetch timed out.") # todo: msg raise RuntimeError("Failed to create head node.") nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) == 1: head_node = nodes[0] break time.sleep(1) cli_logger.newline() with cli_logger.group( "Setting up head node", _numbered=("<>", 1, 1), # cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]), _tags=dict()): # add id, ARN to tags? # TODO(ekl) right now we always update the head node even if the # hash matches. # We could prompt the user for what they want to do here. # No need to pass in cluster_sync_files because we use this # hash to set up the head node (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf( config["file_mounts"], None, config) cli_logger.old_info( logger, "get_or_create_head_node: Updating files on head node...") # Rewrite the auth config so that the head # node can update the workers remote_config = copy.deepcopy(config) # drop proxy options if they exist, otherwise # head node won't be able to connect to workers remote_config["auth"].pop("ssh_proxy_command", None) if "ssh_private_key" in config["auth"]: remote_key_path = "~/ray_bootstrap_key.pem" remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts remote_config["no_restart"] = no_restart # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile( "w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update( {"~/ray_bootstrap_config.yaml": remote_config_file.name}) if "ssh_private_key" in config["auth"]: config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], }) cli_logger.print("Prepared bootstrap config") if restart_only: setup_commands = [] ray_start_commands = config["head_start_ray_commands"] elif no_restart: setup_commands = config["head_setup_commands"] ray_start_commands = [] else: setup_commands = config["head_setup_commands"] ray_start_commands = config["head_start_ray_commands"] if not no_restart: warn_about_bad_start_command(ray_start_commands) updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=config["initialization_commands"], setup_commands=setup_commands, ray_start_commands=ray_start_commands, process_runner=_runner, runtime_hash=runtime_hash, file_mounts_contents_hash=file_mounts_contents_hash, is_head_node=True, docker_config=config.get("docker")) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.non_terminated_nodes(head_node_tags) if config.get("provider", {}).get("use_internal_ips", False) is True: head_node_ip = provider.internal_ip(head_node) else: head_node_ip = provider.external_ip(head_node) if updater.exitcode != 0: # todo: this does not follow the mockup and is not good enough cli_logger.abort("Failed to setup head node.") cli_logger.old_error( logger, "get_or_create_head_node: " "Updating {} failed", head_node_ip) sys.exit(1) cli_logger.old_info( logger, "get_or_create_head_node: " "Head node up-to-date, IP address is: {}", head_node_ip) monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*" if override_cluster_name: modifiers = " --cluster-name={}".format( quote(override_cluster_name)) else: modifiers = "" if cli_logger.old_style: print("To monitor autoscaling activity, you can run:\n\n" " ray exec {} {}{}\n".format(config_file, quote(monitor_str), modifiers)) print("To open a console on the cluster:\n\n" " ray attach {}{}\n".format(config_file, modifiers)) print("To get a remote shell to the cluster manually, run:\n\n" " {}\n".format( updater.cmd_runner.remote_shell_command_str())) cli_logger.newline() with cli_logger.group("Useful commands"): cli_logger.print("Monitor autoscaling with") cli_logger.print(cf.bold(" ray exec {}{} {}"), raw_config_file, modifiers, quote(monitor_str)) cli_logger.print("Connect to a terminal on the cluster head") cli_logger.print(cf.bold(" ray attach {}{}"), raw_config_file, modifiers) finally: provider.cleanup()
def do_update(self): self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH}) cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH) deadline = time.time() + NODE_START_WAIT_S self.wait_ready(deadline) node_tags = self.provider.node_tags(self.node_id) logger.debug("Node tags: {}".format(str(node_tags))) # runtime_hash will only change whenever the user restarts # or updates their cluster with `get_or_create_head_node` if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and ( self.file_mounts_contents_hash is None or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) == self.file_mounts_contents_hash): # todo: we lie in the confirmation message since # full setup might be cancelled here cli_logger.print( "Configuration already up to date, " "skipping file mounts, initalization and setup commands.", _numbered=("[]", "2-5", 6)) cli_logger.old_info(logger, "{}{} already up-to-date, skip to ray start", self.log_prefix, self.node_id) # When resuming from a stopped instance the runtime_hash may be the # same, but the container will not be started. self.cmd_runner.run_init(as_head=self.is_head_node, file_mounts=self.file_mounts) else: cli_logger.print("Updating cluster configuration.", _tags=dict(hash=self.runtime_hash)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES}) cli_logger.labeled_value("New status", STATUS_SYNCING_FILES) self.sync_file_mounts(self.rsync_up, step_numbers=(2, 6)) # Only run setup commands if runtime_hash has changed because # we don't want to run setup_commands every time the head node # file_mounts folders have changed. if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash: # Run init commands self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP}) cli_logger.labeled_value("New status", STATUS_SETTING_UP) if self.initialization_commands: with cli_logger.group("Running initialization commands", _numbered=("[]", 3, 5)): with LogTimer(self.log_prefix + "Initialization commands", show_status=True): for cmd in self.initialization_commands: try: # Overriding the existing SSHOptions class # with a new SSHOptions class that uses # this ssh_private_key as its only __init__ # argument. # Run outside docker. self.cmd_runner.run( cmd, ssh_options_override_ssh_key=self. auth_config.get("ssh_private_key"), run_env="host") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Initialization command failed." ) from None else: cli_logger.print("No initialization commands to run.", _numbered=("[]", 3, 6)) self.cmd_runner.run_init(as_head=self.is_head_node, file_mounts=self.file_mounts) if self.setup_commands: with cli_logger.group( "Running setup commands", # todo: fix command numbering _numbered=("[]", 4, 6)): with LogTimer(self.log_prefix + "Setup commands", show_status=True): total = len(self.setup_commands) for i, cmd in enumerate(self.setup_commands): if cli_logger.verbosity == 0 and len(cmd) > 30: cmd_to_print = cf.bold(cmd[:30]) + "..." else: cmd_to_print = cf.bold(cmd) cli_logger.print("{}", cmd_to_print, _numbered=("()", i, total)) try: # Runs in the container if docker is in use self.cmd_runner.run(cmd, run_env="auto") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Setup command failed.") else: cli_logger.print("No setup commands to run.", _numbered=("[]", 4, 6)) with cli_logger.group("Starting the Ray runtime", _numbered=("[]", 6, 6)): with LogTimer(self.log_prefix + "Ray start commands", show_status=True): for cmd in self.ray_start_commands: if self.node_resources: env_vars = { ray_constants.RESOURCES_ENVIRONMENT_VARIABLE: self.node_resources } else: env_vars = {} try: old_redirected = cmd_output_util.is_output_redirected() cmd_output_util.set_output_redirected(False) # Runs in the container if docker is in use self.cmd_runner.run(cmd, environment_variables=env_vars, run_env="auto") cmd_output_util.set_output_redirected(old_redirected) except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error("See above for stderr.") raise click.ClickException("Start command failed.")
def do_update(self): self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH}) cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH) deadline = time.time() + NODE_START_WAIT_S self.wait_ready(deadline) node_tags = self.provider.node_tags(self.node_id) logger.debug("Node tags: {}".format(str(node_tags))) # runtime_hash will only change whenever the user restarts # or updates their cluster with `get_or_create_head_node` if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and ( self.file_mounts_contents_hash is None or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) == self.file_mounts_contents_hash): # todo: we lie in the confirmation message since # full setup might be cancelled here cli_logger.print( "Configuration already up to date, " "skipping file mounts, initalization and setup commands.") cli_logger.old_info(logger, "{}{} already up-to-date, skip to ray start", self.log_prefix, self.node_id) else: cli_logger.print( "Updating cluster configuration.", _tags=dict(hash=self.runtime_hash)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES}) cli_logger.labeled_value("New status", STATUS_SYNCING_FILES) self.sync_file_mounts(self.rsync_up) # Only run setup commands if runtime_hash has changed because # we don't want to run setup_commands every time the head node # file_mounts folders have changed. if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash: # Run init commands self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP}) cli_logger.labeled_value("New status", STATUS_SETTING_UP) if self.initialization_commands: with cli_logger.group( "Running initialization commands", _numbered=("[]", 4, 6)): # todo: fix command numbering with LogTimer( self.log_prefix + "Initialization commands", show_status=True): for cmd in self.initialization_commands: self.cmd_runner.run( cmd, ssh_options_override=SSHOptions( self.auth_config.get( "ssh_private_key"))) else: cli_logger.print( "No initialization commands to run.", _numbered=("[]", 4, 6)) if self.setup_commands: with cli_logger.group( "Running setup commands", _numbered=("[]", 5, 6)): # todo: fix command numbering with LogTimer( self.log_prefix + "Setup commands", show_status=True): total = len(self.setup_commands) for i, cmd in enumerate(self.setup_commands): if cli_logger.verbosity == 0: cmd_to_print = cf.bold(cmd[:30]) + "..." else: cmd_to_print = cf.bold(cmd) cli_logger.print( "{}", cmd_to_print, _numbered=("()", i, total)) self.cmd_runner.run(cmd) else: cli_logger.print( "No setup commands to run.", _numbered=("[]", 5, 6)) with cli_logger.group( "Starting the Ray runtime", _numbered=("[]", 6, 6)): with LogTimer( self.log_prefix + "Ray start commands", show_status=True): for cmd in self.ray_start_commands: self.cmd_runner.run(cmd)
def resolve(self, is_head, node_ip_address=None): """Returns a copy with values filled out with system defaults. Args: is_head (bool): Whether this is the head node. node_ip_address (str): The IP address of the node that we are on. This is used to automatically create a node id resource. """ resources = (self.resources or {}).copy() assert "CPU" not in resources, resources assert "GPU" not in resources, resources assert "memory" not in resources, resources assert "object_store_memory" not in resources, resources if node_ip_address is None: node_ip_address = ray.services.get_node_ip_address() # Automatically create a node id resource on each node. This is # queryable with ray.state.node_ids() and ray.state.current_node_id(). resources[NODE_ID_PREFIX + node_ip_address] = 1.0 num_cpus = self.num_cpus if num_cpus is None: num_cpus = multiprocessing.cpu_count() num_gpus = self.num_gpus gpu_ids = ray.utils.get_cuda_visible_devices() # Check that the number of GPUs that the raylet wants doesn't # excede the amount allowed by CUDA_VISIBLE_DEVICES. if (num_gpus is not None and gpu_ids is not None and num_gpus > len(gpu_ids)): raise ValueError("Attempting to start raylet with {} GPUs, " "but CUDA_VISIBLE_DEVICES contains {}.".format( num_gpus, gpu_ids)) if num_gpus is None: # Try to automatically detect the number of GPUs. num_gpus = _autodetect_num_gpus() # Don't use more GPUs than allowed by CUDA_VISIBLE_DEVICES. if gpu_ids is not None: num_gpus = min(num_gpus, len(gpu_ids)) try: info_string = _get_gpu_info_string() gpu_types = _constraints_from_gpu_info(info_string) resources.update(gpu_types) except Exception: logger.exception("Could not parse gpu information.") # Choose a default object store size. system_memory = ray.utils.get_system_memory() avail_memory = ray.utils.estimate_available_memory() object_store_memory = self.object_store_memory if object_store_memory is None: object_store_memory = int(avail_memory * 0.3) # Cap memory to avoid memory waste and perf issues on large nodes if (object_store_memory > ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES): logger.debug( "Warning: Capping object memory store to {}GB. ".format( ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES // 1e9) + "To increase this further, specify `object_store_memory` " "when calling ray.init() or ray start.") object_store_memory = ( ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES) redis_max_memory = self.redis_max_memory if redis_max_memory is None: redis_max_memory = min( ray_constants.DEFAULT_REDIS_MAX_MEMORY_BYTES, max(int(avail_memory * 0.1), ray_constants.REDIS_MINIMUM_MEMORY_BYTES)) if redis_max_memory < ray_constants.REDIS_MINIMUM_MEMORY_BYTES: raise ValueError( "Attempting to cap Redis memory usage at {} bytes, " "but the minimum allowed is {} bytes.".format( redis_max_memory, ray_constants.REDIS_MINIMUM_MEMORY_BYTES)) memory = self.memory if memory is None: memory = (avail_memory - object_store_memory - (redis_max_memory if is_head else 0)) if memory < 100e6 and memory < 0.05 * system_memory: raise ValueError( "After taking into account object store and redis memory " "usage, the amount of memory on this node available for " "tasks and actors ({} GB) is less than {}% of total. " "You can adjust these settings with " "ray.init(memory=<bytes>, " "object_store_memory=<bytes>).".format( round(memory / 1e9, 2), int(100 * (memory / system_memory)))) rounded_memory = ray_constants.round_to_memory_units(memory, round_up=False) worker_ram = round(rounded_memory / (1024**3), 2) object_ram = round(object_store_memory / (1024**3), 2) # TODO(maximsmol): this behavior is strange since we do not have a # good grasp on when this will get called # (you have to study node.py to make a guess) with cli_logger.group("Available RAM"): cli_logger.labeled_value("Workers", "{} GiB", str(worker_ram)) cli_logger.labeled_value("Objects", "{} GiB", str(object_ram)) cli_logger.newline() cli_logger.print("To adjust these values, use") with cf.with_style("monokai") as c: cli_logger.print( " ray{0}init(memory{1}{2}, " "object_store_memory{1}{2})", c.magenta("."), c.magenta("="), c.purple("<bytes>")) cli_logger.old_info( logger, "Starting Ray with {} GiB memory available for workers and up to " "{} GiB for objects. You can adjust these settings " "with ray.init(memory=<bytes>, " "object_store_memory=<bytes>).", worker_ram, object_ram) spec = ResourceSpec(num_cpus, num_gpus, memory, object_store_memory, resources, redis_max_memory) assert spec.resolved() return spec
def run(self, cmd, timeout=120, exit_on_fail=False, port_forward=None, with_output=False, ssh_options_override=None, **kwargs): ssh_options = ssh_options_override or self.ssh_options assert isinstance( ssh_options, SSHOptions ), "ssh_options must be of type SSHOptions, got {}".format( type(ssh_options)) self._set_ssh_ip_if_required() ssh = ["ssh", "-tt"] if port_forward: with cli_logger.group("Forwarding ports"): if not isinstance(port_forward, list): port_forward = [port_forward] for local, remote in port_forward: cli_logger.verbose( "Forwarding port {} to port {} on localhost.", cf.bold(local), cf.bold(remote)) # todo: msg cli_logger.old_info(logger, "{}Forwarding {} -> localhost:{}", self.log_prefix, local, remote) ssh += ["-L", "{}:localhost:{}".format(remote, local)] final_cmd = ssh + ssh_options.to_ssh_options_list( timeout=timeout) + ["{}@{}".format(self.ssh_user, self.ssh_ip)] if cmd: final_cmd += _with_interactive(cmd) cli_logger.old_info(logger, "{}Running {}", self.log_prefix, " ".join(final_cmd)) else: # We do this because `-o ControlMaster` causes the `-N` flag to # still create an interactive shell in some ssh versions. final_cmd.append(quote("while true; do sleep 86400; done")) # todo: add a flag for this, we might # wanna log commands with print sometimes cli_logger.verbose("Running `{}`", cf.bold(cmd)) with cli_logger.indented(): cli_logger.very_verbose("Full command is `{}`", cf.bold(" ".join(final_cmd))) def start_process(): try: if with_output: return self.process_runner.check_output(final_cmd) else: self.process_runner.check_call(final_cmd) except subprocess.CalledProcessError as e: quoted_cmd = " ".join(final_cmd[:-1] + [quote(final_cmd[-1])]) if not cli_logger.old_style: raise ProcessRunnerError("Command failed", "ssh_command_failed", code=e.returncode, command=quoted_cmd) if exit_on_fail: raise click.ClickException( "Command failed: \n\n {}\n".format(quoted_cmd)) \ from None else: raise click.ClickException( "SSH command Failed. See above for the output from the" " failure.") from None if cli_logger.verbosity > 0: with cli_logger.indented(): return start_process() else: return start_process()
cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3])) cli_logger.newline() cli_logger.very_verbose("Very verbose") cli_logger.verbose("Verbose") cli_logger.verbose_warning("Verbose warning") cli_logger.verbose_error("Verbose error") cli_logger.print("Info") cli_logger.success("Success") cli_logger.warning("Warning") cli_logger.error("Error") cli_logger.newline() try: cli_logger.abort("Abort") except Exception: pass try: cli_logger.doassert(False, "Assert") except Exception: pass cli_logger.newline() cli_logger.confirm(True, "example") cli_logger.newline() with cli_logger.indented(): cli_logger.print("Indented") with cli_logger.group("Group"): cli_logger.print("Group contents") with cli_logger.timed("Timed (unimplemented)"): cli_logger.print("Timed contents") with cli_logger.verbatim_error_ctx("Verbtaim error"): cli_logger.print("Error contents")
def _create_node(self, node_config, tags, count): tags = to_aws_format(tags) conf = node_config.copy() # Delete unsupported keys from the node config try: del conf["Resources"] except KeyError: pass tag_pairs = [{ "Key": TAG_RAY_CLUSTER_NAME, "Value": self.cluster_name, }] for k, v in tags.items(): tag_pairs.append({ "Key": k, "Value": v, }) tag_specs = [{ "ResourceType": "instance", "Tags": tag_pairs, }] user_tag_specs = conf.get("TagSpecifications", []) # Allow users to add tags and override values of existing # tags with their own. This only applies to the resource type # "instance". All other resource types are appended to the list of # tag specs. for user_tag_spec in user_tag_specs: if user_tag_spec["ResourceType"] == "instance": for user_tag in user_tag_spec["Tags"]: exists = False for tag in tag_specs[0]["Tags"]: if user_tag["Key"] == tag["Key"]: exists = True tag["Value"] = user_tag["Value"] break if not exists: tag_specs[0]["Tags"] += [user_tag] else: tag_specs += [user_tag_spec] # SubnetIds is not a real config key: we must resolve to a # single SubnetId before invoking the AWS API. subnet_ids = conf.pop("SubnetIds") for attempt in range(1, BOTO_CREATE_MAX_RETRIES + 1): try: subnet_id = subnet_ids[self.subnet_idx % len(subnet_ids)] cli_logger.old_info( logger, "NodeProvider: calling create_instances " "with {} (count={}).", subnet_id, count) self.subnet_idx += 1 conf.update({ "MinCount": 1, "MaxCount": count, "SubnetId": subnet_id, "TagSpecifications": tag_specs }) created = self.ec2_fail_fast.create_instances(**conf) # todo: timed? # todo: handle plurality? with cli_logger.group( "Launching {} nodes", count, _tags=dict(subnet_id=subnet_id)): for instance in created: cli_logger.print( "Launched instance {}", instance.instance_id, _tags=dict( state=instance.state["Name"], info=instance.state_reason["Message"])) cli_logger.old_info( logger, "NodeProvider: Created instance " "[id={}, name={}, info={}]", instance.instance_id, instance.state["Name"], instance.state_reason["Message"]) break except botocore.exceptions.ClientError as exc: if attempt == BOTO_CREATE_MAX_RETRIES: # todo: err msg cli_logger.abort( "Failed to launch instances. Max attempts exceeded.") cli_logger.old_error( logger, "create_instances: Max attempts ({}) exceeded.", BOTO_CREATE_MAX_RETRIES) raise exc else: # todo: err msg cli_logger.abort(exc) cli_logger.old_error(logger, exc)
def create_node(self, node_config, tags, count): # Always add the instance type tag, since node reuse is unsafe # otherwise. tags = copy.deepcopy(tags) tags[TAG_RAY_INSTANCE_TYPE] = node_config["InstanceType"] # Try to reuse previously stopped nodes with compatible configs if self.cache_stopped_nodes: filters = [ { "Name": "instance-state-name", "Values": ["stopped", "stopping"], }, { "Name": "tag:{}".format(TAG_RAY_CLUSTER_NAME), "Values": [self.cluster_name], }, { "Name": "tag:{}".format(TAG_RAY_NODE_TYPE), "Values": [tags[TAG_RAY_NODE_TYPE]], }, { "Name": "tag:{}".format(TAG_RAY_INSTANCE_TYPE), "Values": [tags[TAG_RAY_INSTANCE_TYPE]], }, { "Name": "tag:{}".format(TAG_RAY_LAUNCH_CONFIG), "Values": [tags[TAG_RAY_LAUNCH_CONFIG]], }, ] reuse_nodes = list( self.ec2.instances.filter(Filters=filters))[:count] reuse_node_ids = [n.id for n in reuse_nodes] if reuse_nodes: cli_logger.print( # todo: handle plural vs singular? "Reusing nodes {}. " "To disable reuse, set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration.", cli_logger.render_list(reuse_node_ids)) cli_logger.old_info( logger, "AWSNodeProvider: reusing instances {}. " "To disable reuse, set " "'cache_stopped_nodes: False' in the provider " "config.", reuse_node_ids) # todo: timed? with cli_logger.group("Stopping instances to reuse"): for node in reuse_nodes: self.tag_cache[node.id] = from_aws_format( {x["Key"]: x["Value"] for x in node.tags}) if node.state["Name"] == "stopping": cli_logger.print("Waiting for instance {} to stop", node.id) cli_logger.old_info( logger, "AWSNodeProvider: waiting for instance " "{} to fully stop...", node.id) node.wait_until_stopped() self.ec2.meta.client.start_instances( InstanceIds=reuse_node_ids) for node_id in reuse_node_ids: self.set_node_tags(node_id, tags) count -= len(reuse_node_ids) if count: self._create_node(node_config, tags, count)
def create_node(self, node_config, tags, count): tags = copy.deepcopy(tags) # Try to reuse previously stopped nodes with compatible configs if self.cache_stopped_nodes: # TODO(ekl) this is breaking the abstraction boundary a little by # peeking into the tag set. filters = [ { "Name": "instance-state-name", "Values": ["stopped", "stopping"], }, { "Name": "tag:{}".format(TAG_RAY_CLUSTER_NAME), "Values": [self.cluster_name], }, { "Name": "tag:{}".format(TAG_RAY_NODE_KIND), "Values": [tags[TAG_RAY_NODE_KIND]], }, { "Name": "tag:{}".format(TAG_RAY_LAUNCH_CONFIG), "Values": [tags[TAG_RAY_LAUNCH_CONFIG]], }, ] # This tag may not always be present. if TAG_RAY_USER_NODE_TYPE in tags: filters.append({ "Name": "tag:{}".format(TAG_RAY_USER_NODE_TYPE), "Values": [tags[TAG_RAY_USER_NODE_TYPE]], }) reuse_nodes = list( self.ec2.instances.filter(Filters=filters))[:count] reuse_node_ids = [n.id for n in reuse_nodes] if reuse_nodes: cli_logger.print( # todo: handle plural vs singular? "Reusing nodes {}. " "To disable reuse, set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration.", cli_logger.render_list(reuse_node_ids)) cli_logger.old_info( logger, "AWSNodeProvider: reusing instances {}. " "To disable reuse, set " "'cache_stopped_nodes: False' in the provider " "config.", reuse_node_ids) # todo: timed? with cli_logger.group("Stopping instances to reuse"): for node in reuse_nodes: self.tag_cache[node.id] = from_aws_format( {x["Key"]: x["Value"] for x in node.tags}) if node.state["Name"] == "stopping": cli_logger.print("Waiting for instance {} to stop", node.id) cli_logger.old_info( logger, "AWSNodeProvider: waiting for instance " "{} to fully stop...", node.id) node.wait_until_stopped() self.ec2.meta.client.start_instances( InstanceIds=reuse_node_ids) for node_id in reuse_node_ids: self.set_node_tags(node_id, tags) count -= len(reuse_node_ids) if count: self._create_node(node_config, tags, count)