def handle_yaml_error(e): cli_logger.error( "Cluster config invalid.\n" "Failed to load YAML file " + cf.bold("{}"), config_file) cli_logger.newline() with cli_logger.verbatim_error_ctx("PyYAML error:"): cli_logger.error(e) cli_logger.abort()
def run(self): cli_logger.old_info(logger, "{}Updating to {}", self.log_prefix, self.runtime_hash) try: with LogTimer(self.log_prefix + "Applied config {}".format(self.runtime_hash)): self.do_update() except Exception as e: error_str = str(e) if hasattr(e, "cmd"): error_str = "(Exit Status {}) {}".format( e.returncode, " ".join(e.cmd)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED}) cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED)) cli_logger.old_error(logger, "{}Error executing: {}\n", self.log_prefix, error_str) cli_logger.error("!!!") if hasattr(e, "cmd"): cli_logger.error( "Setup command `{}` failed with exit code {}. stderr:", cf.bold(e.cmd), e.returncode) else: cli_logger.verbose_error("{}", str(vars(e))) # todo: handle this better somehow? cli_logger.error("{}", str(e)) # todo: print stderr here cli_logger.error("!!!") cli_logger.newline() if isinstance(e, click.ClickException): # todo: why do we ignore this here return raise tags_to_set = { TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, TAG_RAY_RUNTIME_CONFIG: self.runtime_hash, } if self.file_mounts_contents_hash is not None: tags_to_set[ TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash self.provider.set_node_tags(self.node_id, tags_to_set) cli_logger.labeled_value("New status", STATUS_UP_TO_DATE) self.exitcode = 0
def log_to_cli(config): provider_name = PROVIDER_PRETTY_NAMES.get("aws", None) cli_logger.doassert(provider_name is not None, "Could not find a pretty name for the AWS provider.") with cli_logger.group("{} config", provider_name): def same_everywhere(key): return config["head_node"][key] == config["worker_nodes"][key] def print_info(resource_string, key, head_src_key, workers_src_key, allowed_tags=["default"], list_value=False): head_tags = {} workers_tags = {} if _log_info[head_src_key] in allowed_tags: head_tags[_log_info[head_src_key]] = True if _log_info[workers_src_key] in allowed_tags: workers_tags[_log_info[workers_src_key]] = True head_value_str = config["head_node"][key] if list_value: head_value_str = cli_logger.render_list(head_value_str) if same_everywhere(key): cli_logger.labeled_value( # todo: handle plural vs singular? resource_string + " (head & workers)", "{}", head_value_str, _tags=head_tags) else: workers_value_str = config["worker_nodes"][key] if list_value: workers_value_str = cli_logger.render_list( workers_value_str) cli_logger.labeled_value(resource_string + " (head)", "{}", head_value_str, _tags=head_tags) cli_logger.labeled_value(resource_string + " (workers)", "{}", workers_value_str, _tags=workers_tags) tags = {"default": _log_info["head_instance_profile_src"] == "default"} cli_logger.labeled_value( "IAM Profile", "{}", _arn_to_name(config["head_node"]["IamInstanceProfile"]["Arn"]), _tags=tags) print_info("EC2 Key pair", "KeyName", "keypair_src", "keypair_src") print_info("VPC Subnets", "SubnetIds", "head_subnet_src", "workers_subnet_src", list_value=True) print_info("EC2 Security groups", "SecurityGroupIds", "head_security_group_src", "workers_security_group_src", list_value=True) print_info("EC2 AMI", "ImageId", "head_ami_src", "workers_ami_src", allowed_tags=["dlami"]) cli_logger.newline()
def create_or_update_cluster(config_file: str, override_min_workers: Optional[int], override_max_workers: Optional[int], no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str], no_config_cache: bool = False, redirect_command_output: bool = False, use_login_shells: bool = True) -> None: """Create or updates an autoscaling Ray cluster from a config json.""" set_using_login_shells(use_login_shells) if not use_login_shells: cmd_output_util.set_allow_interactive(False) if redirect_command_output is None: # Do not redirect by default. cmd_output_util.set_output_redirected(False) else: cmd_output_util.set_output_redirected(redirect_command_output) if use_login_shells: cli_logger.warning( "Commands running under a login shell can produce more " "output than special processing can handle.") cli_logger.warning( "Thus, the output from subcommands will be logged as is.") cli_logger.warning( "Consider using {}, {}.", cf.bold("--use-normal-shells"), cf.underlined("if you tested your workflow and it is compatible")) cli_logger.newline() def handle_yaml_error(e): cli_logger.error("Cluster config invalid") cli_logger.newline() cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file) cli_logger.newline() with cli_logger.verbatim_error_ctx("PyYAML error:"): cli_logger.error(e) cli_logger.abort() try: config = yaml.safe_load(open(config_file).read()) except FileNotFoundError: cli_logger.abort( "Provided cluster configuration file ({}) does not exist", cf.bold(config_file)) raise except yaml.parser.ParserError as e: handle_yaml_error(e) raise except yaml.scanner.ScannerError as e: handle_yaml_error(e) raise # todo: validate file_mounts, ssh keys, etc. importer = NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: cli_logger.abort( "Unknown provider type " + cf.bold("{}") + "\n" "Available providers are: {}", config["provider"]["type"], cli_logger.render_list([ k for k in NODE_PROVIDERS.keys() if NODE_PROVIDERS[k] is not None ])) raise NotImplementedError("Unsupported provider {}".format( config["provider"])) cli_logger.success("Cluster configuration valid") printed_overrides = False def handle_cli_override(key, override): if override is not None: if key in config: nonlocal printed_overrides printed_overrides = True cli_logger.warning( "`{}` override provided on the command line.\n" " Using " + cf.bold("{}") + cf.dimmed(" [configuration file has " + cf.bold("{}") + "]"), key, override, config[key]) config[key] = override handle_cli_override("min_workers", override_min_workers) handle_cli_override("max_workers", override_max_workers) handle_cli_override("cluster_name", override_cluster_name) if printed_overrides: cli_logger.newline() cli_logger.labeled_value("Cluster", config["cluster_name"]) # disable the cli_logger here if needed # because it only supports aws if config["provider"]["type"] != "aws": cli_logger.old_style = True cli_logger.newline() config = _bootstrap_config(config, no_config_cache=no_config_cache) try_logging_config(config) get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name)
def get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name, _provider=None, _runner=subprocess): """Create the cluster head node, which in turn creates the workers.""" provider = (_provider or get_node_provider(config["provider"], config["cluster_name"])) config = copy.deepcopy(config) raw_config_file = config_file # used for printing to the user config_file = os.path.abspath(config_file) try: head_node_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, } nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: cli_logger.confirm(yes, "No head node found. " "Launching a new cluster.", _abort=True) cli_logger.old_confirm("This will create a new cluster", yes) elif not no_restart: cli_logger.old_confirm("This will restart cluster services", yes) if head_node: if restart_only: cli_logger.confirm( yes, "Updating cluster configuration and " "restarting the cluster Ray runtime. " "Setup commands will not be run due to `{}`.\n", cf.bold("--restart-only"), _abort=True) elif no_restart: cli_logger.print( "Cluster Ray runtime will not be restarted due " "to `{}`.", cf.bold("--no-restart")) cli_logger.confirm(yes, "Updating cluster configuration and " "running setup commands.", _abort=True) else: cli_logger.print( "Updating cluster configuration and running full setup.") cli_logger.confirm( yes, cf.bold("Cluster Ray runtime will be restarted."), _abort=True) cli_logger.newline() # TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync) head_node_config = copy.deepcopy(config["head_node"]) if "head_node_type" in config: head_node_tags[TAG_RAY_USER_NODE_TYPE] = config["head_node_type"] head_node_config.update(config["available_node_types"][ config["head_node_type"]]["node_config"]) launch_hash = hash_launch_conf(head_node_config, config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: with cli_logger.group("Acquiring an up-to-date head node"): if head_node is not None: cli_logger.print( "Currently running head node is out-of-date with " "cluster configuration") cli_logger.print( "hash is {}, expected {}", cf.bold( provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG)), cf.bold(launch_hash)) cli_logger.confirm(yes, "Relaunching it.", _abort=True) cli_logger.old_confirm( "Head node config out-of-date. It will be terminated", yes) cli_logger.old_info( logger, "get_or_create_head_node: " "Shutting down outdated head node {}", head_node) provider.terminate_node(head_node) cli_logger.print("Terminated head node {}", head_node) cli_logger.old_info( logger, "get_or_create_head_node: Launching new head node...") head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( config["cluster_name"]) provider.create_node(head_node_config, head_node_tags, 1) cli_logger.print("Launched a new head node") start = time.time() head_node = None with cli_logger.timed("Fetching the new head node"): while True: if time.time() - start > 50: cli_logger.abort( "Head node fetch timed out.") # todo: msg raise RuntimeError("Failed to create head node.") nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) == 1: head_node = nodes[0] break time.sleep(1) cli_logger.newline() with cli_logger.group( "Setting up head node", _numbered=("<>", 1, 1), # cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]), _tags=dict()): # add id, ARN to tags? # TODO(ekl) right now we always update the head node even if the # hash matches. # We could prompt the user for what they want to do here. # No need to pass in cluster_sync_files because we use this # hash to set up the head node (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf( config["file_mounts"], None, config) cli_logger.old_info( logger, "get_or_create_head_node: Updating files on head node...") # Rewrite the auth config so that the head # node can update the workers remote_config = copy.deepcopy(config) # drop proxy options if they exist, otherwise # head node won't be able to connect to workers remote_config["auth"].pop("ssh_proxy_command", None) if "ssh_private_key" in config["auth"]: remote_key_path = "~/ray_bootstrap_key.pem" remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts remote_config["no_restart"] = no_restart # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile( "w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update( {"~/ray_bootstrap_config.yaml": remote_config_file.name}) if "ssh_private_key" in config["auth"]: config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], }) cli_logger.print("Prepared bootstrap config") if restart_only: setup_commands = [] ray_start_commands = config["head_start_ray_commands"] elif no_restart: setup_commands = config["head_setup_commands"] ray_start_commands = [] else: setup_commands = config["head_setup_commands"] ray_start_commands = config["head_start_ray_commands"] if not no_restart: warn_about_bad_start_command(ray_start_commands) updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=config["initialization_commands"], setup_commands=setup_commands, ray_start_commands=ray_start_commands, process_runner=_runner, runtime_hash=runtime_hash, file_mounts_contents_hash=file_mounts_contents_hash, is_head_node=True, docker_config=config.get("docker")) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.non_terminated_nodes(head_node_tags) if config.get("provider", {}).get("use_internal_ips", False) is True: head_node_ip = provider.internal_ip(head_node) else: head_node_ip = provider.external_ip(head_node) if updater.exitcode != 0: # todo: this does not follow the mockup and is not good enough cli_logger.abort("Failed to setup head node.") cli_logger.old_error( logger, "get_or_create_head_node: " "Updating {} failed", head_node_ip) sys.exit(1) cli_logger.old_info( logger, "get_or_create_head_node: " "Head node up-to-date, IP address is: {}", head_node_ip) monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*" if override_cluster_name: modifiers = " --cluster-name={}".format( quote(override_cluster_name)) else: modifiers = "" if cli_logger.old_style: print("To monitor autoscaling activity, you can run:\n\n" " ray exec {} {}{}\n".format(config_file, quote(monitor_str), modifiers)) print("To open a console on the cluster:\n\n" " ray attach {}{}\n".format(config_file, modifiers)) print("To get a remote shell to the cluster manually, run:\n\n" " {}\n".format( updater.cmd_runner.remote_shell_command_str())) cli_logger.newline() with cli_logger.group("Useful commands"): cli_logger.print("Monitor autoscaling with") cli_logger.print(cf.bold(" ray exec {}{} {}"), raw_config_file, modifiers, quote(monitor_str)) cli_logger.print("Connect to a terminal on the cluster head") cli_logger.print(cf.bold(" ray attach {}{}"), raw_config_file, modifiers) finally: provider.cleanup()
def run(self): cli_logger.old_info(logger, "{}Updating to {}", self.log_prefix, self.runtime_hash) if cmd_output_util.does_allow_interactive( ) and cmd_output_util.is_output_redirected(): # this is most probably a bug since the user has no control # over these settings msg = ("Output was redirected for an interactive command. " "Either do not pass `--redirect-command-output` " "or also pass in `--use-normal-shells`.") cli_logger.abort(msg) raise click.ClickException(msg) try: with LogTimer(self.log_prefix + "Applied config {}".format(self.runtime_hash)): self.do_update() except Exception as e: error_str = str(e) if hasattr(e, "cmd"): error_str = "(Exit Status {}) {}".format( e.returncode, " ".join(e.cmd)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED}) cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED)) cli_logger.old_error(logger, "{}Error executing: {}\n", self.log_prefix, error_str) cli_logger.error("!!!") if hasattr(e, "cmd"): cli_logger.error( "Setup command `{}` failed with exit code {}. stderr:", cf.bold(e.cmd), e.returncode) else: cli_logger.verbose_error("{}", str(vars(e))) # todo: handle this better somehow? cli_logger.error("{}", str(e)) # todo: print stderr here cli_logger.error("!!!") cli_logger.newline() if isinstance(e, click.ClickException): # todo: why do we ignore this here return raise tags_to_set = { TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, TAG_RAY_RUNTIME_CONFIG: self.runtime_hash, } if self.file_mounts_contents_hash is not None: tags_to_set[ TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash self.provider.set_node_tags(self.node_id, tags_to_set) cli_logger.labeled_value("New status", STATUS_UP_TO_DATE) self.exitcode = 0
def create_or_update_cluster(config_file: str, override_min_workers: Optional[int], override_max_workers: Optional[int], no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str], no_config_cache: bool, log_old_style: bool, log_color: str, verbose: int) -> None: """Create or updates an autoscaling Ray cluster from a config json.""" cli_logger.old_style = log_old_style cli_logger.color_mode = log_color cli_logger.verbosity = verbose # todo: disable by default when the command output handling PR makes it in cli_logger.dump_command_output = True cli_logger.detect_colors() def handle_yaml_error(e): cli_logger.error( "Cluster config invalid.\n" "Failed to load YAML file " + cf.bold("{}"), config_file) cli_logger.newline() with cli_logger.verbatim_error_ctx("PyYAML error:"): cli_logger.error(e) cli_logger.abort() try: config = yaml.safe_load(open(config_file).read()) except FileNotFoundError: cli_logger.abort( "Provided cluster configuration file ({}) does not exist.", cf.bold(config_file)) except yaml.parser.ParserError as e: handle_yaml_error(e) except yaml.scanner.ScannerError as e: handle_yaml_error(e) # todo: validate file_mounts, ssh keys, etc. importer = NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: cli_logger.abort( "Unknown provider type " + cf.bold("{}") + "\n" "Available providers are: {}", config["provider"]["type"], cli_logger.render_list([ k for k in NODE_PROVIDERS.keys() if NODE_PROVIDERS[k] is not None ])) raise NotImplementedError("Unsupported provider {}".format( config["provider"])) cli_logger.success("Cluster configuration valid.\n") printed_overrides = False def handle_cli_override(key, override): if override is not None: if key in config: nonlocal printed_overrides printed_overrides = True cli_logger.warning( "`{}` override provided on the command line.\n" " Using " + cf.bold("{}") + cf.dimmed(" [configuration file has " + cf.bold("{}") + "]"), key, override, config[key]) config[key] = override handle_cli_override("min_workers", override_min_workers) handle_cli_override("max_workers", override_max_workers) handle_cli_override("cluster_name", override_cluster_name) if printed_overrides: cli_logger.newline() cli_logger.labeled_value("Cluster", config["cluster_name"]) # disable the cli_logger here if needed # because it only supports aws if config["provider"]["type"] != "aws": cli_logger.old_style = True config = _bootstrap_config(config, no_config_cache) if config["provider"]["type"] != "aws": cli_logger.old_style = False try_logging_config(config) get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name)
def resolve(self, is_head, node_ip_address=None): """Returns a copy with values filled out with system defaults. Args: is_head (bool): Whether this is the head node. node_ip_address (str): The IP address of the node that we are on. This is used to automatically create a node id resource. """ resources = (self.resources or {}).copy() assert "CPU" not in resources, resources assert "GPU" not in resources, resources assert "memory" not in resources, resources assert "object_store_memory" not in resources, resources if node_ip_address is None: node_ip_address = ray.services.get_node_ip_address() # Automatically create a node id resource on each node. This is # queryable with ray.state.node_ids() and ray.state.current_node_id(). resources[NODE_ID_PREFIX + node_ip_address] = 1.0 num_cpus = self.num_cpus if num_cpus is None: num_cpus = multiprocessing.cpu_count() num_gpus = self.num_gpus gpu_ids = ray.utils.get_cuda_visible_devices() # Check that the number of GPUs that the raylet wants doesn't # excede the amount allowed by CUDA_VISIBLE_DEVICES. if (num_gpus is not None and gpu_ids is not None and num_gpus > len(gpu_ids)): raise ValueError("Attempting to start raylet with {} GPUs, " "but CUDA_VISIBLE_DEVICES contains {}.".format( num_gpus, gpu_ids)) if num_gpus is None: # Try to automatically detect the number of GPUs. num_gpus = _autodetect_num_gpus() # Don't use more GPUs than allowed by CUDA_VISIBLE_DEVICES. if gpu_ids is not None: num_gpus = min(num_gpus, len(gpu_ids)) try: info_string = _get_gpu_info_string() gpu_types = _constraints_from_gpu_info(info_string) resources.update(gpu_types) except Exception: logger.exception("Could not parse gpu information.") # Choose a default object store size. system_memory = ray.utils.get_system_memory() avail_memory = ray.utils.estimate_available_memory() object_store_memory = self.object_store_memory if object_store_memory is None: object_store_memory = int(avail_memory * 0.3) # Cap memory to avoid memory waste and perf issues on large nodes if (object_store_memory > ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES): logger.debug( "Warning: Capping object memory store to {}GB. ".format( ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES // 1e9) + "To increase this further, specify `object_store_memory` " "when calling ray.init() or ray start.") object_store_memory = ( ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES) redis_max_memory = self.redis_max_memory if redis_max_memory is None: redis_max_memory = min( ray_constants.DEFAULT_REDIS_MAX_MEMORY_BYTES, max(int(avail_memory * 0.1), ray_constants.REDIS_MINIMUM_MEMORY_BYTES)) if redis_max_memory < ray_constants.REDIS_MINIMUM_MEMORY_BYTES: raise ValueError( "Attempting to cap Redis memory usage at {} bytes, " "but the minimum allowed is {} bytes.".format( redis_max_memory, ray_constants.REDIS_MINIMUM_MEMORY_BYTES)) memory = self.memory if memory is None: memory = (avail_memory - object_store_memory - (redis_max_memory if is_head else 0)) if memory < 100e6 and memory < 0.05 * system_memory: raise ValueError( "After taking into account object store and redis memory " "usage, the amount of memory on this node available for " "tasks and actors ({} GB) is less than {}% of total. " "You can adjust these settings with " "ray.init(memory=<bytes>, " "object_store_memory=<bytes>).".format( round(memory / 1e9, 2), int(100 * (memory / system_memory)))) rounded_memory = ray_constants.round_to_memory_units(memory, round_up=False) worker_ram = round(rounded_memory / (1024**3), 2) object_ram = round(object_store_memory / (1024**3), 2) # TODO(maximsmol): this behavior is strange since we do not have a # good grasp on when this will get called # (you have to study node.py to make a guess) with cli_logger.group("Available RAM"): cli_logger.labeled_value("Workers", "{} GiB", str(worker_ram)) cli_logger.labeled_value("Objects", "{} GiB", str(object_ram)) cli_logger.newline() cli_logger.print("To adjust these values, use") with cf.with_style("monokai") as c: cli_logger.print( " ray{0}init(memory{1}{2}, " "object_store_memory{1}{2})", c.magenta("."), c.magenta("="), c.purple("<bytes>")) cli_logger.old_info( logger, "Starting Ray with {} GiB memory available for workers and up to " "{} GiB for objects. You can adjust these settings " "with ray.init(memory=<bytes>, " "object_store_memory=<bytes>).", worker_ram, object_ram) spec = ResourceSpec(num_cpus, num_gpus, memory, object_store_memory, resources, redis_max_memory) assert spec.resolved() return spec
def handle_boto_error(exc, msg, *args, **kwargs): if cli_logger.old_style: # old-style logging doesn't do anything here # so we exit early return error_code = None error_info = None # todo: not sure if these exceptions always have response if hasattr(exc, "response"): error_info = exc.response.get("Error", None) if error_info is not None: error_code = error_info.get("Code", None) generic_message_args = [ "{}\n" "Error code: {}", msg.format(*args, **kwargs), cf.bold(error_code) ] # apparently # ExpiredTokenException # ExpiredToken # RequestExpired # are all the same pretty much credentials_expiration_codes = [ "ExpiredTokenException", "ExpiredToken", "RequestExpired" ] if error_code in credentials_expiration_codes: # "An error occurred (ExpiredToken) when calling the # GetInstanceProfile operation: The security token # included in the request is expired" # "An error occurred (RequestExpired) when calling the # DescribeKeyPairs operation: Request has expired." token_command = ("aws sts get-session-token " "--serial-number arn:aws:iam::" + cf.underlined("ROOT_ACCOUNT_ID") + ":mfa/" + cf.underlined("AWS_USERNAME") + " --token-code " + cf.underlined("TWO_FACTOR_AUTH_CODE")) secret_key_var = ("export AWS_SECRET_ACCESS_KEY = " + cf.underlined("REPLACE_ME") + " # found at Credentials.SecretAccessKey") session_token_var = ("export AWS_SESSION_TOKEN = " + cf.underlined("REPLACE_ME") + " # found at Credentials.SessionToken") access_key_id_var = ("export AWS_ACCESS_KEY_ID = " + cf.underlined("REPLACE_ME") + " # found at Credentials.AccessKeyId") # fixme: replace with a Github URL that points # to our repo aws_session_script_url = ("https://gist.github.com/maximsmol/" "a0284e1d97b25d417bd9ae02e5f450cf") cli_logger.verbose_error(*generic_message_args) cli_logger.verbose(vars(exc)) cli_logger.abort( "Your AWS session has expired.\n\n" "You can request a new one using\n{}\n" "then expose it to Ray by setting\n{}\n{}\n{}\n\n" "You can find a script that automates this at:\n{}", cf.bold(token_command), cf.bold(secret_key_var), cf.bold(session_token_var), cf.bold(access_key_id_var), cf.underlined(aws_session_script_url)) # todo: any other errors that we should catch separately? cli_logger.error(*generic_message_args) cli_logger.newline() with cli_logger.verbatim_error_ctx("Boto3 error:"): cli_logger.verbose(vars(exc)) cli_logger.error(exc) cli_logger.abort()
# This is an executable script that runs an example of every single CliLogger # function for demonstration purposes. Primarily useful for tuning color and # other formatting. from ray.autoscaler.cli_logger import cli_logger import colorful as cf cli_logger.old_style = False cli_logger.verbosity = 999 cli_logger.detect_colors() cli_logger.print( cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined")) cli_logger.labeled_value("Label", "value") cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3])) cli_logger.newline() cli_logger.very_verbose("Very verbose") cli_logger.verbose("Verbose") cli_logger.verbose_warning("Verbose warning") cli_logger.verbose_error("Verbose error") cli_logger.print("Info") cli_logger.success("Success") cli_logger.warning("Warning") cli_logger.error("Error") cli_logger.newline() try: cli_logger.abort("Abort") except Exception: pass try: cli_logger.doassert(False, "Assert")
def _bootstrap_config(config: Dict[str, Any], no_config_cache: bool = False) -> Dict[str, Any]: config = prepare_config(config) hasher = hashlib.sha1() hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) cache_key = os.path.join(tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())) if os.path.exists(cache_key) and not no_config_cache: cli_logger.old_info(logger, "Using cached config at {}", cache_key) config_cache = json.loads(open(cache_key).read()) if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION: # todo: is it fine to re-resolve? afaik it should be. # we can have migrations otherwise or something # but this seems overcomplicated given that resolving is # relatively cheap try_reload_log_state(config_cache["config"]["provider"], config_cache.get("provider_log_info")) cli_logger.newline() cli_logger.verbose_warning( "Loaded cached provider configuration " "from " + cf.bold("{}"), cache_key) if cli_logger.verbosity == 0: cli_logger.warning("Loaded cached provider configuration") cli_logger.warning( "If you experience issues with " "the cloud provider, try re-running " "the command with {}.", cf.bold("--no-config-cache")) return config_cache["config"] else: cli_logger.warning( "Found cached cluster config " "but the version " + cf.bold("{}") + " " "(expected " + cf.bold("{}") + ") does not match.\n" "This is normal if cluster launcher was updated.\n" "Config will be re-resolved.", config_cache.get("_version", "none"), CONFIG_CACHE_VERSION) validate_config(config) importer = NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) provider_cls = importer(config["provider"]) with cli_logger.timed( # todo: better message "Bootstraping {} config", PROVIDER_PRETTY_NAMES.get(config["provider"]["type"])): resolved_config = provider_cls.bootstrap_config(config) if not no_config_cache: with open(cache_key, "w") as f: config_cache = { "_version": CONFIG_CACHE_VERSION, "provider_log_info": try_get_log_state(config["provider"]), "config": resolved_config } f.write(json.dumps(config_cache)) return resolved_config