def _get_sdk_client( address: Optional[str], create_cluster_if_needed: bool = False) -> JobSubmissionClient: if address is None: if "RAY_ADDRESS" not in os.environ: raise ValueError( "Address must be specified using either the --address flag " "or RAY_ADDRESS environment variable.") address = os.environ["RAY_ADDRESS"] cli_logger.labeled_value("Job submission server address", address) return JobSubmissionClient(address, create_cluster_if_needed)
def run(self): update_start_time = time.time() if cmd_output_util.does_allow_interactive( ) and cmd_output_util.is_output_redirected(): # this is most probably a bug since the user has no control # over these settings msg = ("Output was redirected for an interactive command. " "Either do not pass `--redirect-command-output` " "or also pass in `--use-normal-shells`.") cli_logger.abort(msg) try: with LogTimer(self.log_prefix + "Applied config {}".format(self.runtime_hash)): self.do_update() except Exception as e: self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED}) cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED)) cli_logger.error("!!!") if hasattr(e, "cmd"): cli_logger.error( "Setup command `{}` failed with exit code {}. stderr:", cf.bold(e.cmd), e.returncode) else: cli_logger.verbose_error("{}", str(vars(e))) # todo: handle this better somehow? cli_logger.error("{}", str(e)) # todo: print stderr here cli_logger.error("!!!") cli_logger.newline() if isinstance(e, click.ClickException): # todo: why do we ignore this here return raise tags_to_set = { TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, TAG_RAY_RUNTIME_CONFIG: self.runtime_hash, } if self.file_mounts_contents_hash is not None: tags_to_set[ TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash self.provider.set_node_tags(self.node_id, tags_to_set) cli_logger.labeled_value("New status", STATUS_UP_TO_DATE) self.update_time = time.time() - update_start_time self.exitcode = 0
def _wait_for_ip(self, deadline): # if we have IP do not print waiting info ip = self._get_node_ip() if ip is not None: cli_logger.labeled_value("Fetched IP", ip) return ip interval = 10 with cli_logger.group("Waiting for IP"): while time.time() < deadline and \ not self.provider.is_terminated(self.node_id): ip = self._get_node_ip() if ip is not None: cli_logger.labeled_value("Received", ip) return ip cli_logger.print("Not yet available, retrying in {} seconds", cf.bold(str(interval))) time.sleep(interval) return None
def print_info(resource_string: str, key: str, src_key: str, allowed_tags: Optional[List[str]] = None, list_value: bool = False) -> None: if allowed_tags is None: allowed_tags = ["default"] node_tags = {} # set of configurations corresponding to `key` unique_settings = set() for node_type_key, node_type in config[ "available_node_types"].items(): node_tags[node_type_key] = {} tag = _log_info[src_key][node_type_key] if tag in allowed_tags: node_tags[node_type_key][tag] = True setting = node_type["node_config"].get(key) if list_value: unique_settings.add(tuple(setting)) else: unique_settings.add(setting) head_value_str = head_node_config[key] if list_value: head_value_str = cli_logger.render_list(head_value_str) if len(unique_settings) == 1: # all node types are configured the same, condense # log output cli_logger.labeled_value( resource_string + " (all available node types)", "{}", head_value_str, _tags=node_tags[config["head_node_type"]]) else: # do head node type first cli_logger.labeled_value(resource_string + f" ({head_node_type})", "{}", head_value_str, _tags=node_tags[head_node_type]) # go through remaining types for node_type_key, node_type in config[ "available_node_types"].items(): if node_type_key == head_node_type: continue workers_value_str = node_type["node_config"][key] if list_value: workers_value_str = cli_logger.render_list( workers_value_str) cli_logger.labeled_value(resource_string + f" ({node_type_key})", "{}", workers_value_str, _tags=node_tags[node_type_key])
def print_info(resource_string, key, head_src_key, workers_src_key, allowed_tags=None, list_value=False): if allowed_tags is None: allowed_tags = ["default"] head_tags = {} workers_tags = {} if _log_info[head_src_key] in allowed_tags: head_tags[_log_info[head_src_key]] = True if _log_info[workers_src_key] in allowed_tags: workers_tags[_log_info[workers_src_key]] = True head_value_str = config["head_node"][key] if list_value: head_value_str = cli_logger.render_list(head_value_str) if same_everywhere(key): cli_logger.labeled_value( # todo: handle plural vs singular? resource_string + " (head & workers)", "{}", head_value_str, _tags=head_tags) else: workers_value_str = config["worker_nodes"][key] if list_value: workers_value_str = cli_logger.render_list( workers_value_str) cli_logger.labeled_value( resource_string + " (head)", "{}", head_value_str, _tags=head_tags) cli_logger.labeled_value( resource_string + " (workers)", "{}", workers_value_str, _tags=workers_tags)
def create_or_update_cluster(config_file: str, override_min_workers: Optional[int], override_max_workers: Optional[int], no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str], no_config_cache: bool = False, redirect_command_output: bool = False, use_login_shells: bool = True) -> None: """Create or updates an autoscaling Ray cluster from a config json.""" set_using_login_shells(use_login_shells) if not use_login_shells: cmd_output_util.set_allow_interactive(False) if redirect_command_output is None: # Do not redirect by default. cmd_output_util.set_output_redirected(False) else: cmd_output_util.set_output_redirected(redirect_command_output) def handle_yaml_error(e): cli_logger.error("Cluster config invalid") cli_logger.newline() cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file) cli_logger.newline() with cli_logger.verbatim_error_ctx("PyYAML error:"): cli_logger.error(e) cli_logger.abort() try: config = yaml.safe_load(open(config_file).read()) except FileNotFoundError: cli_logger.abort( "Provided cluster configuration file ({}) does not exist", cf.bold(config_file)) raise except yaml.parser.ParserError as e: handle_yaml_error(e) raise except yaml.scanner.ScannerError as e: handle_yaml_error(e) raise # todo: validate file_mounts, ssh keys, etc. importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: cli_logger.abort( "Unknown provider type " + cf.bold("{}") + "\n" "Available providers are: {}", config["provider"]["type"], cli_logger.render_list([ k for k in _NODE_PROVIDERS.keys() if _NODE_PROVIDERS[k] is not None ])) raise NotImplementedError("Unsupported provider {}".format( config["provider"])) printed_overrides = False def handle_cli_override(key, override): if override is not None: if key in config: nonlocal printed_overrides printed_overrides = True cli_logger.warning( "`{}` override provided on the command line.\n" " Using " + cf.bold("{}") + cf.dimmed(" [configuration file has " + cf.bold("{}") + "]"), key, override, config[key]) config[key] = override handle_cli_override("min_workers", override_min_workers) handle_cli_override("max_workers", override_max_workers) handle_cli_override("cluster_name", override_cluster_name) if printed_overrides: cli_logger.newline() cli_logger.labeled_value("Cluster", config["cluster_name"]) # disable the cli_logger here if needed # because it only supports aws if config["provider"]["type"] != "aws": cli_logger.old_style = True cli_logger.newline() config = _bootstrap_config(config, no_config_cache=no_config_cache) try_logging_config(config) get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name)
#!/usr/bin/env python # This is an executable script that runs an example of every single CliLogger # function for demonstration purposes. Primarily useful for tuning color and # other formatting. from ray.autoscaler._private.cli_logger import cli_logger import colorful as cf cli_logger.old_style = False cli_logger.verbosity = 999 cli_logger.detect_colors() cli_logger.print( cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined")) cli_logger.labeled_value("Label", "value") cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3])) cli_logger.newline() cli_logger.very_verbose("Very verbose") cli_logger.verbose("Verbose") cli_logger.verbose_warning("Verbose warning") cli_logger.verbose_error("Verbose error") cli_logger.print("Info") cli_logger.success("Success") cli_logger.warning("Warning") cli_logger.error("Error") cli_logger.newline() try: cli_logger.abort("Abort") except Exception: pass
def log_to_cli(config): provider_name = _PROVIDER_PRETTY_NAMES.get("aws", None) cli_logger.doassert(provider_name is not None, "Could not find a pretty name for the AWS provider.") with cli_logger.group("{} config", provider_name): def same_everywhere(key): return config["head_node"][key] == config["worker_nodes"][key] def print_info(resource_string, key, head_src_key, workers_src_key, allowed_tags=None, list_value=False): if allowed_tags is None: allowed_tags = ["default"] head_tags = {} workers_tags = {} if _log_info[head_src_key] in allowed_tags: head_tags[_log_info[head_src_key]] = True if _log_info[workers_src_key] in allowed_tags: workers_tags[_log_info[workers_src_key]] = True head_value_str = config["head_node"][key] if list_value: head_value_str = cli_logger.render_list(head_value_str) if same_everywhere(key): cli_logger.labeled_value( # todo: handle plural vs singular? resource_string + " (head & workers)", "{}", head_value_str, _tags=head_tags) else: workers_value_str = config["worker_nodes"][key] if list_value: workers_value_str = cli_logger.render_list( workers_value_str) cli_logger.labeled_value( resource_string + " (head)", "{}", head_value_str, _tags=head_tags) cli_logger.labeled_value( resource_string + " (workers)", "{}", workers_value_str, _tags=workers_tags) tags = {"default": _log_info["head_instance_profile_src"] == "default"} profile_arn = config["head_node"]["IamInstanceProfile"].get("Arn") profile_name = _arn_to_name(profile_arn) \ if profile_arn \ else config["head_node"]["IamInstanceProfile"]["Name"] cli_logger.labeled_value("IAM Profile", "{}", profile_name, _tags=tags) if ("KeyName" in config["head_node"] and "KeyName" in config["worker_nodes"]): print_info("EC2 Key pair", "KeyName", "keypair_src", "keypair_src") print_info( "VPC Subnets", "SubnetIds", "head_subnet_src", "workers_subnet_src", list_value=True) print_info( "EC2 Security groups", "SecurityGroupIds", "head_security_group_src", "workers_security_group_src", list_value=True) print_info( "EC2 AMI", "ImageId", "head_ami_src", "workers_ami_src", allowed_tags=["dlami"]) cli_logger.newline()
def do_update(self): self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH}) cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH) deadline = time.time() + NODE_START_WAIT_S self.wait_ready(deadline) node_tags = self.provider.node_tags(self.node_id) logger.debug("Node tags: {}".format(str(node_tags))) if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash: # When resuming from a stopped instance the runtime_hash may be the # same, but the container will not be started. self.cmd_runner.run_init( as_head=self.is_head_node, file_mounts=self.file_mounts) # runtime_hash will only change whenever the user restarts # or updates their cluster with `get_or_create_head_node` if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and ( not self.file_mounts_contents_hash or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) == self.file_mounts_contents_hash): # todo: we lie in the confirmation message since # full setup might be cancelled here cli_logger.print( "Configuration already up to date, " "skipping file mounts, initalization and setup commands.", _numbered=("[]", "2-5", 6)) cli_logger.old_info(logger, "{}{} already up-to-date, skip to ray start", self.log_prefix, self.node_id) else: cli_logger.print( "Updating cluster configuration.", _tags=dict(hash=self.runtime_hash)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES}) cli_logger.labeled_value("New status", STATUS_SYNCING_FILES) self.sync_file_mounts(self.rsync_up, step_numbers=(2, 6)) # Only run setup commands if runtime_hash has changed because # we don't want to run setup_commands every time the head node # file_mounts folders have changed. if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash: # Run init commands self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP}) cli_logger.labeled_value("New status", STATUS_SETTING_UP) if self.initialization_commands: with cli_logger.group( "Running initialization commands", _numbered=("[]", 3, 5)): with LogTimer( self.log_prefix + "Initialization commands", show_status=True): for cmd in self.initialization_commands: try: # Overriding the existing SSHOptions class # with a new SSHOptions class that uses # this ssh_private_key as its only __init__ # argument. # Run outside docker. self.cmd_runner.run( cmd, ssh_options_override_ssh_key=self. auth_config.get("ssh_private_key"), run_env="host") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Initialization command failed." ) from None else: cli_logger.print( "No initialization commands to run.", _numbered=("[]", 3, 6)) self.cmd_runner.run_init( as_head=self.is_head_node, file_mounts=self.file_mounts) if self.setup_commands: with cli_logger.group( "Running setup commands", # todo: fix command numbering _numbered=("[]", 4, 6)): with LogTimer( self.log_prefix + "Setup commands", show_status=True): total = len(self.setup_commands) for i, cmd in enumerate(self.setup_commands): if cli_logger.verbosity == 0 and len(cmd) > 30: cmd_to_print = cf.bold(cmd[:30]) + "..." else: cmd_to_print = cf.bold(cmd) cli_logger.print( "{}", cmd_to_print, _numbered=("()", i, total)) try: # Runs in the container if docker is in use self.cmd_runner.run(cmd, run_env="auto") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Setup command failed.") else: cli_logger.print( "No setup commands to run.", _numbered=("[]", 4, 6)) with cli_logger.group( "Starting the Ray runtime", _numbered=("[]", 6, 6)): with LogTimer( self.log_prefix + "Ray start commands", show_status=True): for cmd in self.ray_start_commands: if self.node_resources: env_vars = { RESOURCES_ENVIRONMENT_VARIABLE: self.node_resources } else: env_vars = {} try: old_redirected = cmd_output_util.is_output_redirected() cmd_output_util.set_output_redirected(False) # Runs in the container if docker is in use self.cmd_runner.run( cmd, environment_variables=env_vars, run_env="auto") cmd_output_util.set_output_redirected(old_redirected) except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error("See above for stderr.") raise click.ClickException("Start command failed.")
def do_update(self): self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH}) cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH) deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S self.wait_ready(deadline) global_event_system.execute_callback( CreateClusterEvent.ssh_control_acquired) node_tags = self.provider.node_tags(self.node_id) logger.debug("Node tags: {}".format(str(node_tags))) if self.provider_type == "aws" and self.provider.provider_config: from ray.autoscaler._private.aws.cloudwatch.cloudwatch_helper \ import CloudwatchHelper CloudwatchHelper(self.provider.provider_config, [self.node_id], self.provider.cluster_name). \ update_from_config(self.is_head_node) if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash: # When resuming from a stopped instance the runtime_hash may be the # same, but the container will not be started. init_required = self.cmd_runner.run_init( as_head=self.is_head_node, file_mounts=self.file_mounts, sync_run_yet=False) if init_required: node_tags[TAG_RAY_RUNTIME_CONFIG] += "-invalidate" # This ensures that `setup_commands` are not removed self.restart_only = False if self.restart_only: self.setup_commands = [] # runtime_hash will only change whenever the user restarts # or updates their cluster with `get_or_create_head_node` if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and ( not self.file_mounts_contents_hash or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) == self.file_mounts_contents_hash): # todo: we lie in the confirmation message since # full setup might be cancelled here cli_logger.print( "Configuration already up to date, " "skipping file mounts, initalization and setup commands.", _numbered=("[]", "2-6", NUM_SETUP_STEPS)) else: cli_logger.print( "Updating cluster configuration.", _tags=dict(hash=self.runtime_hash)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES}) cli_logger.labeled_value("New status", STATUS_SYNCING_FILES) self.sync_file_mounts( self.rsync_up, step_numbers=(1, NUM_SETUP_STEPS)) # Only run setup commands if runtime_hash has changed because # we don't want to run setup_commands every time the head node # file_mounts folders have changed. if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash: # Run init commands self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP}) cli_logger.labeled_value("New status", STATUS_SETTING_UP) if self.initialization_commands: with cli_logger.group( "Running initialization commands", _numbered=("[]", 4, NUM_SETUP_STEPS)): global_event_system.execute_callback( CreateClusterEvent.run_initialization_cmd) with LogTimer( self.log_prefix + "Initialization commands", show_status=True): for cmd in self.initialization_commands: global_event_system.execute_callback( CreateClusterEvent.run_initialization_cmd, {"command": cmd}) try: # Overriding the existing SSHOptions class # with a new SSHOptions class that uses # this ssh_private_key as its only __init__ # argument. # Run outside docker. self.cmd_runner.run( cmd, ssh_options_override_ssh_key=self. auth_config.get("ssh_private_key"), run_env="host") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Initialization command failed." ) from None else: cli_logger.print( "No initialization commands to run.", _numbered=("[]", 4, NUM_SETUP_STEPS)) with cli_logger.group( "Initalizing command runner", # todo: fix command numbering _numbered=("[]", 5, NUM_SETUP_STEPS)): self.cmd_runner.run_init( as_head=self.is_head_node, file_mounts=self.file_mounts, sync_run_yet=True) if self.setup_commands: with cli_logger.group( "Running setup commands", # todo: fix command numbering _numbered=("[]", 6, NUM_SETUP_STEPS)): global_event_system.execute_callback( CreateClusterEvent.run_setup_cmd) with LogTimer( self.log_prefix + "Setup commands", show_status=True): total = len(self.setup_commands) for i, cmd in enumerate(self.setup_commands): global_event_system.execute_callback( CreateClusterEvent.run_setup_cmd, {"command": cmd}) if cli_logger.verbosity == 0 and len(cmd) > 30: cmd_to_print = cf.bold(cmd[:30]) + "..." else: cmd_to_print = cf.bold(cmd) cli_logger.print( "{}", cmd_to_print, _numbered=("()", i, total)) try: # Runs in the container if docker is in use self.cmd_runner.run(cmd, run_env="auto") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Setup command failed.") else: cli_logger.print( "No setup commands to run.", _numbered=("[]", 6, NUM_SETUP_STEPS)) with cli_logger.group( "Starting the Ray runtime", _numbered=("[]", 7, NUM_SETUP_STEPS)): global_event_system.execute_callback( CreateClusterEvent.start_ray_runtime) with LogTimer( self.log_prefix + "Ray start commands", show_status=True): for cmd in self.ray_start_commands: # Add a resource override env variable if needed: if self.provider_type == "local": # Local NodeProvider doesn't need resource override. env_vars = {} elif self.node_resources: env_vars = { RESOURCES_ENVIRONMENT_VARIABLE: self.node_resources } else: env_vars = {} try: old_redirected = cmd_output_util.is_output_redirected() cmd_output_util.set_output_redirected(False) # Runs in the container if docker is in use self.cmd_runner.run( cmd, environment_variables=env_vars, run_env="auto") cmd_output_util.set_output_redirected(old_redirected) except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error("See above for stderr.") raise click.ClickException("Start command failed.") global_event_system.execute_callback( CreateClusterEvent.start_ray_runtime_completed)
def create_or_update_cluster( config_file: str, override_min_workers: Optional[int], override_max_workers: Optional[int], no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str] = None, no_config_cache: bool = False, redirect_command_output: Optional[bool] = False, use_login_shells: bool = True, no_monitor_on_head: bool = False) -> Dict[str, Any]: """Creates or updates an autoscaling Ray cluster from a config json.""" # no_monitor_on_head is an internal flag used by the Ray K8s operator. # If True, prevents autoscaling config sync to the Ray head during cluster # creation. See https://github.com/ray-project/ray/pull/13720. set_using_login_shells(use_login_shells) if not use_login_shells: cmd_output_util.set_allow_interactive(False) if redirect_command_output is None: # Do not redirect by default. cmd_output_util.set_output_redirected(False) else: cmd_output_util.set_output_redirected(redirect_command_output) def handle_yaml_error(e): cli_logger.error("Cluster config invalid") cli_logger.newline() cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file) cli_logger.newline() with cli_logger.verbatim_error_ctx("PyYAML error:"): cli_logger.error(e) cli_logger.abort() try: config = yaml.safe_load(open(config_file).read()) except FileNotFoundError: cli_logger.abort( "Provided cluster configuration file ({}) does not exist", cf.bold(config_file)) raise except yaml.parser.ParserError as e: handle_yaml_error(e) raise except yaml.scanner.ScannerError as e: handle_yaml_error(e) raise global_event_system.execute_callback(CreateClusterEvent.up_started, {"cluster_config": config}) # todo: validate file_mounts, ssh keys, etc. importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: cli_logger.abort( "Unknown provider type " + cf.bold("{}") + "\n" "Available providers are: {}", config["provider"]["type"], cli_logger.render_list([ k for k in _NODE_PROVIDERS.keys() if _NODE_PROVIDERS[k] is not None ])) raise NotImplementedError("Unsupported provider {}".format( config["provider"])) printed_overrides = False def handle_cli_override(key, override): if override is not None: if key in config: nonlocal printed_overrides printed_overrides = True cli_logger.warning( "`{}` override provided on the command line.\n" " Using " + cf.bold("{}") + cf.dimmed( " [configuration file has " + cf.bold("{}") + "]"), key, override, config[key]) config[key] = override handle_cli_override("min_workers", override_min_workers) handle_cli_override("max_workers", override_max_workers) handle_cli_override("cluster_name", override_cluster_name) if printed_overrides: cli_logger.newline() cli_logger.labeled_value("Cluster", config["cluster_name"]) cli_logger.newline() config = _bootstrap_config(config, no_config_cache=no_config_cache) try_logging_config(config) get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name, no_monitor_on_head) return config
def start(node_ip_address, address, port, redis_password, redis_shard_ports, object_manager_port, node_manager_port, gcs_server_port, min_worker_port, max_worker_port, worker_port_list, memory, object_store_memory, redis_max_memory, num_cpus, num_gpus, resources, head, include_dashboard, dashboard_host, dashboard_port, block, plasma_directory, autoscaling_config, no_redirect_worker_output, no_redirect_output, plasma_store_socket_name, raylet_socket_name, temp_dir, java_worker_options, load_code_from_local, code_search_path, system_config, lru_evict, enable_object_reconstruction, metrics_export_port, log_style, log_color, verbose): """Start Ray processes manually on the local machine.""" cli_logger.configure(log_style, log_color, verbose) if gcs_server_port and not head: raise ValueError( "gcs_server_port can be only assigned when you specify --head.") # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) redis_address = None if address is not None: (redis_address, redis_address_ip, redis_address_port) = services.validate_redis_address(address) try: resources = json.loads(resources) except Exception: cli_logger.error("`{}` is not a valid JSON string.", cf.bold("--resources")) cli_logger.abort( "Valid values look like this: `{}`", cf.bold("--resources='\"CustomResource3\": 1, " "\"CustomResource2\": 2}'")) raise Exception("Unable to parse the --resources argument using " "json.loads. Try using a format like\n\n" " --resources='{\"CustomResource1\": 3, " "\"CustomReseource2\": 2}'") redirect_worker_output = None if not no_redirect_worker_output else True redirect_output = None if not no_redirect_output else True ray_params = ray.parameter.RayParams( node_ip_address=node_ip_address, min_worker_port=min_worker_port, max_worker_port=max_worker_port, worker_port_list=worker_port_list, object_manager_port=object_manager_port, node_manager_port=node_manager_port, gcs_server_port=gcs_server_port, memory=memory, object_store_memory=object_store_memory, redis_password=redis_password, redirect_worker_output=redirect_worker_output, redirect_output=redirect_output, num_cpus=num_cpus, num_gpus=num_gpus, resources=resources, plasma_directory=plasma_directory, huge_pages=False, plasma_store_socket_name=plasma_store_socket_name, raylet_socket_name=raylet_socket_name, temp_dir=temp_dir, include_dashboard=include_dashboard, dashboard_host=dashboard_host, dashboard_port=dashboard_port, java_worker_options=java_worker_options, load_code_from_local=load_code_from_local, code_search_path=code_search_path, _system_config=system_config, lru_evict=lru_evict, enable_object_reconstruction=enable_object_reconstruction, metrics_export_port=metrics_export_port) if head: # Use default if port is none, allocate an available port if port is 0 if port is None: port = ray_constants.DEFAULT_PORT if port == 0: with socket() as s: s.bind(("", 0)) port = s.getsockname()[1] num_redis_shards = None # Start Ray on the head node. if redis_shard_ports is not None: redis_shard_ports = redis_shard_ports.split(",") # Infer the number of Redis shards from the ports if the number is # not provided. num_redis_shards = len(redis_shard_ports) if redis_address is not None: cli_logger.abort( "`{}` starts a new Redis server, `{}` should not be set.", cf.bold("--head"), cf.bold("--address")) raise Exception("If --head is passed in, a Redis server will be " "started, so a Redis address should not be " "provided.") node_ip_address = services.get_node_ip_address() # Get the node IP address if one is not provided. ray_params.update_if_absent(node_ip_address=node_ip_address) cli_logger.labeled_value("Local node IP", ray_params.node_ip_address) ray_params.update_if_absent( redis_port=port, redis_shard_ports=redis_shard_ports, redis_max_memory=redis_max_memory, num_redis_shards=num_redis_shards, redis_max_clients=None, autoscaling_config=autoscaling_config, ) # Fail early when starting a new cluster when one is already running if address is None: default_address = f"{node_ip_address}:{port}" redis_addresses = services.find_redis_address(default_address) if len(redis_addresses) > 0: raise ConnectionError( f"Ray is already running at {default_address}. " f"Please specify a different port using the `--port`" f" command to `ray start`.") node = ray.node.Node( ray_params, head=True, shutdown_at_exit=block, spawn_reaper=block) redis_address = node.redis_address # this is a noop if new-style is not set, so the old logger calls # are still in place cli_logger.newline() startup_msg = "Ray runtime started." cli_logger.success("-" * len(startup_msg)) cli_logger.success(startup_msg) cli_logger.success("-" * len(startup_msg)) cli_logger.newline() with cli_logger.group("Next steps"): cli_logger.print( "To connect to this Ray runtime from another node, run") cli_logger.print( cf.bold(" ray start --address='{}'{}"), redis_address, f" --redis-password='******'" if redis_password else "") cli_logger.newline() cli_logger.print("Alternatively, use the following Python code:") with cli_logger.indented(): with cf.with_style("monokai") as c: cli_logger.print("{} ray", c.magenta("import")) cli_logger.print( "ray{}init(address{}{}{})", c.magenta("."), c.magenta("="), c.yellow("'auto'"), ", _redis_password{}{}".format( c.magenta("="), c.yellow("'" + redis_password + "'")) if redis_password else "") cli_logger.newline() cli_logger.print( cf.underlined("If connection fails, check your " "firewall settings and " "network configuration.")) cli_logger.newline() cli_logger.print("To terminate the Ray runtime, run") cli_logger.print(cf.bold(" ray stop")) else: # Start Ray on a non-head node. if not (port is None): cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--port"), cf.bold("--head")) raise Exception("If --head is not passed in, --port is not " "allowed.") if redis_shard_ports is not None: cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--redis-shard-ports"), cf.bold("--head")) raise Exception("If --head is not passed in, --redis-shard-ports " "is not allowed.") if redis_address is None: cli_logger.abort("`{}` is required unless starting with `{}`.", cf.bold("--address"), cf.bold("--head")) raise Exception("If --head is not passed in, --address must " "be provided.") if include_dashboard: cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--include-dashboard"), cf.bold("--head")) raise ValueError( "If --head is not passed in, the --include-dashboard" "flag is not relevant.") # Wait for the Redis server to be started. And throw an exception if we # can't connect to it. services.wait_for_redis_to_start( redis_address_ip, redis_address_port, password=redis_password) # Create a Redis client. redis_client = services.create_redis_client( redis_address, password=redis_password) # Check that the version information on this node matches the version # information that the cluster was started with. services.check_version_info(redis_client) # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address(redis_address)) cli_logger.labeled_value("Local node IP", ray_params.node_ip_address) # Check that there aren't already Redis clients with the same IP # address connected with this Redis instance. This raises an exception # if the Redis server already has clients on this node. check_no_existing_redis_clients(ray_params.node_ip_address, redis_client) ray_params.update(redis_address=redis_address) node = ray.node.Node( ray_params, head=False, shutdown_at_exit=block, spawn_reaper=block) cli_logger.newline() startup_msg = "Ray runtime started." cli_logger.success("-" * len(startup_msg)) cli_logger.success(startup_msg) cli_logger.success("-" * len(startup_msg)) cli_logger.newline() cli_logger.print("To terminate the Ray runtime, run") cli_logger.print(cf.bold(" ray stop")) if block: cli_logger.newline() with cli_logger.group(cf.bold("--block")): cli_logger.print( "This command will now block until terminated by a signal.") cli_logger.print( "Runing subprocesses are monitored and a message will be " "printed if any of them terminate unexpectedly.") while True: time.sleep(1) deceased = node.dead_processes() if len(deceased) > 0: cli_logger.newline() cli_logger.error("Some Ray subprcesses exited unexpectedly:") with cli_logger.indented(): for process_type, process in deceased: cli_logger.error( "{}", cf.bold(str(process_type)), _tags={"exit code": str(process.returncode)}) # shutdown_at_exit will handle cleanup. cli_logger.newline() cli_logger.error("Remaining processes will be killed.") sys.exit(1)
def log_to_cli(config: Dict[str, Any]) -> None: provider_name = _PROVIDER_PRETTY_NAMES.get("aws", None) cli_logger.doassert( provider_name is not None, "Could not find a pretty name for the AWS provider." ) head_node_type = config["head_node_type"] head_node_config = config["available_node_types"][head_node_type]["node_config"] with cli_logger.group("{} config", provider_name): def print_info( resource_string: str, key: str, src_key: str, allowed_tags: Optional[List[str]] = None, list_value: bool = False, ) -> None: if allowed_tags is None: allowed_tags = ["default"] node_tags = {} # set of configurations corresponding to `key` unique_settings = set() for node_type_key, node_type in config["available_node_types"].items(): node_tags[node_type_key] = {} tag = _log_info[src_key][node_type_key] if tag in allowed_tags: node_tags[node_type_key][tag] = True setting = node_type["node_config"].get(key) if list_value: unique_settings.add(tuple(setting)) else: unique_settings.add(setting) head_value_str = head_node_config[key] if list_value: head_value_str = cli_logger.render_list(head_value_str) if len(unique_settings) == 1: # all node types are configured the same, condense # log output cli_logger.labeled_value( resource_string + " (all available node types)", "{}", head_value_str, _tags=node_tags[config["head_node_type"]], ) else: # do head node type first cli_logger.labeled_value( resource_string + f" ({head_node_type})", "{}", head_value_str, _tags=node_tags[head_node_type], ) # go through remaining types for node_type_key, node_type in config["available_node_types"].items(): if node_type_key == head_node_type: continue workers_value_str = node_type["node_config"][key] if list_value: workers_value_str = cli_logger.render_list(workers_value_str) cli_logger.labeled_value( resource_string + f" ({node_type_key})", "{}", workers_value_str, _tags=node_tags[node_type_key], ) tags = {"default": _log_info["head_instance_profile_src"] == "default"} # head_node_config is the head_node_type's config, # config["head_node"] is a field that gets applied only to the actual # head node (and not workers of the head's node_type) assert ( "IamInstanceProfile" in head_node_config or "IamInstanceProfile" in config["head_node"] ) if "IamInstanceProfile" in head_node_config: # If the user manually configured the role we're here. IamProfile = head_node_config["IamInstanceProfile"] elif "IamInstanceProfile" in config["head_node"]: # If we filled the default IAM role, we're here. IamProfile = config["head_node"]["IamInstanceProfile"] profile_arn = IamProfile.get("Arn") profile_name = _arn_to_name(profile_arn) if profile_arn else IamProfile["Name"] cli_logger.labeled_value("IAM Profile", "{}", profile_name, _tags=tags) if all( "KeyName" in node_type["node_config"] for node_type in config["available_node_types"].values() ): print_info("EC2 Key pair", "KeyName", "keypair_src") print_info("VPC Subnets", "SubnetIds", "subnet_src", list_value=True) print_info( "EC2 Security groups", "SecurityGroupIds", "security_group_src", list_value=True, ) print_info("EC2 AMI", "ImageId", "ami_src", allowed_tags=["dlami"]) cli_logger.newline()
<<<<<<< HEAD:python/ray/autoscaler/_private/updater.py ======= # When resuming from a stopped instance the runtime_hash may be the # same, but the container will not be started. self.cmd_runner.run_init( as_head=self.is_head_node, file_mounts=self.file_mounts) >>>>>>> upstream/releases/1.0.0:python/ray/autoscaler/updater.py else: cli_logger.print( "Updating cluster configuration.", _tags=dict(hash=self.runtime_hash)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES}) cli_logger.labeled_value("New status", STATUS_SYNCING_FILES) self.sync_file_mounts(self.rsync_up, step_numbers=(2, 6)) # Only run setup commands if runtime_hash has changed because # we don't want to run setup_commands every time the head node # file_mounts folders have changed. if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash: # Run init commands self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP}) cli_logger.labeled_value("New status", STATUS_SETTING_UP) if self.initialization_commands: with cli_logger.group( "Running initialization commands", _numbered=("[]", 3, 5)):