Exemplo n.º 1
0
    def do_update(self):
        self.provider.set_node_tags(
            self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH})
        cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH)

        deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S
        self.wait_ready(deadline)
        global_event_system.execute_callback(
            CreateClusterEvent.ssh_control_acquired)

        node_tags = self.provider.node_tags(self.node_id)
        logger.debug("Node tags: {}".format(str(node_tags)))

        if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash:
            # When resuming from a stopped instance the runtime_hash may be the
            # same, but the container will not be started.
            init_required = self.cmd_runner.run_init(
                as_head=self.is_head_node,
                file_mounts=self.file_mounts,
                sync_run_yet=False)
            if init_required:
                node_tags[TAG_RAY_RUNTIME_CONFIG] += "-invalidate"
                # This ensures that `setup_commands` are not removed
                self.restart_only = False

        if self.restart_only:
            self.setup_commands = []

        # runtime_hash will only change whenever the user restarts
        # or updates their cluster with `get_or_create_head_node`
        if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and (
                not self.file_mounts_contents_hash
                or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS)
                == self.file_mounts_contents_hash):
            # todo: we lie in the confirmation message since
            # full setup might be cancelled here
            cli_logger.print(
                "Configuration already up to date, "
                "skipping file mounts, initalization and setup commands.",
                _numbered=("[]", "2-6", NUM_SETUP_STEPS))

        else:
            cli_logger.print("Updating cluster configuration.",
                             _tags=dict(hash=self.runtime_hash))

            self.provider.set_node_tags(
                self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES})
            cli_logger.labeled_value("New status", STATUS_SYNCING_FILES)
            self.sync_file_mounts(self.rsync_up,
                                  step_numbers=(1, NUM_SETUP_STEPS))

            # Only run setup commands if runtime_hash has changed because
            # we don't want to run setup_commands every time the head node
            # file_mounts folders have changed.
            if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash:
                # Run init commands
                self.provider.set_node_tags(
                    self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP})
                cli_logger.labeled_value("New status", STATUS_SETTING_UP)

                if self.initialization_commands:
                    with cli_logger.group("Running initialization commands",
                                          _numbered=("[]", 4,
                                                     NUM_SETUP_STEPS)):
                        global_event_system.execute_callback(
                            CreateClusterEvent.run_initialization_cmd)
                        with LogTimer(self.log_prefix +
                                      "Initialization commands",
                                      show_status=True):
                            for cmd in self.initialization_commands:
                                global_event_system.execute_callback(
                                    CreateClusterEvent.run_initialization_cmd,
                                    {"command": cmd})
                                try:
                                    # Overriding the existing SSHOptions class
                                    # with a new SSHOptions class that uses
                                    # this ssh_private_key as its only __init__
                                    # argument.
                                    # Run outside docker.
                                    self.cmd_runner.run(
                                        cmd,
                                        ssh_options_override_ssh_key=self.
                                        auth_config.get("ssh_private_key"),
                                        run_env="host")
                                except ProcessRunnerError as e:
                                    if e.msg_type == "ssh_command_failed":
                                        cli_logger.error("Failed.")
                                        cli_logger.error(
                                            "See above for stderr.")

                                    raise click.ClickException(
                                        "Initialization command failed."
                                    ) from None
                else:
                    cli_logger.print("No initialization commands to run.",
                                     _numbered=("[]", 4, NUM_SETUP_STEPS))
                with cli_logger.group(
                        "Initalizing command runner",
                        # todo: fix command numbering
                        _numbered=("[]", 5, NUM_SETUP_STEPS)):
                    self.cmd_runner.run_init(as_head=self.is_head_node,
                                             file_mounts=self.file_mounts,
                                             sync_run_yet=True)
                if self.setup_commands:
                    with cli_logger.group(
                            "Running setup commands",
                            # todo: fix command numbering
                            _numbered=("[]", 6, NUM_SETUP_STEPS)):
                        global_event_system.execute_callback(
                            CreateClusterEvent.run_setup_cmd)
                        with LogTimer(self.log_prefix + "Setup commands",
                                      show_status=True):

                            total = len(self.setup_commands)
                            for i, cmd in enumerate(self.setup_commands):
                                global_event_system.execute_callback(
                                    CreateClusterEvent.run_setup_cmd,
                                    {"command": cmd})
                                if cli_logger.verbosity == 0 and len(cmd) > 30:
                                    cmd_to_print = cf.bold(cmd[:30]) + "..."
                                else:
                                    cmd_to_print = cf.bold(cmd)

                                cli_logger.print("{}",
                                                 cmd_to_print,
                                                 _numbered=("()", i, total))

                                try:
                                    # Runs in the container if docker is in use
                                    self.cmd_runner.run(cmd, run_env="auto")
                                except ProcessRunnerError as e:
                                    if e.msg_type == "ssh_command_failed":
                                        cli_logger.error("Failed.")
                                        cli_logger.error(
                                            "See above for stderr.")

                                    raise click.ClickException(
                                        "Setup command failed.")
                else:
                    cli_logger.print("No setup commands to run.",
                                     _numbered=("[]", 6, NUM_SETUP_STEPS))

        with cli_logger.group("Starting the Ray runtime",
                              _numbered=("[]", 7, NUM_SETUP_STEPS)):
            global_event_system.execute_callback(
                CreateClusterEvent.start_ray_runtime)
            with LogTimer(self.log_prefix + "Ray start commands",
                          show_status=True):
                for cmd in self.ray_start_commands:
                    if self.node_resources:
                        env_vars = {
                            RESOURCES_ENVIRONMENT_VARIABLE: self.node_resources
                        }
                    else:
                        env_vars = {}
                    try:
                        old_redirected = cmd_output_util.is_output_redirected()
                        cmd_output_util.set_output_redirected(False)
                        # Runs in the container if docker is in use
                        self.cmd_runner.run(cmd,
                                            environment_variables=env_vars,
                                            run_env="auto")
                        cmd_output_util.set_output_redirected(old_redirected)
                    except ProcessRunnerError as e:
                        if e.msg_type == "ssh_command_failed":
                            cli_logger.error("Failed.")
                            cli_logger.error("See above for stderr.")

                        raise click.ClickException("Start command failed.")
            global_event_system.execute_callback(
                CreateClusterEvent.start_ray_runtime_completed)
Exemplo n.º 2
0
def stop(force, verbose, log_style, log_color):
    """Stop Ray processes manually on the local machine."""

    cli_logger.configure(log_style, log_color, verbose)

    # Note that raylet needs to exit before object store, otherwise
    # it cannot exit gracefully.
    is_linux = sys.platform.startswith("linux")
    processes_to_kill = [
        # The first element is the substring to filter.
        # The second element, if True, is to filter ps results by command name
        # (only the first 15 charactors of the executable name on Linux);
        # if False, is to filter ps results by command with all its arguments.
        # See STANDARD FORMAT SPECIFIERS section of
        # http://man7.org/linux/man-pages/man1/ps.1.html
        # about comm and args. This can help avoid killing non-ray processes.
        # Format:
        # Keyword to filter, filter by command (True)/filter by args (False)
        ["raylet", True],
        ["plasma_store", True],
        ["gcs_server", True],
        ["monitor.py", False],
        ["redis-server", False],
        ["default_worker.py", False],  # Python worker.
        ["ray::", True],  # Python worker. TODO(mehrdadn): Fix for Windows
        ["io.ray.runtime.runner.worker.DefaultWorker", False],  # Java worker.
        ["log_monitor.py", False],
        ["reporter.py", False],
        ["dashboard.py", False],
        ["ray_process_reaper.py", False],
    ]

    process_infos = []
    for proc in psutil.process_iter(["name", "cmdline"]):
        try:
            process_infos.append((proc, proc.name(), proc.cmdline()))
        except psutil.Error:
            pass

    total_found = 0
    total_stopped = 0
    for keyword, filter_by_cmd in processes_to_kill:
        if filter_by_cmd and is_linux and len(keyword) > 15:
            # getting here is an internal bug, so we do not use cli_logger
            msg = ("The filter string should not be more than {} "
                   "characters. Actual length: {}. Filter: {}").format(
                       15, len(keyword), keyword)
            raise ValueError(msg)

        found = []
        for candidate in process_infos:
            proc, proc_cmd, proc_args = candidate
            corpus = (proc_cmd
                      if filter_by_cmd else subprocess.list2cmdline(proc_args))
            if keyword in corpus:
                found.append(candidate)

        for proc, proc_cmd, proc_args in found:
            total_found += 1

            proc_string = str(subprocess.list2cmdline(proc_args))
            if verbose:
                operation = "Terminating" if force else "Killing"
                cli_logger.old_info(logger, "%s process %s: %s", operation,
                                    proc.pid, proc_string)
            try:
                if force:
                    proc.kill()
                else:
                    # TODO(mehrdadn): On Windows, this is forceful termination.
                    # We don't want CTRL_BREAK_EVENT, because that would
                    # terminate the entire process group. What to do?
                    proc.terminate()

                if force:
                    cli_logger.verbose("Killed `{}` {} ", cf.bold(proc_string),
                                       cf.dimmed("(via SIGKILL)"))
                else:
                    cli_logger.verbose("Send termination request to `{}` {}",
                                       cf.bold(proc_string),
                                       cf.dimmed("(via SIGTERM)"))

                total_stopped += 1
            except psutil.NoSuchProcess:
                cli_logger.verbose(
                    "Attempted to stop `{}`, but process was already dead.",
                    cf.bold(proc_string))
                pass
            except (psutil.Error, OSError) as ex:
                cli_logger.error("Could not terminate `{}` due to {}",
                                 cf.bold(proc_string), str(ex))
                cli_logger.old_error(logger, "Error: %s", ex)

    if total_found == 0:
        cli_logger.print("Did not find any active Ray processes.")
    else:
        if total_stopped == total_found:
            cli_logger.success("Stopped all {} Ray processes.", total_stopped)
        else:
            cli_logger.warning(
                "Stopped only {} out of {} Ray processes. "
                "Set `{}` to see more details.", total_stopped, total_found,
                cf.bold("-v"))
            cli_logger.warning("Try running the command again, or use `{}`.",
                               cf.bold("--force"))
def _read_subprocess_stream(f, output_file, is_stdout=False):
    """Read and process a subprocess output stream.

    The goal is to find error messages and respond to them in a clever way.
    Currently just used for SSH messages (CONN_REFUSED, TIMEOUT, etc.), so
    the user does not get confused by these.

    Ran in a thread each for both `stdout` and `stderr` to
    allow for cross-platform asynchronous IO.

    Note: `select`-based IO is another option, but Windows has
    no support for `select`ing pipes, and Linux support varies somewhat.
    Spefically, Older *nix systems might also have quirks in how they
    handle `select` on pipes.

    Args:
        f: File object for the stream.
        output_file: File object to which filtered output is written.
        is_stdout (bool):
            When `is_stdout` is `False`, the stream is assumed to
            be `stderr`. Different error message detectors are used,
            and the output is displayed to the user unless it matches
            a special case (e.g. SSH timeout), in which case this is
            left up to the caller.
    """

    detected_special_case = None
    while True:
        # ! Readline here is crucial.
        # ! Normal `read()` will block until EOF instead of until
        #   something is available.
        line = f.readline()

        if line is None or line == "":
            # EOF
            break

        if line[-1] == "\n":
            line = line[:-1]

        if not is_stdout:
            if _ssh_output_regexes["connection_closed"]\
                    .fullmatch(line) is not None:
                # Do not log "connection closed" messages which SSH
                # puts in stderr for no reason.
                #
                # They are never errors since the connection will
                # close no matter whether the command succeeds or not.
                continue

            if _ssh_output_regexes["timeout"].fullmatch(line) is not None:
                # Timeout is not really an error but rather a special
                # condition. It should be handled by the caller, since
                # network conditions/nodes in the early stages of boot
                # are expected to sometimes cause connection timeouts.
                if detected_special_case is not None:
                    raise ValueError("Bug: ssh_timeout conflicts with another "
                                     "special codition: " +
                                     detected_special_case)

                detected_special_case = "ssh_timeout"
                continue

            if _ssh_output_regexes["conn_refused"]\
                    .fullmatch(line) is not None:
                # Connection refused is not really an error but
                # rather a special condition. It should be handled by
                # the caller, since network conditions/nodes in the
                # early stages of boot are expected to sometimes cause
                # CONN_REFUSED.
                if detected_special_case is not None:
                    raise ValueError(
                        "Bug: ssh_conn_refused conflicts with another "
                        "special codition: " + detected_special_case)

                detected_special_case = "ssh_conn_refused"
                continue

            if _ssh_output_regexes["known_host_update"]\
                    .fullmatch(line) is not None:
                # Since we ignore SSH host control anyway
                # (-o UserKnownHostsFile=/dev/null),
                # we should silence the host control warnings.
                continue

            cli_logger.error(line)

        if output_file is not None and output_file != subprocess.DEVNULL:
            output_file.write(line + "\n")

    return detected_special_case
Exemplo n.º 4
0
    def run_init(self, *, as_head, file_mounts):
        BOOTSTRAP_MOUNTS = [
            "~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"
        ]

        image = self.docker_config.get("image")
        image = self.docker_config.get(
            f"{'head' if as_head else 'worker'}_image", image)

        self._check_docker_installed()
        if self.docker_config.get("pull_before_run", True):
            assert image, "Image must be included in config if " + \
                "pull_before_run is specified"

            self.run("docker pull {}".format(image), run_env="host")

        # Bootstrap files cannot be bind mounted because docker opens the
        # underlying inode. When the file is switched, docker becomes outdated.
        cleaned_bind_mounts = file_mounts.copy()
        for mnt in BOOTSTRAP_MOUNTS:
            cleaned_bind_mounts.pop(mnt, None)

        start_command = docker_start_cmds(
            self.ssh_command_runner.ssh_user, image, cleaned_bind_mounts,
            self.container_name,
            self.docker_config.get("run_options", []) + self.docker_config.get(
                f"{'head' if as_head else 'worker'}_run_options", []))

        if not self._check_container_status():
            self.run(start_command, run_env="host")
        else:
            running_image = self.run(
                check_docker_image(self.container_name),
                with_output=True,
                run_env="host").decode("utf-8").strip()
            if running_image != image:
                logger.error(f"A container with name {self.container_name} " +
                             f"is running image {running_image} instead " +
                             f"of {image} (which was provided in the YAML")
            mounts = self.run(
                check_bind_mounts_cmd(self.container_name),
                with_output=True,
                run_env="host").decode("utf-8").strip()
            try:
                active_mounts = json.loads(mounts)
                active_remote_mounts = [
                    mnt["Destination"] for mnt in active_mounts
                ]
                # Ignore ray bootstrap files.
                for remote, local in cleaned_bind_mounts.items():
                    remote = self._docker_expand_user(remote)
                    if remote not in active_remote_mounts:
                        cli_logger.error(
                            "Please ray stop & restart cluster to "
                            f"allow mount {remote}:{local} to take hold")
            except json.JSONDecodeError:
                cli_logger.verbose(
                    "Unable to check if file_mounts specified in the YAML "
                    "differ from those on the running container.")

        # Explicitly copy in ray bootstrap files.
        for mount in BOOTSTRAP_MOUNTS:
            if mount in file_mounts:
                self.ssh_command_runner.run(
                    "docker cp {src} {container}:{dst}".format(
                        src=os.path.join(DOCKER_MOUNT_PREFIX, mount),
                        container=self.container_name,
                        dst=self._docker_expand_user(mount)))
        self.initialized = True
Exemplo n.º 5
0
def start(node_ip_address, address, port, redis_password, redis_shard_ports,
          object_manager_port, node_manager_port, gcs_server_port,
          min_worker_port, max_worker_port, memory, object_store_memory,
          redis_max_memory, num_cpus, num_gpus, resources, head,
          include_dashboard, dashboard_host, dashboard_port, block,
          plasma_directory, autoscaling_config, no_redirect_worker_output,
          no_redirect_output, plasma_store_socket_name, raylet_socket_name,
          temp_dir, java_worker_options, load_code_from_local,
          code_search_path, system_config, lru_evict,
          enable_object_reconstruction, metrics_export_port, log_style,
          log_color, verbose):
    """Start Ray processes manually on the local machine."""
    cli_logger.configure(log_style, log_color, verbose)

    if gcs_server_port and not head:
        raise ValueError(
            "gcs_server_port can be only assigned when you specify --head.")

    # Convert hostnames to numerical IP address.
    if node_ip_address is not None:
        node_ip_address = services.address_to_ip(node_ip_address)

    redis_address = None
    if address is not None:
        (redis_address, redis_address_ip,
         redis_address_port) = services.validate_redis_address(address)

    try:
        resources = json.loads(resources)
    except Exception:
        cli_logger.error("`{}` is not a valid JSON string.",
                         cf.bold("--resources"))
        cli_logger.abort(
            "Valid values look like this: `{}`",
            cf.bold("--resources='\"CustomResource3\": 1, "
                    "\"CustomResource2\": 2}'"))

        raise Exception("Unable to parse the --resources argument using "
                        "json.loads. Try using a format like\n\n"
                        "    --resources='{\"CustomResource1\": 3, "
                        "\"CustomReseource2\": 2}'")

    redirect_worker_output = None if not no_redirect_worker_output else True
    redirect_output = None if not no_redirect_output else True
    ray_params = ray.parameter.RayParams(
        node_ip_address=node_ip_address,
        min_worker_port=min_worker_port,
        max_worker_port=max_worker_port,
        object_manager_port=object_manager_port,
        node_manager_port=node_manager_port,
        gcs_server_port=gcs_server_port,
        memory=memory,
        object_store_memory=object_store_memory,
        redis_password=redis_password,
        redirect_worker_output=redirect_worker_output,
        redirect_output=redirect_output,
        num_cpus=num_cpus,
        num_gpus=num_gpus,
        resources=resources,
        plasma_directory=plasma_directory,
        huge_pages=False,
        plasma_store_socket_name=plasma_store_socket_name,
        raylet_socket_name=raylet_socket_name,
        temp_dir=temp_dir,
        include_dashboard=include_dashboard,
        dashboard_host=dashboard_host,
        dashboard_port=dashboard_port,
        java_worker_options=java_worker_options,
        load_code_from_local=load_code_from_local,
        code_search_path=code_search_path,
        _system_config=system_config,
        lru_evict=lru_evict,
        enable_object_reconstruction=enable_object_reconstruction,
        metrics_export_port=metrics_export_port)
    if head:
        num_redis_shards = None
        # Start Ray on the head node.
        if redis_shard_ports is not None:
            redis_shard_ports = redis_shard_ports.split(",")
            # Infer the number of Redis shards from the ports if the number is
            # not provided.
            num_redis_shards = len(redis_shard_ports)

        if redis_address is not None:
            cli_logger.abort(
                "`{}` starts a new Redis server, `{}` should not be set.",
                cf.bold("--head"), cf.bold("--address"))

            raise Exception("If --head is passed in, a Redis server will be "
                            "started, so a Redis address should not be "
                            "provided.")

        # Get the node IP address if one is not provided.
        ray_params.update_if_absent(
            node_ip_address=services.get_node_ip_address())
        cli_logger.labeled_value("Local node IP", ray_params.node_ip_address)
        cli_logger.old_info(logger, "Using IP address {} for this node.",
                            ray_params.node_ip_address)
        ray_params.update_if_absent(
            redis_port=port,
            redis_shard_ports=redis_shard_ports,
            redis_max_memory=redis_max_memory,
            num_redis_shards=num_redis_shards,
            redis_max_clients=None,
            autoscaling_config=autoscaling_config,
        )

        node = ray.node.Node(
            ray_params, head=True, shutdown_at_exit=block, spawn_reaper=block)
        redis_address = node.redis_address

        # this is a noop if new-style is not set, so the old logger calls
        # are still in place
        cli_logger.newline()
        startup_msg = "Ray runtime started."
        cli_logger.success("-" * len(startup_msg))
        cli_logger.success(startup_msg)
        cli_logger.success("-" * len(startup_msg))
        cli_logger.newline()
        with cli_logger.group("Next steps"):
            cli_logger.print(
                "To connect to this Ray runtime from another node, run")
            cli_logger.print(
                cf.bold("  ray start --address='{}'{}"), redis_address,
                f" --redis-password='******'"
                if redis_password else "")
            cli_logger.newline()
            cli_logger.print("Alternatively, use the following Python code:")
            with cli_logger.indented():
                with cf.with_style("monokai") as c:
                    cli_logger.print("{} ray", c.magenta("import"))
                    cli_logger.print(
                        "ray{}init(address{}{}{})", c.magenta("."),
                        c.magenta("="), c.yellow("'auto'"),
                        ", _redis_password{}{}".format(
                            c.magenta("="),
                            c.yellow("'" + redis_password + "'"))
                        if redis_password else "")
            cli_logger.newline()
            cli_logger.print(
                cf.underlined("If connection fails, check your "
                              "firewall settings and "
                              "network configuration."))
            cli_logger.newline()
            cli_logger.print("To terminate the Ray runtime, run")
            cli_logger.print(cf.bold("  ray stop"))

        cli_logger.old_info(
            logger,
            "\nStarted Ray on this node. You can add additional nodes to "
            "the cluster by calling\n\n"
            "    ray start --address='{}'{}\n\n"
            "from the node you wish to add. You can connect a driver to the "
            "cluster from Python by running\n\n"
            "    import ray\n"
            "    ray.init(address='auto'{})\n\n"
            "If you have trouble connecting from a different machine, check "
            "that your firewall is configured properly. If you wish to "
            "terminate the processes that have been started, run\n\n"
            "    ray stop".format(
                redis_address, " --redis-password='******'"
                if redis_password else "",
                ", _redis_password='******'"
                if redis_password else ""))
    else:
        # Start Ray on a non-head node.
        if not (port is None):
            cli_logger.abort("`{}` should not be specified without `{}`.",
                             cf.bold("--port"), cf.bold("--head"))

            raise Exception("If --head is not passed in, --port is not "
                            "allowed.")
        if redis_shard_ports is not None:
            cli_logger.abort("`{}` should not be specified without `{}`.",
                             cf.bold("--redis-shard-ports"), cf.bold("--head"))

            raise Exception("If --head is not passed in, --redis-shard-ports "
                            "is not allowed.")
        if redis_address is None:
            cli_logger.abort("`{}` is required unless starting with `{}`.",
                             cf.bold("--address"), cf.bold("--head"))

            raise Exception("If --head is not passed in, --address must "
                            "be provided.")
        if include_dashboard:
            cli_logger.abort("`{}` should not be specified without `{}`.",
                             cf.bold("--include-dashboard"), cf.bold("--head"))

            raise ValueError(
                "If --head is not passed in, the --include-dashboard"
                "flag is not relevant.")

        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(
            redis_address_ip, redis_address_port, password=redis_password)

        # Create a Redis client.
        redis_client = services.create_redis_client(
            redis_address, password=redis_password)

        # Check that the version information on this node matches the version
        # information that the cluster was started with.
        services.check_version_info(redis_client)

        # Get the node IP address if one is not provided.
        ray_params.update_if_absent(
            node_ip_address=services.get_node_ip_address(redis_address))

        cli_logger.labeled_value("Local node IP", ray_params.node_ip_address)
        cli_logger.old_info(logger, "Using IP address {} for this node.",
                            ray_params.node_ip_address)

        # Check that there aren't already Redis clients with the same IP
        # address connected with this Redis instance. This raises an exception
        # if the Redis server already has clients on this node.
        check_no_existing_redis_clients(ray_params.node_ip_address,
                                        redis_client)
        ray_params.update(redis_address=redis_address)
        node = ray.node.Node(
            ray_params, head=False, shutdown_at_exit=block, spawn_reaper=block)

        cli_logger.newline()
        startup_msg = "Ray runtime started."
        cli_logger.success("-" * len(startup_msg))
        cli_logger.success(startup_msg)
        cli_logger.success("-" * len(startup_msg))
        cli_logger.newline()
        cli_logger.print("To terminate the Ray runtime, run")
        cli_logger.print(cf.bold("  ray stop"))

        cli_logger.old_info(
            logger, "\nStarted Ray on this node. If you wish to terminate the "
            "processes that have been started, run\n\n"
            "    ray stop")

    if block:
        cli_logger.newline()
        with cli_logger.group(cf.bold("--block")):
            cli_logger.print(
                "This command will now block until terminated by a signal.")
            cli_logger.print(
                "Runing subprocesses are monitored and a message will be "
                "printed if any of them terminate unexpectedly.")

        while True:
            time.sleep(1)
            deceased = node.dead_processes()
            if len(deceased) > 0:
                cli_logger.newline()
                cli_logger.error("Some Ray subprcesses exited unexpectedly:")
                cli_logger.old_error(logger,
                                     "Ray processes died unexpectedly:")

                with cli_logger.indented():
                    for process_type, process in deceased:
                        cli_logger.error(
                            "{}",
                            cf.bold(str(process_type)),
                            _tags={"exit code": str(process.returncode)})
                        cli_logger.old_error(
                            logger, "\t{} died with exit code {}".format(
                                process_type, process.returncode))

                # shutdown_at_exit will handle cleanup.
                cli_logger.newline()
                cli_logger.error("Remaining processes will be killed.")
                cli_logger.old_error(
                    logger, "Killing remaining processes and exiting...")
                sys.exit(1)
Exemplo n.º 6
0
cli_logger.configure(log_style="auto", verbosity=999)

cli_logger.print(
    cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined"))
cli_logger.labeled_value("Label", "value")
cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3]))
cli_logger.newline()
cli_logger.very_verbose("Very verbose")
cli_logger.verbose("Verbose")
cli_logger.verbose_warning("Verbose warning")
cli_logger.verbose_error("Verbose error")
cli_logger.print("Info")
cli_logger.success("Success")
cli_logger.warning("Warning")
cli_logger.error("Error")
cli_logger.newline()
try:
    cli_logger.abort("Abort")
except Exception:
    pass
try:
    cli_logger.doassert(False, "Assert")
except Exception:
    pass
cli_logger.newline()
cli_logger.confirm(True, "example")
cli_logger.newline()
with cli_logger.indented():
    cli_logger.print("Indented")
with cli_logger.group("Group"):
Exemplo n.º 7
0
def get_cluster_dump_archive(cluster_config_file: Optional[str] = None,
                             host: Optional[str] = None,
                             ssh_user: Optional[str] = None,
                             ssh_key: Optional[str] = None,
                             docker: Optional[str] = None,
                             local: Optional[bool] = None,
                             output: Optional[str] = None,
                             logs: bool = True,
                             debug_state: bool = True,
                             pip: bool = True,
                             processes: bool = True,
                             processes_verbose: bool = False) -> Optional[str]:

    # Inform the user what kind of logs are collected (before actually
    # collecting, so they can abort)
    content_str = ""
    if logs:
        content_str += \
            "  - The logfiles of your Ray session\n" \
            "    This usually includes Python outputs (stdout/stderr)\n"

    if debug_state:
        content_str += \
            "  - Debug state information on your Ray cluster \n" \
            "    e.g. number of workers, drivers, objects, etc.\n"

    if pip:
        content_str += "  - Your installed Python packages (`pip freeze`)\n"

    if processes:
        content_str += \
            "  - Information on your running Ray processes\n" \
            "    This includes command line arguments\n"

    cli_logger.warning(
        "You are about to create a cluster dump. This will collect data from "
        "cluster nodes.\n\n"
        "The dump will contain this information:\n\n"
        f"{content_str}\n"
        f"If you are concerned about leaking private information, extract "
        f"the archive and inspect its contents before sharing it with "
        f"anyone.")

    # Parse arguments (e.g. fetch info from cluster config)
    cluster_config_file, hosts, ssh_user, ssh_key, docker, cluster_name = \
        _info_from_params(cluster_config_file, host, ssh_user, ssh_key, docker)

    nodes = [
        Node(
            host=h,
            ssh_user=ssh_user,
            ssh_key=ssh_key,
            docker_container=docker) for h in hosts
    ]

    if not nodes:
        cli_logger.error(
            f"No nodes found. Specify with `--host` or by passing a ray "
            f"cluster config to `--cluster`.")
        return None

    if cluster_config_file:
        nodes[0].is_head = True

    if local is None:
        # If called with a cluster config, this was probably started
        # from a laptop
        local = not bool(cluster_config_file)

    parameters = GetParameters(
        logs=logs,
        debug_state=debug_state,
        pip=pip,
        processes=processes,
        processes_verbose=processes_verbose)

    with Archive() as archive:
        if local:
            create_archive_for_local_and_remote_nodes(
                archive, remote_nodes=nodes, parameters=parameters)
        else:
            create_archive_for_remote_nodes(
                archive, remote_nodes=nodes, parameters=parameters)

    if not output:
        if cluster_name:
            filename = f"{cluster_name}_" \
                       f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz"
        else:
            filename = f"collected_logs_" \
                       f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz"
        output = os.path.join(os.getcwd(), filename)
    else:
        output = os.path.expanduser(output)

    os.rename(archive.file, output)
    return output
Exemplo n.º 8
0
def _log_big_error_msg(success_msg):
    cli_logger.newline()
    cli_logger.error("-" * len(success_msg))
    cli_logger.error(success_msg)
    cli_logger.error("-" * len(success_msg))
    cli_logger.newline()
Exemplo n.º 9
0
    def run(self):
        cli_logger.old_info(logger, "{}Updating to {}", self.log_prefix,
                            self.runtime_hash)

        if cmd_output_util.does_allow_interactive(
        ) and cmd_output_util.is_output_redirected():
            # this is most probably a bug since the user has no control
            # over these settings
            msg = ("Output was redirected for an interactive command. "
                   "Either do not pass `--redirect-command-output` "
                   "or also pass in `--use-normal-shells`.")
            cli_logger.abort(msg)
            raise click.ClickException(msg)

        try:
            with LogTimer(self.log_prefix +
                          "Applied config {}".format(self.runtime_hash)):
                self.do_update()
        except Exception as e:
            error_str = str(e)
            if hasattr(e, "cmd"):
                error_str = "(Exit Status {}) {}".format(
                    e.returncode, " ".join(e.cmd))

            self.provider.set_node_tags(
                self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED})
            cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED))

            cli_logger.old_error(logger, "{}Error executing: {}\n",
                                 self.log_prefix, error_str)

            cli_logger.error("!!!")
            if hasattr(e, "cmd"):
                cli_logger.error(
                    "Setup command `{}` failed with exit code {}. stderr:",
                    cf.bold(e.cmd), e.returncode)
            else:
                cli_logger.verbose_error("{}", str(vars(e)))
                # todo: handle this better somehow?
                cli_logger.error("{}", str(e))
            # todo: print stderr here
            cli_logger.error("!!!")
            cli_logger.newline()

            if isinstance(e, click.ClickException):
                # todo: why do we ignore this here
                return
            raise

        tags_to_set = {
            TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
            TAG_RAY_RUNTIME_CONFIG: self.runtime_hash,
        }
        if self.file_mounts_contents_hash is not None:
            tags_to_set[
                TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash

        self.provider.set_node_tags(self.node_id, tags_to_set)
        cli_logger.labeled_value("New status", STATUS_UP_TO_DATE)

        self.exitcode = 0
Exemplo n.º 10
0
    def run_init(
        self, *, as_head: bool, file_mounts: Dict[str, str], sync_run_yet: bool
    ):
        BOOTSTRAP_MOUNTS = ["~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"]

        specific_image = self.docker_config.get(
            f"{'head' if as_head else 'worker'}_image", self.docker_config.get("image")
        )

        self._check_docker_installed()
        if self.docker_config.get("pull_before_run", True):
            assert specific_image, (
                "Image must be included in config if " + "pull_before_run is specified"
            )
            self.run(
                "{} pull {}".format(self.docker_cmd, specific_image), run_env="host"
            )
        else:

            self.run(
                f"{self.docker_cmd} image inspect {specific_image} "
                "1> /dev/null  2>&1 || "
                f"{self.docker_cmd} pull {specific_image}"
            )

        # Bootstrap files cannot be bind mounted because docker opens the
        # underlying inode. When the file is switched, docker becomes outdated.
        cleaned_bind_mounts = file_mounts.copy()
        for mnt in BOOTSTRAP_MOUNTS:
            cleaned_bind_mounts.pop(mnt, None)

        docker_run_executed = False

        container_running = self._check_container_status()
        requires_re_init = False
        if container_running:
            requires_re_init = self._check_if_container_restart_is_needed(
                specific_image, cleaned_bind_mounts
            )
            if requires_re_init:
                self.run(
                    f"{self.docker_cmd} stop {self.container_name}", run_env="host"
                )

        if (not container_running) or requires_re_init:
            if not sync_run_yet:
                # Do not start the actual image as we need to run file_sync
                # first to ensure that all folders are created with the
                # correct ownership. Docker will create the folders with
                # `root` as the owner.
                return True
            # Get home directory
            image_env = (
                self.ssh_command_runner.run(
                    f"{self.docker_cmd} "
                    + "inspect -f '{{json .Config.Env}}' "
                    + specific_image,
                    with_output=True,
                )
                .decode()
                .strip()
            )
            home_directory = "/root"
            try:
                for env_var in json.loads(image_env):
                    if env_var.startswith("HOME="):
                        home_directory = env_var.split("HOME=")[1]
                        break
            except json.JSONDecodeError as e:
                cli_logger.error(
                    "Unable to deserialize `image_env` to Python object. "
                    f"The `image_env` is:\n{image_env}"
                )
                raise e

            user_docker_run_options = self.docker_config.get(
                "run_options", []
            ) + self.docker_config.get(
                f"{'head' if as_head else 'worker'}_run_options", []
            )
            start_command = docker_start_cmds(
                self.ssh_command_runner.ssh_user,
                specific_image,
                cleaned_bind_mounts,
                self.container_name,
                self._configure_runtime(
                    self._auto_configure_shm(user_docker_run_options)
                ),
                self.ssh_command_runner.cluster_name,
                home_directory,
                self.docker_cmd,
            )
            self.run(start_command, run_env="host")
            docker_run_executed = True

        # Explicitly copy in ray bootstrap files.
        for mount in BOOTSTRAP_MOUNTS:
            if mount in file_mounts:
                if not sync_run_yet:
                    # NOTE(ilr) This rsync is needed because when starting from
                    #  a stopped instance,  /tmp may be deleted and `run_init`
                    # is called before the first `file_sync` happens
                    self.run_rsync_up(file_mounts[mount], mount)
                self.ssh_command_runner.run(
                    "rsync -e '{cmd} exec -i' -avz {src} {container}:{dst}".format(
                        cmd=self.docker_cmd,
                        src=os.path.join(
                            self._get_docker_host_mount_location(
                                self.ssh_command_runner.cluster_name
                            ),
                            mount,
                        ),
                        container=self.container_name,
                        dst=self._docker_expand_user(mount),
                    )
                )
                try:
                    # Check if the current user has read permission.
                    # If they do not, try to change ownership!
                    self.run(
                        f"cat {mount} >/dev/null 2>&1 || "
                        f"sudo chown $(id -u):$(id -g) {mount}"
                    )
                except Exception:
                    lsl_string = (
                        self.run(f"ls -l {mount}", with_output=True)
                        .decode("utf-8")
                        .strip()
                    )
                    # The string is of format <Permission> <Links>
                    # <Owner> <Group> <Size> <Date> <Name>
                    permissions = lsl_string.split(" ")[0]
                    owner = lsl_string.split(" ")[2]
                    group = lsl_string.split(" ")[3]
                    current_user = (
                        self.run("whoami", with_output=True).decode("utf-8").strip()
                    )
                    cli_logger.warning(
                        f"File ({mount}) is owned by user:{owner} and group:"
                        f"{group} with permissions ({permissions}). The "
                        f"current user ({current_user}) does not have "
                        "permission to read these files, and Ray may not be "
                        "able to autoscale. This can be resolved by "
                        "installing `sudo` in your container, or adding a "
                        f"command like 'chown {current_user} {mount}' to "
                        "your `setup_commands`."
                    )
        self.initialized = True
        return docker_run_executed