Exemplo n.º 1
0
    def rsync_down(self, source, target):
        cli_logger.old_info(logger, "{}Syncing {} from {}...", self.log_prefix,
                            source, target)

        self.cmd_runner.run_rsync_down(source, target)
        cli_logger.verbose("`rsync`ed {} (remote) to {} (local)",
                           cf.bold(source), cf.bold(target))
Exemplo n.º 2
0
    def run(self,
            cmd,
            timeout=120,
            exit_on_fail=False,
            port_forward=None,
            with_output=False,
            ssh_options_override=None,
            **kwargs):
        ssh_options = ssh_options_override or self.ssh_options

        assert isinstance(
            ssh_options, SSHOptions
        ), "ssh_options must be of type SSHOptions, got {}".format(
            type(ssh_options))

        self._set_ssh_ip_if_required()

        if is_using_login_shells():
            ssh = ["ssh", "-tt"]
        else:
            ssh = ["ssh"]

        if port_forward:
            with cli_logger.group("Forwarding ports"):
                if not isinstance(port_forward, list):
                    port_forward = [port_forward]
                for local, remote in port_forward:
                    cli_logger.verbose(
                        "Forwarding port {} to port {} on localhost.",
                        cf.bold(local), cf.bold(remote))  # todo: msg
                    cli_logger.old_info(logger,
                                        "{}Forwarding {} -> localhost:{}",
                                        self.log_prefix, local, remote)
                    ssh += ["-L", "{}:localhost:{}".format(remote, local)]

        final_cmd = ssh + ssh_options.to_ssh_options_list(
            timeout=timeout) + ["{}@{}".format(self.ssh_user, self.ssh_ip)]
        if cmd:
            if is_using_login_shells():
                final_cmd += _with_interactive(cmd)
            else:
                final_cmd += [cmd]
            cli_logger.old_info(logger, "{}Running {}", self.log_prefix,
                                " ".join(final_cmd))
        else:
            # We do this because `-o ControlMaster` causes the `-N` flag to
            # still create an interactive shell in some ssh versions.
            final_cmd.append(quote("while true; do sleep 86400; done"))

        cli_logger.verbose("Running `{}`", cf.bold(cmd))
        with cli_logger.indented():
            cli_logger.very_verbose("Full command is `{}`",
                                    cf.bold(" ".join(final_cmd)))

        if cli_logger.verbosity > 0:
            with cli_logger.indented():
                return self._run_helper(final_cmd, with_output, exit_on_fail)
        else:
            return self._run_helper(final_cmd, with_output, exit_on_fail)
Exemplo n.º 3
0
    def rsync_down(self, source, target, file_mount=False):
        cli_logger.old_info(logger, "{}Syncing {} from {}...", self.log_prefix,
                            source, target)

        options = {}
        options["file_mount"] = file_mount
        self.cmd_runner.run_rsync_down(source, target, options=options)
        cli_logger.verbose("`rsync`ed {} (remote) to {} (local)",
                           cf.bold(source), cf.bold(target))
Exemplo n.º 4
0
def _bootstrap_config(config: Dict[str, Any],
                      no_config_cache: bool = False) -> Dict[str, Any]:
    config = prepare_config(config)

    hasher = hashlib.sha1()
    hasher.update(json.dumps([config], sort_keys=True).encode("utf-8"))
    cache_key = os.path.join(tempfile.gettempdir(),
                             "ray-config-{}".format(hasher.hexdigest()))

    if os.path.exists(cache_key) and not no_config_cache:
        cli_logger.old_info(logger, "Using cached config at {}", cache_key)

        config_cache = json.loads(open(cache_key).read())
        if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION:
            # todo: is it fine to re-resolve? afaik it should be.
            # we can have migrations otherwise or something
            # but this seems overcomplicated given that resolving is
            # relatively cheap
            try_reload_log_state(config_cache["config"]["provider"],
                                 config_cache.get("provider_log_info"))
            cli_logger.verbose("Loaded cached config from " + cf.bold("{}"),
                               cache_key)

            return config_cache["config"]
        else:
            cli_logger.warning(
                "Found cached cluster config "
                "but the version " + cf.bold("{}") + " "
                "(expected " + cf.bold("{}") + ") does not match.\n"
                "This is normal if cluster launcher was updated.\n"
                "Config will be re-resolved.",
                config_cache.get("_version", "none"), CONFIG_CACHE_VERSION)
    validate_config(config)

    importer = NODE_PROVIDERS.get(config["provider"]["type"])
    if not importer:
        raise NotImplementedError("Unsupported provider {}".format(
            config["provider"]))

    provider_cls = importer(config["provider"])

    with cli_logger.timed(  # todo: better message
            "Bootstraping {} config",
            PROVIDER_PRETTY_NAMES.get(config["provider"]["type"])):
        resolved_config = provider_cls.bootstrap_config(config)

    if not no_config_cache:
        with open(cache_key, "w") as f:
            config_cache = {
                "_version": CONFIG_CACHE_VERSION,
                "provider_log_info": try_get_log_state(config["provider"]),
                "config": resolved_config
            }
            f.write(json.dumps(config_cache))
    return resolved_config
Exemplo n.º 5
0
 def run_rsync_up(self, source, target):
     self._set_ssh_ip_if_required()
     command = [
         "rsync", "--rsh",
         subprocess.list2cmdline(["ssh"] +
                                 self.ssh_options.to_ssh_options_list(
                                     timeout=120)), "-avz", source,
         "{}@{}:{}".format(self.ssh_user, self.ssh_ip, target)
     ]
     cli_logger.verbose("Running `{}`", cf.bold(" ".join(command)))
     self._run_helper(command, silent=is_rsync_silent())
Exemplo n.º 6
0
    def run_rsync_down(self, source, target):
        self._set_ssh_ip_if_required()

        command = [
            "rsync", "--rsh",
            subprocess.list2cmdline(["ssh"] +
                                    self.ssh_options.to_ssh_options_list(
                                        timeout=120)), "-avz",
            "{}@{}:{}".format(self.ssh_user, self.ssh_ip, source), target
        ]
        cli_logger.verbose("Running `{}`", cf.bold(" ".join(command)))
        self.process_runner.check_call(command)
Exemplo n.º 7
0
    def run_init(self, *, as_head, file_mounts):
        image = self.docker_config.get("image")
        image = self.docker_config.get(
            f"{'head' if as_head else 'worker'}_image", image)

        self._check_docker_installed()
        if self.docker_config.get("pull_before_run", True):
            assert image, "Image must be included in config if " + \
                "pull_before_run is specified"

            self.run("docker pull {}".format(image), run_env="host")

        start_command = docker_start_cmds(
            self.ssh_command_runner.ssh_user, image, file_mounts,
            self.container_name,
            self.docker_config.get("run_options", []) + self.docker_config.get(
                f"{'head' if as_head else 'worker'}_run_options", []))

        if not self._check_container_status():
            self.run(start_command, run_env="host")
        else:
            running_image = self.run(
                check_docker_image(self.container_name),
                with_output=True,
                run_env="host").decode("utf-8").strip()
            if running_image != image:
                logger.error(f"A container with name {self.container_name} " +
                             f"is running image {running_image} instead " +
                             f"of {image} (which was provided in the YAML")
            mounts = self.run(
                check_bind_mounts_cmd(self.container_name),
                with_output=True,
                run_env="host").decode("utf-8").strip()
            try:
                active_mounts = json.loads(mounts)
                active_remote_mounts = [
                    mnt["Destination"] for mnt in active_mounts
                ]
                for remote, local in file_mounts.items():
                    remote = self._docker_expand_user(remote)
                    if remote not in active_remote_mounts:
                        cli_logger.error(
                            "Please ray stop & restart cluster to "
                            f"allow mount {remote}:{local} to take hold")
            except json.JSONDecodeError:
                cli_logger.verbose(
                    "Unable to check if file_mounts specified in the YAML "
                    "differ from those on the running container.")
        self.initialized = True
Exemplo n.º 8
0
def _create_security_group(config, vpc_id, group_name):
    client = _client("ec2", config)
    client.create_security_group(
        Description="Auto-created security group for Ray workers",
        GroupName=group_name,
        VpcId=vpc_id)
    security_group = _get_security_group(config, vpc_id, group_name)

    cli_logger.verbose("Created new security group {}",
                       cf.bold(security_group.group_name),
                       _tags=dict(id=security_group.id))
    cli_logger.old_info(
        logger, "_create_security_group: Created new security group {} ({})",
        security_group.group_name, security_group.id)

    cli_logger.doassert(security_group,
                        "Failed to create security group")  # err msg
    assert security_group, "Failed to create security group"
    return security_group
Exemplo n.º 9
0
def run_cmd_redirected(cmd,
                       process_runner=subprocess,
                       silent=False,
                       use_login_shells=False):
    """Run a command and optionally redirect output to a file.

    Args:
        cmd (List[str]): Command to run.
        process_runner: Process runner used for executing commands.
        silent (bool): If true, the command output will be silenced completely
                       (redirected to /dev/null), unless verbose logging
                       is enabled. Use this for runnign utility commands like
                       rsync.
    """
    if silent and cli_logger.verbosity < 1:
        return _run_and_process_output(cmd,
                                       process_runner=process_runner,
                                       stdout_file=None,
                                       use_login_shells=use_login_shells)

    if not is_output_redirected():
        return _run_and_process_output(cmd,
                                       process_runner=process_runner,
                                       stdout_file=sys.stdout,
                                       use_login_shells=use_login_shells)
    else:
        tmpfile_path = os.path.join(
            tempfile.gettempdir(),
            "ray-up-{}-{}.txt".format(cmd[0], time.time()))
        with open(
                tmpfile_path,
                mode="w",
                # line buffering
                buffering=1) as tmp:
            cli_logger.verbose("Command stdout is redirected to {}",
                               cf.bold(tmp.name))

            return _run_and_process_output(cmd,
                                           process_runner=process_runner,
                                           stdout_file=tmp,
                                           stderr_file=tmp,
                                           use_login_shells=use_login_shells)
Exemplo n.º 10
0
def _configure_key_pair(config):
    if "ssh_private_key" in config["auth"]:
        _set_config_info(keypair_src="config")

        cli_logger.doassert(  # todo: verify schema beforehand?
            "KeyName" in config["head_node"],
            "`KeyName` missing for head node.")  # todo: err msg
        cli_logger.doassert(
            "KeyName" in config["worker_nodes"],
            "`KeyName` missing for worker nodes.")  # todo: err msg

        assert "KeyName" in config["head_node"]
        assert "KeyName" in config["worker_nodes"]
        return config
    _set_config_info(keypair_src="default")

    ec2 = _resource("ec2", config)

    # Try a few times to get or create a good key pair.
    MAX_NUM_KEYS = 30
    for i in range(MAX_NUM_KEYS):

        key_name = config["provider"].get("key_pair", {}).get("key_name")

        key_name, key_path = key_pair(i, config["provider"]["region"],
                                      key_name)
        key = _get_key(key_name, config)

        # Found a good key.
        if key and os.path.exists(key_path):
            break

        # We can safely create a new key.
        if not key and not os.path.exists(key_path):
            cli_logger.verbose(
                "Creating new key pair {} for use as the default.",
                cf.bold(key_name))
            cli_logger.old_info(
                logger, "_configure_key_pair: "
                "Creating new key pair {}", key_name)
            key = ec2.create_key_pair(KeyName=key_name)

            # We need to make sure to _create_ the file with the right
            # permissions. In order to do that we need to change the default
            # os.open behavior to include the mode we want.
            with open(key_path, "w", opener=partial(os.open, mode=0o600)) as f:
                f.write(key.key_material)
            break

    if not key:
        cli_logger.abort(
            "No matching local key file for any of the key pairs in this "
            "account with ids from 0..{}. "
            "Consider deleting some unused keys pairs from your account.",
            key_name)  # todo: err msg
        raise ValueError(
            "No matching local key file for any of the key pairs in this "
            "account with ids from 0..{}. ".format(key_name) +
            "Consider deleting some unused keys pairs from your account.")

    cli_logger.doassert(os.path.exists(key_path), "Private key file " +
                        cf.bold("{}") + " not found for " + cf.bold("{}"),
                        key_path, key_name)  # todo: err msg
    assert os.path.exists(key_path), \
        "Private key file {} not found for {}".format(key_path, key_name)

    cli_logger.old_info(
        logger, "_configure_key_pair: "
        "KeyName not specified for nodes, using {}", key_name)

    config["auth"]["ssh_private_key"] = key_path
    config["head_node"]["KeyName"] = key_name
    config["worker_nodes"]["KeyName"] = key_name

    return config
Exemplo n.º 11
0
def _configure_iam_role(config):
    if "IamInstanceProfile" in config["head_node"]:
        _set_config_info(head_instance_profile_src="config")
        return config
    _set_config_info(head_instance_profile_src="default")

    profile = _get_instance_profile(DEFAULT_RAY_INSTANCE_PROFILE, config)

    if profile is None:
        cli_logger.verbose(
            "Creating new IAM instance profile {} for use as the default.",
            cf.bold(DEFAULT_RAY_INSTANCE_PROFILE))
        cli_logger.old_info(
            logger, "_configure_iam_role: "
            "Creating new instance profile {}", DEFAULT_RAY_INSTANCE_PROFILE)
        client = _client("iam", config)
        client.create_instance_profile(
            InstanceProfileName=DEFAULT_RAY_INSTANCE_PROFILE)
        profile = _get_instance_profile(DEFAULT_RAY_INSTANCE_PROFILE, config)
        time.sleep(15)  # wait for propagation

    cli_logger.doassert(profile is not None,
                        "Failed to create instance profile.")  # todo: err msg
    assert profile is not None, "Failed to create instance profile"

    if not profile.roles:
        role = _get_role(DEFAULT_RAY_IAM_ROLE, config)
        if role is None:
            cli_logger.verbose(
                "Creating new IAM role {} for "
                "use as the default instance role.",
                cf.bold(DEFAULT_RAY_IAM_ROLE))
            cli_logger.old_info(logger, "_configure_iam_role: "
                                "Creating new role {}", DEFAULT_RAY_IAM_ROLE)
            iam = _resource("iam", config)
            iam.create_role(RoleName=DEFAULT_RAY_IAM_ROLE,
                            AssumeRolePolicyDocument=json.dumps({
                                "Statement": [
                                    {
                                        "Effect": "Allow",
                                        "Principal": {
                                            "Service": "ec2.amazonaws.com"
                                        },
                                        "Action": "sts:AssumeRole",
                                    },
                                ],
                            }))
            role = _get_role(DEFAULT_RAY_IAM_ROLE, config)

            cli_logger.doassert(role is not None,
                                "Failed to create role.")  # todo: err msg
            assert role is not None, "Failed to create role"
        role.attach_policy(
            PolicyArn="arn:aws:iam::aws:policy/AmazonEC2FullAccess")
        role.attach_policy(
            PolicyArn="arn:aws:iam::aws:policy/AmazonS3FullAccess")
        profile.add_role(RoleName=role.name)
        time.sleep(15)  # wait for propagation

    cli_logger.old_info(
        logger, "_configure_iam_role: "
        "Role not specified for head node, using {}", profile.arn)
    config["head_node"]["IamInstanceProfile"] = {"Arn": profile.arn}

    return config
Exemplo n.º 12
0
    def run_init(self, *, as_head, file_mounts):
        BOOTSTRAP_MOUNTS = [
            "~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"
        ]

        image = self.docker_config.get("image")
        image = self.docker_config.get(
            f"{'head' if as_head else 'worker'}_image", image)

        self._check_docker_installed()
        if self.docker_config.get("pull_before_run", True):
            assert image, "Image must be included in config if " + \
                "pull_before_run is specified"

            self.run("docker pull {}".format(image), run_env="host")

        # Bootstrap files cannot be bind mounted because docker opens the
        # underlying inode. When the file is switched, docker becomes outdated.
        cleaned_bind_mounts = file_mounts.copy()
        for mnt in BOOTSTRAP_MOUNTS:
            cleaned_bind_mounts.pop(mnt, None)

        start_command = docker_start_cmds(
            self.ssh_command_runner.ssh_user, image, cleaned_bind_mounts,
            self.container_name,
            self.docker_config.get("run_options", []) + self.docker_config.get(
                f"{'head' if as_head else 'worker'}_run_options", []))

        if not self._check_container_status():
            self.run(start_command, run_env="host")
        else:
            running_image = self.run(check_docker_image(self.container_name),
                                     with_output=True,
                                     run_env="host").decode("utf-8").strip()
            if running_image != image:
                logger.error(f"A container with name {self.container_name} " +
                             f"is running image {running_image} instead " +
                             f"of {image} (which was provided in the YAML")
            mounts = self.run(check_bind_mounts_cmd(self.container_name),
                              with_output=True,
                              run_env="host").decode("utf-8").strip()
            try:
                active_mounts = json.loads(mounts)
                active_remote_mounts = [
                    mnt["Destination"] for mnt in active_mounts
                ]
                # Ignore ray bootstrap files.
                for remote, local in cleaned_bind_mounts.items():
                    remote = self._docker_expand_user(remote)
                    if remote not in active_remote_mounts:
                        cli_logger.error(
                            "Please ray stop & restart cluster to "
                            f"allow mount {remote}:{local} to take hold")
            except json.JSONDecodeError:
                cli_logger.verbose(
                    "Unable to check if file_mounts specified in the YAML "
                    "differ from those on the running container.")

        # Explicitly copy in ray bootstrap files.
        for mount in BOOTSTRAP_MOUNTS:
            if mount in file_mounts:
                self.ssh_command_runner.run(
                    "docker cp {src} {container}:{dst}".format(
                        src=os.path.join(DOCKER_MOUNT_PREFIX, mount),
                        container=self.container_name,
                        dst=self._docker_expand_user(mount)))
        self.initialized = True
Exemplo n.º 13
0
    def run(self,
            cmd,
            timeout=120,
            exit_on_fail=False,
            port_forward=None,
            with_output=False,
            ssh_options_override=None,
            **kwargs):
        ssh_options = ssh_options_override or self.ssh_options

        assert isinstance(
            ssh_options, SSHOptions
        ), "ssh_options must be of type SSHOptions, got {}".format(
            type(ssh_options))

        self._set_ssh_ip_if_required()

        ssh = ["ssh", "-tt"]

        if port_forward:
            with cli_logger.group("Forwarding ports"):
                if not isinstance(port_forward, list):
                    port_forward = [port_forward]
                for local, remote in port_forward:
                    cli_logger.verbose(
                        "Forwarding port {} to port {} on localhost.",
                        cf.bold(local), cf.bold(remote))  # todo: msg
                    cli_logger.old_info(logger,
                                        "{}Forwarding {} -> localhost:{}",
                                        self.log_prefix, local, remote)
                    ssh += ["-L", "{}:localhost:{}".format(remote, local)]

        final_cmd = ssh + ssh_options.to_ssh_options_list(
            timeout=timeout) + ["{}@{}".format(self.ssh_user, self.ssh_ip)]
        if cmd:
            final_cmd += _with_interactive(cmd)
            cli_logger.old_info(logger, "{}Running {}", self.log_prefix,
                                " ".join(final_cmd))
        else:
            # We do this because `-o ControlMaster` causes the `-N` flag to
            # still create an interactive shell in some ssh versions.
            final_cmd.append(quote("while true; do sleep 86400; done"))

        # todo: add a flag for this, we might
        # wanna log commands with print sometimes
        cli_logger.verbose("Running `{}`", cf.bold(cmd))
        with cli_logger.indented():
            cli_logger.very_verbose("Full command is `{}`",
                                    cf.bold(" ".join(final_cmd)))

        def start_process():
            try:
                if with_output:
                    return self.process_runner.check_output(final_cmd)
                else:
                    self.process_runner.check_call(final_cmd)
            except subprocess.CalledProcessError as e:
                quoted_cmd = " ".join(final_cmd[:-1] + [quote(final_cmd[-1])])
                if not cli_logger.old_style:
                    raise ProcessRunnerError("Command failed",
                                             "ssh_command_failed",
                                             code=e.returncode,
                                             command=quoted_cmd)

                if exit_on_fail:
                    raise click.ClickException(
                        "Command failed: \n\n  {}\n".format(quoted_cmd)) \
                        from None
                else:
                    raise click.ClickException(
                        "SSH command Failed. See above for the output from the"
                        " failure.") from None

        if cli_logger.verbosity > 0:
            with cli_logger.indented():
                return start_process()
        else:
            return start_process()
Exemplo n.º 14
0
def handle_boto_error(exc, msg, *args, **kwargs):
    if cli_logger.old_style:
        # old-style logging doesn't do anything here
        # so we exit early
        return

    error_code = None
    error_info = None
    # todo: not sure if these exceptions always have response
    if hasattr(exc, "response"):
        error_info = exc.response.get("Error", None)
    if error_info is not None:
        error_code = error_info.get("Code", None)

    generic_message_args = [
        "{}\n"
        "Error code: {}",
        msg.format(*args, **kwargs),
        cf.bold(error_code)
    ]

    # apparently
    # ExpiredTokenException
    # ExpiredToken
    # RequestExpired
    # are all the same pretty much
    credentials_expiration_codes = [
        "ExpiredTokenException", "ExpiredToken", "RequestExpired"
    ]

    if error_code in credentials_expiration_codes:
        # "An error occurred (ExpiredToken) when calling the
        # GetInstanceProfile operation: The security token
        # included in the request is expired"

        # "An error occurred (RequestExpired) when calling the
        # DescribeKeyPairs operation: Request has expired."

        token_command = ("aws sts get-session-token "
                         "--serial-number arn:aws:iam::" +
                         cf.underlined("ROOT_ACCOUNT_ID") + ":mfa/" +
                         cf.underlined("AWS_USERNAME") + " --token-code " +
                         cf.underlined("TWO_FACTOR_AUTH_CODE"))

        secret_key_var = ("export AWS_SECRET_ACCESS_KEY = " +
                          cf.underlined("REPLACE_ME") +
                          " # found at Credentials.SecretAccessKey")
        session_token_var = ("export AWS_SESSION_TOKEN = " +
                             cf.underlined("REPLACE_ME") +
                             " # found at Credentials.SessionToken")
        access_key_id_var = ("export AWS_ACCESS_KEY_ID = " +
                             cf.underlined("REPLACE_ME") +
                             " # found at Credentials.AccessKeyId")

        # fixme: replace with a Github URL that points
        # to our repo
        aws_session_script_url = ("https://gist.github.com/maximsmol/"
                                  "a0284e1d97b25d417bd9ae02e5f450cf")

        cli_logger.verbose_error(*generic_message_args)
        cli_logger.verbose(vars(exc))

        cli_logger.abort(
            "Your AWS session has expired.\n\n"
            "You can request a new one using\n{}\n"
            "then expose it to Ray by setting\n{}\n{}\n{}\n\n"
            "You can find a script that automates this at:\n{}",
            cf.bold(token_command), cf.bold(secret_key_var),
            cf.bold(session_token_var), cf.bold(access_key_id_var),
            cf.underlined(aws_session_script_url))

    # todo: any other errors that we should catch separately?

    cli_logger.error(*generic_message_args)
    cli_logger.newline()
    with cli_logger.verbatim_error_ctx("Boto3 error:"):
        cli_logger.verbose(vars(exc))
        cli_logger.error(exc)
    cli_logger.abort()
Exemplo n.º 15
0
# other formatting.

from ray.autoscaler.cli_logger import cli_logger
import colorful as cf

cli_logger.old_style = False
cli_logger.verbosity = 999
cli_logger.detect_colors()

cli_logger.print(
    cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined"))
cli_logger.labeled_value("Label", "value")
cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3]))
cli_logger.newline()
cli_logger.very_verbose("Very verbose")
cli_logger.verbose("Verbose")
cli_logger.verbose_warning("Verbose warning")
cli_logger.verbose_error("Verbose error")
cli_logger.print("Info")
cli_logger.success("Success")
cli_logger.warning("Warning")
cli_logger.error("Error")
cli_logger.newline()
try:
    cli_logger.abort("Abort")
except Exception:
    pass
try:
    cli_logger.doassert(False, "Assert")
except Exception:
    pass