예제 #1
0
파일: command_runner.py 프로젝트: rlan/ray
    def run_init(self, *, as_head: bool, file_mounts: Dict[str, str],
                 sync_run_yet: bool):
        BOOTSTRAP_MOUNTS = [
            "~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"
        ]

        specific_image = self.docker_config.get(
            f"{'head' if as_head else 'worker'}_image",
            self.docker_config.get("image"))

        self._check_docker_installed()
        if self.docker_config.get("pull_before_run", True):
            assert specific_image, "Image must be included in config if " + \
                "pull_before_run is specified"
            self.run("{} pull {}".format(self.docker_cmd, specific_image),
                     run_env="host")
        else:

            self.run(f"{self.docker_cmd} image inspect {specific_image} "
                     "1> /dev/null  2>&1 || "
                     f"{self.docker_cmd} pull {specific_image}")

        # Bootstrap files cannot be bind mounted because docker opens the
        # underlying inode. When the file is switched, docker becomes outdated.
        cleaned_bind_mounts = file_mounts.copy()
        for mnt in BOOTSTRAP_MOUNTS:
            cleaned_bind_mounts.pop(mnt, None)

        docker_run_executed = False

        container_running = self._check_container_status()
        requires_re_init = False
        if container_running:
            requires_re_init = self._check_if_container_restart_is_needed(
                specific_image, cleaned_bind_mounts)
            if requires_re_init:
                self.run(f"{self.docker_cmd} stop {self.container_name}",
                         run_env="host")

        if (not container_running) or requires_re_init:
            if not sync_run_yet:
                # Do not start the actual image as we need to run file_sync
                # first to ensure that all folders are created with the
                # correct ownership. Docker will create the folders with
                # `root` as the owner.
                return True
            # Get home directory
            image_env = self.ssh_command_runner.run(
                f"{self.docker_cmd} " + "inspect -f '{{json .Config.Env}}' " +
                specific_image,
                with_output=True).decode().strip()
            home_directory = "/root"
            for env_var in json.loads(image_env):
                if env_var.startswith("HOME="):
                    home_directory = env_var.split("HOME=")[1]
                    break

            user_docker_run_options = self.docker_config.get(
                "run_options", []) + self.docker_config.get(
                    f"{'head' if as_head else 'worker'}_run_options", [])
            start_command = docker_start_cmds(
                self.ssh_command_runner.ssh_user, specific_image,
                cleaned_bind_mounts, self.container_name,
                self._configure_runtime(
                    self._auto_configure_shm(user_docker_run_options)),
                self.ssh_command_runner.cluster_name, home_directory,
                self.docker_cmd)
            self.run(start_command, run_env="host")
            docker_run_executed = True

        # Explicitly copy in ray bootstrap files.
        for mount in BOOTSTRAP_MOUNTS:
            if mount in file_mounts:
                if not sync_run_yet:
                    # NOTE(ilr) This rsync is needed because when starting from
                    #  a stopped instance,  /tmp may be deleted and `run_init`
                    # is called before the first `file_sync` happens
                    self.run_rsync_up(file_mounts[mount], mount)
                self.ssh_command_runner.run(
                    "{cmd} cp {src} {container}:{dst}".format(
                        cmd=self.docker_cmd,
                        src=os.path.join(
                            self._get_docker_host_mount_location(
                                self.ssh_command_runner.cluster_name), mount),
                        container=self.container_name,
                        dst=self._docker_expand_user(mount)))
                try:
                    # Check if the current user has read permission.
                    # If they do not, try to change ownership!
                    self.run(f"cat {mount} >/dev/null 2>&1 || "
                             f"sudo chown $(id -u):$(id -g) {mount}")
                except Exception:
                    lsl_string = self.run(
                        f"ls -l {mount}",
                        with_output=True).decode("utf-8").strip()
                    # The string is of format <Permission> <Links>
                    # <Owner> <Group> <Size> <Date> <Name>
                    permissions = lsl_string.split(" ")[0]
                    owner = lsl_string.split(" ")[2]
                    group = lsl_string.split(" ")[3]
                    current_user = self.run(
                        "whoami", with_output=True).decode("utf-8").strip()
                    cli_logger.warning(
                        f"File ({mount}) is owned by user:{owner} and group:"
                        f"{group} with permissions ({permissions}). The "
                        f"current user ({current_user}) does not have "
                        "permission to read these files, and Ray may not be "
                        "able to autoscale. This can be resolved by "
                        "installing `sudo` in your container, or adding a "
                        f"command like 'chown {current_user} {mount}' to "
                        "your `setup_commands`.")
        self.initialized = True
        return docker_run_executed
예제 #2
0
    def run_init(self, *, as_head, file_mounts, sync_run_yet):
        BOOTSTRAP_MOUNTS = [
            "~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"
        ]

        specific_image = self.docker_config.get(
            f"{'head' if as_head else 'worker'}_image",
            self.docker_config.get("image"))

        self._check_docker_installed()
        if self.docker_config.get("pull_before_run", True):
            assert specific_image, "Image must be included in config if " + \
                "pull_before_run is specified"
            self.run("docker pull {}".format(specific_image), run_env="host")
        else:

            self.run(
                f"docker image inspect {specific_image} 1> /dev/null  2>&1 || "
                f"docker pull {specific_image}")

        # Bootstrap files cannot be bind mounted because docker opens the
        # underlying inode. When the file is switched, docker becomes outdated.
        cleaned_bind_mounts = file_mounts.copy()
        for mnt in BOOTSTRAP_MOUNTS:
            cleaned_bind_mounts.pop(mnt, None)

        docker_run_executed = False

        container_running = self._check_container_status()
        requires_re_init = False
        if container_running:
            requires_re_init = self._check_if_container_restart_is_needed(
                specific_image, cleaned_bind_mounts)
            if requires_re_init:
                self.run(f"docker stop {self.container_name}", run_env="host")

        if (not container_running) or requires_re_init:
            # Get home directory
            image_env = self.ssh_command_runner.run(
                "docker inspect -f '{{json .Config.Env}}' " + specific_image,
                with_output=True).decode().strip()
            home_directory = "/root"
            for env_var in json.loads(image_env):
                if env_var.startswith("HOME="):
                    home_directory = env_var.split("HOME=")[1]
                    break

            start_command = docker_start_cmds(
                self.ssh_command_runner.ssh_user, specific_image,
                cleaned_bind_mounts, self.container_name,
                self.docker_config.get("run_options", []) +
                self.docker_config.get(
                    f"{'head' if as_head else 'worker'}_run_options", []) +
                self._configure_runtime() + self._auto_configure_shm(),
                self.ssh_command_runner.cluster_name, home_directory)
            self.run(start_command, run_env="host")
            docker_run_executed = True

        # Explicitly copy in ray bootstrap files.
        for mount in BOOTSTRAP_MOUNTS:
            if mount in file_mounts:
                if not sync_run_yet:
                    # NOTE(ilr) This rsync is needed because when starting from
                    #  a stopped instance,  /tmp may be deleted and `run_init`
                    # is called before the first `file_sync` happens
                    self.run_rsync_up(file_mounts[mount], mount)
                self.ssh_command_runner.run(
                    "docker cp {src} {container}:{dst}".format(
                        src=os.path.join(
                            self._get_docker_host_mount_location(
                                self.ssh_command_runner.cluster_name), mount),
                        container=self.container_name,
                        dst=self._docker_expand_user(mount)))
        self.initialized = True
        return docker_run_executed
예제 #3
0
    def run_init(self, *, as_head, file_mounts):
        BOOTSTRAP_MOUNTS = [
            "~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"
        ]

        image = self.docker_config.get("image")
        image = self.docker_config.get(
            f"{'head' if as_head else 'worker'}_image", image)

        self._check_docker_installed()
        if self.docker_config.get("pull_before_run", True):
            assert image, "Image must be included in config if " + \
                "pull_before_run is specified"

            self.run("docker pull {}".format(image), run_env="host")

        # Bootstrap files cannot be bind mounted because docker opens the
        # underlying inode. When the file is switched, docker becomes outdated.
        cleaned_bind_mounts = file_mounts.copy()
        for mnt in BOOTSTRAP_MOUNTS:
            cleaned_bind_mounts.pop(mnt, None)

        if not self._check_container_status():
            # Get home directory
            image_env = self.ssh_command_runner.run(
                "docker inspect -f '{{json .Config.Env}}' " + image,
                with_output=True).decode().strip()
            home_directory = "/root"
            for env_var in json.loads(image_env):
                if env_var.startswith("HOME="):
                    home_directory = env_var.split("HOME=")[1]
                    break

            start_command = docker_start_cmds(
                self.ssh_command_runner.ssh_user, image, cleaned_bind_mounts,
                self.container_name,
                self.docker_config.get("run_options", []) +
                self.docker_config.get(
                    f"{'head' if as_head else 'worker'}_run_options", []) +
                self._configure_runtime() + self._auto_configure_shm(),
                self.ssh_command_runner.cluster_name, home_directory)
            self.run(start_command, run_env="host")
        else:
            running_image = self.run(check_docker_image(self.container_name),
                                     with_output=True,
                                     run_env="host").decode("utf-8").strip()
            if running_image != image:
                logger.error(f"A container with name {self.container_name} " +
                             f"is running image {running_image} instead " +
                             f"of {image} (which was provided in the YAML")
            mounts = self.run(check_bind_mounts_cmd(self.container_name),
                              with_output=True,
                              run_env="host").decode("utf-8").strip()
            try:
                active_mounts = json.loads(mounts)
                active_remote_mounts = [
                    mnt["Destination"] for mnt in active_mounts
                ]
                # Ignore ray bootstrap files.
                for remote, local in cleaned_bind_mounts.items():
                    remote = self._docker_expand_user(remote)
                    if remote not in active_remote_mounts:
                        cli_logger.error(
                            "Please ray stop & restart cluster to "
                            f"allow mount {remote}:{local} to take hold")
            except json.JSONDecodeError:
                cli_logger.verbose(
                    "Unable to check if file_mounts specified in the YAML "
                    "differ from those on the running container.")

        # Explicitly copy in ray bootstrap files.
        for mount in BOOTSTRAP_MOUNTS:
            if mount in file_mounts:
                self.ssh_command_runner.run(
                    "docker cp {src} {container}:{dst}".format(
                        src=os.path.join(
                            self._get_docker_host_mount_location(
                                self.ssh_command_runner.cluster_name), mount),
                        container=self.container_name,
                        dst=self._docker_expand_user(mount)))
        self.initialized = True