Пример #1
0
    def launch_flambe(self, config_file: str, secrets_file: str,
                      force: bool) -> None:
        """Launch flambe execution in the remote host

        Parameters
        ----------
        config_file: str
            The config filename relative to the orchestrator
        secrets_file: str
            The filepath containing the secrets for the orchestrator
        force: bool
            The force parameters that was originally passed to flambe

        """
        force_params = "--force" if force else ""
        cmd = (
            f"tmux new-session -d -s 'flambe' " +
            f"'bash -lc \"flambe {config_file} -i --secrets {secrets_file} " +
            f"{force_params} &> output.log\"'")

        ret = self._run_cmd(cmd)

        if ret.success:
            logger.info(cl.GR("Running flambe in Orchestrator"))
        else:
            raise errors.RemoteCommandError(
                f"Not able to run flambe. {ret.msg}")
Пример #2
0
    def launch_report_site(self, progress_file: str, port: int,
                           output_log: str, output_dir: str,
                           tensorboard_port: int) -> None:
        """Launch the report site.

        The report site is a Flask web app.

        Raises
        ------
        RemoteCommandError
            In case the launch process fails

        """
        tensorboard_url = f"http://{self.host}:{tensorboard_port}"

        cmd = (
            f"tmux new-session -d -s 'flambe-site' 'bash -lc \"flambe-site {progress_file} "
            f"--tensorboard_url {tensorboard_url} "
            f"--host 0.0.0.0 --port {port} "
            f"--output-dir {output_dir} "
            f"--output-log {output_log} &>> outputsite.log\"'")

        res = self._run_cmd(cmd)

        # Sometimes tmux command returns failure (because of some
        # timeout) but website is running.
        # Adding this extra check in that case.
        if res.success and self.is_report_site_running():
            logger.info(cl.BL(f"Report site at http://{self.host}:{port}"))
        else:
            raise errors.RemoteCommandError(
                f"Report site failed to run. {res.msg}")
Пример #3
0
    def launch_node(self, redis_address: str) -> None:
        """Launche the ray worker node.

        Parameters
        ----------
        redis_address : str
            The URL of the main node. Must be IP:port

        Raises
        ------
        RemoteCommandError
            If not able to run node.

        """
        # https://stackoverflow.com/a/18665363.
        # `ray` is in ~/.local/bin that is not in $PATH in paramiko.
        # For this, use bash and -lc flags
        cmd = f"bash -lc 'ray start --redis-address {redis_address}'"
        ret = self._run_cmd(cmd)

        if ret.success:
            logger.debug(f"Ray worker node launched at {self.host}")
        else:
            raise errors.RemoteCommandError(
                f"Could not launch worker node. {ret.msg}")
Пример #4
0
    def launch_tensorboard(self, logs_dir: str, tensorboard_port: int) -> None:
        """Launch tensorboard.

        Parameters
        ----------
        logs_dir : str
            Tensorboard logs directory
        tensorboard_port: int
            The port where tensorboard will be available

        Raises
        ------
        RemoteCommandError
            In case the launch process fails

        """
        if not self.is_docker_installed():
            logger.error("Can't run tensorboard. Docker not installed.")
            return

        cmd = self._run_cmd(
            f"docker run -d -p {tensorboard_port}:6006 -v " +
            f"{os.path.join(self.get_home_path(), logs_dir)}:" +
            f"/tensorboard_logs {const.TENSORBOARD_IMAGE} tensorboard --logdir /tensorboard_logs"
        )
        if cmd.success:
            logger.debug(
                f"Tensorboard running at http://{self.host}:{tensorboard_port} . "
                +
                "Be aware that it can take a while until it starts showing results."
            )
        else:
            raise errors.RemoteCommandError(
                f"Tensorboard stable failed to run. {cmd.msg}")
Пример #5
0
    def is_flambe_installed(self, version: bool = True) -> bool:
        """Check if flambe is installed and if it matches version.

        Parameters
        ----------
        version: bool
            If True, also the version will be used. That is, if flag
            is True and the remote flambe version is different from the
            local flambe version, then this method will return False.
            If they match, then True. If version is False this method
            will return if there is ANY flambe version in the host.

        Returns
        ------
        bool

        """
        # First check if a version of flambe is installed
        ret = self._run_cmd("bash -lc 'flambe --help'")
        if not ret.success:
            return False

        if version:
            cmd = "python3 -c 'import flambe; print(flambe.__version__)'"
            ret = self._run_cmd(cmd)

            if not ret.success:
                raise errors.RemoteCommandError(
                    f"Could not run flambe in python at {self.host} even if binary was found."
                )

            return ret.msg.strip() == bytes(flambe.__version__, 'utf-8')

        return True
Пример #6
0
    def worker_nodes(self) -> List[str]:
        """Returns the list of worker nodes

        Returns
        -------
        List[str]
            The list of worker nodes identified by their hostname

        """
        redis_address = f"\"{self.private_host}:{const.RAY_REDIS_PORT}\""
        cmd = "python3 -c '\n"\
              "import time\n"\
              "import ray\n"\
              f"ray.init(redis_address={redis_address})\n"\
              "@ray.remote\n"\
              "def f():\n"\
              "    time.sleep(0.01)\n"\
              "    return ray.services.get_node_ip_address()\n"\
              "print(set(ray.get([f.remote() for _ in range(1000)])))\n'"

        ret = self._run_cmd(cmd)

        if not ret.success:
            raise errors.RemoteCommandError(
                f"Failed to run Python script. {ret.msg}")

        return [s[1:-1] for s in (ret.msg[1:-2]).decode("utf-8").split(',')]
Пример #7
0
    def launch_node(self, port: int) -> None:
        """Launch the main ray node in given sftp server in port 49559.

        Parameters
        ----------
        port: int
            Available port to launch the redis DB of the main ray node

        Raises
        ------
        RemoteCommandError
            In case the launch process fails

        """
        # https://stackoverflow.com/a/18665363.
        # `ray` is in ~/.local/bin that is not in $PATH in paramiko.
        # For this, use bash and -lc flags
        cmd = self._run_cmd(
            f"bash -lc 'ray start --head --num-cpus=0 --redis-port={port}'")

        if cmd.success:
            logger.debug(f"Ray main node running in {self.host}")
        else:
            raise errors.RemoteCommandError(
                f"Ray main node failed to run. {cmd.msg}")
Пример #8
0
    def _remote_script(self, host_fname: str,
                       desc: str) -> Generator[str, None, None]:
        """Sends a local file containing a script to the instance
        using Paramiko SFTP.

        It should be used as a context manager for latter execution of
        the script. See `_run_script` on how to use it.

        After the context manager exists, then the file is removed from
        the instance.

        This is a private method and should only be used in this module.

        Parameters
        ----------
        host_fname : str
            The local script filename
        desc : str
            A description for the script purpose. This will be used
            for the copied filename

        Yields
        -------
        str
            The remote filename of the copied local file.

        Raises
        ------
        RemoteCommandError
            In case sending the script fails.

        """
        random_fname = f"{desc}_{uuid.uuid4().hex}.sh"

        cli = paramiko.SSHClient()
        cli.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        cli.connect(hostname=self.host,
                    username=self.username,
                    key_filename=self.key)
        sftp = cli.open_sftp()

        try:
            random_fname = f"{desc}_{uuid.uuid4().hex}.sh"
            sftp.put(host_fname, random_fname)
            cmd = self._run_cmd(f"chmod +x {random_fname}")

            if cmd.success:
                yield random_fname
            else:
                raise errors.RemoteCommandError(
                    f"Error sending local script. {cmd.msg}")

        finally:
            sftp.remove(random_fname)
            sftp.close()
            cli.close()
Пример #9
0
    def num_cpus(self) -> int:
        """Return the number of CPUs this host contains.

        """
        cmd = self._run_cmd(f"python3 -c 'import multiprocessing; " +
                            "print(multiprocessing.cpu_count())'")

        if cmd.success:
            return int(cmd.msg)

        raise errors.RemoteCommandError(f"Could not find out the number of CPUs. {cmd.msg}")
Пример #10
0
    def start_docker(self) -> None:
        """Restart docker.

        Raises
        ------
        RemoteCommandError
            If it's not able to restart docker.

        """
        cmd = self._run_cmd("sudo systemctl restart docker")
        if not cmd.success:
            raise errors.RemoteCommandError(f"Could not start docker. {cmd.msg}")
Пример #11
0
    def install_flambe(self) -> None:
        """Pip install Flambe.

        If dev mode is activated, then it rsyncs the local flambe
        folder and installs that version. If not, downloads from pypi.

        Raises
        ------
        RemoteCommandError
            If it's not able to install flambe.

        """
        flags = []
        if 'PIP' in self.config:
            host = self.config['PIP'].get('HOST', None)
            if host:
                flags.append(f"--trusted-host {host}")

            host_url = self.config['PIP'].get('HOST_URL', None)
            if host_url:
                flags.append(f"--extra-index-url {host_url}")

        if not self.debug:
            pip_flambe = "flambe" if not self.contains_gpu() else "flambe[cuda]"
            logger.debug(f"Installing flambe in {self.host} using pypi")
            ret = self._run_cmd(
                f"python3 -m pip install --user --upgrade "
                f"{' '.join(flags)} {pip_flambe}=={flambe.__version__}",
                retries=3
            )

        else:
            origin = get_flambe_repo_location()
            # Avoid rsyncing resources from gitignore
            filter_param = ""
            if os.path.exists(os.path.join(origin, ".gitignore")):
                filter_param = f"--filter=':- {os.path.join(origin, '.gitignore')}'"

            destiny = os.path.join(self.get_home_path(), "extensions", "flambe")
            self.send_rsync(origin, destiny, filter_param)
            logger.debug(f"Sent flambe {origin} -> {destiny}")
            pip_destiny = destiny if not self.contains_gpu() else f"{destiny}[cuda]"
            ret = self._run_cmd(
                f"python3 -m pip install --user --upgrade {' '.join(flags)} {pip_destiny}",
                retries=3
            )

        if not ret.success:
            raise errors.RemoteCommandError(f"Could not install flambe. {ret.msg}")
        else:
            logger.debug(f"Installed flambe in {self.host} successfully")
Пример #12
0
    def install_docker(self) -> None:
        """Install docker in a Ubuntu 18.04 distribution.

        Raises
        ------
        RemoteCommandError
            If it's not able to install docker.
            ie. then the installation script fails

        """
        fname = os.path.join(os.path.dirname(__file__), "scripts/install_docker.sh")
        cmd = self._run_script(fname, "install_docker")
        if not cmd.success:
            raise errors.RemoteCommandError(f"Could not install docker. {cmd.msg}")
Пример #13
0
    def install_cuda(self) -> None:
        """Install CUDA 10.0 drivers in an Ubuntu 18.04 distribution.

        Raises
        ------
        RemoteCommandError
            If it's not able to install drivers. ie if script fails

        """
        fname = os.path.join(os.path.dirname(__file__), "scripts/install_cuda_ubuntu1804.sh")
        cmd = self._run_script(fname, "install_cuda")

        if not cmd.success:
            raise errors.RemoteCommandError(f"Could not install CUDA. {cmd.msg}")
Пример #14
0
    def contains_gpu(self) -> bool:
        """Return if this machine contains GPU.

        This method will be used to possibly upgrade
        this factory to a GPUFactoryInstance.

        """
        cmd = "python3 -c 'import torch; print(torch.cuda.is_available())'"
        ret = self._run_cmd(cmd)

        if not ret.success:
            raise errors.RemoteCommandError("Factory does not contain torch installed")

        return ret.msg.strip() == b"True"
Пример #15
0
    def shutdown_flambe(self) -> None:
        """Shut down flambe in the host

        """
        if not self.is_flambe_running():
            logger.debug("Tried to shutdown flambe in a host that it's not runing flambe")
            return

        cmd = self._run_cmd("killall -9 flambe")

        if cmd.success:
            logger.debug(f"Flambe killed in {self.host}")
        else:
            raise errors.RemoteCommandError(f"Flambe failed to be shutdown. {cmd.msg}")
Пример #16
0
    def kill_tmux_session(self, session_name: str) -> None:
        """Kill an existing tmux session

        Parameters
        ----------
        session_name: str
            The exact name of the tmux session to be removed

        """
        cmd = f'tmux kill-session -t {session_name}'
        ret = self._run_cmd(cmd)

        if ret.success:
            logger.debug(f"Remove existing tmux session {session_name}")
        else:
            raise errors.RemoteCommandError(f"Tried to remove a session. {ret.msg}")
Пример #17
0
    def clean_containers(self) -> None:
        """Stop and remove all containers running

        Raises
        ------
        RemoteCommandError
            If command fails

        """
        cmd = f'''
        docker stop $(docker ps -a -q);
        docker rm $(docker ps -a -q);
        '''

        ret = self._run_cmd(cmd)

        if not ret.success:
            raise errors.RemoteCommandError("Could not clean containers")
Пример #18
0
    def shutdown_node(self) -> None:
        """Shut down the ray node in the host.

        If the node is also the main node, then the entire
        cluster will shut down

        """
        if not self.is_node_running():
            logger.debug("Tried to shutdown a non existing node")
            return

        cmd = self._run_cmd("bash -lc 'ray stop'")

        if cmd.success:
            logger.debug(f"Ray node stopped at {self.host}")
        else:
            raise errors.RemoteCommandError(
                f"Ray node failed to stop. {cmd.msg}")
Пример #19
0
    def run_cmds(self, setup_cmds: List[str]) -> None:
        """Execute a list of sequential commands

        Parameters
        ----------
        setup_cmds: List[str]
            The list of commands

        Returns
        -------
        RemoteCommandError
            In case at least one command is not successful

        """
        for s in setup_cmds:
            ret = self._run_cmd(s, retries=3)
            if not ret.success:
                raise errors.RemoteCommandError(
                    f"Error executing {s} in {self.host}. " + f"{ret.msg}")
Пример #20
0
    def install_extensions(self, extensions: Dict[str, str]) -> None:
        """Install local + pypi extensions.

        Parameters
        ----------
        extension: Dict[str, str]
            The extensions, as a dict from module_name to location

        Raises
        ------
        errors.RemoteCommandError
            If could not install an extension

        """
        cmd = ['python3', '-m', 'pip', 'install', '-U', '--user']
        for ext, resource in extensions.items():
            curr_cmd = cmd[:]

            if 'PIP' in self.config:
                host = self.config['PIP'].get('HOST', None)
                if host:
                    curr_cmd.extend(["--trusted-host", host])

                host_url = self.config['PIP'].get('HOST_URL', None)
                if host_url:
                    curr_cmd.extend(["--extra-index-url", host_url])

            if os.path.exists(resource):
                # Package is local
                if os.sep not in resource:
                    resource = f"./{resource}"
            else:
                # Package follows pypi notation: "torch>=0.4.1,<1.1"
                resource = f"{resource}"

            curr_cmd.append(resource)

            ret = self._run_cmd(" ".join(curr_cmd))
            if not ret.success:
                raise errors.RemoteCommandError(
                    f"Could not install package {resource} in {self.host}"
                )
Пример #21
0
    def num_gpus(self) -> int:
        """Get the number of GPUs this host contains

        Returns
        -------
        int
            The number of GPUs

        Raises
        ------
        RemoteCommandError
            If command to get the number of GPUs fails.

        """
        cmd = self._run_cmd(f"python3 -c 'import torch; print(torch.cuda.device_count())'")

        if cmd.success:
            return int(cmd.msg)

        raise errors.RemoteCommandError(f"Could not find out how many GPUs available. {cmd.msg}")
Пример #22
0
    def get_home_path(self) -> str:
        """Return the $HOME value of the instance.

        Returns
        -------
        str
            The $HOME env value.

        Raises
        ------
        RemoteCommandError
            If after 3 retries it is not able to get $HOME.

        """
        cmd = self._run_cmd("echo $HOME", retries=3)

        if cmd.success:
            return cmd.msg.decode("utf-8").strip()

        raise errors.RemoteCommandError(f"Could not access $HOME env variable. {cmd.msg}")
Пример #23
0
    def clean_container_by_image(self, image_name: str) -> None:
        """Stop and remove all containers given an image name.

        Parameters
        ----------
        image_name : str
            The name of the image for which all containers
            should be stopped and removed.

        Raises
        ------
        RemoteCommandError
            If command fails

        """
        cmd = f"docker rm $(docker stop "\
              f"$(docker ps -a -q --filter ancestor={image_name} --format='{{{{.ID}}}}'))"
        res = self._run_cmd(cmd)

        if not res.success:
            raise errors.RemoteCommandError(f"Could not clean container {image_name}. {res.msg}")
Пример #24
0
    def clean_container_by_command(self, command: str) -> None:
        """Stop and remove all containers with the given command.

        Parameters
        ----------
        command : str
            The command used to stop and remove the containers

        Raises
        ------
        RemoteCommandError
            If command fails

        """
        cmd = f"docker rm -f $(docker inspect -f '{{{{.ID}}}} "\
              f"{{{{.Config.Cmd}}}}' $(docker ps -a -q) | grep {command} | awk '{{print $1}}')"
        res = self._run_cmd(cmd)

        if not res.success:
            raise errors.RemoteCommandError(
                f"Could not clean container with cmd {command}. {res.msg}")
Пример #25
0
    def remove_dir(self, _dir: str, content_only: bool = True) -> None:
        """Delete the specified dir result folder.

        Parameters
        ----------
        _dir: str
            The directory. It needs to be relative to the $HOME path as
            it will be prepended as a prefix.
        content_only: bool
            If True, the folder itseld will not be erased.

        """
        path = f"{self.get_home_path()}/{_dir}"
        if content_only:
            cmd = self._run_cmd(f"rm -rf {path}/*")
        else:
            cmd = self._run_cmd(f"rm -rf {path}/")
        if cmd.success:
            logger.debug(f"Removed {path} at {self.host}.")
        else:
            raise errors.RemoteCommandError(
                f"Failed to remove {path} on {self.host}. {cmd.msg}")
Пример #26
0
    def _run_cmd(self,
                 cmd: str,
                 retries: int = 1,
                 wd: str = None) -> RemoteCommand:
        """Runs a single shell command in the instance through SSH.

        The command will be executed in one ssh connection.
        Don't expect calling several time to `_run_cmd` expecting to
        keep state between commands. To use mutliple commands, use:
        `_run_script`

        *Important: when running docker containers, don't use -it flag!*

        This is a private method and should only be used in this module.

        Parameters
        ----------
        cmd : str
            The command to execute.
        retries : int
            The amount of attempts to run the command if it fails.
            Default to 1.
        wd : str
            The working directory to 'cd' before running the command

        Returns
        -------
        RemoteCommand
            A `RemoteCommand` instance with success boolean and message.

        Examples
        --------
        To get $HOME env

        >>> instance._run_cmd("echo $HOME")
        RemoteCommand(True, "/home/ubuntu")

        This will not work

        >>> instance._run_cmd("export var=10")
        >>> instance._run_cmd("echo $var")
        RemoteCommand(False, "")

        This will work

        >>> instance._run_cmd("export var=10; echo $var")
        RemoteCommand(True, "10")

        Raises
        ------
        RemoteCommandError
            In case the `cmd` failes after `retries` attempts.

        """
        if retries <= 0:
            raise ValueError("'retries' parameter should be > 0")

        for i in range(retries):
            cli = self._get_cli()

            try:
                if wd:
                    cmd = f"cd {wd}; {cmd}"

                status, stdout, stderr = cli.exec_command(cmd)

                # Blocks until done
                while not stdout.channel.exit_status_ready():
                    status = stdout.channel.recv_exit_status()

                out, err = stdout.read(), stderr.read()

                success = status == 0

                if not success:
                    logger.debug(
                        f"Retry {i}. {cmd} failed with message: {err}")
                else:
                    logger.debug(f"'{cmd}' ran successfully")
                    return RemoteCommand(success, out if success else err)

            except errors.SSHConnectingError:
                raise
            except Exception as err:
                raise errors.RemoteCommandError(err)

        logger.debug(
            f"'{cmd}' returning after {retries} intents returning != 0")
        return RemoteCommand(success, out if success else err)