示例#1
0
    def run(self) -> None:
        conn: t.Text = self.get_connection_string()
        remote_env: PythonInterpreter = PythonInterpreter.create(
            self.mlcube.runner.interpreter)

        # The 'remote_path' variable points to the MLCube root directory on remote host.
        remote_path: t.Text = os.path.join(
            self.mlcube.runner.remote_root,
            os.path.basename(self.mlcube.runtime.root))

        try:
            cmd = f"mlcube run --mlcube=. --platform={self.mlcube.runner.platform} --task={self.task}"
            Shell.ssh(
                conn,
                f'{remote_env.activate_cmd(noop=":")} && cd {remote_path} && {cmd}'
            )
        except ExecutionError as err:
            raise ExecutionError.mlcube_run_error(
                self.__class__.__name__,
                f"Error occurred while running MLCube task (name={self.task}).",
                **err.context)

        # Sync back results
        try:
            # TODO: Only workspace/ directory is synced. Better solution?
            Shell.rsync_dirs(source=f'{conn}:{remote_path}/workspace/',
                             dest=f'{self.mlcube.runtime.root}/workspace/')
        except ExecutionError as err:
            raise ExecutionError.mlcube_run_error(
                self.__class__.__name__,
                "Error occurred while syncing workspace.", **err.context)
示例#2
0
    def configure(self) -> None:
        """Run 'configure' phase for SHH runner."""
        conn: t.Text = self.get_connection_string()
        remote_env: PythonInterpreter = PythonInterpreter.create(
            self.mlcube.runner.interpreter)

        # If required, create and configure python environment on remote host
        try:
            Shell.ssh(conn, remote_env.create_cmd())
        except ExecutionError as err:
            raise ExecutionError.mlcube_configure_error(
                self.__class__.__name__,
                f"Error occurred while creating remote python environment (env={remote_env}).",
                **err.context)
        try:
            Shell.ssh(conn, remote_env.configure_cmd())
        except ExecutionError as err:
            raise ExecutionError.mlcube_configure_error(
                self.__class__.__name__,
                f"Error occurred while configuring remote python environment (env={remote_env}).",
                **err.context)

        # The 'local_path' and 'remote_path' must both be directories.
        try:
            local_path: str = self.mlcube.runtime.root
            remote_path: str = os.path.join(self.mlcube.runner.remote_root,
                                            os.path.basename(local_path))
            Shell.ssh(conn, f'mkdir -p {remote_path}')
            Shell.rsync_dirs(source=f'{local_path}/',
                             dest=f'{conn}:{remote_path}/')
        except ExecutionError as err:
            raise ExecutionError.mlcube_configure_error(
                self.__class__.__name__,
                "Error occurred while syncing local and remote folders.",
                **err.context)

        # Configure remote MLCube runner. Idea is that we use chain of runners, for instance, SHH Runner -> Docker
        # runner. So, the runner to be used on a remote host must configure itself.
        try:
            cmd = f"mlcube configure --mlcube=. --platform={self.mlcube.runner.platform}"
            Shell.ssh(
                conn,
                f'{remote_env.activate_cmd(noop=":")} && cd {remote_path} && {cmd}'
            )
        except ExecutionError as err:
            raise ExecutionError.mlcube_configure_error(
                self.__class__.__name__,
                "Error occurred while configuring MLCube on a remote machine.",
                **err.context)
 def check_install(singularity_exec: str = "singularity") -> None:
     if not check_singularity_installed(software=singularity_exec):
         raise ExecutionError(
             f"{SingularityRun.__name__} runner failed to configure or to run MLCube.",
             "SingularityRun check_install returned false ('singularity --version' failed to run). MLCube cannot "
             "run singularity images unless this check passes. Singularity runner uses `check_install` function "
             "from singularity-cli python library (https://github.com/singularityhub/singularity-cli)."
         )
示例#4
0
 def test_execution_error_mlcube_run_error_method(self) -> None:
     self.check_execution_error_state(
         ExecutionError.mlcube_run_error(
             "MLCube Reference Runner", "Long error description.", param_a='value_a', param_b=1.2
         ),
         "MLCube Reference Runner runner failed to run MLCube.", "Long error description.",
         {'param_a': 'value_a', 'param_b': 1.2}
     )
示例#5
0
    def run(self) -> None:
        """ Run a cube. """
        docker: t.Text = self.mlcube.runner.docker
        image: t.Text = self.mlcube.runner.image

        build_strategy: t.Text = self.mlcube.runner.build_strategy
        if build_strategy == Config.BuildStrategy.ALWAYS or not Shell.docker_image_exists(docker, image):
            logger.warning("Docker image (%s) does not exist or build strategy is 'always'. "
                           "Will run 'configure' phase.", image)
            self.configure()
        # Deal with user-provided workspace
        try:
            Shell.sync_workspace(self.mlcube, self.task)
        except Exception as err:
            raise ExecutionError.mlcube_run_error(
                self.__class__.__name__,
                f"Error occurred while syncing MLCube workspace (task={self.task}). Actual error is {type(err)} - see "
                "context for details.",
                error=str(err)
            )

        # The 'mounts' dictionary maps host paths to container paths
        try:
            mounts, task_args = Shell.generate_mounts_and_args(self.mlcube, self.task)
        except ConfigurationError as err:
            raise ExecutionError.mlcube_run_error(
                self.__class__.__name__,
                f"Error occurred while generating mount points for docker run command (task={self.task}). See context "
                "for details and check your MLCube configuration file.",
                error=str(err)
            )
        logger.info(f"mounts={mounts}, task_args={task_args}")

        volumes = Shell.to_cli_args(mounts, sep=':', parent_arg='--volume')
        env_args = self.mlcube.runner.env_args
        num_gpus: int = self.mlcube.platform.get('accelerator_count', None) or 0
        run_args: t.Text = self.mlcube.runner.cpu_args if num_gpus == 0 else self.mlcube.runner.gpu_args
        try:
            Shell.run([docker, 'run', run_args, env_args, volumes, image, ' '.join(task_args)])
        except ExecutionError as err:
            raise ExecutionError.mlcube_run_error(
                self.__class__.__name__,
                f"Error occurred while running MLCube task (docker={docker}, run_args={run_args}, env_args={env_args}, "
                f"volumes={volumes}, image={image}, task_args={task_args}).",
                **err.context
            )
示例#6
0
    def check_execution_error_state(self, err: ExecutionError, message: str, description: str, context: t.Dict) -> None:
        self.assertEqual(err.message, message)
        self.assertEqual(err.description, description)
        self.assertDictEqual(err.context, context)

        self.assertEqual(
            err.describe(frmt='text'),
            f"ERROR:\n\tmessage: {message}\n\tdescription: {description}\n\tcontext: {context}"
        )
示例#7
0
 def run(self) -> None:
     """Run a cube"""
     try:
         logging.info("Configuring MLCube to run in Kubeflow Pipelines...")
         _ = self.create_kf_pipeline()
     except Exception as err:
         raise ExecutionError.mlcube_run_error(
             self.__class__.__name__,
             "See context for more details.",
             error=str(err))
示例#8
0
    def configure(self) -> None:
        """Build Docker image on a current host."""
        image: t.Text = self.mlcube.runner.image
        context: t.Text = os.path.abspath(os.path.join(self.mlcube.runtime.root, self.mlcube.runner.build_context))
        recipe: t.Text = os.path.abspath(os.path.join(context, self.mlcube.runner.build_file))
        docker: t.Text = self.mlcube.runner.docker

        # Build strategies: `pull`, `auto` and `always`.
        build_strategy: t.Text = self.mlcube.runner.build_strategy
        build_recipe_exists: bool = os.path.exists(recipe)
        if build_strategy == Config.BuildStrategy.PULL or not build_recipe_exists:
            logger.info("Will pull image (%s) because (build_strategy=%s, build_recipe_exists=%r)",
                        image, build_strategy, build_recipe_exists)
            if build_recipe_exists:
                logger.warning(
                    "Docker recipe exists (%s), but your build strategy is `%s`, and so the image will be pulled, not "
                    "built. Make sure your image is up-to-date with your source code. If you want to rebuilt MLCube "
                    "docker image locally, rerun with `-Prunner.build_strategy=always`.",
                    recipe, build_strategy
                )
            try:
                Shell.run([docker, 'pull', image])
            except ExecutionError as err:
                description = f"Error occurred while pulling docker image (docker={docker}, image={image})."
                if build_recipe_exists:
                    description += \
                        f" By the way, docker recipe ({recipe}) exists, but your build strategy is set to "\
                        "pull. Consider rerunning with: `-Prunner.build_strategy=auto` to build image locally."
                raise ExecutionError.mlcube_configure_error(self.__class__.__name__, description, **err.context)

        else:
            logger.info("Will build image (%s) because (build_strategy=%s, build_recipe_exists=%r)",
                        image, build_strategy, build_recipe_exists)
            build_args: t.Text = self.mlcube.runner.build_args
            try:
                Shell.run([docker, 'build', build_args, '-t', image, '-f', recipe, context])
            except ExecutionError as err:
                raise ExecutionError.mlcube_configure_error(
                    self.__class__.__name__,
                    f"Error occurred while building docker image (docker={docker}, build_args={build_args}, "
                    f"image={image}, recipe={recipe}, context={context}).",
                    **err.context
                )
示例#9
0
 def run(self) -> None:
     gcp: DictConfig = self.mlcube.runner
     try:
         Shell.run(
             f"mlcube run --mlcube={self.mlcube.root} --platform={gcp.platform} --task={self.task}"
         )
     except ExecutionError as err:
         raise ExecutionError.mlcube_run_error(
             self.__class__.__name__,
             f"Error occurred while running MLCube task (platform={gcp.platform}, task={self.task}).",
             **err.context)
示例#10
0
    def run(self) -> None:
        """ """
        image_file = Path(
            self.mlcube.runner.image_dir) / self.mlcube.runner.image
        if not image_file.exists():
            self.configure()
        else:
            SingularityRun.check_install()

        # Deal with user-provided workspace
        try:
            Shell.sync_workspace(self.mlcube, self.task)
        except Exception as err:
            raise ExecutionError.mlcube_run_error(
                self.__class__.__name__,
                "Error occurred while syncing MLCube workspace. See context for more details.",
                error=str(err))

        try:
            mounts, task_args = Shell.generate_mounts_and_args(
                self.mlcube, self.task)
            logger.info(f"mounts={mounts}, task_args={task_args}")
        except ConfigurationError as err:
            raise ExecutionError.mlcube_run_error(
                self.__class__.__name__,
                "Error occurred while generating mount points for singularity run command. See context for more "
                "details and check your MLCube configuration file.",
                error=str(err))

        volumes = Shell.to_cli_args(mounts, sep=":", parent_arg="--bind")
        try:
            Shell.run([
                self.mlcube.runner.singularity, 'run',
                self.mlcube.runner.run_args, volumes,
                str(image_file), ' '.join(task_args)
            ])
        except ExecutionError as err:
            raise ExecutionError.mlcube_run_error(
                self.__class__.__name__,
                f"Error occurred while running MLCube task (task={self.task}). See context for more details.",
                **err.context)
示例#11
0
    def run(self) -> None:
        """Run a cube"""
        try:
            logging.info("Configuring MLCube as a Kubernetes Job...")
            kubernetes.config.load_kube_config()

            mlcube_job_manifest = self.create_job_manifest()
            job = self.create_job(mlcube_job_manifest)
            self.wait_for_completion(job)
        except Exception as err:
            raise ExecutionError.mlcube_run_error(
                self.__class__.__name__,
                "See context for more details.",
                error=str(err))
示例#12
0
    def configure(self) -> None:
        """Build Singularity Image on a current host."""
        SingularityRun.check_install()

        s_cfg: DictConfig = self.mlcube.runner

        # Get full path to a singularity image. By design, we compute it relative to {mlcube.root}/workspace.
        image_file = Path(s_cfg.image_dir, s_cfg.image)
        if image_file.exists():
            logger.info(
                "SingularityRun SIF exists (%s) - no need to run the configure step.",
                image_file,
            )
            return

        # Make sure a directory to store image exists. If paths are like "/opt/...", the call may fail.
        image_file.parent.mkdir(parents=True, exist_ok=True)

        build_path = Path(
            self.mlcube.runtime.root
        )  # Let's assume that build context is the root MLCube directory
        recipe: str = s_cfg.build_file  # This is the recipe file, or docker image.
        if recipe.startswith("docker://") or recipe.startswith(
                "docker-archive:"):
            # https://sylabs.io/guides/3.0/user-guide/build_a_container.html
            # URI beginning with docker:// to build from Docker Hub
            logger.info("SingularityRun building SIF from docker image (%s).",
                        recipe)
        else:
            # This must be a recipe file. Make sure it exists.
            if not Path(build_path, recipe).exists():
                raise IOError(
                    f"SIF recipe file does not exist (path={build_path}, file={recipe})"
                )
            logger.info("Building SIF from recipe file (path=%s, file=%s).",
                        build_path, recipe)
        try:
            Shell.run([
                'cd',
                str(build_path), ';', s_cfg.singularity, 'build',
                s_cfg.build_args,
                str(image_file), recipe
            ])
        except ExecutionError as err:
            raise ExecutionError.mlcube_configure_error(
                self.__class__.__name__,
                "Error occurred while building SIF image. See context for more details.",
                **err.context)
示例#13
0
    def run(cmd: t.Union[str, t.List], on_error: str = 'raise') -> int:
        """Execute shell command.
        Args:
            cmd: Command to execute, e.g. Shell.run(['ls', -lh']). If type is iterable, this method will join into
                one string using whitespace as a separator.
            on_error: Action to perform if `os.system` returns a non-zero status. Options - ignore (do nothing, return
                exit code), 'raise' (raise a RuntimeError exception), 'die' (exit the process).
        Returns:
            Exit status. On Windows, the exit status is the output of `os.system`. On Linux, the output is either
                process exit status if that processes exited, or -1 in other cases (e.g., process was killed).
        """
        if isinstance(cmd, t.List):
            cmd = ' '.join(cmd)

        if on_error not in ('raise', 'die', 'ignore'):
            raise ValueError(
                f"Unrecognized 'on_error' action ({on_error}). Valid options are ('raise', 'die', 'ignore')."
            )

        status: int = os.system(cmd)
        exit_code, exit_status = Shell.parse_exec_status(status)
        if exit_status == 'na':
            logger.warning("Command (cmd=%s) did not exit properly (status=%d).", cmd, status)

        msg = f"Command='{cmd}' status={status} exit_status={exit_status} exit_code={exit_code} on_error={on_error}"
        if exit_code != 0:
            logger.error(msg)
            if on_error == 'die':
                sys.exit(exit_code)
            if on_error == 'raise':
                raise ExecutionError(
                    'Failed to execute shell command.', status=exit_status, code=exit_code, cmd=cmd
                )
        else:
            logger.info(msg)
        return exit_code
示例#14
0
 def test_execution_error_init_method(self) -> None:
     self.check_execution_error_state(
         ExecutionError("Brief error description.", "Long error description.", param_a='value_a', param_b=1.2),
         "Brief error description.", "Long error description.", {'param_a': 'value_a', 'param_b': 1.2}
     )
示例#15
0
    def configure(self) -> None:
        """  """
        gcp: DictConfig = self.mlcube.runner

        # Check that SSH is configured.
        # TODO: (Sergey) why am I doing it here (copy-past bug)?
        ssh_config_file = os.path.join(Path.home(), '.ssh', 'mlcube')
        try:
            ssh_config = SSHConfig.load(ssh_config_file)
            gcp_host: Host = ssh_config.get(gcp.instance.name)
        except KeyError:
            raise ExecutionError.mlcube_configure_error(
                self.__class__.__name__,
                f"SSH configuration file ({ssh_config_file}) does not provide connection details for GCP instance "
                f"(name={gcp.instance.name}). Most likely this error has occurred due to implementation error - "
                "please, contact MLCube developers.")
        # TODO: I can try to add this info on the fly assuming standard paths. Need to figure out the user name.
        if gcp_host.get('User', None) is None or gcp_host.get(
                'IdentityFile', None) is None:
            raise ExecutionError.mlcube_configure_error(
                self.__class__.__name__,
                f"SSH configuration file ({ssh_config_file}) provides connection details for GCP instance "
                f"(name={gcp.instance.name}), but these details do not include information about `User` "
                "and/or `IdentifyFile`.")

        # Connect to GCP
        logger.info("Connecting to GCP ...")
        try:
            service = Service(project_id=gcp.gcp.project_id,
                              zone=gcp.gcp.zone,
                              credentials=gcp.gcp.credentials)
        except Exception as err:
            raise ExecutionError.mlcube_configure_error(
                self.__class__.__name__,
                "The error most like is associated with either reading credentials, or connecting using google API ("
                f"project_id={gcp.gcp.project_id}, zone={gcp.gcp.zone}, credentials={gcp.gcp.credentials}). See "
                "context for more details.",
                error=str(err),
                gcp_info={
                    'project_id': gcp.gcp.project_id,
                    'zone': gcp.gcp.zone,
                    'credentials': gcp.gcp.credentials
                })

        # Figure out if an instance needs to be created
        try:
            instance = GCPInstance(service.get_instance(gcp.instance.name))
            if instance.name is None:
                print("Creating GCP instance ...")
                service.wait_for_operation(
                    service.create_instance(
                        name=gcp.instance.name,
                        machine_type=gcp.instance.machine_type,
                        disk_size_gb=gcp.instance.disk_size_gb))
                instance = GCPInstance(service.get_instance(gcp.instance.name))

            # Check its running status
            if instance.status != GCPInstanceStatus.RUNNING:
                print("Starting GCP instance ...")
                service.wait_for_operation(
                    service.start_instance(instance.name))
                instance = GCPInstance(service.get_instance(gcp.instance.name))
        except Exception as err:
            raise ExecutionError.mlcube_configure_error(
                self.__class__.__name__,
                "Failed to create or connect to remote GCP instance. See context for more details.",
                error=str(err),
                gcp_instance_info={
                    'name': gcp.instance.name,
                    'machine_type': gcp.instance.machine_type,
                    'disk_size_gb': gcp.instance.disk_size_gb
                })

        # Make sure SSH mlcube is up-to-date
        if gcp_host.get('HostName', None) != instance.public_ip:
            print(
                f"Updating SSH mlcube (prev={gcp_host.get('HostName')}, new={instance.public_ip}, "
                f"file={ssh_config_file})")
            ssh_config.update(instance.name, {'HostName': instance.public_ip})
            ssh_config.write(ssh_config_file)
            # TODO: clean '.ssh/known_hosts'.

        # Configure remote instance. This is specific for docker-based images now.
        try:
            Shell.ssh(
                gcp.instance.name,
                'sudo snap install docker && sudo addgroup --system docker && sudo adduser ${USER} docker && '
                'sudo snap disable docker && sudo snap enable docker && '
                'sudo apt update && yes | sudo apt install python3-pip virtualenv && sudo apt clean'
            )
        except ExecutionError as err:
            raise ExecutionError.mlcube_configure_error(
                self.__class__.__name__,
                "Failed to install system packages on a remote instance. See context for more details.",
                error=str(err))

        # Remote GCP instance has been configured
        print(instance)

        # Should be as simple as invoking SSH configure.
        try:
            Shell.run(
                f"mlcube configure --mlcube={self.mlcube.root} --platform={gcp.platform}"
            )
        except ExecutionError as err:
            raise ExecutionError.mlcube_configure_error(
                self.__class__.__name__,
                f"Error occurred while running mlcube configure with GCP platform (platform={gcp.platform}). See "
                "context for more details.",
                error=str(err))