def run(self) -> None: conn: t.Text = self.get_connection_string() remote_env: PythonInterpreter = PythonInterpreter.create( self.mlcube.runner.interpreter) # The 'remote_path' variable points to the MLCube root directory on remote host. remote_path: t.Text = os.path.join( self.mlcube.runner.remote_root, os.path.basename(self.mlcube.runtime.root)) try: cmd = f"mlcube run --mlcube=. --platform={self.mlcube.runner.platform} --task={self.task}" Shell.ssh( conn, f'{remote_env.activate_cmd(noop=":")} && cd {remote_path} && {cmd}' ) except ExecutionError as err: raise ExecutionError.mlcube_run_error( self.__class__.__name__, f"Error occurred while running MLCube task (name={self.task}).", **err.context) # Sync back results try: # TODO: Only workspace/ directory is synced. Better solution? Shell.rsync_dirs(source=f'{conn}:{remote_path}/workspace/', dest=f'{self.mlcube.runtime.root}/workspace/') except ExecutionError as err: raise ExecutionError.mlcube_run_error( self.__class__.__name__, "Error occurred while syncing workspace.", **err.context)
def configure(self) -> None: """Run 'configure' phase for SHH runner.""" conn: t.Text = self.get_connection_string() remote_env: PythonInterpreter = PythonInterpreter.create( self.mlcube.runner.interpreter) # If required, create and configure python environment on remote host try: Shell.ssh(conn, remote_env.create_cmd()) except ExecutionError as err: raise ExecutionError.mlcube_configure_error( self.__class__.__name__, f"Error occurred while creating remote python environment (env={remote_env}).", **err.context) try: Shell.ssh(conn, remote_env.configure_cmd()) except ExecutionError as err: raise ExecutionError.mlcube_configure_error( self.__class__.__name__, f"Error occurred while configuring remote python environment (env={remote_env}).", **err.context) # The 'local_path' and 'remote_path' must both be directories. try: local_path: str = self.mlcube.runtime.root remote_path: str = os.path.join(self.mlcube.runner.remote_root, os.path.basename(local_path)) Shell.ssh(conn, f'mkdir -p {remote_path}') Shell.rsync_dirs(source=f'{local_path}/', dest=f'{conn}:{remote_path}/') except ExecutionError as err: raise ExecutionError.mlcube_configure_error( self.__class__.__name__, "Error occurred while syncing local and remote folders.", **err.context) # Configure remote MLCube runner. Idea is that we use chain of runners, for instance, SHH Runner -> Docker # runner. So, the runner to be used on a remote host must configure itself. try: cmd = f"mlcube configure --mlcube=. --platform={self.mlcube.runner.platform}" Shell.ssh( conn, f'{remote_env.activate_cmd(noop=":")} && cd {remote_path} && {cmd}' ) except ExecutionError as err: raise ExecutionError.mlcube_configure_error( self.__class__.__name__, "Error occurred while configuring MLCube on a remote machine.", **err.context)
def check_install(singularity_exec: str = "singularity") -> None: if not check_singularity_installed(software=singularity_exec): raise ExecutionError( f"{SingularityRun.__name__} runner failed to configure or to run MLCube.", "SingularityRun check_install returned false ('singularity --version' failed to run). MLCube cannot " "run singularity images unless this check passes. Singularity runner uses `check_install` function " "from singularity-cli python library (https://github.com/singularityhub/singularity-cli)." )
def test_execution_error_mlcube_run_error_method(self) -> None: self.check_execution_error_state( ExecutionError.mlcube_run_error( "MLCube Reference Runner", "Long error description.", param_a='value_a', param_b=1.2 ), "MLCube Reference Runner runner failed to run MLCube.", "Long error description.", {'param_a': 'value_a', 'param_b': 1.2} )
def run(self) -> None: """ Run a cube. """ docker: t.Text = self.mlcube.runner.docker image: t.Text = self.mlcube.runner.image build_strategy: t.Text = self.mlcube.runner.build_strategy if build_strategy == Config.BuildStrategy.ALWAYS or not Shell.docker_image_exists(docker, image): logger.warning("Docker image (%s) does not exist or build strategy is 'always'. " "Will run 'configure' phase.", image) self.configure() # Deal with user-provided workspace try: Shell.sync_workspace(self.mlcube, self.task) except Exception as err: raise ExecutionError.mlcube_run_error( self.__class__.__name__, f"Error occurred while syncing MLCube workspace (task={self.task}). Actual error is {type(err)} - see " "context for details.", error=str(err) ) # The 'mounts' dictionary maps host paths to container paths try: mounts, task_args = Shell.generate_mounts_and_args(self.mlcube, self.task) except ConfigurationError as err: raise ExecutionError.mlcube_run_error( self.__class__.__name__, f"Error occurred while generating mount points for docker run command (task={self.task}). See context " "for details and check your MLCube configuration file.", error=str(err) ) logger.info(f"mounts={mounts}, task_args={task_args}") volumes = Shell.to_cli_args(mounts, sep=':', parent_arg='--volume') env_args = self.mlcube.runner.env_args num_gpus: int = self.mlcube.platform.get('accelerator_count', None) or 0 run_args: t.Text = self.mlcube.runner.cpu_args if num_gpus == 0 else self.mlcube.runner.gpu_args try: Shell.run([docker, 'run', run_args, env_args, volumes, image, ' '.join(task_args)]) except ExecutionError as err: raise ExecutionError.mlcube_run_error( self.__class__.__name__, f"Error occurred while running MLCube task (docker={docker}, run_args={run_args}, env_args={env_args}, " f"volumes={volumes}, image={image}, task_args={task_args}).", **err.context )
def check_execution_error_state(self, err: ExecutionError, message: str, description: str, context: t.Dict) -> None: self.assertEqual(err.message, message) self.assertEqual(err.description, description) self.assertDictEqual(err.context, context) self.assertEqual( err.describe(frmt='text'), f"ERROR:\n\tmessage: {message}\n\tdescription: {description}\n\tcontext: {context}" )
def run(self) -> None: """Run a cube""" try: logging.info("Configuring MLCube to run in Kubeflow Pipelines...") _ = self.create_kf_pipeline() except Exception as err: raise ExecutionError.mlcube_run_error( self.__class__.__name__, "See context for more details.", error=str(err))
def configure(self) -> None: """Build Docker image on a current host.""" image: t.Text = self.mlcube.runner.image context: t.Text = os.path.abspath(os.path.join(self.mlcube.runtime.root, self.mlcube.runner.build_context)) recipe: t.Text = os.path.abspath(os.path.join(context, self.mlcube.runner.build_file)) docker: t.Text = self.mlcube.runner.docker # Build strategies: `pull`, `auto` and `always`. build_strategy: t.Text = self.mlcube.runner.build_strategy build_recipe_exists: bool = os.path.exists(recipe) if build_strategy == Config.BuildStrategy.PULL or not build_recipe_exists: logger.info("Will pull image (%s) because (build_strategy=%s, build_recipe_exists=%r)", image, build_strategy, build_recipe_exists) if build_recipe_exists: logger.warning( "Docker recipe exists (%s), but your build strategy is `%s`, and so the image will be pulled, not " "built. Make sure your image is up-to-date with your source code. If you want to rebuilt MLCube " "docker image locally, rerun with `-Prunner.build_strategy=always`.", recipe, build_strategy ) try: Shell.run([docker, 'pull', image]) except ExecutionError as err: description = f"Error occurred while pulling docker image (docker={docker}, image={image})." if build_recipe_exists: description += \ f" By the way, docker recipe ({recipe}) exists, but your build strategy is set to "\ "pull. Consider rerunning with: `-Prunner.build_strategy=auto` to build image locally." raise ExecutionError.mlcube_configure_error(self.__class__.__name__, description, **err.context) else: logger.info("Will build image (%s) because (build_strategy=%s, build_recipe_exists=%r)", image, build_strategy, build_recipe_exists) build_args: t.Text = self.mlcube.runner.build_args try: Shell.run([docker, 'build', build_args, '-t', image, '-f', recipe, context]) except ExecutionError as err: raise ExecutionError.mlcube_configure_error( self.__class__.__name__, f"Error occurred while building docker image (docker={docker}, build_args={build_args}, " f"image={image}, recipe={recipe}, context={context}).", **err.context )
def run(self) -> None: gcp: DictConfig = self.mlcube.runner try: Shell.run( f"mlcube run --mlcube={self.mlcube.root} --platform={gcp.platform} --task={self.task}" ) except ExecutionError as err: raise ExecutionError.mlcube_run_error( self.__class__.__name__, f"Error occurred while running MLCube task (platform={gcp.platform}, task={self.task}).", **err.context)
def run(self) -> None: """ """ image_file = Path( self.mlcube.runner.image_dir) / self.mlcube.runner.image if not image_file.exists(): self.configure() else: SingularityRun.check_install() # Deal with user-provided workspace try: Shell.sync_workspace(self.mlcube, self.task) except Exception as err: raise ExecutionError.mlcube_run_error( self.__class__.__name__, "Error occurred while syncing MLCube workspace. See context for more details.", error=str(err)) try: mounts, task_args = Shell.generate_mounts_and_args( self.mlcube, self.task) logger.info(f"mounts={mounts}, task_args={task_args}") except ConfigurationError as err: raise ExecutionError.mlcube_run_error( self.__class__.__name__, "Error occurred while generating mount points for singularity run command. See context for more " "details and check your MLCube configuration file.", error=str(err)) volumes = Shell.to_cli_args(mounts, sep=":", parent_arg="--bind") try: Shell.run([ self.mlcube.runner.singularity, 'run', self.mlcube.runner.run_args, volumes, str(image_file), ' '.join(task_args) ]) except ExecutionError as err: raise ExecutionError.mlcube_run_error( self.__class__.__name__, f"Error occurred while running MLCube task (task={self.task}). See context for more details.", **err.context)
def run(self) -> None: """Run a cube""" try: logging.info("Configuring MLCube as a Kubernetes Job...") kubernetes.config.load_kube_config() mlcube_job_manifest = self.create_job_manifest() job = self.create_job(mlcube_job_manifest) self.wait_for_completion(job) except Exception as err: raise ExecutionError.mlcube_run_error( self.__class__.__name__, "See context for more details.", error=str(err))
def configure(self) -> None: """Build Singularity Image on a current host.""" SingularityRun.check_install() s_cfg: DictConfig = self.mlcube.runner # Get full path to a singularity image. By design, we compute it relative to {mlcube.root}/workspace. image_file = Path(s_cfg.image_dir, s_cfg.image) if image_file.exists(): logger.info( "SingularityRun SIF exists (%s) - no need to run the configure step.", image_file, ) return # Make sure a directory to store image exists. If paths are like "/opt/...", the call may fail. image_file.parent.mkdir(parents=True, exist_ok=True) build_path = Path( self.mlcube.runtime.root ) # Let's assume that build context is the root MLCube directory recipe: str = s_cfg.build_file # This is the recipe file, or docker image. if recipe.startswith("docker://") or recipe.startswith( "docker-archive:"): # https://sylabs.io/guides/3.0/user-guide/build_a_container.html # URI beginning with docker:// to build from Docker Hub logger.info("SingularityRun building SIF from docker image (%s).", recipe) else: # This must be a recipe file. Make sure it exists. if not Path(build_path, recipe).exists(): raise IOError( f"SIF recipe file does not exist (path={build_path}, file={recipe})" ) logger.info("Building SIF from recipe file (path=%s, file=%s).", build_path, recipe) try: Shell.run([ 'cd', str(build_path), ';', s_cfg.singularity, 'build', s_cfg.build_args, str(image_file), recipe ]) except ExecutionError as err: raise ExecutionError.mlcube_configure_error( self.__class__.__name__, "Error occurred while building SIF image. See context for more details.", **err.context)
def run(cmd: t.Union[str, t.List], on_error: str = 'raise') -> int: """Execute shell command. Args: cmd: Command to execute, e.g. Shell.run(['ls', -lh']). If type is iterable, this method will join into one string using whitespace as a separator. on_error: Action to perform if `os.system` returns a non-zero status. Options - ignore (do nothing, return exit code), 'raise' (raise a RuntimeError exception), 'die' (exit the process). Returns: Exit status. On Windows, the exit status is the output of `os.system`. On Linux, the output is either process exit status if that processes exited, or -1 in other cases (e.g., process was killed). """ if isinstance(cmd, t.List): cmd = ' '.join(cmd) if on_error not in ('raise', 'die', 'ignore'): raise ValueError( f"Unrecognized 'on_error' action ({on_error}). Valid options are ('raise', 'die', 'ignore')." ) status: int = os.system(cmd) exit_code, exit_status = Shell.parse_exec_status(status) if exit_status == 'na': logger.warning("Command (cmd=%s) did not exit properly (status=%d).", cmd, status) msg = f"Command='{cmd}' status={status} exit_status={exit_status} exit_code={exit_code} on_error={on_error}" if exit_code != 0: logger.error(msg) if on_error == 'die': sys.exit(exit_code) if on_error == 'raise': raise ExecutionError( 'Failed to execute shell command.', status=exit_status, code=exit_code, cmd=cmd ) else: logger.info(msg) return exit_code
def test_execution_error_init_method(self) -> None: self.check_execution_error_state( ExecutionError("Brief error description.", "Long error description.", param_a='value_a', param_b=1.2), "Brief error description.", "Long error description.", {'param_a': 'value_a', 'param_b': 1.2} )
def configure(self) -> None: """ """ gcp: DictConfig = self.mlcube.runner # Check that SSH is configured. # TODO: (Sergey) why am I doing it here (copy-past bug)? ssh_config_file = os.path.join(Path.home(), '.ssh', 'mlcube') try: ssh_config = SSHConfig.load(ssh_config_file) gcp_host: Host = ssh_config.get(gcp.instance.name) except KeyError: raise ExecutionError.mlcube_configure_error( self.__class__.__name__, f"SSH configuration file ({ssh_config_file}) does not provide connection details for GCP instance " f"(name={gcp.instance.name}). Most likely this error has occurred due to implementation error - " "please, contact MLCube developers.") # TODO: I can try to add this info on the fly assuming standard paths. Need to figure out the user name. if gcp_host.get('User', None) is None or gcp_host.get( 'IdentityFile', None) is None: raise ExecutionError.mlcube_configure_error( self.__class__.__name__, f"SSH configuration file ({ssh_config_file}) provides connection details for GCP instance " f"(name={gcp.instance.name}), but these details do not include information about `User` " "and/or `IdentifyFile`.") # Connect to GCP logger.info("Connecting to GCP ...") try: service = Service(project_id=gcp.gcp.project_id, zone=gcp.gcp.zone, credentials=gcp.gcp.credentials) except Exception as err: raise ExecutionError.mlcube_configure_error( self.__class__.__name__, "The error most like is associated with either reading credentials, or connecting using google API (" f"project_id={gcp.gcp.project_id}, zone={gcp.gcp.zone}, credentials={gcp.gcp.credentials}). See " "context for more details.", error=str(err), gcp_info={ 'project_id': gcp.gcp.project_id, 'zone': gcp.gcp.zone, 'credentials': gcp.gcp.credentials }) # Figure out if an instance needs to be created try: instance = GCPInstance(service.get_instance(gcp.instance.name)) if instance.name is None: print("Creating GCP instance ...") service.wait_for_operation( service.create_instance( name=gcp.instance.name, machine_type=gcp.instance.machine_type, disk_size_gb=gcp.instance.disk_size_gb)) instance = GCPInstance(service.get_instance(gcp.instance.name)) # Check its running status if instance.status != GCPInstanceStatus.RUNNING: print("Starting GCP instance ...") service.wait_for_operation( service.start_instance(instance.name)) instance = GCPInstance(service.get_instance(gcp.instance.name)) except Exception as err: raise ExecutionError.mlcube_configure_error( self.__class__.__name__, "Failed to create or connect to remote GCP instance. See context for more details.", error=str(err), gcp_instance_info={ 'name': gcp.instance.name, 'machine_type': gcp.instance.machine_type, 'disk_size_gb': gcp.instance.disk_size_gb }) # Make sure SSH mlcube is up-to-date if gcp_host.get('HostName', None) != instance.public_ip: print( f"Updating SSH mlcube (prev={gcp_host.get('HostName')}, new={instance.public_ip}, " f"file={ssh_config_file})") ssh_config.update(instance.name, {'HostName': instance.public_ip}) ssh_config.write(ssh_config_file) # TODO: clean '.ssh/known_hosts'. # Configure remote instance. This is specific for docker-based images now. try: Shell.ssh( gcp.instance.name, 'sudo snap install docker && sudo addgroup --system docker && sudo adduser ${USER} docker && ' 'sudo snap disable docker && sudo snap enable docker && ' 'sudo apt update && yes | sudo apt install python3-pip virtualenv && sudo apt clean' ) except ExecutionError as err: raise ExecutionError.mlcube_configure_error( self.__class__.__name__, "Failed to install system packages on a remote instance. See context for more details.", error=str(err)) # Remote GCP instance has been configured print(instance) # Should be as simple as invoking SSH configure. try: Shell.run( f"mlcube configure --mlcube={self.mlcube.root} --platform={gcp.platform}" ) except ExecutionError as err: raise ExecutionError.mlcube_configure_error( self.__class__.__name__, f"Error occurred while running mlcube configure with GCP platform (platform={gcp.platform}). See " "context for more details.", error=str(err))