def _parse_kubernetes_config(backend_config): """Creates build context tarfile containing Dockerfile and project code, returning path to tarfile.""" if not backend_config: raise ExecutionException('Backend_config file not found.') kube_config = backend_config.copy() if 'kube-job-template-path' not in backend_config.keys(): raise ExecutionException( "'kube-job-template-path' attribute must be specified in " 'backend_config.') kube_job_template = backend_config['kube-job-template-path'] if os.path.exists(kube_job_template): with open(kube_job_template, 'r') as job_template: yaml_obj = yaml.safe_load(job_template.read()) kube_job_template = yaml_obj kube_config['kube-job-template'] = kube_job_template else: raise ExecutionException( "Could not find 'kube-job-template-path': {}".format( kube_job_template)) if 'kube-context' not in backend_config.keys(): _logger.debug('Could not find kube-context in backend_config.' ' Using current context or in-cluster config.') if 'repository-uri' not in backend_config.keys(): raise ExecutionException( "Could not find 'repository-uri' in backend_config.") return kube_config
def _fetch_git_repo(uri, version, dst_dir): """Clone the git repo at ``uri`` into ``dst_dir``, checking out commit ``version`` (or defaulting to the head commit of the repository's master branch if version is unspecified). Assumes authentication parameters are specified by the environment, e.g. by a Git credential helper. """ # We defer importing git until the last moment, because the import # requires that the git executable is availble on the PATH, so we only # want to fail if we actually need it. import git repo = git.Repo.init(dst_dir) origin = repo.create_remote('origin', uri) origin.fetch() if version is not None: try: repo.git.checkout(version) except git.exc.GitCommandError as e: raise ExecutionException( "Unable to checkout version '%s' of git repo %s" '- please ensure that the version exists in the repo. ' 'Error: %s' % (version, uri, e)) else: repo.create_head('master', origin.refs.master) repo.heads.master.checkout() repo.submodule_update(init=True, recursive=True)
def _validate_parameters(self, user_parameters): missing_params = [] for name in self.parameters: if (name not in user_parameters and self.parameters[name].default is None): missing_params.append(name) if missing_params: raise ExecutionException( 'No value given for missing parameters: %s' % ', '.join(["'%s'" % name for name in missing_params]))
def _parse_subdirectory(uri): # Parses a uri and returns the uri and subdirectory as separate values. # Uses '#' as a delimiter. subdirectory = '' parsed_uri = uri if '#' in uri: subdirectory = uri[uri.find('#') + 1:] parsed_uri = uri[:uri.find('#')] if subdirectory and '.' in subdirectory: raise ExecutionException( "'.' is not allowed in project subdirectory paths.") return parsed_uri, subdirectory
def _fetch_project(uri, force_tempdir, version=None): """Fetch a project into a local directory, returning the path to the local project directory. :Args force_tempdir: If True, will fetch the project into a temporary directory. Otherwise, will fetch ZIP or Git projects into a temporary directory but simply return the path of local projects (i.e. perform a no-op for local projects). """ parsed_uri, subdirectory = _parse_subdirectory(uri) use_temp_dst_dir = force_tempdir or _is_zip_uri( parsed_uri) or not _is_local_uri(parsed_uri) dst_dir = tempfile.mkdtemp() if use_temp_dst_dir else parsed_uri if use_temp_dst_dir: _logger.info('=== Fetching project from %s into %s ===', uri, dst_dir) if _is_zip_uri(parsed_uri): if _is_file_uri(parsed_uri): parsed_file_uri = urllib.parse.urlparse( urllib.parse.unquote(parsed_uri)) parsed_uri = os.path.join(parsed_file_uri.netloc, parsed_file_uri.path) _unzip_repo(zip_file=(parsed_uri if _is_local_uri(parsed_uri) else _fetch_zip_repo(parsed_uri)), dst_dir=dst_dir) elif _is_local_uri(uri): if version is not None: raise ExecutionException( 'Setting a version is only supported for Git project URIs') if use_temp_dst_dir: dir_util.copy_tree(src=parsed_uri, dst=dst_dir) else: assert _GIT_URI_REGEX.match( parsed_uri), 'Non-local URI %s should be a Git URI' % parsed_uri _fetch_git_repo(parsed_uri, version, dst_dir) res = os.path.abspath(os.path.join(dst_dir, subdirectory)) if not os.path.exists(res): raise ExecutionException('Could not find subdirectory %s of %s' % (subdirectory, dst_dir)) return res
def _compute_path_value(self, user_param_value, storage_dir): local_path = get_local_path_or_none(user_param_value) if local_path: if not os.path.exists(local_path): raise ExecutionException( 'Got value %s for parameter %s, but no such file or ' 'directory was found.' % (user_param_value, self.name)) return os.path.abspath(local_path) basename = os.path.basename(user_param_value) dest_path = os.path.join(storage_dir, basename) if dest_path != user_param_value: data.download_uri(uri=user_param_value, output_path=dest_path) return os.path.abspath(dest_path)
def _fetch_zip_repo(uri): import requests from io import BytesIO # TODO (dbczumar): Replace HTTP resolution via ``requests.get`` with an # invocation of ```segmind_track.data.download_uri()`` when the API # supports the same set of available stores as the artifact repository # (Azure, FTP, etc). See the following issue: # https://github.com/mlflow/mlflow/issues/763. response = requests.get(uri) try: response.raise_for_status() except requests.HTTPError as error: raise ExecutionException('Unable to retrieve ZIP file. Reason: %s' % str(error)) return BytesIO(response.content)
def get_entry_point(self, entry_point): if entry_point in self._entry_points: return self._entry_points[entry_point] _, file_extension = os.path.splitext(entry_point) ext_to_cmd = {'.py': 'python', '.sh': os.environ.get('SHELL', 'bash')} if file_extension in ext_to_cmd: command = '%s %s' % (ext_to_cmd[file_extension], shlex_quote(entry_point)) if not is_string_type(command): command = command.encode('utf-8') return EntryPoint(name=entry_point, parameters={}, command=command) raise ExecutionException( 'Could not find {0} among entry points {1} or interpret {0} as a ' 'runnable script. Supported script file extensions: ' '{2}'.format(entry_point, list(self._entry_points.keys()), list(ext_to_cmd.keys())))
def _wait_for(submitted_run_obj): """Wait on the passed-in submitted run, reporting its status to the tracking server.""" run_id = submitted_run_obj.run_id active_run = None # Note: there's a small chance we fail to report the run's status to the # tracking server if # we're interrupted before we reach the try block below try: active_run = MlflowClient().get_run( run_id) if run_id is not None else None if submitted_run_obj.wait(): _logger.info("=== Run (ID '%s') succeeded ===", run_id) _maybe_set_run_terminated(active_run, 'FINISHED') else: _maybe_set_run_terminated(active_run, 'FAILED') raise ExecutionException("Run (ID '%s') failed" % run_id) except KeyboardInterrupt: _logger.error("=== Run (ID '%s') interrupted, cancelling run ===", run_id) submitted_run_obj.cancel() _maybe_set_run_terminated(active_run, 'FAILED') raise
def load_project(directory): mlproject_path = _find_mlproject(directory) # TODO: Validate structure of YAML loaded from the file yaml_obj = {} if mlproject_path is not None: with open(mlproject_path) as mlproject_file: yaml_obj = yaml.safe_load(mlproject_file) project_name = yaml_obj.get('name') # Validate config if docker_env parameter is present docker_env = yaml_obj.get('docker_env') if docker_env: if not docker_env.get('image'): raise ExecutionException( 'Project configuration (MLproject file) was invalid: Docker ' 'environment specified but no image attribute found.') if docker_env.get('volumes'): if not (isinstance(docker_env['volumes'], list) and all( [isinstance(i, str) for i in docker_env['volumes']])): # noqa: E125, E501 raise ExecutionException( 'Project configuration (MLproject file) was invalid: ' 'Docker volumes must be a list of strings, ' """e.g.: '["/path1/:/path1", "/path2/:/path2"])""") if docker_env.get('environment'): if not (isinstance(docker_env['environment'], list) and all([ isinstance(i, list) or isinstance(i, str) for i in docker_env['environment'] ])): raise ExecutionException( 'Project configuration (MLproject file) was invalid: ' 'environment must be a list containing either strings ' '(to copy env variables from host system) or lists' ' of string pairs (to define new environment variables).') # Validate config if conda_env parameter is present conda_path = yaml_obj.get('conda_env') if conda_path and docker_env: raise ExecutionException('Project cannot contain both a docker and ' 'conda environment.') # Parse entry points entry_points = {} for name, entry_point_yaml in yaml_obj.get('entry_points', {}).items(): parameters = entry_point_yaml.get('parameters', {}) command = entry_point_yaml.get('command') entry_points[name] = EntryPoint(name, parameters, command) if conda_path: conda_env_path = os.path.join(directory, conda_path) if not os.path.exists(conda_env_path): raise ExecutionException( 'Project specified conda environment file %s, but no such ' 'file was found.' % conda_env_path) return Project( conda_env_path=conda_env_path, entry_points=entry_points, docker_env=docker_env, name=project_name, ) default_conda_path = os.path.join(directory, DEFAULT_CONDA_FILE_NAME) if os.path.exists(default_conda_path): return Project(conda_env_path=default_conda_path, entry_points=entry_points, docker_env=docker_env, name=project_name) return Project(conda_env_path=None, entry_points=entry_points, docker_env=docker_env, name=project_name)
def _compute_uri_value(self, user_param_value): if not data.is_uri(user_param_value): raise ExecutionException('Expected URI for parameter %s but got ' '%s' % (self.name, user_param_value)) return user_param_value
def _run(uri, experiment_id, entry_point='main', version=None, parameters=None, backend=None, backend_config=None, use_conda=True, storage_dir=None, synchronous=True, run_id=None): """Helper that delegates to the project-running method corresponding to the passed-in backend. Returns a ``SubmittedRun`` corresponding to the project run. """ parameters = parameters or {} work_dir = _fetch_project(uri=uri, force_tempdir=False, version=version) project = _project_spec.load_project(work_dir) _validate_execution_environment(project, backend) # noqa project.get_entry_point(entry_point)._validate_parameters(parameters) if run_id: active_run = MlflowClient().get_run(run_id) else: active_run = _create_run(uri, experiment_id, work_dir, entry_point) # Consolidate parameters for logging. # `storage_dir` is `None` since we want to log actual path not downloaded # local path entry_point_obj = project.get_entry_point(entry_point) final_params, extra_params = entry_point_obj.compute_parameters( parameters, storage_dir=None) for key, value in (list(final_params.items()) + list(extra_params.items())): MlflowClient().log_param(active_run.info.run_id, key, value) repo_url = _get_git_repo_url(work_dir) if repo_url is not None: for tag in [MLFLOW_GIT_REPO_URL, LEGACY_MLFLOW_GIT_REPO_URL]: MlflowClient().set_tag(active_run.info.run_id, tag, repo_url) # Add branch name tag if a branch is specified through -version if _is_valid_branch_name(work_dir, version): for tag in [MLFLOW_GIT_BRANCH, LEGACY_MLFLOW_GIT_BRANCH_NAME]: MlflowClient().set_tag(active_run.info.run_id, tag, version) if backend == 'local' or backend is None: MlflowClient().set_tag(active_run.info.run_id, MLFLOW_PROJECT_BACKEND, 'local') command_args = [] command_separator = ' ' # If a docker_env attribute is defined in MLproject then it takes # precedence over conda yaml environments, so the project will be # executed inside a docker container. if project.docker_env: pass # MlflowClient().set_tag(active_run.info.run_id, # MLFLOW_PROJECT_ENV, 'docker') # _validate_docker_env(project) # _validate_docker_installation() # image = _build_docker_image( # work_dir=work_dir, # repository_uri=project.name, # base_image=project.docker_env.get('image'), # run_id=active_run.info.run_id) # command_args += _get_docker_command( # image=image, # active_run=active_run, # volumes=project.docker_env.get('volumes'), # user_env_vars=project.docker_env.get('environment')) # Synchronously create a conda environment (even though this may take # some time) to avoid failures due to multiple concurrent attempts to # create the same conda env. # elif use_conda: # MlflowClient().set_tag(active_run.info.run_id, # MLFLOW_PROJECT_ENV, 'conda') # command_separator = ' && ' # conda_env_name = _get_or_create_conda_env(project.conda_env_path) # command_args += _get_conda_command(conda_env_name) # In synchronous mode, run the entry point command in a blocking # fashion, sending status updates to the tracking server when finished # . Note that the run state may not be persisted to the tracking server # if interrupted if synchronous: command_args += _get_entry_point_command(project, entry_point, parameters, storage_dir) command_str = command_separator.join(command_args) return _run_entry_point(command_str, work_dir, experiment_id, run_id=active_run.info.run_id) # Otherwise, invoke `mlflow run` in a subprocess return _invoke_mlflow_run_subprocess(work_dir=work_dir, entry_point=entry_point, parameters=parameters, experiment_id=experiment_id, use_conda=use_conda, storage_dir=storage_dir, run_id=active_run.info.run_id) # elif backend == 'kubernetes': # from segmind.projects import kubernetes as kb # MlflowClient().set_tag(active_run.info.run_id, # MLFLOW_PROJECT_ENV, 'docker') # MlflowClient().set_tag(active_run.info.run_id, # MLFLOW_PROJECT_BACKEND, 'kubernetes') # _validate_docker_env(project) # _validate_docker_installation() # kube_config = _parse_kubernetes_config(backend_config) # image = _build_docker_image( # work_dir=work_dir, # repository_uri=kube_config['repository-uri'], # base_image=project.docker_env.get('image'), # run_id=active_run.info.run_id) # image_digest = kb.push_image_to_registry(image.tags[0]) # submitted_run = kb.run_kubernetes_job( # project.name, active_run, image.tags[0], image_digest, # _get_entry_point_command(project, entry_point, parameters, # storage_dir), # _get_run_env_vars( # run_id=active_run.info.run_uuid, # experiment_id=active_run.info.experiment_id), # kube_config.get('kube-context', None), # kube_config['kube-job-template']) # return submitted_run supported_backends = ['local', 'kubernetes'] raise ExecutionException('Got unsupported execution mode %s. Supported ' 'values: %s' % (backend, supported_backends))