def _show_dependencies_tree(self, target, commands, outs): from treelib import Tree nodes, edges, is_tree = self._build_graph(target, commands, outs) if not nodes: return if not is_tree: raise DvcException( "DAG is not a tree, can not print it in tree-structure way, " "please use --ascii instead") tree = Tree() tree.create_node(target, target) # Root node observe_list = [target] while len(observe_list) > 0: current_root = observe_list[0] for edge in edges: if edge[0] == current_root: tree.create_node(edge[1], edge[1], parent=current_root) observe_list.append(edge[1]) observe_list.pop(0) tree.show()
def hadoop_fs(self, cmd, user=None): cmd = 'hadoop fs -' + cmd if user: cmd = 'HADOOP_USER_NAME={} '.format(user) + cmd # NOTE: close_fds doesn't work with redirected stdin/stdout/stderr. # See https://github.com/iterative/dvc/issues/1197. close_fds = (os.name != 'nt') executable = os.getenv('SHELL') if os.name != 'nt' else None p = Popen(cmd, shell=True, close_fds=close_fds, executable=executable, env=fix_env(os.environ), stdin=PIPE, stdout=PIPE, stderr=PIPE) out, err = p.communicate() if p.returncode != 0: raise DvcException('HDFS command failed: {}: {}'.format(cmd, err)) return out.decode('utf-8')
def md5(self, path): """ Use different md5 commands depending on the OS: - Darwin's `md5` returns BSD-style checksums by default - Linux's `md5sum` needs the `--tag` flag for a similar output Example: MD5 (foo.txt) = f3d220a856b52aabbf294351e8a24300 """ path = shlex.quote(path) if self.uname == "Linux": md5 = self.execute("md5sum " + path).split()[0] elif self.uname == "Darwin": md5 = self.execute("md5 " + path).split()[-1] else: raise DvcException( f"'{self.uname}' is not supported as a SSH remote" ) assert len(md5) == 32 return md5
def _git_object_by_path(self, path): import git path = relpath(os.path.realpath(path), self.git.working_dir) if path.split(os.sep, 1)[0] == "..": # path points outside of git repository return None try: tree = self.git.tree(self.rev) except git.exc.BadName as exc: # pylint: disable=no-member raise DvcException("revision '{}' not found in Git '{}'".format( self.rev, os.path.relpath(self.git.working_dir))) from exc if not path or path == ".": return tree for i in path.split(os.sep): if not self._is_tree_and_contains(tree, i): # there is no tree for specified path return None tree = tree[i] return tree
def resolve_paths(repo, out): from urllib.parse import urlparse from ..dvcfile import DVC_FILE_SUFFIX from ..exceptions import DvcException from ..path_info import PathInfo from ..system import System from .fs import contains_symlink_up_to abspath = PathInfo(os.path.abspath(out)) dirname = os.path.dirname(abspath) base = os.path.basename(os.path.normpath(out)) scheme = urlparse(out).scheme if os.name == "nt" and scheme == abspath.drive[0].lower(): # urlparse interprets windows drive letters as URL scheme scheme = "" if scheme or not abspath.isin_or_eq(repo.root_dir): wdir = os.getcwd() elif contains_symlink_up_to(dirname, repo.root_dir) or ( os.path.isdir(abspath) and System.is_symlink(abspath) ): msg = ( "Cannot add files inside symlinked directories to DVC. " "See {} for more information." ).format( format_link( "https://dvc.org/doc/user-guide/troubleshooting#add-symlink" ) ) raise DvcException(msg) else: wdir = dirname out = base path = os.path.join(wdir, base + DVC_FILE_SUFFIX) return (path, wdir, out)
def __init__(self, repo, config): super().__init__(repo, config) self.path_info = self.PATH_CLS(config["url"]) if not self.path_info.bucket: raise DvcException( "Empty GDrive URL '{}'. Learn more at {}".format( config["url"], format_link("https://man.dvc.org/remote/add"), ) ) self._bucket = self.path_info.bucket self._path = self.path_info.path self._trash_only = config.get("gdrive_trash_only") self._use_service_account = config.get("gdrive_use_service_account") self._service_account_email = config.get( "gdrive_service_account_email" ) self._service_account_user_email = config.get( "gdrive_service_account_user_email" ) self._service_account_p12_file_path = config.get( "gdrive_service_account_p12_file_path" ) self._client_id = config.get("gdrive_client_id") self._client_secret = config.get("gdrive_client_secret") self._validate_config() self._gdrive_user_credentials_path = ( tmp_fname(os.path.join(self.repo.tmp_dir, "")) if os.getenv(GDriveTree.GDRIVE_CREDENTIALS_DATA) else config.get( "gdrive_user_credentials_file", os.path.join( self.repo.tmp_dir, self.DEFAULT_USER_CREDENTIALS_FILE, ), ) )
def _filter_fields(data_points, filename, revision, fields=None, **kwargs): if not fields: return data_points assert isinstance(fields, set) new_data = [] for data_point in data_points: new_dp = copy(data_point) keys = set(data_point.keys()) if keys & fields != fields: raise DvcException( "Could not find fields: '{}' for '{}' at '{}'.".format( ", " "".join(fields), filename, revision ) ) to_del = keys - fields for key in to_del: del new_dp[key] new_data.append(new_dp) return new_data
def _collect_dir_cache(self, out, branch=None, remote=None, force=False, jobs=None): info = out.dumpd() ret = [info] r = out.remote md5 = info[r.PARAM_CHECKSUM] if self.cache.local.changed_cache_file(md5): try: self.cloud.pull(ret, jobs=jobs, remote=remote, show_checksums=False) except DvcException as exc: msg = "Failed to pull cache for '{}': {}" logger.debug(msg.format(out, exc)) if self.cache.local.changed_cache_file(md5): msg = ("Missing cache for directory '{}'. " "Cache for files inside will be lost. " "Would you like to continue? Use '-f' to force. ") if not force and not prompt.confirm(msg): raise DvcException( "unable to fully collect used cache" " without cache for directory '{}'".format(out)) else: return ret for i in self.cache.local.load_dir_cache(md5): i["branch"] = branch i[r.PARAM_PATH] = os.path.join(info[r.PARAM_PATH], i[r.PARAM_RELPATH]) ret.append(i) return ret
def reproduce(self, force=False, dry=False, interactive=False): if not self.changed(print_info=True) and not force: return None if (self.cmd or self.is_import) and not self.locked and not dry: # Removing outputs only if we actually have command to reproduce self.remove_outs(ignore_remove=False) msg = "Going to reproduce '{}'. Are you sure you want to continue?" msg = msg.format(self.relpath) if interactive \ and not self.project.prompt.prompt(msg): raise DvcException('Reproduction aborted by the user') self.project.logger.info(u'Reproducing \'{}\''.format(self.relpath)) self.run(dry=dry) msg = u'\'{}\' was reproduced'.format(self.relpath) self.project.logger.debug(msg) return self
def __init__( self, stage, path, info=None, remote=None, cache=True, metric=False, persist=False, tags=None, ): # This output (and dependency) objects have too many paths/urls # here is a list and comments: # # .def_path - path from definition in stage file # .path_info - PathInfo/URLInfo structured resolved path # .fspath - local only, resolved # .__str__ - for presentation purposes, def_path/relpath # # By resolved path, which contains actual location, # should be absolute and don't contain remote:// refs. self.stage = stage self.repo = stage.repo self.def_path = path self.info = info self.remote = remote or self.REMOTE(self.repo, {}) self.use_cache = False if self.IS_DEPENDENCY else cache self.metric = False if self.IS_DEPENDENCY else metric self.persist = persist self.tags = None if self.IS_DEPENDENCY else (tags or {}) if self.use_cache and self.cache is None: raise DvcException( "no cache location setup for '{}' outputs.".format( self.REMOTE.scheme ) ) self.path_info = self._parse_path(remote, path)
def test_nested_exceptions(self, caplog): with caplog.at_level(logging.DEBUG, logger="dvc"): try: raise Exception("first") except Exception as exc: try: raise DvcException("second") from exc except DvcException: stack_trace = traceback.format_exc() logger.exception("message") expected = ( "{red}ERROR{nc}: message - second: first\n" "{red}{line}{nc}\n" "{stack_trace}" "{red}{line}{nc}\n".format( line="-" * 60, stack_trace=stack_trace, **colors ) ) assert expected == formatter.format(caplog.records[0]) assert "Exception: first" in stack_trace assert "dvc.exceptions.DvcException: second" in stack_trace
def __init__( self, selected: Optional[Iterable[str]], tmp_dir: StrPath, **kwargs, ) -> None: selected = selected or list(self.DEFAULT) self.backends: Dict[str, "BackendCls"] = {} for key in selected: cls = self.DEFAULT.get(key) if cls is None: raise DvcException( f"'dvc machine' backend '{key}' is missing required " "dependencies. Install them with:\n" f"\tpip install dvc[{key}]" ) self.backends[key] = cls self.initialized: Dict[str, "BaseMachineBackend"] = {} self.tmp_dir = tmp_dir self.kwargs = kwargs
def reproduce(self, force=False, dry=False, interactive=False): if not self.changed() and not force: return None if (self.cmd or self.is_import) and not self.locked and not dry: # Removing outputs only if we actually have command to reproduce self.remove_outs(ignore_remove=False) msg = "Going to reproduce '{stage}'. Are you sure you want to continue?".format( stage=self.relpath ) if interactive and not prompt.confirm(msg): raise DvcException("reproduction aborted by the user") logger.info("Reproducing '{stage}'".format(stage=self.relpath)) self.run(dry=dry) logger.debug("'{stage}' was reproduced".format(stage=self.relpath)) return self
def loads(project=None, cmd=None, deps=[], outs=[], outs_no_cache=[], metrics_no_cache=[], fname=None, cwd=os.curdir, locked=False, add=False, overwrite=True): stage = Stage(project=project, cwd=cwd, cmd=cmd, locked=locked) stage.outs = output.loads_from(stage, outs, use_cache=True) stage.outs += output.loads_from(stage, outs_no_cache, use_cache=False) stage.outs += output.loads_from(stage, metrics_no_cache, use_cache=False, metric=True) stage.deps = dependency.loads_from(stage, deps) fname, cwd = Stage._stage_fname_cwd(fname, cwd, stage.outs, add=add) cwd = os.path.abspath(cwd) path = os.path.join(cwd, fname) if os.path.exists(path): relpath = os.path.relpath(path) msg = "'{}' already exists. " \ "Do you wish to run the command and overwrite it?" if not overwrite \ and not project.prompt.prompt(msg.format(relpath), False): raise DvcException("'{}' already exists".format(relpath)) stage.cwd = cwd stage.path = path return stage
def __init__(self, **config): from fsspec.utils import infer_storage_options super().__init__(**config) self.url = config["url"] opts = infer_storage_options(self.url) if not opts["host"]: raise DvcException( "Empty GDrive URL '{}'. Learn more at {}".format( config["url"], format_link("https://man.dvc.org/remote/add"), )) self._bucket = opts["host"] self._path = opts["path"].lstrip("/") self._trash_only = config.get("gdrive_trash_only") self._use_service_account = config.get("gdrive_use_service_account") self._service_account_user_email = config.get( "gdrive_service_account_user_email") self._service_account_json_file_path = config.get( "gdrive_service_account_json_file_path") self._client_id = config.get("gdrive_client_id") self._client_secret = config.get("gdrive_client_secret") self._validate_config() tmp_dir = config["gdrive_credentials_tmp_dir"] assert tmp_dir self._gdrive_service_credentials_path = tmp_fname( os.path.join(tmp_dir, "")) self._gdrive_user_credentials_path = (tmp_fname( os.path.join(tmp_dir, "")) if os.getenv( GDriveFileSystem.GDRIVE_CREDENTIALS_DATA) else config.get( "gdrive_user_credentials_file", os.path.join(tmp_dir, self.DEFAULT_USER_CREDENTIALS_FILE), ))
def reproduce(self, interactive=False, **kwargs): if not (kwargs.get("force", False) or self.changed()): if not isinstance(self, PipelineStage) and self.is_data_source: logger.info("'%s' didn't change, skipping", self.addressing) else: logger.info( "Stage '%s' didn't change, skipping", self.addressing ) return None msg = ( "Going to reproduce {stage}. " "Are you sure you want to continue?".format(stage=self) ) if interactive and not prompt.confirm(msg): raise DvcException("reproduction aborted by the user") self.run(**kwargs) logger.debug(f"{self} was reproduced") return self
def test_nested_exceptions(self, caplog): with caplog.at_level(logging.DEBUG, logger="dvc"): try: raise Exception("first") except Exception as exc: first_traceback = traceback.format_exc() try: raise DvcException("second", cause=exc) except DvcException: second_traceback = traceback.format_exc() logger.exception("message") expected = ("{red}ERROR{nc}: message - second: first\n" "{red}{line}{nc}\n" "{stack_trace}" "{red}{line}{nc}\n".format(line="-" * 60, stack_trace="\n".join([ first_traceback, second_traceback ]), **colors)) assert expected == formatter.format(caplog.records[0])
def checkout_exp(self, rev, force=False): """Checkout an experiment to the user's workspace.""" from git.exc import GitCommandError from dvc.repo.checkout import _checkout as dvc_checkout if force: self.repo.scm.repo.git.reset(hard=True) self._scm_checkout(rev) tmp = tempfile.NamedTemporaryFile(delete=False).name self.scm.repo.head.commit.diff("HEAD~1", patch=True, output=tmp) try: if os.path.getsize(tmp): logger.debug("Patching local workspace") self.repo.scm.repo.git.apply(tmp, reverse=True) dvc_checkout(self.repo) except GitCommandError: raise DvcException( "Checkout failed, experiment contains changes which " "conflict with your current workspace. To overwrite " "your workspace, use `dvc experiments checkout --force`.") finally: remove(tmp)
def link(self, cache, path): assert os.path.isfile(cache) dname = os.path.dirname(path) if not os.path.exists(dname): os.makedirs(dname) i = len(self.cache_types) while i > 0: try: self.CACHE_TYPE_MAP[self.cache_types[0]](cache, path) msg = "Created '{}': {} -> {}" Logger.info( msg.format(self.cache_types[0], os.path.relpath(cache), os.path.relpath(path))) return except DvcException as exc: msg = 'Cache type \'{}\' is not supported: {}' Logger.debug(msg.format(self.cache_types[0], str(exc))) del self.cache_types[0] i -= 1 raise DvcException('No possible cache types left to try out.')
def git_object_by_path(self, path): import git path = relpath(os.path.realpath(path), self.git.working_dir) assert path.split(os.sep, 1)[0] != ".." try: tree = self.git.tree(self.rev) except git.exc.BadName as exc: raise DvcException( "revision '{}' not found in git '{}'".format( self.rev, os.path.relpath(self.git.working_dir)), cause=exc, ) if not path or path == ".": return tree for i in path.split(os.sep): if not self._is_tree_and_contains(tree, i): # there is no tree for specified path return None tree = tree[i] return tree
def _request(self, method, url, **kwargs): import requests kwargs.setdefault("allow_redirects", True) kwargs.setdefault("timeout", self.REQUEST_TIMEOUT) try: res = self._session.request(method, url, **kwargs) redirect_no_location = (kwargs["allow_redirects"] and res.status_code in (301, 302) and "location" not in res.headers) if redirect_no_location: # AWS s3 doesn't like to add a location header to its redirects # from https://s3.amazonaws.com/<bucket name>/* type URLs. # This should be treated as an error raise requests.exceptions.RequestException return res except requests.exceptions.RequestException: raise DvcException("could not perform a {} request".format(method))
def walk(self, directory, topdown=True): # NOTE: original os.walk() implementation [1] with default options was # used as a template. # # [1] https://github.com/python/cpython/blob/master/Lib/os.py self._sftp_connect() try: dir_entries = self._sftp.listdir_attr(directory) except IOError as exc: raise DvcException( "couldn't get the '{}' remote directory files list".format( directory ), cause=exc, ) dirs = [] nondirs = [] for entry in dir_entries: name = entry.filename if stat.S_ISDIR(entry.st_mode): dirs.append(name) else: nondirs.append(name) if topdown: yield directory, dirs, nondirs for dname in dirs: newpath = posixpath.join(directory, dname) for entry in self.walk(newpath, topdown=topdown): yield entry if not topdown: yield directory, dirs, nondirs
def checkout_exp(self, rev): """Checkout an experiment to the user's workspace.""" from git.exc import GitCommandError from dvc.repo.checkout import checkout as dvc_checkout baseline_rev = self._check_baseline(rev) self._scm_checkout(rev) tmp = tempfile.NamedTemporaryFile(delete=False).name self.scm.repo.head.commit.diff(baseline_rev, patch=True, full_index=True, binary=True, output=tmp) dirty = self.repo.scm.is_dirty() if dirty: logger.debug("Stashing workspace changes.") self.repo.scm.repo.git.stash("push", "--include-untracked") try: if os.path.getsize(tmp): logger.debug("Patching local workspace") self.repo.scm.repo.git.apply(tmp, reverse=True) need_checkout = True else: need_checkout = False except GitCommandError: raise DvcException("failed to apply experiment changes.") finally: remove(tmp) if dirty: self._unstash_workspace() if need_checkout: dvc_checkout(self.repo)
def link(self, cache, path): assert os.path.isfile(cache) dname = os.path.dirname(path) if not os.path.exists(dname): os.makedirs(dname) # NOTE: just create an empty file for an empty cache if os.path.getsize(cache) == 0: open(path, 'w+').close() msg = "Created empty file: {} -> {}".format(cache, path) logger.debug(msg) return i = len(self.cache_types) while i > 0: try: self.CACHE_TYPE_MAP[self.cache_types[0]](cache, path) if self.protected: os.chmod(path, stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH) msg = "Created {}'{}': {} -> {}".format( 'protected ' if self.protected else '', self.cache_types[0], cache, path) logger.debug(msg) return except DvcException as exc: msg = "Cache type '{}' is not supported: {}" logger.debug(msg.format(self.cache_types[0], str(exc))) del self.cache_types[0] i -= 1 raise DvcException('no possible cache types left to try out.')
def checkout(self, target=None, with_deps=False, force=False, recursive=False): from dvc.stage import StageFileDoesNotExistError, StageFileBadNameError if target and not recursive: all_stages = self.active_stages() try: stages = self.collect(target, with_deps=with_deps) except (StageFileDoesNotExistError, StageFileBadNameError) as exc: raise DvcException( str(exc) + " Did you mean 'git checkout {}'?".format(target)) else: all_stages = self.active_stages(target) stages = all_stages with self.state: _cleanup_unused_links(self, all_stages) for stage in stages: if stage.locked: logger.warning( "DVC file '{path}' is locked. Its dependencies are" " not going to be checked out.".format(path=stage.relpath)) stage.checkout(force=force)
def md5(self, path): """ Use different md5 commands depending on the OS: - Darwin's `md5` returns BSD-style checksums by default - Linux's `md5sum` needs the `--tag` flag for a similar output Example: MD5 (foo.txt) = f3d220a856b52aabbf294351e8a24300 """ uname = self.execute("uname").strip() command = { "Darwin": "md5 {}".format(path), "Linux": "md5sum --tag {}".format(path), }.get(uname) if not command: raise DvcException( "'{uname}' is not supported as a remote".format(uname=uname)) md5 = self.execute(command).split()[-1] assert len(md5) == 32 return md5
def reflink(source, link_name): from dvc.utils.fs import umask source, link_name = os.fspath(source), os.fspath(link_name) system = platform.system() try: if system == "Windows": ret = System._reflink_windows(source, link_name) elif system == "Darwin": ret = System._reflink_darwin(source, link_name) elif system == "Linux": ret = System._reflink_linux(source, link_name) else: ret = -1 except OSError: ret = -1 if ret != 0: raise DvcException("reflink is not supported") # NOTE: reflink has a new inode, but has the same mode as the src, # so we need to chmod it to look like a normal copy. os.chmod(link_name, 0o666 & ~umask)
def parse_target(target, default=None): from dvc.dvcfile import PIPELINE_FILE, PIPELINE_LOCK from dvc.exceptions import DvcException if not target: return None, None match = TARGET_REGEX.match(target) if not match: return target, None path, name = ( match.group("path"), match.group("name"), ) if not path: path = default or PIPELINE_FILE logger.debug("Assuming file to be '%s'", path) if os.path.basename(path) == PIPELINE_LOCK: raise DvcException("Did you mean: `{}`?".format( target.replace(".lock", ".yaml", 1))) return path, name
def get_file_hash(self, path_info): import dropbox path = path_info_to_dropbox_path(path_info) logger.debug("Getting hash of {0}".format(path)) try: return HashInfo( self.PARAM_CHECKSUM, self.client.files_get_metadata(path).content_hash, ) except dropbox.exceptions.ApiError as ex: if ex.error.is_path() and ex.error.get_path().is_not_found(): raise DvcException( "Path not found for '{}':\n\n" "1. Confirm the file exists and you can access it.\n" "2. Make sure that credentials in '{}'\n" " are correct for this remote e.g. " "use the `dropbox_user_credentials_file` config\n" " option if you use multiple Dropbox remotes with " "different email accounts.\n\nDetails".format( path, FileCredProvider.DEFAULT_FILE ) ) from ex raise
def makedirs(self, path): # Single stat call will say whether this is a dir, a file or a link st_mode = self.st_mode(path) if stat.S_ISDIR(st_mode): return if stat.S_ISREG(st_mode) or stat.S_ISLNK(st_mode): raise DvcException( "a file with the same name '{}' already exists".format(path)) head, tail = posixpath.split(path) if head: self.makedirs(head) if tail: try: self.sftp.mkdir(path) except IOError as e: # Since paramiko errors are very vague we need to recheck # whether it's because path already exists or something else if e.errno == errno.EACCES or not self.exists(path): raise