class RepoTree(BaseTree): # pylint:disable=abstract-method """DVC + git-tracked files tree. Args: repo: DVC or git repo. subrepos: traverse to subrepos (by default, it ignores subrepos) repo_factory: A function to initialize subrepo with, default is Repo. kwargs: Additional keyword arguments passed to the `DvcTree()`. """ scheme = "local" PARAM_CHECKSUM = "md5" def __init__(self, repo, subrepos=False, repo_factory: Callable[[str], "Repo"] = None, **kwargs): super().__init__(repo, {"url": repo.root_dir}) if not repo_factory: from dvc.repo import Repo self.repo_factory = Repo else: self.repo_factory = repo_factory self._main_repo = repo self.root_dir = repo.root_dir self._traverse_subrepos = subrepos self._subrepos_trie = StringTrie(separator=os.sep) """Keeps track of each and every path with the corresponding repo.""" self._subrepos_trie[self.root_dir] = repo self._dvctrees = {} """Keep a dvctree instance of each repo.""" self._dvctree_configs = kwargs if hasattr(repo, "dvc_dir"): self._dvctrees[repo.root_dir] = DvcTree(repo, **kwargs) def _get_repo(self, path) -> Optional["Repo"]: """Returns repo that the path falls in, using prefix. If the path is already tracked/collected, it just returns the repo. Otherwise, it collects the repos that might be in the path's parents and then returns the appropriate one. """ repo = self._subrepos_trie.get(path) if repo: return repo prefix, repo = self._subrepos_trie.longest_prefix(path) if not prefix: return None parents = (parent.fspath for parent in PathInfo(path).parents) dirs = [path] + list(takewhile(lambda p: p != prefix, parents)) dirs.reverse() self._update(dirs, starting_repo=repo) return self._subrepos_trie.get(path) @wrap_with(threading.Lock()) def _update(self, dirs, starting_repo): """Checks for subrepo in directories and updates them.""" repo = starting_repo for d in dirs: if self._is_dvc_repo(d): repo = self.repo_factory(d) self._dvctrees[repo.root_dir] = DvcTree( repo, **self._dvctree_configs) self._subrepos_trie[d] = repo def _is_dvc_repo(self, dir_path): """Check if the directory is a dvc repo.""" if not self._traverse_subrepos: return False from dvc.repo import Repo repo_path = os.path.join(dir_path, Repo.DVC_DIR) # dvcignore will ignore subrepos, therefore using `use_dvcignore=False` return self._main_repo.tree.isdir(repo_path, use_dvcignore=False) def _get_tree_pair(self, path) -> Tuple[Union["GitTree", "LocalTree"], DvcTree]: """ Returns a pair of trees based on repo the path falls in, using prefix. """ path = os.path.abspath(path) # fallback to the top-level repo if repo was not found # this can happen if the path is outside of the repo repo = self._get_repo(path) or self._main_repo dvc_tree = self._dvctrees.get(repo.root_dir) return repo.tree, dvc_tree @property def fetch(self): return "fetch" in self._dvctree_configs @property def stream(self): return "stream" in self._dvctree_configs def open(self, path, mode="r", encoding="utf-8", **kwargs): # pylint: disable=arguments-differ if "b" in mode: encoding = None tree, dvc_tree = self._get_tree_pair(path) if dvc_tree and dvc_tree.exists(path): return dvc_tree.open(path, mode=mode, encoding=encoding, **kwargs) return tree.open(path, mode=mode, encoding=encoding) def exists(self, path, use_dvcignore=True): # pylint: disable=arguments-differ tree, dvc_tree = self._get_tree_pair(path) return tree.exists(path) or (dvc_tree and dvc_tree.exists(path)) def isdir(self, path): # pylint: disable=arguments-differ tree, dvc_tree = self._get_tree_pair(path) return tree.isdir(path) or (dvc_tree and dvc_tree.isdir(path)) def isdvc(self, path, **kwargs): _, dvc_tree = self._get_tree_pair(path) return dvc_tree is not None and dvc_tree.isdvc(path, **kwargs) def isfile(self, path): # pylint: disable=arguments-differ tree, dvc_tree = self._get_tree_pair(path) return tree.isfile(path) or (dvc_tree and dvc_tree.isfile(path)) def isexec(self, path): tree, dvc_tree = self._get_tree_pair(path) if dvc_tree and dvc_tree.exists(path): return dvc_tree.isexec(path) return tree.isexec(path) def stat(self, path): tree, _ = self._get_tree_pair(path) return tree.stat(path) def _dvc_walk(self, walk): try: root, dirs, files = next(walk) except StopIteration: return yield root, dirs, files for _ in dirs: yield from self._dvc_walk(walk) def _subrepo_walk(self, dir_path, **kwargs): """Walk into a new repo. NOTE: subrepo will only be discovered when walking if ignore_subrepos is set to False. """ tree, dvc_tree = self._get_tree_pair(dir_path) tree_walk = tree.walk(dir_path, topdown=True, ignore_subrepos=not self._traverse_subrepos) if dvc_tree: dvc_walk = dvc_tree.walk(dir_path, topdown=True, **kwargs) else: dvc_walk = None yield from self._walk(tree_walk, dvc_walk, **kwargs) def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False): assert repo_walk try: _, dvc_dirs, dvc_fnames = (next(dvc_walk) if dvc_walk else (None, [], [])) repo_root, repo_dirs, repo_fnames = next(repo_walk) except StopIteration: return # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs dvc_set = set(dvc_dirs) repo_set = set(repo_dirs) dvc_only = list(dvc_set - repo_set) repo_only = list(repo_set - dvc_set) shared = list(dvc_set & repo_set) dirs = shared + dvc_only + repo_only # merge file lists files = { fname for fname in dvc_fnames + repo_fnames if dvcfiles or not is_valid_filename(fname) } yield repo_root, dirs, list(files) # set dir order for next recursion level - shared dirs first so that # next() for both generators recurses into the same shared directory dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set] repo_dirs[:] = [dirname for dirname in dirs if dirname in repo_set] for dirname in dirs: dir_path = os.path.join(repo_root, dirname) if self._is_dvc_repo(dir_path): yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles) elif dirname in shared: yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) elif dirname in dvc_set: yield from self._dvc_walk(dvc_walk) elif dirname in repo_set: yield from self._walk(repo_walk, None, dvcfiles=dvcfiles) def walk(self, top, topdown=True, onerror=None, dvcfiles=False, **kwargs): # pylint: disable=arguments-differ """Walk and merge both DVC and repo trees. Args: top: path to walk from topdown: if True, tree will be walked from top down. onerror: if set, onerror function will be called if an error occurs (by default errors are ignored). dvcfiles: if True, dvcfiles will be included in the files list for walked directories. Any kwargs will be passed into methods used for fetching and/or streaming DVC outs from remotes. """ assert topdown if not self.exists(top): if onerror is not None: onerror(FileNotFoundError(top)) return if not self.isdir(top): if onerror is not None: onerror(NotADirectoryError(top)) return tree, dvc_tree = self._get_tree_pair(top) dvc_exists = dvc_tree and dvc_tree.exists(top) repo_exists = tree.exists(top) if dvc_exists: dvc_walk = dvc_tree.walk(top, topdown=topdown, **kwargs) if repo_exists: repo_walk = tree.walk( top, topdown=topdown, ignore_subrepos=not self._traverse_subrepos, ) yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) else: yield from dvc_walk else: repo_walk = tree.walk( top, topdown=topdown, onerror=onerror, ignore_subrepos=not self._traverse_subrepos, ) yield from self._walk(repo_walk, None, dvcfiles=dvcfiles) def walk_files(self, top, **kwargs): # pylint: disable=arguments-differ for root, _, files in self.walk(top, **kwargs): for fname in files: yield PathInfo(root) / fname def get_file_hash(self, path_info): """Return file checksum for specified path. If path_info is a DVC out, the pre-computed checksum for the file will be used. If path_info is a git file, MD5 will be computed for the git object. """ if not self.exists(path_info): raise FileNotFoundError _, dvc_tree = self._get_tree_pair(path_info) if dvc_tree and dvc_tree.exists(path_info): try: return dvc_tree.get_file_hash(path_info) except OutputNotFoundError: pass return self.PARAM_CHECKSUM, file_md5(path_info, self)[0] def copytree(self, top, dest): top = PathInfo(top) dest = PathInfo(dest) if not self.exists(top): raise FileNotFoundError if self.isfile(top): makedirs(dest.parent, exist_ok=True) with self.open(top, mode="rb") as fobj: copy_fobj_to_file(fobj, dest) return for root, _, files in self.walk(top): root = PathInfo(root) dest_dir = dest / root.relative_to(top) makedirs(dest_dir, exist_ok=True) for fname in files: src = root / fname with self.open(src, mode="rb") as fobj: copy_fobj_to_file(fobj, dest_dir / fname) @property def hash_jobs(self): # pylint: disable=invalid-overridden-method return self._main_repo.tree.hash_jobs def metadata(self, path): path_info = PathInfo(os.path.abspath(path)) tree, dvc_tree = self._get_tree_pair(path_info) dvc_meta = None if dvc_tree: with suppress(OutputNotFoundError): dvc_meta = dvc_tree.metadata(path_info) stat_result = None with suppress(FileNotFoundError): stat_result = tree.stat(path_info) if not stat_result and not dvc_meta: raise FileNotFoundError meta = dvc_meta or Metadata(path_info=path_info) isdir = bool(stat_result) and stat.S_ISDIR(stat_result.st_mode) meta.isdir = meta.isdir or isdir if not dvc_meta: meta.is_exec = bool(stat_result) and is_exec(stat_result.st_mode) return meta
class DvcIgnoreFilter: @staticmethod def _is_dvc_repo(root, directory): from dvc.repo import Repo return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR)) def __init__(self, tree, root_dir): from dvc.repo import Repo default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] self.tree = tree self.root_dir = root_dir self.ignores_trie_tree = StringTrie(separator=os.sep) self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( default_ignore_patterns, root_dir) self._update(self.root_dir) def _update(self, dirname): old_pattern = self.ignores_trie_tree.longest_prefix(dirname).value matches = old_pattern.matches(dirname, DvcIgnore.DVCIGNORE_FILE, False) ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) if not matches and self.tree.exists(ignore_file_path, use_dvcignore=False): new_pattern = DvcIgnorePatterns.from_files(ignore_file_path, self.tree) if old_pattern: self.ignores_trie_tree[dirname] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, old_pattern.dirname, new_pattern.pattern_list, new_pattern.dirname, )) else: self.ignores_trie_tree[dirname] = new_pattern elif old_pattern: self.ignores_trie_tree[dirname] = old_pattern # NOTE: using `walk` + `break` because tree doesn't have `listdir()` for root, dirs, _ in self.tree.walk(dirname, use_dvcignore=False): self._update_sub_repo(root, dirs) break def _update_sub_repo(self, root, dirs): for d in dirs: if self._is_dvc_repo(root, d): old_pattern = self.ignores_trie_tree.longest_prefix(root).value if old_pattern: self.ignores_trie_tree[root] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, old_pattern.dirname, ["/{}/".format(d)], root, )) else: self.ignores_trie_tree[root] = DvcIgnorePatterns( ["/{}/".format(d)], root) def __call__(self, root, dirs, files): ignore_pattern = self._get_trie_pattern(root) if ignore_pattern: return ignore_pattern(root, dirs, files) else: return dirs, files def _get_trie_pattern(self, dirname): ignore_pattern = self.ignores_trie_tree.get(dirname) if ignore_pattern: return ignore_pattern prefix = self.ignores_trie_tree.longest_prefix(dirname).key if not prefix: # outside of the repo return None dirs = list( takewhile( lambda path: path != prefix, (parent.fspath for parent in PathInfo(dirname).parents), )) dirs.reverse() dirs.append(dirname) for parent in dirs: self._update(parent) return self.ignores_trie_tree.get(dirname) def _is_ignored(self, path, is_dir=False): if self._outside_repo(path): return True dirname, basename = os.path.split(os.path.normpath(path)) ignore_pattern = self._get_trie_pattern(dirname) if ignore_pattern: return ignore_pattern.matches(dirname, basename, is_dir) else: return False def is_ignored_dir(self, path): path = os.path.abspath(path) if path == self.root_dir: return False return self._is_ignored(path, True) def is_ignored_file(self, path): return self._is_ignored(path, False) def _outside_repo(self, path): path = PathInfo(path) # paths outside of the repo should be ignored path = relpath(path, self.root_dir) if path.startswith("..") or ( os.name == "nt" and not os.path.commonprefix( [os.path.abspath(path), self.root_dir])): return True return False