class RepoFileSystem(BaseFileSystem): # pylint:disable=abstract-method """DVC + git-tracked files fs. Args: repo: DVC or git repo. subrepos: traverse to subrepos (by default, it ignores subrepos) repo_factory: A function to initialize subrepo with, default is Repo. kwargs: Additional keyword arguments passed to the `DvcFileSystem()`. """ scheme = "local" PARAM_CHECKSUM = "md5" def __init__( self, repo=None, subrepos=False, repo_factory: RepoFactory = None, ): super().__init__() from dvc.utils.collections import PathStringTrie if not repo_factory: from dvc.repo import Repo self.repo_factory: RepoFactory = Repo else: self.repo_factory = repo_factory self._main_repo = repo self.hash_jobs = repo.fs.hash_jobs self.root_dir = repo.root_dir self._traverse_subrepos = subrepos self._subrepos_trie = PathStringTrie() """Keeps track of each and every path with the corresponding repo.""" self._subrepos_trie[self.root_dir] = repo self._dvcfss = {} """Keep a dvcfs instance of each repo.""" if hasattr(repo, "dvc_dir"): self._dvcfss[repo.root_dir] = DvcFileSystem(repo=repo) def _get_repo(self, path: str) -> Optional["Repo"]: """Returns repo that the path falls in, using prefix. If the path is already tracked/collected, it just returns the repo. Otherwise, it collects the repos that might be in the path's parents and then returns the appropriate one. """ repo = self._subrepos_trie.get(path) if repo: return repo prefix, repo = self._subrepos_trie.longest_prefix(path) if not prefix: return None parents = (parent.fspath for parent in PathInfo(path).parents) dirs = [path] + list(takewhile(lambda p: p != prefix, parents)) dirs.reverse() self._update(dirs, starting_repo=repo) return self._subrepos_trie.get(path) @wrap_with(threading.Lock()) def _update(self, dirs, starting_repo): """Checks for subrepo in directories and updates them.""" repo = starting_repo for d in dirs: if self._is_dvc_repo(d): repo = self.repo_factory( d, scm=self._main_repo.scm, rev=self._main_repo.get_rev(), repo_factory=self.repo_factory, ) self._dvcfss[repo.root_dir] = DvcFileSystem(repo=repo) self._subrepos_trie[d] = repo def _is_dvc_repo(self, dir_path): """Check if the directory is a dvc repo.""" if not self._traverse_subrepos: return False from dvc.repo import Repo repo_path = os.path.join(dir_path, Repo.DVC_DIR) return self._main_repo.fs.isdir(repo_path) def _get_fs_pair(self, path) -> Tuple[BaseFileSystem, Optional[DvcFileSystem]]: """ Returns a pair of fss based on repo the path falls in, using prefix. """ path = os.path.abspath(path) # fallback to the top-level repo if repo was not found # this can happen if the path is outside of the repo repo = self._get_repo(path) or self._main_repo dvc_fs = self._dvcfss.get(repo.root_dir) return repo.fs, dvc_fs def open(self, path, mode="r", encoding="utf-8", **kwargs): # pylint: disable=arguments-differ if "b" in mode: encoding = None fs, dvc_fs = self._get_fs_pair(path) path_info = PathInfo(path) try: return fs.open(path_info, mode=mode, encoding=encoding) except FileNotFoundError: if not dvc_fs: raise return dvc_fs.open(path_info, mode=mode, encoding=encoding, **kwargs) def exists(self, path_info) -> bool: fs, dvc_fs = self._get_fs_pair(path_info) if not dvc_fs: return fs.exists(path_info) if dvc_fs.repo.dvcignore.is_ignored(fs, path_info): return False if fs.exists(path_info): return True try: meta = dvc_fs.metadata(path_info) except FileNotFoundError: return False (out, ) = meta.outs assert len(meta.outs) == 1 if fs.exists(out.path_info): return False return True def isdir(self, path): # pylint: disable=arguments-differ fs, dvc_fs = self._get_fs_pair(path) if dvc_fs and dvc_fs.repo.dvcignore.is_ignored_dir(path): return False try: st = fs.stat(path) return stat.S_ISDIR(st.st_mode) except (OSError, ValueError): # from CPython's os.path.isdir() pass if not dvc_fs: return False try: meta = dvc_fs.metadata(path) except FileNotFoundError: return False (out, ) = meta.outs assert len(meta.outs) == 1 if fs.exists(out.path_info): return False return meta.isdir def isdvc(self, path, **kwargs): _, dvc_fs = self._get_fs_pair(path) return dvc_fs is not None and dvc_fs.isdvc(path, **kwargs) def isfile(self, path): # pylint: disable=arguments-differ fs, dvc_fs = self._get_fs_pair(path) if dvc_fs and dvc_fs.repo.dvcignore.is_ignored_file(path): return False try: st = fs.stat(path) return stat.S_ISREG(st.st_mode) except (OSError, ValueError): # from CPython's os.path.isfile() pass if not dvc_fs: return False try: meta = dvc_fs.metadata(path) except FileNotFoundError: return False (out, ) = meta.outs assert len(meta.outs) == 1 if fs.exists(out.path_info): return False return meta.isfile def isexec(self, path_info): fs, dvc_fs = self._get_fs_pair(path_info) if dvc_fs and dvc_fs.exists(path_info): return dvc_fs.isexec(path_info) return fs.isexec(path_info) def stat(self, path): fs, _ = self._get_fs_pair(path) return fs.stat(path) def _dvc_walk(self, walk): try: root, dirs, files = next(walk) except StopIteration: return yield root, dirs, files for _ in dirs: yield from self._dvc_walk(walk) def _subrepo_walk(self, dir_path, **kwargs): """Walk into a new repo. NOTE: subrepo will only be discovered when walking if ignore_subrepos is set to False. """ fs, dvc_fs = self._get_fs_pair(dir_path) fs_walk = fs.walk(dir_path, topdown=True) if dvc_fs: dvc_walk = dvc_fs.walk(dir_path, topdown=True, **kwargs) else: dvc_walk = None yield from self._walk(fs_walk, dvc_walk, **kwargs) def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False): from dvc.dvcfile import is_valid_filename from dvc.ignore import DvcIgnore assert repo_walk try: _, dvc_dirs, dvc_fnames = (next(dvc_walk) if dvc_walk else (None, [], [])) repo_root, repo_dirs, repo_fnames = next(repo_walk) except StopIteration: return # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs dvc_set = set(dvc_dirs) repo_set = set(repo_dirs) dvc_only = list(dvc_set - repo_set) repo_only = list(repo_set - dvc_set) shared = list(dvc_set & repo_set) dirs = shared + dvc_only + repo_only def _func(fname): if dvcfiles: return True return not (is_valid_filename(fname) or fname == DvcIgnore.DVCIGNORE_FILE) # merge file lists files = set(filter(_func, dvc_fnames + repo_fnames)) yield repo_root, dirs, list(files) def is_dvc_repo(d): return self._is_dvc_repo(os.path.join(repo_root, d)) # remove subrepos to prevent it from being traversed subrepos = set(filter(is_dvc_repo, repo_only)) # set dir order for next recursion level - shared dirs first so that # next() for both generators recurses into the same shared directory dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set] repo_dirs[:] = lfilter(lambda d: d in (repo_set - subrepos), dirs) for dirname in dirs: if dirname in subrepos: dir_path = os.path.join(repo_root, dirname) yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles) elif dirname in shared: yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) elif dirname in dvc_set: yield from self._dvc_walk(dvc_walk) elif dirname in repo_set: yield from self._walk(repo_walk, None, dvcfiles=dvcfiles) def walk(self, top, topdown=True, onerror=None, **kwargs): """Walk and merge both DVC and repo fss. Args: top: path to walk from topdown: if True, fs will be walked from top down. onerror: if set, onerror function will be called if an error occurs (by default errors are ignored). dvcfiles: if True, dvcfiles will be included in the files list for walked directories. Any kwargs will be passed into methods used for fetching and/or streaming DVC outs from remotes. """ assert topdown if not self.exists(top): if onerror is not None: onerror(FileNotFoundError(top)) return if not self.isdir(top): if onerror is not None: onerror(NotADirectoryError(top)) return repo = self._get_repo(os.path.abspath(top)) dvcfiles = kwargs.pop("dvcfiles", False) fs, dvc_fs = self._get_fs_pair(top) repo_exists = fs.exists(top) repo_walk = repo.dvcignore.walk(fs, top, topdown=topdown, onerror=onerror, **kwargs) if not dvc_fs or (repo_exists and dvc_fs.isdvc(top)): yield from self._walk(repo_walk, None, dvcfiles=dvcfiles) return if not repo_exists: yield from dvc_fs.walk(top, topdown=topdown, onerror=onerror, **kwargs) dvc_walk = None if dvc_fs.exists(top): dvc_walk = dvc_fs.walk(top, topdown=topdown, onerror=onerror, **kwargs) yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) def walk_files(self, path_info, **kwargs): for root, _, files in self.walk(path_info, **kwargs): for fname in files: yield PathInfo(root) / fname def _download(self, from_info, to_file, name=None, no_progress_bar=False, **kwargs): import shutil from dvc.progress import Tqdm with open(to_file, "wb+") as to_fobj: with Tqdm.wrapattr( to_fobj, "write", desc=name, disable=no_progress_bar, ) as wrapped: with self.open(from_info, "rb", **kwargs) as from_fobj: shutil.copyfileobj(from_fobj, wrapped) def metadata(self, path): abspath = os.path.abspath(path) path_info = PathInfo(abspath) fs, dvc_fs = self._get_fs_pair(path_info) dvc_meta = None if dvc_fs: with suppress(FileNotFoundError): dvc_meta = dvc_fs.metadata(path_info) stat_result = None with suppress(FileNotFoundError): stat_result = fs.stat(path_info) if not stat_result and not dvc_meta: raise FileNotFoundError from ._metadata import Metadata meta = dvc_meta or Metadata( path_info=path_info, repo=self._get_repo(abspath) or self._main_repo, ) isdir = bool(stat_result) and stat.S_ISDIR(stat_result.st_mode) meta.isdir = meta.isdir or isdir if not dvc_meta: from dvc.utils import is_exec meta.is_exec = bool(stat_result) and is_exec(stat_result.st_mode) return meta def info(self, path_info): fs, dvc_fs = self._get_fs_pair(path_info) try: return fs.info(path_info) except FileNotFoundError: return dvc_fs.info(path_info)
class _RepoFileSystem(AbstractFileSystem): # pylint:disable=abstract-method """DVC + git-tracked files fs. Args: repo: DVC or git repo. subrepos: traverse to subrepos (by default, it ignores subrepos) repo_factory: A function to initialize subrepo with, default is Repo. kwargs: Additional keyword arguments passed to the `DvcFileSystem()`. """ PARAM_REPO_URL = "repo_url" PARAM_REPO_ROOT = "repo_root" PARAM_REV = "rev" PARAM_CACHE_DIR = "cache_dir" PARAM_CACHE_TYPES = "cache_types" PARAM_SUBREPOS = "subrepos" def __init__( self, repo: Optional["Repo"] = None, subrepos=False, repo_factory: RepoFactory = None, **kwargs, ): super().__init__() from dvc.utils.collections import PathStringTrie if repo is None: repo, repo_factory = self._repo_from_fs_config(subrepos=subrepos, **kwargs) if not repo_factory: from dvc.repo import Repo self.repo_factory: RepoFactory = Repo else: self.repo_factory = repo_factory self.path = Path(self.sep) self.repo = repo self.hash_jobs = repo.fs.hash_jobs self._root_dir: str = repo.root_dir self._traverse_subrepos = subrepos self._subrepos_trie = PathStringTrie() """Keeps track of each and every path with the corresponding repo.""" self._subrepos_trie[self._root_dir] = repo self._dvcfss = {} """Keep a dvcfs instance of each repo.""" if hasattr(repo, "dvc_dir"): self._dvcfss[self._root_dir] = DvcFileSystem(repo=repo) @property def repo_url(self): if self.repo is None: return None return self.repo.url @property def config(self): return { self.PARAM_REPO_URL: self.repo_url, self.PARAM_REPO_ROOT: self.repo.root_dir, self.PARAM_REV: getattr(self.repo.fs, "rev", None), self.PARAM_CACHE_DIR: os.path.abspath(self.repo.odb.local.cache_dir), self.PARAM_CACHE_TYPES: self.repo.odb.local.cache_types, self.PARAM_SUBREPOS: self._traverse_subrepos, } @classmethod def _repo_from_fs_config( cls, **config) -> Tuple["Repo", Optional["RepoFactory"]]: from dvc.external_repo import erepo_factory, external_repo from dvc.repo import Repo url = config.get(cls.PARAM_REPO_URL) root = config.get(cls.PARAM_REPO_ROOT) assert url or root def _open(*args, **kwargs): # NOTE: if original repo was an erepo (and has a URL), # we cannot use Repo.open() since it will skip erepo # cache/remote setup for local URLs if url is None: return Repo.open(*args, **kwargs) return external_repo(*args, **kwargs) cache_dir = config.get(cls.PARAM_CACHE_DIR) cache_config = ({} if not cache_dir else { "cache": { "dir": cache_dir, "type": config.get(cls.PARAM_CACHE_TYPES), } }) repo_kwargs: dict = { "rev": config.get(cls.PARAM_REV), "subrepos": config.get(cls.PARAM_SUBREPOS, False), "uninitialized": True, } factory: Optional["RepoFactory"] = None if url is None: repo_kwargs["config"] = cache_config else: repo_kwargs["cache_dir"] = cache_dir factory = erepo_factory(url, cache_config) with _open( url if url else root, **repo_kwargs, ) as repo: return repo, factory def _get_repo(self, path: str) -> "Repo": """Returns repo that the path falls in, using prefix. If the path is already tracked/collected, it just returns the repo. Otherwise, it collects the repos that might be in the path's parents and then returns the appropriate one. """ repo = self._subrepos_trie.get(path) if repo: return repo prefix, repo = self._subrepos_trie.longest_prefix(path) if not prefix: return self.repo parents = (parent for parent in self.repo.fs.path.parents(path)) dirs = [path] + list(takewhile(lambda p: p != prefix, parents)) dirs.reverse() self._update(dirs, starting_repo=repo) return self._subrepos_trie.get(path) or self.repo @wrap_with(threading.Lock()) def _update(self, dirs, starting_repo): """Checks for subrepo in directories and updates them.""" repo = starting_repo for d in dirs: if self._is_dvc_repo(d): repo = self.repo_factory( d, fs=self.repo.fs, repo_factory=self.repo_factory, ) self._dvcfss[repo.root_dir] = DvcFileSystem(repo=repo) self._subrepos_trie[d] = repo def _is_dvc_repo(self, dir_path): """Check if the directory is a dvc repo.""" if not self._traverse_subrepos: return False from dvc.repo import Repo repo_path = os.path.join(dir_path, Repo.DVC_DIR) return self.repo.fs.isdir(repo_path) def _get_fs_pair( self, path ) -> Tuple[Optional[FileSystem], Optional[str], Optional[DvcFileSystem], Optional[str], ]: """ Returns a pair of fss based on repo the path falls in, using prefix. """ from dvc.utils import as_posix if os.path.isabs(path): if self.repo.fs.path.isin_or_eq(path, self.repo.root_dir): path = self.repo.fs.path.relpath(path, self.repo.root_dir) else: return None, None, self.repo.dvcfs, path path = as_posix(path) parts = self.path.parts(path) if parts and parts[0] == os.curdir: parts = parts[1:] fs_path = self.repo.fs.path.join(self.repo.root_dir, *parts) repo = self._get_repo(fs_path) fs = repo.fs repo_parts = fs.path.relparts(repo.root_dir, self.repo.root_dir) if repo_parts[0] == os.curdir: repo_parts = repo_parts[1:] dvc_parts = parts[len(repo_parts):] if dvc_parts and dvc_parts[0] == os.curdir: dvc_parts = dvc_parts[1:] dvc_fs = self._dvcfss.get(repo.root_dir) if dvc_fs: dvc_path = dvc_fs.path.join(*dvc_parts) if dvc_parts else "" else: dvc_path = None return fs, fs_path, dvc_fs, dvc_path def open(self, path, mode="r", encoding="utf-8", **kwargs): # pylint: disable=arguments-renamed, arguments-differ if "b" in mode: encoding = None fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(path) try: return fs.open(fs_path, mode=mode, encoding=encoding) except FileNotFoundError: if not dvc_fs: raise return dvc_fs.open(dvc_path, mode=mode, encoding=encoding, **kwargs) def isdvc(self, path, **kwargs): _, _, dvc_fs, dvc_path = self._get_fs_pair(path) return dvc_fs is not None and dvc_fs.isdvc(dvc_path, **kwargs) def ls(self, path, detail=True, **kwargs): fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(path) repo = dvc_fs.repo if dvc_fs else self.repo dvcignore = repo.dvcignore ignore_subrepos = kwargs.get("ignore_subrepos", True) names = set() if dvc_fs: with suppress(FileNotFoundError): for entry in dvc_fs.ls(dvc_path, detail=False): names.add(dvc_fs.path.name(entry)) if fs: try: for entry in dvcignore.ls(fs, fs_path, detail=False, ignore_subrepos=ignore_subrepos): names.add(fs.path.name(entry)) except (FileNotFoundError, NotADirectoryError): pass dvcfiles = kwargs.get("dvcfiles", False) def _func(fname): from dvc.dvcfile import is_valid_filename from dvc.ignore import DvcIgnore if dvcfiles: return True return not (is_valid_filename(fname) or fname == DvcIgnore.DVCIGNORE_FILE) names = filter(_func, names) infos = [] paths = [] for name in names: entry_path = self.path.join(path, name) try: info = self.info(entry_path, ignore_subrepos=ignore_subrepos) except FileNotFoundError: continue infos.append(info) paths.append(entry_path) if not detail: return paths return infos def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs): fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(rpath) if fs: try: fs.get_file( # pylint: disable=protected-access fs_path, lpath, callback=callback, **kwargs) return except FileNotFoundError: if not dvc_fs: raise dvc_fs.get_file( # pylint: disable=protected-access dvc_path, lpath, callback=callback, **kwargs) def info(self, path, **kwargs): fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(path) repo = dvc_fs.repo if dvc_fs else self.repo dvcignore = repo.dvcignore ignore_subrepos = kwargs.get("ignore_subrepos", True) dvc_info = None if dvc_fs: try: dvc_info = dvc_fs.info(dvc_path) except FileNotFoundError: pass fs_info = None if fs: try: fs_info = fs.info(fs_path) if dvcignore.is_ignored(fs, fs_path, ignore_subrepos=ignore_subrepos): fs_info = None except (FileNotFoundError, NotADirectoryError): if not dvc_info: raise # NOTE: if some parent in fs_path turns out to be a file, it means # that the whole repofs branch doesn't exist. if fs and not fs_info and dvc_info: for parent in fs.path.parents(fs_path): try: if fs.info(parent)["type"] != "directory": dvc_info = None break except FileNotFoundError: continue if not dvc_info and not fs_info: raise FileNotFoundError info = _merge_info(dvc_fs.repo, fs_info, dvc_info) info["name"] = path return info def checksum(self, path): fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(path) try: return fs.checksum(fs_path) except FileNotFoundError: return dvc_fs.checksum(dvc_path)
class RepoTree(BaseTree): # pylint:disable=abstract-method """DVC + git-tracked files tree. Args: repo: DVC or git repo. subrepos: traverse to subrepos (by default, it ignores subrepos) repo_factory: A function to initialize subrepo with, default is Repo. kwargs: Additional keyword arguments passed to the `DvcTree()`. """ scheme = "local" PARAM_CHECKSUM = "md5" def __init__(self, repo, subrepos=False, repo_factory: RepoFactory = None, **kwargs): super().__init__(repo, {"url": repo.root_dir}) if not repo_factory: from dvc.repo import Repo self.repo_factory: RepoFactory = Repo else: self.repo_factory = repo_factory self._main_repo = repo self.root_dir = repo.root_dir self._traverse_subrepos = subrepos self._subrepos_trie = PathStringTrie() """Keeps track of each and every path with the corresponding repo.""" self._subrepos_trie[self.root_dir] = repo self._dvctrees = {} """Keep a dvctree instance of each repo.""" self._dvctree_configs = kwargs if hasattr(repo, "dvc_dir"): self._dvctrees[repo.root_dir] = DvcTree(repo, **kwargs) def _get_repo(self, path) -> Optional["Repo"]: """Returns repo that the path falls in, using prefix. If the path is already tracked/collected, it just returns the repo. Otherwise, it collects the repos that might be in the path's parents and then returns the appropriate one. """ repo = self._subrepos_trie.get(path) if repo: return repo prefix, repo = self._subrepos_trie.longest_prefix(path) if not prefix: return None parents = (parent.fspath for parent in PathInfo(path).parents) dirs = [path] + list(takewhile(lambda p: p != prefix, parents)) dirs.reverse() self._update(dirs, starting_repo=repo) return self._subrepos_trie.get(path) @wrap_with(threading.Lock()) def _update(self, dirs, starting_repo): """Checks for subrepo in directories and updates them.""" repo = starting_repo for d in dirs: if self._is_dvc_repo(d): repo = self.repo_factory(d) self._dvctrees[repo.root_dir] = DvcTree( repo, **self._dvctree_configs) self._subrepos_trie[d] = repo def _is_dvc_repo(self, dir_path): """Check if the directory is a dvc repo.""" if not self._traverse_subrepos: return False from dvc.repo import Repo repo_path = os.path.join(dir_path, Repo.DVC_DIR) # dvcignore will ignore subrepos, therefore using `use_dvcignore=False` return self._main_repo.tree.isdir(repo_path, use_dvcignore=False) def _get_tree_pair(self, path) -> Tuple[BaseTree, Optional[DvcTree]]: """ Returns a pair of trees based on repo the path falls in, using prefix. """ path = os.path.abspath(path) # fallback to the top-level repo if repo was not found # this can happen if the path is outside of the repo repo = self._get_repo(path) or self._main_repo dvc_tree = self._dvctrees.get(repo.root_dir) return repo.tree, dvc_tree @property def fetch(self): return "fetch" in self._dvctree_configs @property def stream(self): return "stream" in self._dvctree_configs def open(self, path, mode="r", encoding="utf-8", **kwargs): # pylint: disable=arguments-differ if "b" in mode: encoding = None tree, dvc_tree = self._get_tree_pair(path) path_info = PathInfo(path) try: return tree.open(path_info, mode=mode, encoding=encoding) except FileNotFoundError: if not dvc_tree: raise return dvc_tree.open(path_info, mode=mode, encoding=encoding, **kwargs) def exists(self, path, use_dvcignore=True): # pylint: disable=arguments-differ tree, dvc_tree = self._get_tree_pair(path) if not dvc_tree: return tree.exists(path) if tree.exists(path): return True try: meta = dvc_tree.metadata(path) except FileNotFoundError: return False (out, ) = meta.outs assert len(meta.outs) == 1 if tree.exists(out.path_info): return False return True def isdir(self, path): # pylint: disable=arguments-differ tree, dvc_tree = self._get_tree_pair(path) try: st = tree.stat(path) return stat.S_ISDIR(st.st_mode) except (OSError, ValueError): # from CPython's os.path.isdir() pass if not dvc_tree: return False try: meta = dvc_tree.metadata(path) except FileNotFoundError: return False (out, ) = meta.outs assert len(meta.outs) == 1 if tree.exists(out.path_info): return False return meta.isdir def isdvc(self, path, **kwargs): _, dvc_tree = self._get_tree_pair(path) return dvc_tree is not None and dvc_tree.isdvc(path, **kwargs) def isfile(self, path): # pylint: disable=arguments-differ tree, dvc_tree = self._get_tree_pair(path) try: st = tree.stat(path) return stat.S_ISREG(st.st_mode) except (OSError, ValueError): # from CPython's os.path.isfile() pass if not dvc_tree: return False try: meta = dvc_tree.metadata(path) except FileNotFoundError: return False (out, ) = meta.outs assert len(meta.outs) == 1 if tree.exists(out.path_info): return False return meta.isfile def isexec(self, path_info): tree, dvc_tree = self._get_tree_pair(path_info) if dvc_tree and dvc_tree.exists(path_info): return dvc_tree.isexec(path_info) return tree.isexec(path_info) def stat(self, path): tree, _ = self._get_tree_pair(path) return tree.stat(path) def _dvc_walk(self, walk): try: root, dirs, files = next(walk) except StopIteration: return yield root, dirs, files for _ in dirs: yield from self._dvc_walk(walk) def _subrepo_walk(self, dir_path, **kwargs): """Walk into a new repo. NOTE: subrepo will only be discovered when walking if ignore_subrepos is set to False. """ tree, dvc_tree = self._get_tree_pair(dir_path) tree_walk = tree.walk(dir_path, topdown=True, ignore_subrepos=not self._traverse_subrepos) if dvc_tree: dvc_walk = dvc_tree.walk(dir_path, topdown=True, **kwargs) else: dvc_walk = None yield from self._walk(tree_walk, dvc_walk, **kwargs) def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False): assert repo_walk try: _, dvc_dirs, dvc_fnames = (next(dvc_walk) if dvc_walk else (None, [], [])) repo_root, repo_dirs, repo_fnames = next(repo_walk) except StopIteration: return # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs dvc_set = set(dvc_dirs) repo_set = set(repo_dirs) dvc_only = list(dvc_set - repo_set) repo_only = list(repo_set - dvc_set) shared = list(dvc_set & repo_set) dirs = shared + dvc_only + repo_only # merge file lists files = { fname for fname in dvc_fnames + repo_fnames if dvcfiles or not is_valid_filename(fname) } yield repo_root, dirs, list(files) def is_dvc_repo(d): return self._is_dvc_repo(os.path.join(repo_root, d)) # remove subrepos to prevent it from being traversed subrepos = set(filter(is_dvc_repo, repo_only)) # set dir order for next recursion level - shared dirs first so that # next() for both generators recurses into the same shared directory dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set] repo_dirs[:] = lfilter(lambda d: d in (repo_set - subrepos), dirs) for dirname in dirs: if dirname in subrepos: dir_path = os.path.join(repo_root, dirname) yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles) elif dirname in shared: yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) elif dirname in dvc_set: yield from self._dvc_walk(dvc_walk) elif dirname in repo_set: yield from self._walk(repo_walk, None, dvcfiles=dvcfiles) def walk(self, top, topdown=True, onerror=None, dvcfiles=False, follow_subrepos=None, **kwargs): # pylint: disable=arguments-differ """Walk and merge both DVC and repo trees. Args: top: path to walk from topdown: if True, tree will be walked from top down. onerror: if set, onerror function will be called if an error occurs (by default errors are ignored). dvcfiles: if True, dvcfiles will be included in the files list for walked directories. Any kwargs will be passed into methods used for fetching and/or streaming DVC outs from remotes. """ assert topdown if not self.exists(top): if onerror is not None: onerror(FileNotFoundError(top)) return if not self.isdir(top): if onerror is not None: onerror(NotADirectoryError(top)) return ignore_subrepos = not self._traverse_subrepos if follow_subrepos is not None: ignore_subrepos = not follow_subrepos tree, dvc_tree = self._get_tree_pair(top) repo_exists = tree.exists(top) repo_walk = tree.walk( top, topdown=topdown, onerror=onerror, ignore_subrepos=ignore_subrepos, ) if not dvc_tree or (repo_exists and dvc_tree.isdvc(top)): yield from self._walk(repo_walk, None, dvcfiles=dvcfiles) return if not repo_exists: yield from dvc_tree.walk(top, topdown=topdown, **kwargs) dvc_walk = None if dvc_tree.exists(top): dvc_walk = dvc_tree.walk(top, topdown=topdown, **kwargs) yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) def walk_files(self, top, **kwargs): # pylint: disable=arguments-differ for root, _, files in self.walk(top, **kwargs): for fname in files: yield PathInfo(root) / fname def get_dir_hash(self, path_info, follow_subrepos=None, **kwargs): # pylint: disable=arguments-differ tree, dvc_tree = self._get_tree_pair(path_info) if tree.exists(path_info): return super().get_dir_hash(path_info, follow_subrepos=follow_subrepos, **kwargs) if not dvc_tree: raise FileNotFoundError return dvc_tree.get_dir_hash(path_info, **kwargs) def get_file_hash(self, path_info): """Return file checksum for specified path. If path_info is a DVC out, the pre-computed checksum for the file will be used. If path_info is a git file, MD5 will be computed for the git object. """ if not self.exists(path_info): raise FileNotFoundError _, dvc_tree = self._get_tree_pair(path_info) if dvc_tree and dvc_tree.exists(path_info): try: return dvc_tree.get_file_hash(path_info) except FileNotFoundError: pass return HashInfo(self.PARAM_CHECKSUM, file_md5(path_info, self)[0]) def copytree(self, top, dest): top = PathInfo(top) dest = PathInfo(dest) if not self.exists(top): raise FileNotFoundError if self.isfile(top): makedirs(dest.parent, exist_ok=True) with self.open(top, mode="rb") as fobj: copy_fobj_to_file(fobj, dest) return for root, _, files in self.walk(top): root = PathInfo(root) dest_dir = dest / root.relative_to(top) makedirs(dest_dir, exist_ok=True) for fname in files: src = root / fname with self.open(src, mode="rb") as fobj: copy_fobj_to_file(fobj, dest_dir / fname) @property def hash_jobs(self): # pylint: disable=invalid-overridden-method return self._main_repo.tree.hash_jobs def metadata(self, path): abspath = os.path.abspath(path) path_info = PathInfo(abspath) tree, dvc_tree = self._get_tree_pair(path_info) dvc_meta = None if dvc_tree: with suppress(FileNotFoundError): dvc_meta = dvc_tree.metadata(path_info) stat_result = None with suppress(FileNotFoundError): stat_result = tree.stat(path_info) if not stat_result and not dvc_meta: raise FileNotFoundError meta = dvc_meta or Metadata( path_info=path_info, repo=self._get_repo(abspath) or self._main_repo, ) isdir = bool(stat_result) and stat.S_ISDIR(stat_result.st_mode) meta.isdir = meta.isdir or isdir if not dvc_meta: meta.is_exec = bool(stat_result) and is_exec(stat_result.st_mode) return meta
class DvcIgnoreFilter: def __init__(self, tree, root_dir): from dvc.repo import Repo default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] self.tree = tree self.root_dir = root_dir self.ignores_trie_tree = PathStringTrie() self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( default_ignore_patterns, root_dir ) self._ignored_subrepos = PathStringTrie() self._update(self.root_dir) def _update(self, dirname): self._update_sub_repo(dirname) old_pattern = self.ignores_trie_tree.longest_prefix(dirname).value matches = old_pattern.matches(dirname, DvcIgnore.DVCIGNORE_FILE, False) ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) if not matches and self.tree.exists( ignore_file_path, use_dvcignore=False ): new_pattern = DvcIgnorePatterns.from_files( ignore_file_path, self.tree ) if old_pattern: self.ignores_trie_tree[dirname] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, old_pattern.dirname, new_pattern.pattern_list, new_pattern.dirname, ) ) else: self.ignores_trie_tree[dirname] = new_pattern elif old_pattern: self.ignores_trie_tree[dirname] = old_pattern def _update_sub_repo(self, path): from dvc.repo import Repo if path == self.root_dir: return dvc_dir = os.path.join(path, Repo.DVC_DIR) if not os.path.exists(dvc_dir): return root, dname = os.path.split(path) self._ignored_subrepos[root] = self._ignored_subrepos.get( root, set() ) | {dname} pattern_info = PatternInfo(f"/{dname}/", f"in sub_repo:{dname}") new_pattern = DvcIgnorePatterns([pattern_info], root) old_pattern = self.ignores_trie_tree.longest_prefix(root).value if old_pattern: self.ignores_trie_tree[root] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, old_pattern.dirname, new_pattern.pattern_list, new_pattern.dirname, ) ) else: self.ignores_trie_tree[root] = new_pattern def __call__(self, root, dirs, files, ignore_subrepos=True): for dname in dirs: self._update_sub_repo(os.path.join(root, dname)) ignore_pattern = self._get_trie_pattern(root) if ignore_pattern: dirs, files = ignore_pattern(root, dirs, files) if not ignore_subrepos: dirs.extend(self._ignored_subrepos.get(root, [])) return dirs, files def _get_trie_pattern(self, dirname): ignore_pattern = self.ignores_trie_tree.get(dirname) if ignore_pattern: return ignore_pattern prefix = self.ignores_trie_tree.longest_prefix(dirname).key if not prefix: # outside of the repo return None dirs = list( takewhile( lambda path: path != prefix, (parent.fspath for parent in PathInfo(dirname).parents), ) ) dirs.reverse() dirs.append(dirname) for parent in dirs: self._update(parent) return self.ignores_trie_tree.get(dirname) def _is_ignored(self, path, is_dir=False): if self._outside_repo(path): return False dirname, basename = os.path.split(os.path.normpath(path)) ignore_pattern = self._get_trie_pattern(dirname) if ignore_pattern: return ignore_pattern.matches(dirname, basename, is_dir) return False def _is_subrepo(self, path): dirname, basename = os.path.split(os.path.normpath(path)) return basename in self._ignored_subrepos.get(dirname, set()) def is_ignored_dir(self, path, ignore_subrepos=True): path = os.path.abspath(path) if not ignore_subrepos: return not self._is_subrepo(path) if path == self.root_dir: return False return self._is_ignored(path, True) def is_ignored_file(self, path): path = os.path.abspath(path) return self._is_ignored(path, False) def _outside_repo(self, path): path = PathInfo(path) # paths outside of the repo should be ignored path = relpath(path, self.root_dir) if path.startswith("..") or ( os.name == "nt" and not os.path.commonprefix( [os.path.abspath(path), self.root_dir] ) ): return True return False def check_ignore(self, target): full_target = os.path.abspath(target) if not self._outside_repo(full_target): dirname, basename = os.path.split(os.path.normpath(full_target)) pattern = self._get_trie_pattern(dirname) if pattern: matches = pattern.match_details( dirname, basename, os.path.isdir(full_target) ) if matches: return CheckIgnoreResult(target, True, matches) return _no_match(target) def is_ignored(self, path): # NOTE: can't use self.check_ignore(path).match for now, see # https://github.com/iterative/dvc/issues/4555 return self.is_ignored_dir(path) or self.is_ignored_file(path)
class RepoFileSystem(FileSystem): # pylint:disable=abstract-method """DVC + git-tracked files fs. Args: repo: DVC or git repo. subrepos: traverse to subrepos (by default, it ignores subrepos) repo_factory: A function to initialize subrepo with, default is Repo. kwargs: Additional keyword arguments passed to the `DvcFileSystem()`. """ sep = os.sep scheme = "local" PARAM_CHECKSUM = "md5" PARAM_REPO_URL = "repo_url" PARAM_REPO_ROOT = "repo_root" PARAM_REV = "rev" PARAM_CACHE_DIR = "cache_dir" PARAM_CACHE_TYPES = "cache_types" PARAM_SUBREPOS = "subrepos" def __init__( self, repo: Optional["Repo"] = None, subrepos=False, repo_factory: RepoFactory = None, **kwargs, ): super().__init__() from dvc.utils.collections import PathStringTrie if repo is None: repo, repo_factory = self._repo_from_fs_config( subrepos=subrepos, **kwargs ) if not repo_factory: from dvc.repo import Repo self.repo_factory: RepoFactory = Repo else: self.repo_factory = repo_factory self._main_repo = repo self.hash_jobs = repo.fs.hash_jobs self.root_dir: str = repo.root_dir self._traverse_subrepos = subrepos self._subrepos_trie = PathStringTrie() """Keeps track of each and every path with the corresponding repo.""" self._subrepos_trie[self.root_dir] = repo self._dvcfss = {} """Keep a dvcfs instance of each repo.""" if hasattr(repo, "dvc_dir"): self._dvcfss[repo.root_dir] = DvcFileSystem(repo=repo) @property def repo_url(self): if self._main_repo is None: return None return self._main_repo.url @property def config(self): return { self.PARAM_REPO_URL: self.repo_url, self.PARAM_REPO_ROOT: self.root_dir, self.PARAM_REV: getattr(self._main_repo.fs, "rev", None), self.PARAM_CACHE_DIR: os.path.abspath( self._main_repo.odb.local.cache_dir ), self.PARAM_CACHE_TYPES: self._main_repo.odb.local.cache_types, self.PARAM_SUBREPOS: self._traverse_subrepos, } @classmethod def _repo_from_fs_config( cls, **config ) -> Tuple["Repo", Optional["RepoFactory"]]: from dvc.external_repo import erepo_factory, external_repo from dvc.repo import Repo url = config.get(cls.PARAM_REPO_URL) root = config.get(cls.PARAM_REPO_ROOT) assert url or root def _open(*args, **kwargs): # NOTE: if original repo was an erepo (and has a URL), # we cannot use Repo.open() since it will skip erepo # cache/remote setup for local URLs if url is None: return Repo.open(*args, **kwargs) return external_repo(*args, **kwargs) cache_dir = config.get(cls.PARAM_CACHE_DIR) cache_config = ( {} if not cache_dir else { "cache": { "dir": cache_dir, "type": config.get(cls.PARAM_CACHE_TYPES), } } ) repo_kwargs: dict = { "rev": config.get(cls.PARAM_REV), "subrepos": config.get(cls.PARAM_SUBREPOS, False), "uninitialized": True, } factory: Optional["RepoFactory"] = None if url is None: repo_kwargs["config"] = cache_config else: repo_kwargs["cache_dir"] = cache_dir factory = erepo_factory(url, cache_config) with _open( url if url else root, **repo_kwargs, ) as repo: return repo, factory def _get_repo(self, path: str) -> Optional["Repo"]: """Returns repo that the path falls in, using prefix. If the path is already tracked/collected, it just returns the repo. Otherwise, it collects the repos that might be in the path's parents and then returns the appropriate one. """ repo = self._subrepos_trie.get(path) if repo: return repo prefix, repo = self._subrepos_trie.longest_prefix(path) if not prefix: return None parents = (parent for parent in self.path.parents(path)) dirs = [path] + list(takewhile(lambda p: p != prefix, parents)) dirs.reverse() self._update(dirs, starting_repo=repo) return self._subrepos_trie.get(path) @wrap_with(threading.Lock()) def _update(self, dirs, starting_repo): """Checks for subrepo in directories and updates them.""" repo = starting_repo for d in dirs: if self._is_dvc_repo(d): repo = self.repo_factory( d, fs=self._main_repo.fs, repo_factory=self.repo_factory, ) self._dvcfss[repo.root_dir] = DvcFileSystem(repo=repo) self._subrepos_trie[d] = repo def _is_dvc_repo(self, dir_path): """Check if the directory is a dvc repo.""" if not self._traverse_subrepos: return False from dvc.repo import Repo repo_path = os.path.join(dir_path, Repo.DVC_DIR) return self._main_repo.fs.isdir(repo_path) def _get_fs_pair( self, path ) -> Tuple[FileSystem, Optional[DvcFileSystem], str]: """ Returns a pair of fss based on repo the path falls in, using prefix. """ path = os.path.abspath(path) # fallback to the top-level repo if repo was not found # this can happen if the path is outside of the repo repo = self._get_repo(path) or self._main_repo dvc_fs = self._dvcfss.get(repo.root_dir) if path.startswith(repo.root_dir): dvc_path = path[len(repo.root_dir) + 1 :] else: dvc_path = path return repo.fs, dvc_fs, dvc_path def open( self, path, mode="r", encoding="utf-8", **kwargs ): # pylint: disable=arguments-renamed if "b" in mode: encoding = None fs, dvc_fs, dvc_path = self._get_fs_pair(path) try: return fs.open(path, mode=mode, encoding=encoding) except FileNotFoundError: if not dvc_fs: raise return dvc_fs.open(dvc_path, mode=mode, encoding=encoding, **kwargs) def exists(self, path) -> bool: path = os.path.abspath(path) fs, dvc_fs, dvc_path = self._get_fs_pair(path) if not dvc_fs: return fs.exists(path) if dvc_fs.repo.dvcignore.is_ignored(fs, path): return False if fs.exists(path): return True if not dvc_fs.exists(dvc_path): return False for p in self.path.parents(path): try: if fs.info(p)["type"] != "directory": return False except FileNotFoundError: continue return True def isdir(self, path): # pylint: disable=arguments-renamed path = os.path.abspath(path) fs, dvc_fs, dvc_path = self._get_fs_pair(path) if dvc_fs and dvc_fs.repo.dvcignore.is_ignored_dir(path): return False try: info = fs.info(path) return info["type"] == "directory" except (OSError, ValueError): # from CPython's os.path.isdir() pass if not dvc_fs: return False try: info = dvc_fs.info(dvc_path) except FileNotFoundError: return False for p in self.path.parents(path): try: if fs.info(p)["type"] != "directory": return False except FileNotFoundError: continue return info["type"] == "directory" def isdvc(self, path, **kwargs): _, dvc_fs, dvc_path = self._get_fs_pair(path) return dvc_fs is not None and dvc_fs.isdvc(dvc_path, **kwargs) def isfile(self, path): # pylint: disable=arguments-renamed path = os.path.abspath(path) fs, dvc_fs, dvc_path = self._get_fs_pair(path) if dvc_fs and dvc_fs.repo.dvcignore.is_ignored_file(path): return False try: info = fs.info(path) return info["type"] == "file" except (OSError, ValueError): # from CPython's os.path.isfile() pass if not dvc_fs: return False try: info = dvc_fs.info(dvc_path) except FileNotFoundError: return False for p in self.path.parents(path): try: if fs.info(p)["type"] != "directory": return False except FileNotFoundError: continue return info["type"] == "file" def _dvc_walk(self, walk): try: root, dirs, files = next(walk) except StopIteration: return yield root, dirs, files for _ in dirs: yield from self._dvc_walk(walk) def _subrepo_walk(self, dir_path, **kwargs): """Walk into a new repo. NOTE: subrepo will only be discovered when walking if ignore_subrepos is set to False. """ fs, dvc_fs, dvc_path = self._get_fs_pair(dir_path) fs_walk = fs.walk(dir_path, topdown=True) if dvc_fs: dvc_walk = _wrap_walk(dvc_fs, dvc_path, topdown=True, **kwargs) else: dvc_walk = None yield from self._walk(fs_walk, dvc_walk, **kwargs) def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False): from dvc.dvcfile import is_valid_filename from dvc.ignore import DvcIgnore assert repo_walk try: _, dvc_dirs, dvc_fnames = ( next(dvc_walk) if dvc_walk else (None, [], []) ) repo_root, repo_dirs, repo_fnames = next(repo_walk) except StopIteration: return # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs dvc_set = set(dvc_dirs) repo_set = set(repo_dirs) dvc_only = list(dvc_set - repo_set) repo_only = list(repo_set - dvc_set) shared = list(dvc_set & repo_set) dirs = shared + dvc_only + repo_only def _func(fname): if dvcfiles: return True return not ( is_valid_filename(fname) or fname == DvcIgnore.DVCIGNORE_FILE ) # merge file lists files = set(filter(_func, dvc_fnames + repo_fnames)) yield repo_root, dirs, list(files) def is_dvc_repo(d): return self._is_dvc_repo(os.path.join(repo_root, d)) # remove subrepos to prevent it from being traversed subrepos = set(filter(is_dvc_repo, repo_only)) # set dir order for next recursion level - shared dirs first so that # next() for both generators recurses into the same shared directory dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set] repo_dirs[:] = lfilter(lambda d: d in (repo_set - subrepos), dirs) for dirname in dirs: if dirname in subrepos: dir_path = os.path.join(repo_root, dirname) yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles) elif dirname in shared: yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) elif dirname in dvc_set: yield from self._dvc_walk(dvc_walk) elif dirname in repo_set: yield from self._walk(repo_walk, None, dvcfiles=dvcfiles) def walk(self, top, topdown=True, onerror=None, **kwargs): """Walk and merge both DVC and repo fss. Args: top: path to walk from topdown: if True, fs will be walked from top down. onerror: if set, onerror function will be called if an error occurs (by default errors are ignored). dvcfiles: if True, dvcfiles will be included in the files list for walked directories. Any kwargs will be passed into methods used for fetching and/or streaming DVC outs from remotes. """ assert topdown if not self.exists(top): if onerror is not None: onerror(FileNotFoundError(top)) return if not self.isdir(top): if onerror is not None: onerror(NotADirectoryError(top)) return repo = self._get_repo(os.path.abspath(top)) dvcfiles = kwargs.pop("dvcfiles", False) fs, dvc_fs, dvc_path = self._get_fs_pair(top) repo_exists = fs.exists(top) repo_walk = repo.dvcignore.walk( fs, top, topdown=topdown, onerror=onerror, **kwargs ) if not dvc_fs or (repo_exists and dvc_fs.isdvc(dvc_path)): yield from self._walk(repo_walk, None, dvcfiles=dvcfiles) return if not repo_exists: yield from _wrap_walk( dvc_fs, dvc_path, topdown=topdown, onerror=onerror, **kwargs ) dvc_walk = None if dvc_fs.exists(dvc_path): dvc_walk = _wrap_walk( dvc_fs, dvc_path, topdown=topdown, onerror=onerror, **kwargs ) yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) def find(self, path, prefix=None): for root, _, files in self.walk(path): for fname in files: yield self.path.join(root, fname) def get_file( self, from_info, to_file, callback=DEFAULT_CALLBACK, **kwargs ): fs, dvc_fs, dvc_path = self._get_fs_pair(from_info) try: fs.get_file( # pylint: disable=protected-access from_info, to_file, callback=callback, **kwargs ) return except FileNotFoundError: if not dvc_fs: raise dvc_fs.get_file( # pylint: disable=protected-access dvc_path, to_file, callback=callback, **kwargs ) def info(self, path): fs, dvc_fs, dvc_path = self._get_fs_pair(path) try: dvc_info = dvc_fs.info(dvc_path) except FileNotFoundError: dvc_info = None try: from dvc.utils import is_exec fs_info = fs.info(path) fs_info["repo"] = dvc_fs.repo fs_info["isout"] = ( dvc_info.get("isout", False) if dvc_info else False ) fs_info["outs"] = dvc_info["outs"] if dvc_info else None fs_info["isdvc"] = dvc_info["isdvc"] if dvc_info else False fs_info["meta"] = dvc_info.get("meta") if dvc_info else None isexec = False if dvc_info: isexec = dvc_info["isexec"] elif fs_info["type"] == "file": isexec = is_exec(fs_info["mode"]) fs_info["isexec"] = isexec return fs_info except FileNotFoundError: if not dvc_info: raise dvc_info["repo"] = dvc_fs.repo dvc_info["isdvc"] = True return dvc_info def checksum(self, path): fs, dvc_fs, dvc_path = self._get_fs_pair(path) try: return fs.checksum(path) except FileNotFoundError: return dvc_fs.checksum(dvc_path)
class DvcIgnoreFilter: @staticmethod def _is_dvc_repo(root, directory): from dvc.repo import Repo return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR)) def __init__(self, tree, root_dir): from dvc.repo import Repo default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] self.tree = tree self.root_dir = root_dir self.ignores_trie_tree = PathStringTrie() self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( default_ignore_patterns, root_dir) self._ignored_subrepos = PathStringTrie() self._update(self.root_dir) def _update(self, dirname): old_pattern = self.ignores_trie_tree.longest_prefix(dirname).value matches = old_pattern.matches(dirname, DvcIgnore.DVCIGNORE_FILE, False) ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) if not matches and self.tree.exists(ignore_file_path, use_dvcignore=False): new_pattern = DvcIgnorePatterns.from_files(ignore_file_path, self.tree) if old_pattern: self.ignores_trie_tree[dirname] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, old_pattern.dirname, new_pattern.pattern_list, new_pattern.dirname, )) else: self.ignores_trie_tree[dirname] = new_pattern elif old_pattern: self.ignores_trie_tree[dirname] = old_pattern # NOTE: using `walk` + `break` because tree doesn't have `listdir()` for root, dirs, _ in self.tree.walk(dirname, use_dvcignore=False): self._update_sub_repo(root, dirs) break def _update_sub_repo(self, root, dirs): for d in dirs: if self._is_dvc_repo(root, d): self._ignored_subrepos[root] = self._ignored_subrepos.get( root, set()) | {d} new_pattern = DvcIgnorePatterns([f"/{d}/"], root) old_pattern = self.ignores_trie_tree.longest_prefix(root).value if old_pattern: self.ignores_trie_tree[root] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, old_pattern.dirname, new_pattern.pattern_list, new_pattern.dirname, )) else: self.ignores_trie_tree[root] = new_pattern def __call__(self, root, dirs, files, ignore_subrepos=True): ignore_pattern = self._get_trie_pattern(root) if ignore_pattern: dirs, files = ignore_pattern(root, dirs, files) if not ignore_subrepos: dirs.extend(self._ignored_subrepos.get(root, [])) return dirs, files def _get_trie_pattern(self, dirname): ignore_pattern = self.ignores_trie_tree.get(dirname) if ignore_pattern: return ignore_pattern prefix = self.ignores_trie_tree.longest_prefix(dirname).key if not prefix: # outside of the repo return None dirs = list( takewhile( lambda path: path != prefix, (parent.fspath for parent in PathInfo(dirname).parents), )) dirs.reverse() dirs.append(dirname) for parent in dirs: self._update(parent) return self.ignores_trie_tree.get(dirname) def _is_ignored(self, path, is_dir=False): if self._outside_repo(path): return False dirname, basename = os.path.split(os.path.normpath(path)) ignore_pattern = self._get_trie_pattern(dirname) if ignore_pattern: return ignore_pattern.matches(dirname, basename, is_dir) else: return False def _is_subrepo(self, path): dirname, basename = os.path.split(os.path.normpath(path)) return basename in self._ignored_subrepos.get(dirname, set()) def is_ignored_dir(self, path, ignore_subrepos=True): path = os.path.abspath(path) if not ignore_subrepos: return not self._is_subrepo(path) if path == self.root_dir: return False return self._is_ignored(path, True) def is_ignored_file(self, path): path = os.path.abspath(path) return self._is_ignored(path, False) def _outside_repo(self, path): path = PathInfo(path) # paths outside of the repo should be ignored path = relpath(path, self.root_dir) if path.startswith("..") or ( os.name == "nt" and not os.path.commonprefix( [os.path.abspath(path), self.root_dir])): return True return False def check_ignore(self, target): full_target = os.path.abspath(target) if not self._outside_repo(full_target): dirname, basename = os.path.split(os.path.normpath(full_target)) pattern = self._get_trie_pattern(dirname) if pattern: matches = pattern.match_details(dirname, basename, os.path.isdir(full_target)) if matches: return CheckIgnoreResult(target, True, matches) return CheckIgnoreResult(target, False, ["::"])