Exemplo n.º 1
0
    def _update_sub_repo(self, path, ignore_trie: PathStringTrie):
        from dvc.repo import Repo

        if path == self.root_dir:
            return

        dvc_dir = os.path.join(path, Repo.DVC_DIR)
        if not os.path.exists(dvc_dir):
            return

        root, dname = os.path.split(path)
        pattern_info = PatternInfo(f"/{dname}/", f"in sub_repo:{dname}")
        new_pattern = DvcIgnorePatterns([pattern_info], root)
        old_pattern = ignore_trie.longest_prefix(root).value
        if old_pattern:
            ignore_trie[root] = DvcIgnorePatterns(
                *merge_patterns(
                    old_pattern.pattern_list,
                    old_pattern.dirname,
                    new_pattern.pattern_list,
                    new_pattern.dirname,
                )
            )
        else:
            ignore_trie[root] = new_pattern
Exemplo n.º 2
0
    def _update_trie(self, dirname: str, trie: PathStringTrie) -> None:
        old_pattern = trie.longest_prefix(dirname).value
        matches = old_pattern.matches(dirname, DvcIgnore.DVCIGNORE_FILE, False)

        path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
        if not matches and self.fs.exists(path):
            name = os.path.relpath(path, self.root_dir)
            new_pattern = DvcIgnorePatterns.from_file(path, self.fs, name)
            if old_pattern:
                trie[dirname] = DvcIgnorePatterns(*merge_patterns(
                    old_pattern.pattern_list,
                    old_pattern.dirname,
                    new_pattern.pattern_list,
                    new_pattern.dirname,
                ))
            else:
                trie[dirname] = new_pattern
        elif old_pattern:
            trie[dirname] = old_pattern
Exemplo n.º 3
0
class RepoFileSystem(BaseFileSystem):  # pylint:disable=abstract-method
    """DVC + git-tracked files fs.

    Args:
        repo: DVC or git repo.
        subrepos: traverse to subrepos (by default, it ignores subrepos)
        repo_factory: A function to initialize subrepo with, default is Repo.
        kwargs: Additional keyword arguments passed to the `DvcFileSystem()`.
    """

    scheme = "local"
    PARAM_CHECKSUM = "md5"

    def __init__(
        self,
        repo=None,
        subrepos=False,
        repo_factory: RepoFactory = None,
    ):
        super().__init__()

        from dvc.utils.collections import PathStringTrie

        if not repo_factory:
            from dvc.repo import Repo

            self.repo_factory: RepoFactory = Repo
        else:
            self.repo_factory = repo_factory

        self._main_repo = repo
        self.hash_jobs = repo.fs.hash_jobs
        self.root_dir = repo.root_dir
        self._traverse_subrepos = subrepos

        self._subrepos_trie = PathStringTrie()
        """Keeps track of each and every path with the corresponding repo."""

        self._subrepos_trie[self.root_dir] = repo

        self._dvcfss = {}
        """Keep a dvcfs instance of each repo."""

        if hasattr(repo, "dvc_dir"):
            self._dvcfss[repo.root_dir] = DvcFileSystem(repo=repo)

    def _get_repo(self, path: str) -> Optional["Repo"]:
        """Returns repo that the path falls in, using prefix.

        If the path is already tracked/collected, it just returns the repo.

        Otherwise, it collects the repos that might be in the path's parents
        and then returns the appropriate one.
        """
        repo = self._subrepos_trie.get(path)
        if repo:
            return repo

        prefix, repo = self._subrepos_trie.longest_prefix(path)
        if not prefix:
            return None

        parents = (parent.fspath for parent in PathInfo(path).parents)
        dirs = [path] + list(takewhile(lambda p: p != prefix, parents))
        dirs.reverse()
        self._update(dirs, starting_repo=repo)
        return self._subrepos_trie.get(path)

    @wrap_with(threading.Lock())
    def _update(self, dirs, starting_repo):
        """Checks for subrepo in directories and updates them."""
        repo = starting_repo
        for d in dirs:
            if self._is_dvc_repo(d):
                repo = self.repo_factory(
                    d,
                    scm=self._main_repo.scm,
                    rev=self._main_repo.get_rev(),
                    repo_factory=self.repo_factory,
                )
                self._dvcfss[repo.root_dir] = DvcFileSystem(repo=repo)
            self._subrepos_trie[d] = repo

    def _is_dvc_repo(self, dir_path):
        """Check if the directory is a dvc repo."""
        if not self._traverse_subrepos:
            return False

        from dvc.repo import Repo

        repo_path = os.path.join(dir_path, Repo.DVC_DIR)
        return self._main_repo.fs.isdir(repo_path)

    def _get_fs_pair(self,
                     path) -> Tuple[BaseFileSystem, Optional[DvcFileSystem]]:
        """
        Returns a pair of fss based on repo the path falls in, using prefix.
        """
        path = os.path.abspath(path)

        # fallback to the top-level repo if repo was not found
        # this can happen if the path is outside of the repo
        repo = self._get_repo(path) or self._main_repo

        dvc_fs = self._dvcfss.get(repo.root_dir)
        return repo.fs, dvc_fs

    def open(self, path, mode="r", encoding="utf-8", **kwargs):  # pylint: disable=arguments-differ
        if "b" in mode:
            encoding = None

        fs, dvc_fs = self._get_fs_pair(path)
        path_info = PathInfo(path)
        try:
            return fs.open(path_info, mode=mode, encoding=encoding)
        except FileNotFoundError:
            if not dvc_fs:
                raise

        return dvc_fs.open(path_info, mode=mode, encoding=encoding, **kwargs)

    def exists(self, path_info) -> bool:
        fs, dvc_fs = self._get_fs_pair(path_info)

        if not dvc_fs:
            return fs.exists(path_info)

        if dvc_fs.repo.dvcignore.is_ignored(fs, path_info):
            return False

        if fs.exists(path_info):
            return True

        try:
            meta = dvc_fs.metadata(path_info)
        except FileNotFoundError:
            return False

        (out, ) = meta.outs
        assert len(meta.outs) == 1
        if fs.exists(out.path_info):
            return False
        return True

    def isdir(self, path):  # pylint: disable=arguments-differ
        fs, dvc_fs = self._get_fs_pair(path)

        if dvc_fs and dvc_fs.repo.dvcignore.is_ignored_dir(path):
            return False

        try:
            st = fs.stat(path)
            return stat.S_ISDIR(st.st_mode)
        except (OSError, ValueError):
            # from CPython's os.path.isdir()
            pass

        if not dvc_fs:
            return False

        try:
            meta = dvc_fs.metadata(path)
        except FileNotFoundError:
            return False

        (out, ) = meta.outs
        assert len(meta.outs) == 1
        if fs.exists(out.path_info):
            return False
        return meta.isdir

    def isdvc(self, path, **kwargs):
        _, dvc_fs = self._get_fs_pair(path)
        return dvc_fs is not None and dvc_fs.isdvc(path, **kwargs)

    def isfile(self, path):  # pylint: disable=arguments-differ
        fs, dvc_fs = self._get_fs_pair(path)

        if dvc_fs and dvc_fs.repo.dvcignore.is_ignored_file(path):
            return False

        try:
            st = fs.stat(path)
            return stat.S_ISREG(st.st_mode)
        except (OSError, ValueError):
            # from CPython's os.path.isfile()
            pass

        if not dvc_fs:
            return False

        try:
            meta = dvc_fs.metadata(path)
        except FileNotFoundError:
            return False

        (out, ) = meta.outs
        assert len(meta.outs) == 1
        if fs.exists(out.path_info):
            return False
        return meta.isfile

    def isexec(self, path_info):
        fs, dvc_fs = self._get_fs_pair(path_info)
        if dvc_fs and dvc_fs.exists(path_info):
            return dvc_fs.isexec(path_info)
        return fs.isexec(path_info)

    def stat(self, path):
        fs, _ = self._get_fs_pair(path)
        return fs.stat(path)

    def _dvc_walk(self, walk):
        try:
            root, dirs, files = next(walk)
        except StopIteration:
            return
        yield root, dirs, files
        for _ in dirs:
            yield from self._dvc_walk(walk)

    def _subrepo_walk(self, dir_path, **kwargs):
        """Walk into a new repo.

         NOTE: subrepo will only be discovered when walking if
         ignore_subrepos is set to False.
        """
        fs, dvc_fs = self._get_fs_pair(dir_path)
        fs_walk = fs.walk(dir_path, topdown=True)
        if dvc_fs:
            dvc_walk = dvc_fs.walk(dir_path, topdown=True, **kwargs)
        else:
            dvc_walk = None
        yield from self._walk(fs_walk, dvc_walk, **kwargs)

    def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False):
        from dvc.dvcfile import is_valid_filename
        from dvc.ignore import DvcIgnore

        assert repo_walk
        try:
            _, dvc_dirs, dvc_fnames = (next(dvc_walk) if dvc_walk else
                                       (None, [], []))
            repo_root, repo_dirs, repo_fnames = next(repo_walk)
        except StopIteration:
            return

        # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs
        dvc_set = set(dvc_dirs)
        repo_set = set(repo_dirs)
        dvc_only = list(dvc_set - repo_set)
        repo_only = list(repo_set - dvc_set)
        shared = list(dvc_set & repo_set)
        dirs = shared + dvc_only + repo_only

        def _func(fname):
            if dvcfiles:
                return True

            return not (is_valid_filename(fname)
                        or fname == DvcIgnore.DVCIGNORE_FILE)

        # merge file lists
        files = set(filter(_func, dvc_fnames + repo_fnames))

        yield repo_root, dirs, list(files)

        def is_dvc_repo(d):
            return self._is_dvc_repo(os.path.join(repo_root, d))

        # remove subrepos to prevent it from being traversed
        subrepos = set(filter(is_dvc_repo, repo_only))
        # set dir order for next recursion level - shared dirs first so that
        # next() for both generators recurses into the same shared directory
        dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set]
        repo_dirs[:] = lfilter(lambda d: d in (repo_set - subrepos), dirs)

        for dirname in dirs:
            if dirname in subrepos:
                dir_path = os.path.join(repo_root, dirname)
                yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles)
            elif dirname in shared:
                yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles)
            elif dirname in dvc_set:
                yield from self._dvc_walk(dvc_walk)
            elif dirname in repo_set:
                yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)

    def walk(self, top, topdown=True, onerror=None, **kwargs):
        """Walk and merge both DVC and repo fss.

        Args:
            top: path to walk from
            topdown: if True, fs will be walked from top down.
            onerror: if set, onerror function will be called if an error
                occurs (by default errors are ignored).
            dvcfiles: if True, dvcfiles will be included in the files list
                for walked directories.

        Any kwargs will be passed into methods used for fetching and/or
        streaming DVC outs from remotes.
        """
        assert topdown

        if not self.exists(top):
            if onerror is not None:
                onerror(FileNotFoundError(top))
            return

        if not self.isdir(top):
            if onerror is not None:
                onerror(NotADirectoryError(top))
            return

        repo = self._get_repo(os.path.abspath(top))
        dvcfiles = kwargs.pop("dvcfiles", False)

        fs, dvc_fs = self._get_fs_pair(top)
        repo_exists = fs.exists(top)

        repo_walk = repo.dvcignore.walk(fs,
                                        top,
                                        topdown=topdown,
                                        onerror=onerror,
                                        **kwargs)

        if not dvc_fs or (repo_exists and dvc_fs.isdvc(top)):
            yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)
            return

        if not repo_exists:
            yield from dvc_fs.walk(top,
                                   topdown=topdown,
                                   onerror=onerror,
                                   **kwargs)

        dvc_walk = None
        if dvc_fs.exists(top):
            dvc_walk = dvc_fs.walk(top,
                                   topdown=topdown,
                                   onerror=onerror,
                                   **kwargs)

        yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles)

    def walk_files(self, path_info, **kwargs):
        for root, _, files in self.walk(path_info, **kwargs):
            for fname in files:
                yield PathInfo(root) / fname

    def _download(self,
                  from_info,
                  to_file,
                  name=None,
                  no_progress_bar=False,
                  **kwargs):
        import shutil

        from dvc.progress import Tqdm

        with open(to_file, "wb+") as to_fobj:
            with Tqdm.wrapattr(
                    to_fobj,
                    "write",
                    desc=name,
                    disable=no_progress_bar,
            ) as wrapped:
                with self.open(from_info, "rb", **kwargs) as from_fobj:
                    shutil.copyfileobj(from_fobj, wrapped)

    def metadata(self, path):
        abspath = os.path.abspath(path)
        path_info = PathInfo(abspath)
        fs, dvc_fs = self._get_fs_pair(path_info)

        dvc_meta = None
        if dvc_fs:
            with suppress(FileNotFoundError):
                dvc_meta = dvc_fs.metadata(path_info)

        stat_result = None
        with suppress(FileNotFoundError):
            stat_result = fs.stat(path_info)

        if not stat_result and not dvc_meta:
            raise FileNotFoundError

        from ._metadata import Metadata

        meta = dvc_meta or Metadata(
            path_info=path_info,
            repo=self._get_repo(abspath) or self._main_repo,
        )

        isdir = bool(stat_result) and stat.S_ISDIR(stat_result.st_mode)
        meta.isdir = meta.isdir or isdir

        if not dvc_meta:
            from dvc.utils import is_exec

            meta.is_exec = bool(stat_result) and is_exec(stat_result.st_mode)
        return meta

    def info(self, path_info):
        fs, dvc_fs = self._get_fs_pair(path_info)

        try:
            return fs.info(path_info)
        except FileNotFoundError:
            return dvc_fs.info(path_info)
Exemplo n.º 4
0
class _RepoFileSystem(AbstractFileSystem):  # pylint:disable=abstract-method
    """DVC + git-tracked files fs.

    Args:
        repo: DVC or git repo.
        subrepos: traverse to subrepos (by default, it ignores subrepos)
        repo_factory: A function to initialize subrepo with, default is Repo.
        kwargs: Additional keyword arguments passed to the `DvcFileSystem()`.
    """

    PARAM_REPO_URL = "repo_url"
    PARAM_REPO_ROOT = "repo_root"
    PARAM_REV = "rev"
    PARAM_CACHE_DIR = "cache_dir"
    PARAM_CACHE_TYPES = "cache_types"
    PARAM_SUBREPOS = "subrepos"

    def __init__(
        self,
        repo: Optional["Repo"] = None,
        subrepos=False,
        repo_factory: RepoFactory = None,
        **kwargs,
    ):
        super().__init__()

        from dvc.utils.collections import PathStringTrie

        if repo is None:
            repo, repo_factory = self._repo_from_fs_config(subrepos=subrepos,
                                                           **kwargs)

        if not repo_factory:
            from dvc.repo import Repo

            self.repo_factory: RepoFactory = Repo
        else:
            self.repo_factory = repo_factory

        self.path = Path(self.sep)
        self.repo = repo
        self.hash_jobs = repo.fs.hash_jobs
        self._root_dir: str = repo.root_dir
        self._traverse_subrepos = subrepos

        self._subrepos_trie = PathStringTrie()
        """Keeps track of each and every path with the corresponding repo."""

        self._subrepos_trie[self._root_dir] = repo

        self._dvcfss = {}
        """Keep a dvcfs instance of each repo."""

        if hasattr(repo, "dvc_dir"):
            self._dvcfss[self._root_dir] = DvcFileSystem(repo=repo)

    @property
    def repo_url(self):
        if self.repo is None:
            return None
        return self.repo.url

    @property
    def config(self):
        return {
            self.PARAM_REPO_URL: self.repo_url,
            self.PARAM_REPO_ROOT: self.repo.root_dir,
            self.PARAM_REV: getattr(self.repo.fs, "rev", None),
            self.PARAM_CACHE_DIR:
            os.path.abspath(self.repo.odb.local.cache_dir),
            self.PARAM_CACHE_TYPES: self.repo.odb.local.cache_types,
            self.PARAM_SUBREPOS: self._traverse_subrepos,
        }

    @classmethod
    def _repo_from_fs_config(
            cls, **config) -> Tuple["Repo", Optional["RepoFactory"]]:
        from dvc.external_repo import erepo_factory, external_repo
        from dvc.repo import Repo

        url = config.get(cls.PARAM_REPO_URL)
        root = config.get(cls.PARAM_REPO_ROOT)
        assert url or root

        def _open(*args, **kwargs):
            # NOTE: if original repo was an erepo (and has a URL),
            # we cannot use Repo.open() since it will skip erepo
            # cache/remote setup for local URLs
            if url is None:
                return Repo.open(*args, **kwargs)
            return external_repo(*args, **kwargs)

        cache_dir = config.get(cls.PARAM_CACHE_DIR)
        cache_config = ({} if not cache_dir else {
            "cache": {
                "dir": cache_dir,
                "type": config.get(cls.PARAM_CACHE_TYPES),
            }
        })
        repo_kwargs: dict = {
            "rev": config.get(cls.PARAM_REV),
            "subrepos": config.get(cls.PARAM_SUBREPOS, False),
            "uninitialized": True,
        }
        factory: Optional["RepoFactory"] = None
        if url is None:
            repo_kwargs["config"] = cache_config
        else:
            repo_kwargs["cache_dir"] = cache_dir
            factory = erepo_factory(url, cache_config)

        with _open(
                url if url else root,
                **repo_kwargs,
        ) as repo:
            return repo, factory

    def _get_repo(self, path: str) -> "Repo":
        """Returns repo that the path falls in, using prefix.

        If the path is already tracked/collected, it just returns the repo.

        Otherwise, it collects the repos that might be in the path's parents
        and then returns the appropriate one.
        """
        repo = self._subrepos_trie.get(path)
        if repo:
            return repo

        prefix, repo = self._subrepos_trie.longest_prefix(path)
        if not prefix:
            return self.repo

        parents = (parent for parent in self.repo.fs.path.parents(path))
        dirs = [path] + list(takewhile(lambda p: p != prefix, parents))
        dirs.reverse()
        self._update(dirs, starting_repo=repo)
        return self._subrepos_trie.get(path) or self.repo

    @wrap_with(threading.Lock())
    def _update(self, dirs, starting_repo):
        """Checks for subrepo in directories and updates them."""
        repo = starting_repo
        for d in dirs:
            if self._is_dvc_repo(d):
                repo = self.repo_factory(
                    d,
                    fs=self.repo.fs,
                    repo_factory=self.repo_factory,
                )
                self._dvcfss[repo.root_dir] = DvcFileSystem(repo=repo)
            self._subrepos_trie[d] = repo

    def _is_dvc_repo(self, dir_path):
        """Check if the directory is a dvc repo."""
        if not self._traverse_subrepos:
            return False

        from dvc.repo import Repo

        repo_path = os.path.join(dir_path, Repo.DVC_DIR)
        return self.repo.fs.isdir(repo_path)

    def _get_fs_pair(
        self, path
    ) -> Tuple[Optional[FileSystem], Optional[str], Optional[DvcFileSystem],
               Optional[str], ]:
        """
        Returns a pair of fss based on repo the path falls in, using prefix.
        """
        from dvc.utils import as_posix

        if os.path.isabs(path):
            if self.repo.fs.path.isin_or_eq(path, self.repo.root_dir):
                path = self.repo.fs.path.relpath(path, self.repo.root_dir)
            else:
                return None, None, self.repo.dvcfs, path

        path = as_posix(path)

        parts = self.path.parts(path)
        if parts and parts[0] == os.curdir:
            parts = parts[1:]

        fs_path = self.repo.fs.path.join(self.repo.root_dir, *parts)
        repo = self._get_repo(fs_path)
        fs = repo.fs

        repo_parts = fs.path.relparts(repo.root_dir, self.repo.root_dir)
        if repo_parts[0] == os.curdir:
            repo_parts = repo_parts[1:]

        dvc_parts = parts[len(repo_parts):]
        if dvc_parts and dvc_parts[0] == os.curdir:
            dvc_parts = dvc_parts[1:]

        dvc_fs = self._dvcfss.get(repo.root_dir)
        if dvc_fs:
            dvc_path = dvc_fs.path.join(*dvc_parts) if dvc_parts else ""
        else:
            dvc_path = None

        return fs, fs_path, dvc_fs, dvc_path

    def open(self, path, mode="r", encoding="utf-8", **kwargs):  # pylint: disable=arguments-renamed, arguments-differ
        if "b" in mode:
            encoding = None

        fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(path)
        try:
            return fs.open(fs_path, mode=mode, encoding=encoding)
        except FileNotFoundError:
            if not dvc_fs:
                raise

        return dvc_fs.open(dvc_path, mode=mode, encoding=encoding, **kwargs)

    def isdvc(self, path, **kwargs):
        _, _, dvc_fs, dvc_path = self._get_fs_pair(path)
        return dvc_fs is not None and dvc_fs.isdvc(dvc_path, **kwargs)

    def ls(self, path, detail=True, **kwargs):
        fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(path)

        repo = dvc_fs.repo if dvc_fs else self.repo
        dvcignore = repo.dvcignore
        ignore_subrepos = kwargs.get("ignore_subrepos", True)

        names = set()
        if dvc_fs:
            with suppress(FileNotFoundError):
                for entry in dvc_fs.ls(dvc_path, detail=False):
                    names.add(dvc_fs.path.name(entry))

        if fs:
            try:
                for entry in dvcignore.ls(fs,
                                          fs_path,
                                          detail=False,
                                          ignore_subrepos=ignore_subrepos):
                    names.add(fs.path.name(entry))
            except (FileNotFoundError, NotADirectoryError):
                pass

        dvcfiles = kwargs.get("dvcfiles", False)

        def _func(fname):
            from dvc.dvcfile import is_valid_filename
            from dvc.ignore import DvcIgnore

            if dvcfiles:
                return True

            return not (is_valid_filename(fname)
                        or fname == DvcIgnore.DVCIGNORE_FILE)

        names = filter(_func, names)

        infos = []
        paths = []
        for name in names:
            entry_path = self.path.join(path, name)
            try:
                info = self.info(entry_path, ignore_subrepos=ignore_subrepos)
            except FileNotFoundError:
                continue
            infos.append(info)
            paths.append(entry_path)

        if not detail:
            return paths

        return infos

    def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs):
        fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(rpath)

        if fs:
            try:
                fs.get_file(  # pylint: disable=protected-access
                    fs_path,
                    lpath,
                    callback=callback,
                    **kwargs)
                return
            except FileNotFoundError:
                if not dvc_fs:
                    raise

        dvc_fs.get_file(  # pylint: disable=protected-access
            dvc_path,
            lpath,
            callback=callback,
            **kwargs)

    def info(self, path, **kwargs):
        fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(path)

        repo = dvc_fs.repo if dvc_fs else self.repo
        dvcignore = repo.dvcignore
        ignore_subrepos = kwargs.get("ignore_subrepos", True)

        dvc_info = None
        if dvc_fs:
            try:
                dvc_info = dvc_fs.info(dvc_path)
            except FileNotFoundError:
                pass

        fs_info = None
        if fs:
            try:
                fs_info = fs.info(fs_path)
                if dvcignore.is_ignored(fs,
                                        fs_path,
                                        ignore_subrepos=ignore_subrepos):
                    fs_info = None
            except (FileNotFoundError, NotADirectoryError):
                if not dvc_info:
                    raise

        # NOTE: if some parent in fs_path turns out to be a file, it means
        # that the whole repofs branch doesn't exist.
        if fs and not fs_info and dvc_info:
            for parent in fs.path.parents(fs_path):
                try:
                    if fs.info(parent)["type"] != "directory":
                        dvc_info = None
                        break
                except FileNotFoundError:
                    continue

        if not dvc_info and not fs_info:
            raise FileNotFoundError

        info = _merge_info(dvc_fs.repo, fs_info, dvc_info)
        info["name"] = path
        return info

    def checksum(self, path):
        fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(path)

        try:
            return fs.checksum(fs_path)
        except FileNotFoundError:
            return dvc_fs.checksum(dvc_path)
Exemplo n.º 5
0
class RepoTree(BaseTree):  # pylint:disable=abstract-method
    """DVC + git-tracked files tree.

    Args:
        repo: DVC or git repo.
        subrepos: traverse to subrepos (by default, it ignores subrepos)
        repo_factory: A function to initialize subrepo with, default is Repo.
        kwargs: Additional keyword arguments passed to the `DvcTree()`.
    """

    scheme = "local"
    PARAM_CHECKSUM = "md5"

    def __init__(self,
                 repo,
                 subrepos=False,
                 repo_factory: RepoFactory = None,
                 **kwargs):
        super().__init__(repo, {"url": repo.root_dir})

        if not repo_factory:
            from dvc.repo import Repo

            self.repo_factory: RepoFactory = Repo
        else:
            self.repo_factory = repo_factory

        self._main_repo = repo
        self.root_dir = repo.root_dir
        self._traverse_subrepos = subrepos

        self._subrepos_trie = PathStringTrie()
        """Keeps track of each and every path with the corresponding repo."""

        self._subrepos_trie[self.root_dir] = repo

        self._dvctrees = {}
        """Keep a dvctree instance of each repo."""

        self._dvctree_configs = kwargs

        if hasattr(repo, "dvc_dir"):
            self._dvctrees[repo.root_dir] = DvcTree(repo, **kwargs)

    def _get_repo(self, path) -> Optional["Repo"]:
        """Returns repo that the path falls in, using prefix.

        If the path is already tracked/collected, it just returns the repo.

        Otherwise, it collects the repos that might be in the path's parents
        and then returns the appropriate one.
        """
        repo = self._subrepos_trie.get(path)
        if repo:
            return repo

        prefix, repo = self._subrepos_trie.longest_prefix(path)
        if not prefix:
            return None

        parents = (parent.fspath for parent in PathInfo(path).parents)
        dirs = [path] + list(takewhile(lambda p: p != prefix, parents))
        dirs.reverse()
        self._update(dirs, starting_repo=repo)
        return self._subrepos_trie.get(path)

    @wrap_with(threading.Lock())
    def _update(self, dirs, starting_repo):
        """Checks for subrepo in directories and updates them."""
        repo = starting_repo
        for d in dirs:
            if self._is_dvc_repo(d):
                repo = self.repo_factory(d)
                self._dvctrees[repo.root_dir] = DvcTree(
                    repo, **self._dvctree_configs)
            self._subrepos_trie[d] = repo

    def _is_dvc_repo(self, dir_path):
        """Check if the directory is a dvc repo."""
        if not self._traverse_subrepos:
            return False

        from dvc.repo import Repo

        repo_path = os.path.join(dir_path, Repo.DVC_DIR)
        # dvcignore will ignore subrepos, therefore using `use_dvcignore=False`
        return self._main_repo.tree.isdir(repo_path, use_dvcignore=False)

    def _get_tree_pair(self, path) -> Tuple[BaseTree, Optional[DvcTree]]:
        """
        Returns a pair of trees based on repo the path falls in, using prefix.
        """
        path = os.path.abspath(path)

        # fallback to the top-level repo if repo was not found
        # this can happen if the path is outside of the repo
        repo = self._get_repo(path) or self._main_repo

        dvc_tree = self._dvctrees.get(repo.root_dir)
        return repo.tree, dvc_tree

    @property
    def fetch(self):
        return "fetch" in self._dvctree_configs

    @property
    def stream(self):
        return "stream" in self._dvctree_configs

    def open(self, path, mode="r", encoding="utf-8", **kwargs):  # pylint: disable=arguments-differ
        if "b" in mode:
            encoding = None

        tree, dvc_tree = self._get_tree_pair(path)
        path_info = PathInfo(path)
        try:
            return tree.open(path_info, mode=mode, encoding=encoding)
        except FileNotFoundError:
            if not dvc_tree:
                raise

        return dvc_tree.open(path_info, mode=mode, encoding=encoding, **kwargs)

    def exists(self, path, use_dvcignore=True):  # pylint: disable=arguments-differ
        tree, dvc_tree = self._get_tree_pair(path)

        if not dvc_tree:
            return tree.exists(path)

        if tree.exists(path):
            return True

        try:
            meta = dvc_tree.metadata(path)
        except FileNotFoundError:
            return False

        (out, ) = meta.outs
        assert len(meta.outs) == 1
        if tree.exists(out.path_info):
            return False
        return True

    def isdir(self, path):  # pylint: disable=arguments-differ
        tree, dvc_tree = self._get_tree_pair(path)

        try:
            st = tree.stat(path)
            return stat.S_ISDIR(st.st_mode)
        except (OSError, ValueError):
            # from CPython's os.path.isdir()
            pass

        if not dvc_tree:
            return False

        try:
            meta = dvc_tree.metadata(path)
        except FileNotFoundError:
            return False

        (out, ) = meta.outs
        assert len(meta.outs) == 1
        if tree.exists(out.path_info):
            return False
        return meta.isdir

    def isdvc(self, path, **kwargs):
        _, dvc_tree = self._get_tree_pair(path)
        return dvc_tree is not None and dvc_tree.isdvc(path, **kwargs)

    def isfile(self, path):  # pylint: disable=arguments-differ
        tree, dvc_tree = self._get_tree_pair(path)

        try:
            st = tree.stat(path)
            return stat.S_ISREG(st.st_mode)
        except (OSError, ValueError):
            # from CPython's os.path.isfile()
            pass

        if not dvc_tree:
            return False

        try:
            meta = dvc_tree.metadata(path)
        except FileNotFoundError:
            return False

        (out, ) = meta.outs
        assert len(meta.outs) == 1
        if tree.exists(out.path_info):
            return False
        return meta.isfile

    def isexec(self, path_info):
        tree, dvc_tree = self._get_tree_pair(path_info)
        if dvc_tree and dvc_tree.exists(path_info):
            return dvc_tree.isexec(path_info)
        return tree.isexec(path_info)

    def stat(self, path):
        tree, _ = self._get_tree_pair(path)
        return tree.stat(path)

    def _dvc_walk(self, walk):
        try:
            root, dirs, files = next(walk)
        except StopIteration:
            return
        yield root, dirs, files
        for _ in dirs:
            yield from self._dvc_walk(walk)

    def _subrepo_walk(self, dir_path, **kwargs):
        """Walk into a new repo.

         NOTE: subrepo will only be discovered when walking if
         ignore_subrepos is set to False.
        """
        tree, dvc_tree = self._get_tree_pair(dir_path)
        tree_walk = tree.walk(dir_path,
                              topdown=True,
                              ignore_subrepos=not self._traverse_subrepos)
        if dvc_tree:
            dvc_walk = dvc_tree.walk(dir_path, topdown=True, **kwargs)
        else:
            dvc_walk = None
        yield from self._walk(tree_walk, dvc_walk, **kwargs)

    def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False):
        assert repo_walk
        try:
            _, dvc_dirs, dvc_fnames = (next(dvc_walk) if dvc_walk else
                                       (None, [], []))
            repo_root, repo_dirs, repo_fnames = next(repo_walk)
        except StopIteration:
            return

        # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs
        dvc_set = set(dvc_dirs)
        repo_set = set(repo_dirs)
        dvc_only = list(dvc_set - repo_set)
        repo_only = list(repo_set - dvc_set)
        shared = list(dvc_set & repo_set)
        dirs = shared + dvc_only + repo_only

        # merge file lists
        files = {
            fname
            for fname in dvc_fnames + repo_fnames
            if dvcfiles or not is_valid_filename(fname)
        }

        yield repo_root, dirs, list(files)

        def is_dvc_repo(d):
            return self._is_dvc_repo(os.path.join(repo_root, d))

        # remove subrepos to prevent it from being traversed
        subrepos = set(filter(is_dvc_repo, repo_only))
        # set dir order for next recursion level - shared dirs first so that
        # next() for both generators recurses into the same shared directory
        dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set]
        repo_dirs[:] = lfilter(lambda d: d in (repo_set - subrepos), dirs)

        for dirname in dirs:
            if dirname in subrepos:
                dir_path = os.path.join(repo_root, dirname)
                yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles)
            elif dirname in shared:
                yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles)
            elif dirname in dvc_set:
                yield from self._dvc_walk(dvc_walk)
            elif dirname in repo_set:
                yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)

    def walk(self,
             top,
             topdown=True,
             onerror=None,
             dvcfiles=False,
             follow_subrepos=None,
             **kwargs):  # pylint: disable=arguments-differ
        """Walk and merge both DVC and repo trees.

        Args:
            top: path to walk from
            topdown: if True, tree will be walked from top down.
            onerror: if set, onerror function will be called if an error
                occurs (by default errors are ignored).
            dvcfiles: if True, dvcfiles will be included in the files list
                for walked directories.

        Any kwargs will be passed into methods used for fetching and/or
        streaming DVC outs from remotes.
        """
        assert topdown

        if not self.exists(top):
            if onerror is not None:
                onerror(FileNotFoundError(top))
            return

        if not self.isdir(top):
            if onerror is not None:
                onerror(NotADirectoryError(top))
            return

        ignore_subrepos = not self._traverse_subrepos
        if follow_subrepos is not None:
            ignore_subrepos = not follow_subrepos

        tree, dvc_tree = self._get_tree_pair(top)
        repo_exists = tree.exists(top)
        repo_walk = tree.walk(
            top,
            topdown=topdown,
            onerror=onerror,
            ignore_subrepos=ignore_subrepos,
        )

        if not dvc_tree or (repo_exists and dvc_tree.isdvc(top)):
            yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)
            return

        if not repo_exists:
            yield from dvc_tree.walk(top, topdown=topdown, **kwargs)

        dvc_walk = None
        if dvc_tree.exists(top):
            dvc_walk = dvc_tree.walk(top, topdown=topdown, **kwargs)

        yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles)

    def walk_files(self, top, **kwargs):  # pylint: disable=arguments-differ
        for root, _, files in self.walk(top, **kwargs):
            for fname in files:
                yield PathInfo(root) / fname

    def get_dir_hash(self, path_info, follow_subrepos=None, **kwargs):  # pylint: disable=arguments-differ
        tree, dvc_tree = self._get_tree_pair(path_info)
        if tree.exists(path_info):
            return super().get_dir_hash(path_info,
                                        follow_subrepos=follow_subrepos,
                                        **kwargs)
        if not dvc_tree:
            raise FileNotFoundError
        return dvc_tree.get_dir_hash(path_info, **kwargs)

    def get_file_hash(self, path_info):
        """Return file checksum for specified path.

        If path_info is a DVC out, the pre-computed checksum for the file
        will be used. If path_info is a git file, MD5 will be computed for
        the git object.
        """
        if not self.exists(path_info):
            raise FileNotFoundError
        _, dvc_tree = self._get_tree_pair(path_info)
        if dvc_tree and dvc_tree.exists(path_info):
            try:
                return dvc_tree.get_file_hash(path_info)
            except FileNotFoundError:
                pass
        return HashInfo(self.PARAM_CHECKSUM, file_md5(path_info, self)[0])

    def copytree(self, top, dest):
        top = PathInfo(top)
        dest = PathInfo(dest)

        if not self.exists(top):
            raise FileNotFoundError

        if self.isfile(top):
            makedirs(dest.parent, exist_ok=True)
            with self.open(top, mode="rb") as fobj:
                copy_fobj_to_file(fobj, dest)
            return

        for root, _, files in self.walk(top):
            root = PathInfo(root)
            dest_dir = dest / root.relative_to(top)
            makedirs(dest_dir, exist_ok=True)
            for fname in files:
                src = root / fname
                with self.open(src, mode="rb") as fobj:
                    copy_fobj_to_file(fobj, dest_dir / fname)

    @property
    def hash_jobs(self):  # pylint: disable=invalid-overridden-method
        return self._main_repo.tree.hash_jobs

    def metadata(self, path):
        abspath = os.path.abspath(path)
        path_info = PathInfo(abspath)
        tree, dvc_tree = self._get_tree_pair(path_info)

        dvc_meta = None
        if dvc_tree:
            with suppress(FileNotFoundError):
                dvc_meta = dvc_tree.metadata(path_info)

        stat_result = None
        with suppress(FileNotFoundError):
            stat_result = tree.stat(path_info)

        if not stat_result and not dvc_meta:
            raise FileNotFoundError

        meta = dvc_meta or Metadata(
            path_info=path_info,
            repo=self._get_repo(abspath) or self._main_repo,
        )

        isdir = bool(stat_result) and stat.S_ISDIR(stat_result.st_mode)
        meta.isdir = meta.isdir or isdir

        if not dvc_meta:
            meta.is_exec = bool(stat_result) and is_exec(stat_result.st_mode)
        return meta
Exemplo n.º 6
0
class DvcIgnoreFilter:
    def __init__(self, tree, root_dir):
        from dvc.repo import Repo

        default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)]

        self.tree = tree
        self.root_dir = root_dir
        self.ignores_trie_tree = PathStringTrie()
        self.ignores_trie_tree[root_dir] = DvcIgnorePatterns(
            default_ignore_patterns, root_dir
        )
        self._ignored_subrepos = PathStringTrie()
        self._update(self.root_dir)

    def _update(self, dirname):
        self._update_sub_repo(dirname)

        old_pattern = self.ignores_trie_tree.longest_prefix(dirname).value
        matches = old_pattern.matches(dirname, DvcIgnore.DVCIGNORE_FILE, False)

        ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
        if not matches and self.tree.exists(
            ignore_file_path, use_dvcignore=False
        ):
            new_pattern = DvcIgnorePatterns.from_files(
                ignore_file_path, self.tree
            )
            if old_pattern:
                self.ignores_trie_tree[dirname] = DvcIgnorePatterns(
                    *merge_patterns(
                        old_pattern.pattern_list,
                        old_pattern.dirname,
                        new_pattern.pattern_list,
                        new_pattern.dirname,
                    )
                )
            else:
                self.ignores_trie_tree[dirname] = new_pattern
        elif old_pattern:
            self.ignores_trie_tree[dirname] = old_pattern

    def _update_sub_repo(self, path):
        from dvc.repo import Repo

        if path == self.root_dir:
            return

        dvc_dir = os.path.join(path, Repo.DVC_DIR)
        if not os.path.exists(dvc_dir):
            return

        root, dname = os.path.split(path)
        self._ignored_subrepos[root] = self._ignored_subrepos.get(
            root, set()
        ) | {dname}
        pattern_info = PatternInfo(f"/{dname}/", f"in sub_repo:{dname}")
        new_pattern = DvcIgnorePatterns([pattern_info], root)
        old_pattern = self.ignores_trie_tree.longest_prefix(root).value
        if old_pattern:
            self.ignores_trie_tree[root] = DvcIgnorePatterns(
                *merge_patterns(
                    old_pattern.pattern_list,
                    old_pattern.dirname,
                    new_pattern.pattern_list,
                    new_pattern.dirname,
                )
            )
        else:
            self.ignores_trie_tree[root] = new_pattern

    def __call__(self, root, dirs, files, ignore_subrepos=True):
        for dname in dirs:
            self._update_sub_repo(os.path.join(root, dname))

        ignore_pattern = self._get_trie_pattern(root)
        if ignore_pattern:
            dirs, files = ignore_pattern(root, dirs, files)
            if not ignore_subrepos:
                dirs.extend(self._ignored_subrepos.get(root, []))
        return dirs, files

    def _get_trie_pattern(self, dirname):
        ignore_pattern = self.ignores_trie_tree.get(dirname)
        if ignore_pattern:
            return ignore_pattern

        prefix = self.ignores_trie_tree.longest_prefix(dirname).key
        if not prefix:
            # outside of the repo
            return None

        dirs = list(
            takewhile(
                lambda path: path != prefix,
                (parent.fspath for parent in PathInfo(dirname).parents),
            )
        )
        dirs.reverse()
        dirs.append(dirname)

        for parent in dirs:
            self._update(parent)

        return self.ignores_trie_tree.get(dirname)

    def _is_ignored(self, path, is_dir=False):
        if self._outside_repo(path):
            return False
        dirname, basename = os.path.split(os.path.normpath(path))
        ignore_pattern = self._get_trie_pattern(dirname)
        if ignore_pattern:
            return ignore_pattern.matches(dirname, basename, is_dir)
        return False

    def _is_subrepo(self, path):
        dirname, basename = os.path.split(os.path.normpath(path))
        return basename in self._ignored_subrepos.get(dirname, set())

    def is_ignored_dir(self, path, ignore_subrepos=True):
        path = os.path.abspath(path)
        if not ignore_subrepos:
            return not self._is_subrepo(path)
        if path == self.root_dir:
            return False

        return self._is_ignored(path, True)

    def is_ignored_file(self, path):
        path = os.path.abspath(path)
        return self._is_ignored(path, False)

    def _outside_repo(self, path):
        path = PathInfo(path)

        # paths outside of the repo should be ignored
        path = relpath(path, self.root_dir)
        if path.startswith("..") or (
            os.name == "nt"
            and not os.path.commonprefix(
                [os.path.abspath(path), self.root_dir]
            )
        ):
            return True
        return False

    def check_ignore(self, target):
        full_target = os.path.abspath(target)
        if not self._outside_repo(full_target):
            dirname, basename = os.path.split(os.path.normpath(full_target))
            pattern = self._get_trie_pattern(dirname)
            if pattern:
                matches = pattern.match_details(
                    dirname, basename, os.path.isdir(full_target)
                )

                if matches:
                    return CheckIgnoreResult(target, True, matches)
        return _no_match(target)

    def is_ignored(self, path):
        # NOTE: can't use self.check_ignore(path).match for now, see
        # https://github.com/iterative/dvc/issues/4555
        return self.is_ignored_dir(path) or self.is_ignored_file(path)
Exemplo n.º 7
0
Arquivo: repo.py Projeto: skshetry/dvc
class RepoFileSystem(FileSystem):  # pylint:disable=abstract-method
    """DVC + git-tracked files fs.

    Args:
        repo: DVC or git repo.
        subrepos: traverse to subrepos (by default, it ignores subrepos)
        repo_factory: A function to initialize subrepo with, default is Repo.
        kwargs: Additional keyword arguments passed to the `DvcFileSystem()`.
    """

    sep = os.sep

    scheme = "local"
    PARAM_CHECKSUM = "md5"
    PARAM_REPO_URL = "repo_url"
    PARAM_REPO_ROOT = "repo_root"
    PARAM_REV = "rev"
    PARAM_CACHE_DIR = "cache_dir"
    PARAM_CACHE_TYPES = "cache_types"
    PARAM_SUBREPOS = "subrepos"

    def __init__(
        self,
        repo: Optional["Repo"] = None,
        subrepos=False,
        repo_factory: RepoFactory = None,
        **kwargs,
    ):
        super().__init__()

        from dvc.utils.collections import PathStringTrie

        if repo is None:
            repo, repo_factory = self._repo_from_fs_config(
                subrepos=subrepos, **kwargs
            )

        if not repo_factory:
            from dvc.repo import Repo

            self.repo_factory: RepoFactory = Repo
        else:
            self.repo_factory = repo_factory

        self._main_repo = repo
        self.hash_jobs = repo.fs.hash_jobs
        self.root_dir: str = repo.root_dir
        self._traverse_subrepos = subrepos

        self._subrepos_trie = PathStringTrie()
        """Keeps track of each and every path with the corresponding repo."""

        self._subrepos_trie[self.root_dir] = repo

        self._dvcfss = {}
        """Keep a dvcfs instance of each repo."""

        if hasattr(repo, "dvc_dir"):
            self._dvcfss[repo.root_dir] = DvcFileSystem(repo=repo)

    @property
    def repo_url(self):
        if self._main_repo is None:
            return None
        return self._main_repo.url

    @property
    def config(self):
        return {
            self.PARAM_REPO_URL: self.repo_url,
            self.PARAM_REPO_ROOT: self.root_dir,
            self.PARAM_REV: getattr(self._main_repo.fs, "rev", None),
            self.PARAM_CACHE_DIR: os.path.abspath(
                self._main_repo.odb.local.cache_dir
            ),
            self.PARAM_CACHE_TYPES: self._main_repo.odb.local.cache_types,
            self.PARAM_SUBREPOS: self._traverse_subrepos,
        }

    @classmethod
    def _repo_from_fs_config(
        cls, **config
    ) -> Tuple["Repo", Optional["RepoFactory"]]:
        from dvc.external_repo import erepo_factory, external_repo
        from dvc.repo import Repo

        url = config.get(cls.PARAM_REPO_URL)
        root = config.get(cls.PARAM_REPO_ROOT)
        assert url or root

        def _open(*args, **kwargs):
            # NOTE: if original repo was an erepo (and has a URL),
            # we cannot use Repo.open() since it will skip erepo
            # cache/remote setup for local URLs
            if url is None:
                return Repo.open(*args, **kwargs)
            return external_repo(*args, **kwargs)

        cache_dir = config.get(cls.PARAM_CACHE_DIR)
        cache_config = (
            {}
            if not cache_dir
            else {
                "cache": {
                    "dir": cache_dir,
                    "type": config.get(cls.PARAM_CACHE_TYPES),
                }
            }
        )
        repo_kwargs: dict = {
            "rev": config.get(cls.PARAM_REV),
            "subrepos": config.get(cls.PARAM_SUBREPOS, False),
            "uninitialized": True,
        }
        factory: Optional["RepoFactory"] = None
        if url is None:
            repo_kwargs["config"] = cache_config
        else:
            repo_kwargs["cache_dir"] = cache_dir
            factory = erepo_factory(url, cache_config)

        with _open(
            url if url else root,
            **repo_kwargs,
        ) as repo:
            return repo, factory

    def _get_repo(self, path: str) -> Optional["Repo"]:
        """Returns repo that the path falls in, using prefix.

        If the path is already tracked/collected, it just returns the repo.

        Otherwise, it collects the repos that might be in the path's parents
        and then returns the appropriate one.
        """
        repo = self._subrepos_trie.get(path)
        if repo:
            return repo

        prefix, repo = self._subrepos_trie.longest_prefix(path)
        if not prefix:
            return None

        parents = (parent for parent in self.path.parents(path))
        dirs = [path] + list(takewhile(lambda p: p != prefix, parents))
        dirs.reverse()
        self._update(dirs, starting_repo=repo)
        return self._subrepos_trie.get(path)

    @wrap_with(threading.Lock())
    def _update(self, dirs, starting_repo):
        """Checks for subrepo in directories and updates them."""
        repo = starting_repo
        for d in dirs:
            if self._is_dvc_repo(d):
                repo = self.repo_factory(
                    d,
                    fs=self._main_repo.fs,
                    repo_factory=self.repo_factory,
                )
                self._dvcfss[repo.root_dir] = DvcFileSystem(repo=repo)
            self._subrepos_trie[d] = repo

    def _is_dvc_repo(self, dir_path):
        """Check if the directory is a dvc repo."""
        if not self._traverse_subrepos:
            return False

        from dvc.repo import Repo

        repo_path = os.path.join(dir_path, Repo.DVC_DIR)
        return self._main_repo.fs.isdir(repo_path)

    def _get_fs_pair(
        self, path
    ) -> Tuple[FileSystem, Optional[DvcFileSystem], str]:
        """
        Returns a pair of fss based on repo the path falls in, using prefix.
        """
        path = os.path.abspath(path)

        # fallback to the top-level repo if repo was not found
        # this can happen if the path is outside of the repo
        repo = self._get_repo(path) or self._main_repo

        dvc_fs = self._dvcfss.get(repo.root_dir)

        if path.startswith(repo.root_dir):
            dvc_path = path[len(repo.root_dir) + 1 :]
        else:
            dvc_path = path

        return repo.fs, dvc_fs, dvc_path

    def open(
        self, path, mode="r", encoding="utf-8", **kwargs
    ):  # pylint: disable=arguments-renamed
        if "b" in mode:
            encoding = None

        fs, dvc_fs, dvc_path = self._get_fs_pair(path)
        try:
            return fs.open(path, mode=mode, encoding=encoding)
        except FileNotFoundError:
            if not dvc_fs:
                raise

        return dvc_fs.open(dvc_path, mode=mode, encoding=encoding, **kwargs)

    def exists(self, path) -> bool:
        path = os.path.abspath(path)

        fs, dvc_fs, dvc_path = self._get_fs_pair(path)

        if not dvc_fs:
            return fs.exists(path)

        if dvc_fs.repo.dvcignore.is_ignored(fs, path):
            return False

        if fs.exists(path):
            return True

        if not dvc_fs.exists(dvc_path):
            return False

        for p in self.path.parents(path):
            try:
                if fs.info(p)["type"] != "directory":
                    return False
            except FileNotFoundError:
                continue

        return True

    def isdir(self, path):  # pylint: disable=arguments-renamed
        path = os.path.abspath(path)

        fs, dvc_fs, dvc_path = self._get_fs_pair(path)

        if dvc_fs and dvc_fs.repo.dvcignore.is_ignored_dir(path):
            return False

        try:
            info = fs.info(path)
            return info["type"] == "directory"
        except (OSError, ValueError):
            # from CPython's os.path.isdir()
            pass

        if not dvc_fs:
            return False

        try:
            info = dvc_fs.info(dvc_path)
        except FileNotFoundError:
            return False

        for p in self.path.parents(path):
            try:
                if fs.info(p)["type"] != "directory":
                    return False
            except FileNotFoundError:
                continue

        return info["type"] == "directory"

    def isdvc(self, path, **kwargs):
        _, dvc_fs, dvc_path = self._get_fs_pair(path)
        return dvc_fs is not None and dvc_fs.isdvc(dvc_path, **kwargs)

    def isfile(self, path):  # pylint: disable=arguments-renamed
        path = os.path.abspath(path)

        fs, dvc_fs, dvc_path = self._get_fs_pair(path)

        if dvc_fs and dvc_fs.repo.dvcignore.is_ignored_file(path):
            return False

        try:
            info = fs.info(path)
            return info["type"] == "file"
        except (OSError, ValueError):
            # from CPython's os.path.isfile()
            pass

        if not dvc_fs:
            return False

        try:
            info = dvc_fs.info(dvc_path)
        except FileNotFoundError:
            return False

        for p in self.path.parents(path):
            try:
                if fs.info(p)["type"] != "directory":
                    return False
            except FileNotFoundError:
                continue

        return info["type"] == "file"

    def _dvc_walk(self, walk):
        try:
            root, dirs, files = next(walk)
        except StopIteration:
            return
        yield root, dirs, files
        for _ in dirs:
            yield from self._dvc_walk(walk)

    def _subrepo_walk(self, dir_path, **kwargs):
        """Walk into a new repo.

        NOTE: subrepo will only be discovered when walking if
        ignore_subrepos is set to False.
        """
        fs, dvc_fs, dvc_path = self._get_fs_pair(dir_path)
        fs_walk = fs.walk(dir_path, topdown=True)
        if dvc_fs:
            dvc_walk = _wrap_walk(dvc_fs, dvc_path, topdown=True, **kwargs)
        else:
            dvc_walk = None
        yield from self._walk(fs_walk, dvc_walk, **kwargs)

    def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False):
        from dvc.dvcfile import is_valid_filename
        from dvc.ignore import DvcIgnore

        assert repo_walk
        try:
            _, dvc_dirs, dvc_fnames = (
                next(dvc_walk) if dvc_walk else (None, [], [])
            )
            repo_root, repo_dirs, repo_fnames = next(repo_walk)
        except StopIteration:
            return

        # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs
        dvc_set = set(dvc_dirs)
        repo_set = set(repo_dirs)
        dvc_only = list(dvc_set - repo_set)
        repo_only = list(repo_set - dvc_set)
        shared = list(dvc_set & repo_set)
        dirs = shared + dvc_only + repo_only

        def _func(fname):
            if dvcfiles:
                return True

            return not (
                is_valid_filename(fname) or fname == DvcIgnore.DVCIGNORE_FILE
            )

        # merge file lists
        files = set(filter(_func, dvc_fnames + repo_fnames))

        yield repo_root, dirs, list(files)

        def is_dvc_repo(d):
            return self._is_dvc_repo(os.path.join(repo_root, d))

        # remove subrepos to prevent it from being traversed
        subrepos = set(filter(is_dvc_repo, repo_only))
        # set dir order for next recursion level - shared dirs first so that
        # next() for both generators recurses into the same shared directory
        dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set]
        repo_dirs[:] = lfilter(lambda d: d in (repo_set - subrepos), dirs)

        for dirname in dirs:
            if dirname in subrepos:
                dir_path = os.path.join(repo_root, dirname)
                yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles)
            elif dirname in shared:
                yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles)
            elif dirname in dvc_set:
                yield from self._dvc_walk(dvc_walk)
            elif dirname in repo_set:
                yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)

    def walk(self, top, topdown=True, onerror=None, **kwargs):
        """Walk and merge both DVC and repo fss.

        Args:
            top: path to walk from
            topdown: if True, fs will be walked from top down.
            onerror: if set, onerror function will be called if an error
                occurs (by default errors are ignored).
            dvcfiles: if True, dvcfiles will be included in the files list
                for walked directories.

        Any kwargs will be passed into methods used for fetching and/or
        streaming DVC outs from remotes.
        """
        assert topdown

        if not self.exists(top):
            if onerror is not None:
                onerror(FileNotFoundError(top))
            return

        if not self.isdir(top):
            if onerror is not None:
                onerror(NotADirectoryError(top))
            return

        repo = self._get_repo(os.path.abspath(top))
        dvcfiles = kwargs.pop("dvcfiles", False)

        fs, dvc_fs, dvc_path = self._get_fs_pair(top)
        repo_exists = fs.exists(top)

        repo_walk = repo.dvcignore.walk(
            fs, top, topdown=topdown, onerror=onerror, **kwargs
        )

        if not dvc_fs or (repo_exists and dvc_fs.isdvc(dvc_path)):
            yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)
            return

        if not repo_exists:
            yield from _wrap_walk(
                dvc_fs, dvc_path, topdown=topdown, onerror=onerror, **kwargs
            )

        dvc_walk = None
        if dvc_fs.exists(dvc_path):
            dvc_walk = _wrap_walk(
                dvc_fs, dvc_path, topdown=topdown, onerror=onerror, **kwargs
            )

        yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles)

    def find(self, path, prefix=None):
        for root, _, files in self.walk(path):
            for fname in files:
                yield self.path.join(root, fname)

    def get_file(
        self, from_info, to_file, callback=DEFAULT_CALLBACK, **kwargs
    ):
        fs, dvc_fs, dvc_path = self._get_fs_pair(from_info)
        try:
            fs.get_file(  # pylint: disable=protected-access
                from_info, to_file, callback=callback, **kwargs
            )
            return
        except FileNotFoundError:
            if not dvc_fs:
                raise

        dvc_fs.get_file(  # pylint: disable=protected-access
            dvc_path, to_file, callback=callback, **kwargs
        )

    def info(self, path):
        fs, dvc_fs, dvc_path = self._get_fs_pair(path)

        try:
            dvc_info = dvc_fs.info(dvc_path)
        except FileNotFoundError:
            dvc_info = None

        try:
            from dvc.utils import is_exec

            fs_info = fs.info(path)
            fs_info["repo"] = dvc_fs.repo
            fs_info["isout"] = (
                dvc_info.get("isout", False) if dvc_info else False
            )
            fs_info["outs"] = dvc_info["outs"] if dvc_info else None
            fs_info["isdvc"] = dvc_info["isdvc"] if dvc_info else False
            fs_info["meta"] = dvc_info.get("meta") if dvc_info else None

            isexec = False
            if dvc_info:
                isexec = dvc_info["isexec"]
            elif fs_info["type"] == "file":
                isexec = is_exec(fs_info["mode"])
            fs_info["isexec"] = isexec
            return fs_info

        except FileNotFoundError:
            if not dvc_info:
                raise

            dvc_info["repo"] = dvc_fs.repo
            dvc_info["isdvc"] = True
            return dvc_info

    def checksum(self, path):
        fs, dvc_fs, dvc_path = self._get_fs_pair(path)

        try:
            return fs.checksum(path)
        except FileNotFoundError:
            return dvc_fs.checksum(dvc_path)
Exemplo n.º 8
0
class DvcIgnoreFilter:
    @staticmethod
    def _is_dvc_repo(root, directory):
        from dvc.repo import Repo

        return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR))

    def __init__(self, tree, root_dir):
        from dvc.repo import Repo

        default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)]

        self.tree = tree
        self.root_dir = root_dir
        self.ignores_trie_tree = PathStringTrie()
        self.ignores_trie_tree[root_dir] = DvcIgnorePatterns(
            default_ignore_patterns, root_dir)
        self._ignored_subrepos = PathStringTrie()
        self._update(self.root_dir)

    def _update(self, dirname):
        old_pattern = self.ignores_trie_tree.longest_prefix(dirname).value
        matches = old_pattern.matches(dirname, DvcIgnore.DVCIGNORE_FILE, False)

        ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
        if not matches and self.tree.exists(ignore_file_path,
                                            use_dvcignore=False):
            new_pattern = DvcIgnorePatterns.from_files(ignore_file_path,
                                                       self.tree)
            if old_pattern:
                self.ignores_trie_tree[dirname] = DvcIgnorePatterns(
                    *merge_patterns(
                        old_pattern.pattern_list,
                        old_pattern.dirname,
                        new_pattern.pattern_list,
                        new_pattern.dirname,
                    ))
            else:
                self.ignores_trie_tree[dirname] = new_pattern
        elif old_pattern:
            self.ignores_trie_tree[dirname] = old_pattern

        # NOTE: using `walk` + `break` because tree doesn't have `listdir()`
        for root, dirs, _ in self.tree.walk(dirname, use_dvcignore=False):
            self._update_sub_repo(root, dirs)
            break

    def _update_sub_repo(self, root, dirs):
        for d in dirs:
            if self._is_dvc_repo(root, d):
                self._ignored_subrepos[root] = self._ignored_subrepos.get(
                    root, set()) | {d}
                new_pattern = DvcIgnorePatterns([f"/{d}/"], root)
                old_pattern = self.ignores_trie_tree.longest_prefix(root).value
                if old_pattern:
                    self.ignores_trie_tree[root] = DvcIgnorePatterns(
                        *merge_patterns(
                            old_pattern.pattern_list,
                            old_pattern.dirname,
                            new_pattern.pattern_list,
                            new_pattern.dirname,
                        ))
                else:
                    self.ignores_trie_tree[root] = new_pattern

    def __call__(self, root, dirs, files, ignore_subrepos=True):
        ignore_pattern = self._get_trie_pattern(root)
        if ignore_pattern:
            dirs, files = ignore_pattern(root, dirs, files)
            if not ignore_subrepos:
                dirs.extend(self._ignored_subrepos.get(root, []))
        return dirs, files

    def _get_trie_pattern(self, dirname):
        ignore_pattern = self.ignores_trie_tree.get(dirname)
        if ignore_pattern:
            return ignore_pattern

        prefix = self.ignores_trie_tree.longest_prefix(dirname).key
        if not prefix:
            # outside of the repo
            return None

        dirs = list(
            takewhile(
                lambda path: path != prefix,
                (parent.fspath for parent in PathInfo(dirname).parents),
            ))
        dirs.reverse()
        dirs.append(dirname)

        for parent in dirs:
            self._update(parent)

        return self.ignores_trie_tree.get(dirname)

    def _is_ignored(self, path, is_dir=False):
        if self._outside_repo(path):
            return False
        dirname, basename = os.path.split(os.path.normpath(path))
        ignore_pattern = self._get_trie_pattern(dirname)
        if ignore_pattern:
            return ignore_pattern.matches(dirname, basename, is_dir)
        else:
            return False

    def _is_subrepo(self, path):
        dirname, basename = os.path.split(os.path.normpath(path))
        return basename in self._ignored_subrepos.get(dirname, set())

    def is_ignored_dir(self, path, ignore_subrepos=True):
        path = os.path.abspath(path)
        if not ignore_subrepos:
            return not self._is_subrepo(path)
        if path == self.root_dir:
            return False

        return self._is_ignored(path, True)

    def is_ignored_file(self, path):
        path = os.path.abspath(path)
        return self._is_ignored(path, False)

    def _outside_repo(self, path):
        path = PathInfo(path)

        # paths outside of the repo should be ignored
        path = relpath(path, self.root_dir)
        if path.startswith("..") or (
                os.name == "nt" and not os.path.commonprefix(
                    [os.path.abspath(path), self.root_dir])):
            return True
        return False

    def check_ignore(self, target):
        full_target = os.path.abspath(target)
        if not self._outside_repo(full_target):
            dirname, basename = os.path.split(os.path.normpath(full_target))
            pattern = self._get_trie_pattern(dirname)
            if pattern:
                matches = pattern.match_details(dirname, basename,
                                                os.path.isdir(full_target))

                if matches:
                    return CheckIgnoreResult(target, True, matches)
        return CheckIgnoreResult(target, False, ["::"])