Exemplo n.º 1
0
class RepoTree(BaseTree):  # pylint:disable=abstract-method
    """DVC + git-tracked files tree.

    Args:
        repo: DVC or git repo.
        subrepos: traverse to subrepos (by default, it ignores subrepos)
        repo_factory: A function to initialize subrepo with, default is Repo.
        kwargs: Additional keyword arguments passed to the `DvcTree()`.
    """

    scheme = "local"
    PARAM_CHECKSUM = "md5"

    def __init__(self,
                 repo,
                 subrepos=False,
                 repo_factory: Callable[[str], "Repo"] = None,
                 **kwargs):
        super().__init__(repo, {"url": repo.root_dir})

        if not repo_factory:
            from dvc.repo import Repo

            self.repo_factory = Repo
        else:
            self.repo_factory = repo_factory

        self._main_repo = repo
        self.root_dir = repo.root_dir
        self._traverse_subrepos = subrepos

        self._subrepos_trie = StringTrie(separator=os.sep)
        """Keeps track of each and every path with the corresponding repo."""

        self._subrepos_trie[self.root_dir] = repo

        self._dvctrees = {}
        """Keep a dvctree instance of each repo."""

        self._dvctree_configs = kwargs

        if hasattr(repo, "dvc_dir"):
            self._dvctrees[repo.root_dir] = DvcTree(repo, **kwargs)

    def _get_repo(self, path) -> Optional["Repo"]:
        """Returns repo that the path falls in, using prefix.

        If the path is already tracked/collected, it just returns the repo.

        Otherwise, it collects the repos that might be in the path's parents
        and then returns the appropriate one.
        """
        repo = self._subrepos_trie.get(path)
        if repo:
            return repo

        prefix, repo = self._subrepos_trie.longest_prefix(path)
        if not prefix:
            return None

        parents = (parent.fspath for parent in PathInfo(path).parents)
        dirs = [path] + list(takewhile(lambda p: p != prefix, parents))
        dirs.reverse()
        self._update(dirs, starting_repo=repo)
        return self._subrepos_trie.get(path)

    @wrap_with(threading.Lock())
    def _update(self, dirs, starting_repo):
        """Checks for subrepo in directories and updates them."""
        repo = starting_repo
        for d in dirs:
            if self._is_dvc_repo(d):
                repo = self.repo_factory(d)
                self._dvctrees[repo.root_dir] = DvcTree(
                    repo, **self._dvctree_configs)
            self._subrepos_trie[d] = repo

    def _is_dvc_repo(self, dir_path):
        """Check if the directory is a dvc repo."""
        if not self._traverse_subrepos:
            return False

        from dvc.repo import Repo

        repo_path = os.path.join(dir_path, Repo.DVC_DIR)
        # dvcignore will ignore subrepos, therefore using `use_dvcignore=False`
        return self._main_repo.tree.isdir(repo_path, use_dvcignore=False)

    def _get_tree_pair(self,
                       path) -> Tuple[Union["GitTree", "LocalTree"], DvcTree]:
        """
        Returns a pair of trees based on repo the path falls in, using prefix.
        """
        path = os.path.abspath(path)

        # fallback to the top-level repo if repo was not found
        # this can happen if the path is outside of the repo
        repo = self._get_repo(path) or self._main_repo

        dvc_tree = self._dvctrees.get(repo.root_dir)
        return repo.tree, dvc_tree

    @property
    def fetch(self):
        return "fetch" in self._dvctree_configs

    @property
    def stream(self):
        return "stream" in self._dvctree_configs

    def open(self, path, mode="r", encoding="utf-8", **kwargs):  # pylint: disable=arguments-differ
        if "b" in mode:
            encoding = None

        tree, dvc_tree = self._get_tree_pair(path)
        if dvc_tree and dvc_tree.exists(path):
            return dvc_tree.open(path, mode=mode, encoding=encoding, **kwargs)
        return tree.open(path, mode=mode, encoding=encoding)

    def exists(self, path, use_dvcignore=True):  # pylint: disable=arguments-differ
        tree, dvc_tree = self._get_tree_pair(path)
        return tree.exists(path) or (dvc_tree and dvc_tree.exists(path))

    def isdir(self, path):  # pylint: disable=arguments-differ
        tree, dvc_tree = self._get_tree_pair(path)
        return tree.isdir(path) or (dvc_tree and dvc_tree.isdir(path))

    def isdvc(self, path, **kwargs):
        _, dvc_tree = self._get_tree_pair(path)
        return dvc_tree is not None and dvc_tree.isdvc(path, **kwargs)

    def isfile(self, path):  # pylint: disable=arguments-differ
        tree, dvc_tree = self._get_tree_pair(path)
        return tree.isfile(path) or (dvc_tree and dvc_tree.isfile(path))

    def isexec(self, path):
        tree, dvc_tree = self._get_tree_pair(path)
        if dvc_tree and dvc_tree.exists(path):
            return dvc_tree.isexec(path)
        return tree.isexec(path)

    def stat(self, path):
        tree, _ = self._get_tree_pair(path)
        return tree.stat(path)

    def _dvc_walk(self, walk):
        try:
            root, dirs, files = next(walk)
        except StopIteration:
            return
        yield root, dirs, files
        for _ in dirs:
            yield from self._dvc_walk(walk)

    def _subrepo_walk(self, dir_path, **kwargs):
        """Walk into a new repo.

         NOTE: subrepo will only be discovered when walking if
         ignore_subrepos is set to False.
        """
        tree, dvc_tree = self._get_tree_pair(dir_path)
        tree_walk = tree.walk(dir_path,
                              topdown=True,
                              ignore_subrepos=not self._traverse_subrepos)
        if dvc_tree:
            dvc_walk = dvc_tree.walk(dir_path, topdown=True, **kwargs)
        else:
            dvc_walk = None
        yield from self._walk(tree_walk, dvc_walk, **kwargs)

    def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False):
        assert repo_walk
        try:
            _, dvc_dirs, dvc_fnames = (next(dvc_walk) if dvc_walk else
                                       (None, [], []))
            repo_root, repo_dirs, repo_fnames = next(repo_walk)
        except StopIteration:
            return

        # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs
        dvc_set = set(dvc_dirs)
        repo_set = set(repo_dirs)
        dvc_only = list(dvc_set - repo_set)
        repo_only = list(repo_set - dvc_set)
        shared = list(dvc_set & repo_set)
        dirs = shared + dvc_only + repo_only

        # merge file lists
        files = {
            fname
            for fname in dvc_fnames + repo_fnames
            if dvcfiles or not is_valid_filename(fname)
        }

        yield repo_root, dirs, list(files)

        # set dir order for next recursion level - shared dirs first so that
        # next() for both generators recurses into the same shared directory
        dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set]
        repo_dirs[:] = [dirname for dirname in dirs if dirname in repo_set]

        for dirname in dirs:
            dir_path = os.path.join(repo_root, dirname)
            if self._is_dvc_repo(dir_path):
                yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles)
            elif dirname in shared:
                yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles)
            elif dirname in dvc_set:
                yield from self._dvc_walk(dvc_walk)
            elif dirname in repo_set:
                yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)

    def walk(self, top, topdown=True, onerror=None, dvcfiles=False, **kwargs):  # pylint: disable=arguments-differ
        """Walk and merge both DVC and repo trees.

        Args:
            top: path to walk from
            topdown: if True, tree will be walked from top down.
            onerror: if set, onerror function will be called if an error
                occurs (by default errors are ignored).
            dvcfiles: if True, dvcfiles will be included in the files list
                for walked directories.

        Any kwargs will be passed into methods used for fetching and/or
        streaming DVC outs from remotes.
        """
        assert topdown

        if not self.exists(top):
            if onerror is not None:
                onerror(FileNotFoundError(top))
            return

        if not self.isdir(top):
            if onerror is not None:
                onerror(NotADirectoryError(top))
            return

        tree, dvc_tree = self._get_tree_pair(top)
        dvc_exists = dvc_tree and dvc_tree.exists(top)
        repo_exists = tree.exists(top)
        if dvc_exists:
            dvc_walk = dvc_tree.walk(top, topdown=topdown, **kwargs)
            if repo_exists:
                repo_walk = tree.walk(
                    top,
                    topdown=topdown,
                    ignore_subrepos=not self._traverse_subrepos,
                )
                yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles)
            else:
                yield from dvc_walk
        else:
            repo_walk = tree.walk(
                top,
                topdown=topdown,
                onerror=onerror,
                ignore_subrepos=not self._traverse_subrepos,
            )
            yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)

    def walk_files(self, top, **kwargs):  # pylint: disable=arguments-differ
        for root, _, files in self.walk(top, **kwargs):
            for fname in files:
                yield PathInfo(root) / fname

    def get_file_hash(self, path_info):
        """Return file checksum for specified path.

        If path_info is a DVC out, the pre-computed checksum for the file
        will be used. If path_info is a git file, MD5 will be computed for
        the git object.
        """
        if not self.exists(path_info):
            raise FileNotFoundError
        _, dvc_tree = self._get_tree_pair(path_info)
        if dvc_tree and dvc_tree.exists(path_info):
            try:
                return dvc_tree.get_file_hash(path_info)
            except OutputNotFoundError:
                pass
        return self.PARAM_CHECKSUM, file_md5(path_info, self)[0]

    def copytree(self, top, dest):
        top = PathInfo(top)
        dest = PathInfo(dest)

        if not self.exists(top):
            raise FileNotFoundError

        if self.isfile(top):
            makedirs(dest.parent, exist_ok=True)
            with self.open(top, mode="rb") as fobj:
                copy_fobj_to_file(fobj, dest)
            return

        for root, _, files in self.walk(top):
            root = PathInfo(root)
            dest_dir = dest / root.relative_to(top)
            makedirs(dest_dir, exist_ok=True)
            for fname in files:
                src = root / fname
                with self.open(src, mode="rb") as fobj:
                    copy_fobj_to_file(fobj, dest_dir / fname)

    @property
    def hash_jobs(self):  # pylint: disable=invalid-overridden-method
        return self._main_repo.tree.hash_jobs

    def metadata(self, path):
        path_info = PathInfo(os.path.abspath(path))
        tree, dvc_tree = self._get_tree_pair(path_info)

        dvc_meta = None
        if dvc_tree:
            with suppress(OutputNotFoundError):
                dvc_meta = dvc_tree.metadata(path_info)

        stat_result = None
        with suppress(FileNotFoundError):
            stat_result = tree.stat(path_info)

        if not stat_result and not dvc_meta:
            raise FileNotFoundError

        meta = dvc_meta or Metadata(path_info=path_info)

        isdir = bool(stat_result) and stat.S_ISDIR(stat_result.st_mode)
        meta.isdir = meta.isdir or isdir

        if not dvc_meta:
            meta.is_exec = bool(stat_result) and is_exec(stat_result.st_mode)
        return meta
Exemplo n.º 2
0
class DvcIgnoreFilter:
    @staticmethod
    def _is_dvc_repo(root, directory):
        from dvc.repo import Repo

        return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR))

    def __init__(self, tree, root_dir):
        from dvc.repo import Repo

        default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)]

        self.tree = tree
        self.root_dir = root_dir
        self.ignores_trie_tree = StringTrie(separator=os.sep)
        self.ignores_trie_tree[root_dir] = DvcIgnorePatterns(
            default_ignore_patterns, root_dir)
        self._update(self.root_dir)

    def _update(self, dirname):
        old_pattern = self.ignores_trie_tree.longest_prefix(dirname).value
        matches = old_pattern.matches(dirname, DvcIgnore.DVCIGNORE_FILE, False)

        ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
        if not matches and self.tree.exists(ignore_file_path,
                                            use_dvcignore=False):
            new_pattern = DvcIgnorePatterns.from_files(ignore_file_path,
                                                       self.tree)
            if old_pattern:
                self.ignores_trie_tree[dirname] = DvcIgnorePatterns(
                    *merge_patterns(
                        old_pattern.pattern_list,
                        old_pattern.dirname,
                        new_pattern.pattern_list,
                        new_pattern.dirname,
                    ))
            else:
                self.ignores_trie_tree[dirname] = new_pattern
        elif old_pattern:
            self.ignores_trie_tree[dirname] = old_pattern

        # NOTE: using `walk` + `break` because tree doesn't have `listdir()`
        for root, dirs, _ in self.tree.walk(dirname, use_dvcignore=False):
            self._update_sub_repo(root, dirs)
            break

    def _update_sub_repo(self, root, dirs):
        for d in dirs:
            if self._is_dvc_repo(root, d):
                old_pattern = self.ignores_trie_tree.longest_prefix(root).value
                if old_pattern:
                    self.ignores_trie_tree[root] = DvcIgnorePatterns(
                        *merge_patterns(
                            old_pattern.pattern_list,
                            old_pattern.dirname,
                            ["/{}/".format(d)],
                            root,
                        ))
                else:
                    self.ignores_trie_tree[root] = DvcIgnorePatterns(
                        ["/{}/".format(d)], root)

    def __call__(self, root, dirs, files):
        ignore_pattern = self._get_trie_pattern(root)
        if ignore_pattern:
            return ignore_pattern(root, dirs, files)
        else:
            return dirs, files

    def _get_trie_pattern(self, dirname):
        ignore_pattern = self.ignores_trie_tree.get(dirname)
        if ignore_pattern:
            return ignore_pattern

        prefix = self.ignores_trie_tree.longest_prefix(dirname).key
        if not prefix:
            # outside of the repo
            return None

        dirs = list(
            takewhile(
                lambda path: path != prefix,
                (parent.fspath for parent in PathInfo(dirname).parents),
            ))
        dirs.reverse()
        dirs.append(dirname)

        for parent in dirs:
            self._update(parent)

        return self.ignores_trie_tree.get(dirname)

    def _is_ignored(self, path, is_dir=False):
        if self._outside_repo(path):
            return True
        dirname, basename = os.path.split(os.path.normpath(path))
        ignore_pattern = self._get_trie_pattern(dirname)
        if ignore_pattern:
            return ignore_pattern.matches(dirname, basename, is_dir)
        else:
            return False

    def is_ignored_dir(self, path):
        path = os.path.abspath(path)
        if path == self.root_dir:
            return False

        return self._is_ignored(path, True)

    def is_ignored_file(self, path):
        return self._is_ignored(path, False)

    def _outside_repo(self, path):
        path = PathInfo(path)

        # paths outside of the repo should be ignored
        path = relpath(path, self.root_dir)
        if path.startswith("..") or (
                os.name == "nt" and not os.path.commonprefix(
                    [os.path.abspath(path), self.root_dir])):
            return True
        return False