Пример #1
0
    def _update_sub_repo(self, path, ignore_trie: Trie):
        from dvc.repo import Repo

        if path == self.root_dir:
            return

        dvc_dir = self.fs.path.join(path, Repo.DVC_DIR)
        if not self.fs.exists(dvc_dir):
            return

        root, dname = self.fs.path.split(path)
        key = self._get_key(root)
        pattern_info = PatternInfo(f"/{dname}/", f"in sub_repo:{dname}")
        new_pattern = DvcIgnorePatterns([pattern_info], root, self.fs.sep)
        old_pattern = ignore_trie.longest_prefix(key).value
        if old_pattern:
            plist, prefix = merge_patterns(
                self.fs.path.flavour,
                old_pattern.pattern_list,
                old_pattern.dirname,
                new_pattern.pattern_list,
                new_pattern.dirname,
            )
            ignore_trie[key] = DvcIgnorePatterns(plist, prefix, self.fs.sep)
        else:
            ignore_trie[key] = new_pattern
Пример #2
0
class JiebaTokenizer(object):
    def __init__(self, user_dict: Union[str, Iterable] = None):
        self.t = posseg.POSTokenizer()
        self.t.initialize()
        self.trie = Trie()
        if user_dict:
            self.load_user_dict(user_dict)

    def load_user_dict(self, user_dict: Union[str, Iterable] = None):
        if isinstance(user_dict, str):
            with open(user_dict) as fin:
                user_dict = [line.strip('\r\n') for line in fin]
        seg_dict = {}
        for line in user_dict:
            line = line.strip()
            if not line:
                continue
            arr = line.split('=', 1)
            key = arr[0].strip()
            if len(arr) > 1 and len(arr[1].strip()) > 0:
                value = [x for x in arr[1].strip().split()]
                assert len(key) == sum(len(x) for x in value)
            else:
                value = [key]
            seg_dict[key] = value
            self.t.tokenizer.add_word(key, 100)
        # jieba一定会切分英文串,对于英文还需要trie树来合并一下
        for key, value in seg_dict.items():
            self.trie[self.t.tokenizer.lcut(key)] = value

    def tokenize(self, s: str) -> List[Token]:
        res = []
        term_list = list(self.t.cut(s))
        text_list = [x.word for x in term_list]
        idx = 0
        while idx < len(text_list):
            m = self.trie.longest_prefix(text_list[idx:])
            if m:
                res.extend(Token(x, 'nz') for x in m.value)
                idx += len(m.key)
            else:
                res.append(Token(term_list[idx].word, term_list[idx].flag))
                idx += 1
        return res
Пример #3
0
    def _update_trie(self, dirname: str, trie: Trie) -> None:
        key = self._get_key(dirname)
        old_pattern = trie.longest_prefix(key).value
        matches = old_pattern.matches(dirname, DvcIgnore.DVCIGNORE_FILE, False)

        path = self.fs.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
        if not matches and self.fs.exists(path):
            name = self.fs.path.relpath(path, self.root_dir)
            new_pattern = DvcIgnorePatterns.from_file(path, self.fs, name)
            if old_pattern:
                plist, prefix = merge_patterns(
                    self.fs.path.flavour,
                    old_pattern.pattern_list,
                    old_pattern.dirname,
                    new_pattern.pattern_list,
                    new_pattern.dirname,
                )
                trie[key] = DvcIgnorePatterns(plist, prefix, self.fs.sep)
            else:
                trie[key] = new_pattern
        elif old_pattern:
            trie[key] = old_pattern
Пример #4
0
Файл: dvc.py Проект: pmrowla/dvc
class _DvcFileSystem(AbstractFileSystem):  # pylint:disable=abstract-method
    """DVC + git-tracked files fs.

    Args:
        repo: DVC or git repo.
        subrepos: traverse to subrepos (by default, it ignores subrepos)
        repo_factory: A function to initialize subrepo with, default is Repo.
        kwargs: Additional keyword arguments passed to the `DataFileSystem()`.
    """

    root_marker = "/"

    PARAM_REPO_URL = "repo_url"
    PARAM_REPO_ROOT = "repo_root"
    PARAM_REV = "rev"
    PARAM_CACHE_DIR = "cache_dir"
    PARAM_CACHE_TYPES = "cache_types"
    PARAM_SUBREPOS = "subrepos"

    def __init__(
        self,
        repo: Optional["Repo"] = None,
        subrepos=False,
        repo_factory: RepoFactory = None,
        **kwargs,
    ):
        super().__init__()

        from pygtrie import Trie

        if repo is None:
            repo, repo_factory = self._repo_from_fs_config(subrepos=subrepos,
                                                           **kwargs)

        if not repo_factory:
            from dvc.repo import Repo

            self.repo_factory: RepoFactory = Repo
        else:
            self.repo_factory = repo_factory

        def _getcwd():
            relparts = ()
            if repo.fs.path.isin(repo.fs.path.getcwd(), repo.root_dir):
                relparts = repo.fs.path.relparts(repo.fs.path.getcwd(),
                                                 repo.root_dir)
            return self.root_marker + self.sep.join(relparts)

        self.path = Path(self.sep, getcwd=_getcwd)
        self.repo = repo
        self.hash_jobs = repo.fs.hash_jobs
        self._traverse_subrepos = subrepos

        self._subrepos_trie = Trie()
        """Keeps track of each and every path with the corresponding repo."""

        key = self._get_key(self.repo.root_dir)
        self._subrepos_trie[key] = repo

        self._datafss = {}
        """Keep a datafs instance of each repo."""

        if hasattr(repo, "dvc_dir"):
            self._datafss[key] = DataFileSystem(repo=repo)

    def _get_key(self, path):
        parts = self.repo.fs.path.relparts(path, self.repo.root_dir)
        if parts == (".", ):
            parts = ()
        return parts

    @property
    def repo_url(self):
        if self.repo is None:
            return None
        return self.repo.url

    @property
    def config(self):
        return {
            self.PARAM_REPO_URL: self.repo_url,
            self.PARAM_REPO_ROOT: self.repo.root_dir,
            self.PARAM_REV: getattr(self.repo.fs, "rev", None),
            self.PARAM_CACHE_DIR:
            os.path.abspath(self.repo.odb.local.cache_dir),
            self.PARAM_CACHE_TYPES: self.repo.odb.local.cache_types,
            self.PARAM_SUBREPOS: self._traverse_subrepos,
        }

    @classmethod
    def _repo_from_fs_config(
            cls, **config) -> Tuple["Repo", Optional["RepoFactory"]]:
        from dvc.external_repo import erepo_factory, external_repo
        from dvc.repo import Repo

        url = config.get(cls.PARAM_REPO_URL)
        root = config.get(cls.PARAM_REPO_ROOT)
        assert url or root

        def _open(*args, **kwargs):
            # NOTE: if original repo was an erepo (and has a URL),
            # we cannot use Repo.open() since it will skip erepo
            # cache/remote setup for local URLs
            if url is None:
                return Repo.open(*args, **kwargs)
            return external_repo(*args, **kwargs)

        cache_dir = config.get(cls.PARAM_CACHE_DIR)
        cache_config = ({} if not cache_dir else {
            "cache": {
                "dir": cache_dir,
                "type": config.get(cls.PARAM_CACHE_TYPES),
            }
        })
        repo_kwargs: dict = {
            "rev": config.get(cls.PARAM_REV),
            "subrepos": config.get(cls.PARAM_SUBREPOS, False),
            "uninitialized": True,
        }
        factory: Optional["RepoFactory"] = None
        if url is None:
            repo_kwargs["config"] = cache_config
        else:
            repo_kwargs["cache_dir"] = cache_dir
            factory = erepo_factory(url, root, cache_config)

        with _open(
                url if url else root,
                **repo_kwargs,
        ) as repo:
            return repo, factory

    def _get_repo(self, path: str) -> "Repo":
        """Returns repo that the path falls in, using prefix.

        If the path is already tracked/collected, it just returns the repo.

        Otherwise, it collects the repos that might be in the path's parents
        and then returns the appropriate one.
        """
        if not self.repo.fs.path.isin_or_eq(path, self.repo.root_dir):
            # outside of repo
            return self.repo

        key = self._get_key(path)
        repo = self._subrepos_trie.get(key)
        if repo:
            return repo

        prefix_key, repo = self._subrepos_trie.longest_prefix(key)
        prefix = self.repo.fs.path.join(
            self.repo.root_dir,
            *prefix_key,  # pylint: disable=not-an-iterable
        )

        parents = (parent for parent in self.repo.fs.path.parents(path))
        dirs = [path] + list(takewhile(lambda p: p != prefix, parents))
        dirs.reverse()
        self._update(dirs, starting_repo=repo)
        return self._subrepos_trie.get(key) or self.repo

    @wrap_with(threading.Lock())
    def _update(self, dirs, starting_repo):
        """Checks for subrepo in directories and updates them."""
        repo = starting_repo
        for d in dirs:
            key = self._get_key(d)
            if self._is_dvc_repo(d):
                repo = self.repo_factory(
                    d,
                    fs=self.repo.fs,
                    scm=self.repo.scm,
                    repo_factory=self.repo_factory,
                )
                self._datafss[key] = DataFileSystem(repo=repo)
            self._subrepos_trie[key] = repo

    def _is_dvc_repo(self, dir_path):
        """Check if the directory is a dvc repo."""
        if not self._traverse_subrepos:
            return False

        from dvc.repo import Repo

        repo_path = self.repo.fs.path.join(dir_path, Repo.DVC_DIR)
        return self.repo.fs.isdir(repo_path)

    def _get_fs_pair(
        self, path
    ) -> Tuple[Optional[FileSystem], Optional[str], Optional[DataFileSystem],
               Optional[str], ]:
        """
        Returns a pair of fss based on repo the path falls in, using prefix.
        """
        parts = self.path.relparts(path, self.root_marker)
        if parts and parts[0] == os.curdir:
            parts = parts[1:]

        fs_path = self.repo.fs.path.join(self.repo.root_dir, *parts)
        repo = self._get_repo(fs_path)
        fs = repo.fs

        repo_parts = fs.path.relparts(repo.root_dir, self.repo.root_dir)
        if repo_parts[0] == os.curdir:
            repo_parts = repo_parts[1:]

        dvc_parts = parts[len(repo_parts):]
        if dvc_parts and dvc_parts[0] == os.curdir:
            dvc_parts = dvc_parts[1:]

        key = self._get_key(repo.root_dir)
        dvc_fs = self._datafss.get(key)
        if dvc_fs:
            dvc_path = dvc_fs.path.join(*dvc_parts) if dvc_parts else ""
        else:
            dvc_path = None

        return fs, fs_path, dvc_fs, dvc_path

    def open(self, path, mode="r", encoding="utf-8", **kwargs):  # pylint: disable=arguments-renamed, arguments-differ
        if "b" in mode:
            encoding = None

        fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(path)
        try:
            return fs.open(fs_path, mode=mode, encoding=encoding)
        except FileNotFoundError:
            if not dvc_fs:
                raise

        return dvc_fs.open(dvc_path, mode=mode, encoding=encoding, **kwargs)

    def isdvc(self, path, **kwargs):
        _, _, dvc_fs, dvc_path = self._get_fs_pair(path)
        return dvc_fs is not None and dvc_fs.isdvc(dvc_path, **kwargs)

    def ls(  # pylint: disable=arguments-differ
            self,
            path,
            detail=True,
            dvc_only=False,
            **kwargs):
        fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(path)

        repo = dvc_fs.repo if dvc_fs else self.repo
        dvcignore = repo.dvcignore
        ignore_subrepos = kwargs.get("ignore_subrepos", True)

        names = set()
        if dvc_fs:
            with suppress(FileNotFoundError):
                for entry in dvc_fs.ls(dvc_path, detail=False):
                    names.add(dvc_fs.path.name(entry))

        if not dvc_only and fs:
            try:
                for entry in dvcignore.ls(fs,
                                          fs_path,
                                          detail=False,
                                          ignore_subrepos=ignore_subrepos):
                    names.add(fs.path.name(entry))
            except (FileNotFoundError, NotADirectoryError):
                pass

        dvcfiles = kwargs.get("dvcfiles", False)

        def _func(fname):
            from dvc.dvcfile import is_valid_filename
            from dvc.ignore import DvcIgnore

            if dvcfiles:
                return True

            return not (is_valid_filename(fname)
                        or fname == DvcIgnore.DVCIGNORE_FILE)

        names = filter(_func, names)

        infos = []
        paths = []
        for name in names:
            entry_path = self.path.join(path, name)
            try:
                info = self.info(entry_path, ignore_subrepos=ignore_subrepos)
            except FileNotFoundError:
                continue
            infos.append(info)
            paths.append(entry_path)

        if not detail:
            return paths

        return infos

    def get_file(  # pylint: disable=arguments-differ
            self,
            rpath,
            lpath,
            callback=DEFAULT_CALLBACK,
            **kwargs):
        fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(rpath)

        if fs:
            try:
                fs.get_file(fs_path, lpath, callback=callback, **kwargs)
                return
            except FileNotFoundError:
                if not dvc_fs:
                    raise
        dvc_fs.get_file(dvc_path, lpath, callback=callback, **kwargs)

    def info(self, path, **kwargs):
        fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(path)

        repo = dvc_fs.repo if dvc_fs else self.repo
        dvcignore = repo.dvcignore
        ignore_subrepos = kwargs.get("ignore_subrepos", True)

        dvc_info = None
        if dvc_fs:
            try:
                dvc_info = dvc_fs.info(dvc_path)
            except FileNotFoundError:
                pass

        fs_info = None
        if fs:
            try:
                fs_info = fs.info(fs_path)
                if dvcignore.is_ignored(fs,
                                        fs_path,
                                        ignore_subrepos=ignore_subrepos):
                    fs_info = None
            except (FileNotFoundError, NotADirectoryError):
                if not dvc_info:
                    raise

        # NOTE: if some parent in fs_path turns out to be a file, it means
        # that the whole repofs branch doesn't exist.
        if fs and not fs_info and dvc_info:
            for parent in fs.path.parents(fs_path):
                try:
                    if fs.info(parent)["type"] != "directory":
                        dvc_info = None
                        break
                except FileNotFoundError:
                    continue

        if not dvc_info and not fs_info:
            raise FileNotFoundError

        info = _merge_info(repo, fs_info, dvc_info)
        info["name"] = path
        return info

    def checksum(self, path):
        fs, fs_path, dvc_fs, dvc_path = self._get_fs_pair(path)

        try:
            return fs.checksum(fs_path)
        except FileNotFoundError:
            return dvc_fs.checksum(dvc_path)