示例#1
0
def test_suffix_matching():
    t = Trie(word_list=ENGLISH_WORD_LIST, use_suffix=True)
    assert t.match("je parle a lot")
    assert t.match("je parlealot")
    assert t.match("je parle a lot", delimiter=" ")
    assert not t.match("je parle beaucoup", delimiter=" ")
    assert not t.match("je parlealot", delimiter=" ")
示例#2
0
def test_simple_word_matching():
    t = Trie(word_list=ENGLISH_WORD_LIST[:10])

    start = time.time()
    for _j in range(10000):
        assert t.contains("across")
        assert not t.contains("whom")
        assert not t.contains("bonjour")
        assert not t.contains("acrosst")
    test1_time = time.time() - start

    t = Trie(word_list=ENGLISH_WORD_LIST)
    start = time.time()
    for _j in range(10000):
        assert t.contains("across")
        assert t.contains("whom")
        assert not t.contains("bonjour")
        assert not t.contains("acrosst")
    test2_time = time.time() - start

    # Using a 1000 word list should not impact the search time
    assert test2_time < 2 * test1_time
示例#3
0
文件: fs.py 项目: pierretr/e3-core
def sync_tree(
    source: str,
    target: str,
    ignore: Optional[str | Sequence[str]] = None,
    file_list: Optional[list[str]] = None,
    delete: bool = True,
    preserve_timestamps: bool = True,
    delete_ignore: bool = False,
) -> tuple[list[str], list[str]]:
    """Synchronize the files and directories between two directories.

    :param source: the directory from where the files and directories
        need to be copied
    :param target: the target directory
    :param ignore: glob pattern or list of files or directories to ignore,
        if the name starts with `/` then only the path is taken into
        account from the root of the source (or target) directory.
        If the ignore value contains a glob pattern, it is taken in account
        only if it doesn't contain a /, since for now the filtering
        is not segmented by '/'.
    :param file_list: list of files to synchronize, if empty synchronize all
        files. Note that if file in the list is a directory then the complete
        content of that directory is included. Note also that ignore list
        takes precedence other file_list.
    :param delete: if True, remove files from target if they do not exist
        in source
    :param preserve_timestamps: if True preserve original timestamps.
        If False updated files get their timestamps set to current time.
    :param delete_ignore: if True files that are explicitely ignored
        are deleted. Note delete should be set to True in that case.
    """
    # Some structure used when walking the trees to be synched
    FilesInfo = namedtuple("FilesInfo", ["rel_path", "source", "target"])

    # The basename in the FileInfo structure is used to compare casing of
    # source and destination.
    FileInfo = namedtuple("FileInfo", ["path", "stat", "basename"])

    # Normalize casing function for path comparison. path_key function
    # return a version of the path that is in lower case for case sensitive
    # and case preserving filesystems. The return value can be used for
    # path comparisons.
    if sys.platform == "win32":

        def path_key(p: str) -> str:
            return p.lower()

    else:

        def path_key(p: str) -> str:
            return p

    # normalize the list of file to synchronize
    norm_file_list = None
    if file_list is not None:
        norm_file_list = [
            wf.replace("\\", "/").rstrip("/") for wf in file_list
        ]

    # normalize ignore patterns
    if ignore is not None:
        ignore = [ignore] if isinstance(ignore, str) else ignore
        norm_ignore_list = [fn.replace("\\", "/") for fn in ignore]

        ignore_path_suffixes = Trie(use_suffix=True, match_delimiter="/")
        ignore_path_prefixes = Trie(match_delimiter="/")

        ignore_base_regexp_list = []
        ignore_base_regexp: Optional[re.Pattern[str]] = None

        for pattern in norm_ignore_list:
            pk = path_key(pattern)
            if "/" not in pk:
                # This is a regexp on the basename using fnmatch.
                ignore_base_regexp_list.append(fnmatch.translate(pk))
            elif pattern.startswith("/"):
                # An absolute path
                ignore_path_prefixes.add(pk)
            else:
                # A relative path
                ignore_path_suffixes.add(pk)

        if ignore_base_regexp_list:
            ignore_base_regexp = re.compile("|".join(ignore_base_regexp_list))

    def is_in_ignore_list(p: str) -> bool:
        """Check if a file should be ignored.

        :param p: path relative to source directory (note it starts with a /)

        :return: True if in the list of file to include
        """
        if ignore is None:
            return False

        pk = path_key(p)

        return (ignore_path_prefixes.match(pk)
                or ignore_path_suffixes.match(pk) or
                (ignore_base_regexp is not None
                 and bool(re.match(ignore_base_regexp, os.path.basename(pk)))))

    def is_in_file_list(p: str) -> bool:
        """Check if a file should be included.

        :param p: path relative to source directory (note it starts with a /)

        :return: True if in the list of file to include
        """
        if file_list is None:
            return True
        if TYPE_CHECKING:
            assert norm_file_list is not None

        pk = path_key(p)

        return any(
            f for f in norm_file_list
            if path_key(f) == pk[1:] or pk.startswith(path_key("/" + f + "/"))
            or path_key(f).startswith(pk[1:] + "/"))

    def isdir(fi: FileInfo) -> bool:
        """Check if a file is a directory.

        :param fi: a FileInfo namedtuple

        :return: True if fi is a directory
        """
        return fi.stat is not None and stat.S_ISDIR(fi.stat.st_mode)

    def islink(fi: FileInfo) -> bool:
        """Check if a file is a link.

        :param fi: a FileInfo namedtuple

        :return: True if fi is a symbolic link
        """
        return fi.stat is not None and stat.S_ISLNK(fi.stat.st_mode)

    def isfile(fi: FileInfo) -> bool:
        """Check if a file is a regular file.

        :param fi: a FileInfo namedtuple
        :return: True if fi is a regular file
        """
        return fi.stat is not None and stat.S_ISREG(fi.stat.st_mode)

    def cmp_files(src: FileInfo, dst: FileInfo) -> bool:
        """Fast compare two files."""
        bufsize = 8 * 1024
        with open(src.path, "rb") as fp1, open(dst.path, "rb") as fp2:
            while True:
                b1 = fp1.read(bufsize)
                b2 = fp2.read(bufsize)
                if b1 != b2:
                    return False

                if len(b1) < bufsize:
                    return True

    def need_update(src: FileInfo, dst: FileInfo) -> bool:
        """Check if dst file should updated.

        :param src: the source FileInfo object
        :param dst: the target FileInfo object

        :return: True if we should update dst
        """
        # when not preserving timestamps we cannot rely on the timestamps to
        # check if a file is up-to-date. In that case do a full content
        # comparison as last check.
        return (dst.stat is None or
                stat.S_IFMT(src.stat.st_mode) != stat.S_IFMT(dst.stat.st_mode)
                or (preserve_timestamps
                    and abs(src.stat.st_mtime - dst.stat.st_mtime) > 0.001)
                or src.stat.st_size != dst.stat.st_size
                or (not preserve_timestamps and isfile(src)
                    and not cmp_files(src, dst))
                or src.basename != dst.basename)

    def copystat(src: FileInfo, dst: FileInfo) -> None:
        """Update attribute of dst file with src attributes.

        :param src: the source FileInfo object
        :param dst: the target FileInfo object
        """
        if islink(src):  # windows: no cover
            mode = stat.S_IMODE(src.stat.st_mode)
            if hasattr(os, "lchmod"):
                os.lchmod(dst.path, mode)

            if hasattr(os, "lchflags") and hasattr(src.stat, "st_flags"):
                try:
                    os.lchflags(dst.path, src.stat.st_flags)
                except OSError as why:  # defensive code
                    import errno

                    if (not hasattr(errno, "EOPNOTSUPP")
                            or why.errno != errno.EOPNOTSUPP):
                        raise
        else:
            mode = stat.S_IMODE(src.stat.st_mode)
            if hasattr(os, "utime"):
                if preserve_timestamps:
                    os.utime(dst.path, (src.stat.st_atime, src.stat.st_mtime))
                else:
                    os.utime(dst.path, None)
            if hasattr(os, "chmod"):
                os.chmod(dst.path, mode)
            if hasattr(os, "chflags") and hasattr(src.stat, "st_flags"):
                try:
                    os.chflags(dst.path, src.stat.st_flags)
                except OSError as why:  # defensive code
                    import errno

                    if (not hasattr(errno, "EOPNOTSUPP")
                            or why.errno != errno.EOPNOTSUPP):
                        raise

    def safe_copy(src: FileInfo, dst: FileInfo) -> None:
        """Copy src file into dst preserving all attributes.

        :param src: the source FileInfo object
        :param dst: the target FileInfo object
        """
        if islink(src):  # windows: no cover
            linkto = os.readlink(src.path)
            if not islink(dst) or os.readlink(dst.path) != linkto:
                if dst.stat is not None:
                    rm(dst.path, recursive=True, glob=False)
                os.symlink(linkto, dst.path)
            copystat(src, dst)
        else:
            if isdir(dst):
                # dst directory will be replaced by a file having the same
                # content as 'src'
                rm(dst.path, recursive=True, glob=False)
            elif islink(dst):
                # dst symlink will be replaced by a file having the same
                #  content as 'src'
                rm(dst.path, recursive=False, glob=False)

            try:
                if dst.basename != src.basename:
                    rm(dst.path, glob=False)
                    dst = FileInfo(
                        os.path.join(os.path.dirname(dst.path), src.basename),
                        None,
                        src.basename,
                    )

                with open(src.path, "rb") as fsrc:
                    with open(dst.path, "wb") as fdst:
                        shutil.copyfileobj(fsrc, fdst)
            except OSError:
                rm(dst.path, glob=False)
                with open(src.path, "rb") as fsrc:
                    with open(dst.path, "wb") as fdst:
                        shutil.copyfileobj(fsrc, fdst)
            copystat(src, dst)

    def safe_mkdir(src: FileInfo, dst: FileInfo) -> None:
        """Create a directory modifying parent directory permissions if needed.

        :param dst: directory to create
        """
        if isfile(dst) or islink(dst):
            rm(dst.path, glob=False)

        try:
            # Final dirname with right casing
            if dst.basename != src.basename:
                dest_dir = os.path.join(os.path.dirname(dst.path),
                                        src.basename)
            else:
                dest_dir = dst.path

            if isdir(dst):
                # For directories in case of non-matching casing just do a rename
                # This ensure sync_tree is efficient in case content of the directory
                # is similar between src and dst.
                if dst.basename != src.basename:
                    os.rename(dst.path, dest_dir)
            else:
                os.makedirs(dest_dir)
        except OSError:
            # in case of error to change parent directory
            # permissions. The permissions will be then
            # set correctly at the end of rsync.
            e3.os.fs.chmod("a+wx", os.path.dirname(dst.path))

            if isdir(dst):
                if dst.basename != src.basename:
                    os.rename(dst.path, dest_dir)
            else:
                os.makedirs(dest_dir)

    def walk(root_dir: str,
             target_root_dir: str,
             entry: Optional[FilesInfo] = None) -> Iterable[FilesInfo]:
        """Walk through source and target file trees.

        :param root_dir: path to source tree
        :param target_root_dir: path to target tree
        :param entry: a FilesInfo object (used internally for the recursion)

        :return: an iterator that iterate other the relevant FilesInfo object
        """
        if entry is None:
            target_stat = None
            if os.path.exists(target_root_dir):
                target_stat = os.lstat(target_root_dir)

            entry = FilesInfo(
                "",
                FileInfo(root_dir, os.lstat(root_dir), ""),
                FileInfo(target_root_dir, target_stat, ""),
            )
            yield entry

        try:
            source_names = {
                path_key(k): k
                for k in os.listdir(entry.source.path)
            }
        except Exception:  # defensive code
            e3.log.debug("cannot get sources list", exc_info=True)
            # Don't crash in case a source directory cannot be read
            return

        target_names = {}
        if isdir(entry.target):
            try:
                target_names = {
                    path_key(k): k
                    for k in os.listdir(entry.target.path)
                }
            except Exception:
                e3.log.debug("cannot get targets list", exc_info=True)
                target_names = {}

        all_names = set(source_names.keys()) | set(target_names.keys())

        result = []
        for name in all_names:
            rel_path = f"{entry.rel_path}/{name}"

            source_full_path = os.path.join(entry.source.path,
                                            source_names.get(name, name))
            target_full_path = os.path.join(entry.target.path,
                                            target_names.get(name, name))
            source_stat = None
            target_stat = None

            if name in source_names:
                source_stat = os.lstat(source_full_path)

            source_file = FileInfo(source_full_path, source_stat,
                                   os.path.basename(source_full_path))

            if name in target_names:
                target_stat = os.lstat(target_full_path)

            target_file = FileInfo(target_full_path, target_stat,
                                   os.path.basename(target_full_path))

            result.append(FilesInfo(rel_path, source_file, target_file))

        for el in result:
            if is_in_ignore_list(el.rel_path):
                logger.debug("ignore %s", el.rel_path)
                if delete_ignore:
                    yield FilesInfo(
                        el.rel_path,
                        FileInfo(el.source.path, None, el.source.basename),
                        el.target,
                    )
            elif is_in_file_list(el.rel_path):
                yield el
                if isdir(el.source):
                    yield from walk(root_dir, target_root_dir, el)
            else:
                yield FilesInfo(
                    el.rel_path,
                    FileInfo(el.source.path, None, el.source.basename),
                    el.target,
                )

    source_top = os.path.normpath(source).rstrip(os.path.sep)
    target_top = os.path.normpath(target).rstrip(os.path.sep)
    copystat_dir_list = []

    logger.debug(
        "sync_tree %s -> %s [delete=%s, preserve_stmp=%s]",
        source,
        target,
        delete,
        preserve_timestamps,
    )

    if not os.path.exists(source):
        raise FSError(origin="sync_tree", message=f"{source} does not exist")

    # Keep track of deleted and updated files
    deleted_list: list[str] = []
    updated_list: list[str] = []

    for wf in walk(source_top, target_top):
        if wf.source.stat is None and wf.target.stat is not None:
            # Entry that exist only in the target file tree. Check if we
            # should delete it
            if delete:
                rm(wf.target.path, recursive=True, glob=False)
                deleted_list.append(wf.target.path)
        else:
            # At this stage we have an element to synchronize in
            # the source tree.
            if need_update(wf.source, wf.target):
                if isfile(wf.source) or islink(wf.source):
                    safe_copy(wf.source, wf.target)
                    updated_list.append(wf.target.path)
                elif isdir(wf.source):
                    safe_mkdir(wf.source, wf.target)
                    updated_list.append(wf.target.path)
                    copystat_dir_list.append((wf.source, wf.target))

    # Adjust directory permissions once all files have been copied
    for d in copystat_dir_list:
        copystat(d[0], d[1])

    return updated_list, deleted_list
示例#4
0
def test_prefix_matching():
    t = Trie(word_list=ENGLISH_WORD_LIST)
    assert t.match("across l'univers")
    assert t.match("across l'univers", delimiter=" ")
    assert t.match("across", delimiter=" ")
    assert not t.match("acrossl'univers", delimiter=" ")