def test_suffix_matching(): t = Trie(word_list=ENGLISH_WORD_LIST, use_suffix=True) assert t.match("je parle a lot") assert t.match("je parlealot") assert t.match("je parle a lot", delimiter=" ") assert not t.match("je parle beaucoup", delimiter=" ") assert not t.match("je parlealot", delimiter=" ")
def test_simple_word_matching(): t = Trie(word_list=ENGLISH_WORD_LIST[:10]) start = time.time() for _j in range(10000): assert t.contains("across") assert not t.contains("whom") assert not t.contains("bonjour") assert not t.contains("acrosst") test1_time = time.time() - start t = Trie(word_list=ENGLISH_WORD_LIST) start = time.time() for _j in range(10000): assert t.contains("across") assert t.contains("whom") assert not t.contains("bonjour") assert not t.contains("acrosst") test2_time = time.time() - start # Using a 1000 word list should not impact the search time assert test2_time < 2 * test1_time
def sync_tree( source: str, target: str, ignore: Optional[str | Sequence[str]] = None, file_list: Optional[list[str]] = None, delete: bool = True, preserve_timestamps: bool = True, delete_ignore: bool = False, ) -> tuple[list[str], list[str]]: """Synchronize the files and directories between two directories. :param source: the directory from where the files and directories need to be copied :param target: the target directory :param ignore: glob pattern or list of files or directories to ignore, if the name starts with `/` then only the path is taken into account from the root of the source (or target) directory. If the ignore value contains a glob pattern, it is taken in account only if it doesn't contain a /, since for now the filtering is not segmented by '/'. :param file_list: list of files to synchronize, if empty synchronize all files. Note that if file in the list is a directory then the complete content of that directory is included. Note also that ignore list takes precedence other file_list. :param delete: if True, remove files from target if they do not exist in source :param preserve_timestamps: if True preserve original timestamps. If False updated files get their timestamps set to current time. :param delete_ignore: if True files that are explicitely ignored are deleted. Note delete should be set to True in that case. """ # Some structure used when walking the trees to be synched FilesInfo = namedtuple("FilesInfo", ["rel_path", "source", "target"]) # The basename in the FileInfo structure is used to compare casing of # source and destination. FileInfo = namedtuple("FileInfo", ["path", "stat", "basename"]) # Normalize casing function for path comparison. path_key function # return a version of the path that is in lower case for case sensitive # and case preserving filesystems. The return value can be used for # path comparisons. if sys.platform == "win32": def path_key(p: str) -> str: return p.lower() else: def path_key(p: str) -> str: return p # normalize the list of file to synchronize norm_file_list = None if file_list is not None: norm_file_list = [ wf.replace("\\", "/").rstrip("/") for wf in file_list ] # normalize ignore patterns if ignore is not None: ignore = [ignore] if isinstance(ignore, str) else ignore norm_ignore_list = [fn.replace("\\", "/") for fn in ignore] ignore_path_suffixes = Trie(use_suffix=True, match_delimiter="/") ignore_path_prefixes = Trie(match_delimiter="/") ignore_base_regexp_list = [] ignore_base_regexp: Optional[re.Pattern[str]] = None for pattern in norm_ignore_list: pk = path_key(pattern) if "/" not in pk: # This is a regexp on the basename using fnmatch. ignore_base_regexp_list.append(fnmatch.translate(pk)) elif pattern.startswith("/"): # An absolute path ignore_path_prefixes.add(pk) else: # A relative path ignore_path_suffixes.add(pk) if ignore_base_regexp_list: ignore_base_regexp = re.compile("|".join(ignore_base_regexp_list)) def is_in_ignore_list(p: str) -> bool: """Check if a file should be ignored. :param p: path relative to source directory (note it starts with a /) :return: True if in the list of file to include """ if ignore is None: return False pk = path_key(p) return (ignore_path_prefixes.match(pk) or ignore_path_suffixes.match(pk) or (ignore_base_regexp is not None and bool(re.match(ignore_base_regexp, os.path.basename(pk))))) def is_in_file_list(p: str) -> bool: """Check if a file should be included. :param p: path relative to source directory (note it starts with a /) :return: True if in the list of file to include """ if file_list is None: return True if TYPE_CHECKING: assert norm_file_list is not None pk = path_key(p) return any( f for f in norm_file_list if path_key(f) == pk[1:] or pk.startswith(path_key("/" + f + "/")) or path_key(f).startswith(pk[1:] + "/")) def isdir(fi: FileInfo) -> bool: """Check if a file is a directory. :param fi: a FileInfo namedtuple :return: True if fi is a directory """ return fi.stat is not None and stat.S_ISDIR(fi.stat.st_mode) def islink(fi: FileInfo) -> bool: """Check if a file is a link. :param fi: a FileInfo namedtuple :return: True if fi is a symbolic link """ return fi.stat is not None and stat.S_ISLNK(fi.stat.st_mode) def isfile(fi: FileInfo) -> bool: """Check if a file is a regular file. :param fi: a FileInfo namedtuple :return: True if fi is a regular file """ return fi.stat is not None and stat.S_ISREG(fi.stat.st_mode) def cmp_files(src: FileInfo, dst: FileInfo) -> bool: """Fast compare two files.""" bufsize = 8 * 1024 with open(src.path, "rb") as fp1, open(dst.path, "rb") as fp2: while True: b1 = fp1.read(bufsize) b2 = fp2.read(bufsize) if b1 != b2: return False if len(b1) < bufsize: return True def need_update(src: FileInfo, dst: FileInfo) -> bool: """Check if dst file should updated. :param src: the source FileInfo object :param dst: the target FileInfo object :return: True if we should update dst """ # when not preserving timestamps we cannot rely on the timestamps to # check if a file is up-to-date. In that case do a full content # comparison as last check. return (dst.stat is None or stat.S_IFMT(src.stat.st_mode) != stat.S_IFMT(dst.stat.st_mode) or (preserve_timestamps and abs(src.stat.st_mtime - dst.stat.st_mtime) > 0.001) or src.stat.st_size != dst.stat.st_size or (not preserve_timestamps and isfile(src) and not cmp_files(src, dst)) or src.basename != dst.basename) def copystat(src: FileInfo, dst: FileInfo) -> None: """Update attribute of dst file with src attributes. :param src: the source FileInfo object :param dst: the target FileInfo object """ if islink(src): # windows: no cover mode = stat.S_IMODE(src.stat.st_mode) if hasattr(os, "lchmod"): os.lchmod(dst.path, mode) if hasattr(os, "lchflags") and hasattr(src.stat, "st_flags"): try: os.lchflags(dst.path, src.stat.st_flags) except OSError as why: # defensive code import errno if (not hasattr(errno, "EOPNOTSUPP") or why.errno != errno.EOPNOTSUPP): raise else: mode = stat.S_IMODE(src.stat.st_mode) if hasattr(os, "utime"): if preserve_timestamps: os.utime(dst.path, (src.stat.st_atime, src.stat.st_mtime)) else: os.utime(dst.path, None) if hasattr(os, "chmod"): os.chmod(dst.path, mode) if hasattr(os, "chflags") and hasattr(src.stat, "st_flags"): try: os.chflags(dst.path, src.stat.st_flags) except OSError as why: # defensive code import errno if (not hasattr(errno, "EOPNOTSUPP") or why.errno != errno.EOPNOTSUPP): raise def safe_copy(src: FileInfo, dst: FileInfo) -> None: """Copy src file into dst preserving all attributes. :param src: the source FileInfo object :param dst: the target FileInfo object """ if islink(src): # windows: no cover linkto = os.readlink(src.path) if not islink(dst) or os.readlink(dst.path) != linkto: if dst.stat is not None: rm(dst.path, recursive=True, glob=False) os.symlink(linkto, dst.path) copystat(src, dst) else: if isdir(dst): # dst directory will be replaced by a file having the same # content as 'src' rm(dst.path, recursive=True, glob=False) elif islink(dst): # dst symlink will be replaced by a file having the same # content as 'src' rm(dst.path, recursive=False, glob=False) try: if dst.basename != src.basename: rm(dst.path, glob=False) dst = FileInfo( os.path.join(os.path.dirname(dst.path), src.basename), None, src.basename, ) with open(src.path, "rb") as fsrc: with open(dst.path, "wb") as fdst: shutil.copyfileobj(fsrc, fdst) except OSError: rm(dst.path, glob=False) with open(src.path, "rb") as fsrc: with open(dst.path, "wb") as fdst: shutil.copyfileobj(fsrc, fdst) copystat(src, dst) def safe_mkdir(src: FileInfo, dst: FileInfo) -> None: """Create a directory modifying parent directory permissions if needed. :param dst: directory to create """ if isfile(dst) or islink(dst): rm(dst.path, glob=False) try: # Final dirname with right casing if dst.basename != src.basename: dest_dir = os.path.join(os.path.dirname(dst.path), src.basename) else: dest_dir = dst.path if isdir(dst): # For directories in case of non-matching casing just do a rename # This ensure sync_tree is efficient in case content of the directory # is similar between src and dst. if dst.basename != src.basename: os.rename(dst.path, dest_dir) else: os.makedirs(dest_dir) except OSError: # in case of error to change parent directory # permissions. The permissions will be then # set correctly at the end of rsync. e3.os.fs.chmod("a+wx", os.path.dirname(dst.path)) if isdir(dst): if dst.basename != src.basename: os.rename(dst.path, dest_dir) else: os.makedirs(dest_dir) def walk(root_dir: str, target_root_dir: str, entry: Optional[FilesInfo] = None) -> Iterable[FilesInfo]: """Walk through source and target file trees. :param root_dir: path to source tree :param target_root_dir: path to target tree :param entry: a FilesInfo object (used internally for the recursion) :return: an iterator that iterate other the relevant FilesInfo object """ if entry is None: target_stat = None if os.path.exists(target_root_dir): target_stat = os.lstat(target_root_dir) entry = FilesInfo( "", FileInfo(root_dir, os.lstat(root_dir), ""), FileInfo(target_root_dir, target_stat, ""), ) yield entry try: source_names = { path_key(k): k for k in os.listdir(entry.source.path) } except Exception: # defensive code e3.log.debug("cannot get sources list", exc_info=True) # Don't crash in case a source directory cannot be read return target_names = {} if isdir(entry.target): try: target_names = { path_key(k): k for k in os.listdir(entry.target.path) } except Exception: e3.log.debug("cannot get targets list", exc_info=True) target_names = {} all_names = set(source_names.keys()) | set(target_names.keys()) result = [] for name in all_names: rel_path = f"{entry.rel_path}/{name}" source_full_path = os.path.join(entry.source.path, source_names.get(name, name)) target_full_path = os.path.join(entry.target.path, target_names.get(name, name)) source_stat = None target_stat = None if name in source_names: source_stat = os.lstat(source_full_path) source_file = FileInfo(source_full_path, source_stat, os.path.basename(source_full_path)) if name in target_names: target_stat = os.lstat(target_full_path) target_file = FileInfo(target_full_path, target_stat, os.path.basename(target_full_path)) result.append(FilesInfo(rel_path, source_file, target_file)) for el in result: if is_in_ignore_list(el.rel_path): logger.debug("ignore %s", el.rel_path) if delete_ignore: yield FilesInfo( el.rel_path, FileInfo(el.source.path, None, el.source.basename), el.target, ) elif is_in_file_list(el.rel_path): yield el if isdir(el.source): yield from walk(root_dir, target_root_dir, el) else: yield FilesInfo( el.rel_path, FileInfo(el.source.path, None, el.source.basename), el.target, ) source_top = os.path.normpath(source).rstrip(os.path.sep) target_top = os.path.normpath(target).rstrip(os.path.sep) copystat_dir_list = [] logger.debug( "sync_tree %s -> %s [delete=%s, preserve_stmp=%s]", source, target, delete, preserve_timestamps, ) if not os.path.exists(source): raise FSError(origin="sync_tree", message=f"{source} does not exist") # Keep track of deleted and updated files deleted_list: list[str] = [] updated_list: list[str] = [] for wf in walk(source_top, target_top): if wf.source.stat is None and wf.target.stat is not None: # Entry that exist only in the target file tree. Check if we # should delete it if delete: rm(wf.target.path, recursive=True, glob=False) deleted_list.append(wf.target.path) else: # At this stage we have an element to synchronize in # the source tree. if need_update(wf.source, wf.target): if isfile(wf.source) or islink(wf.source): safe_copy(wf.source, wf.target) updated_list.append(wf.target.path) elif isdir(wf.source): safe_mkdir(wf.source, wf.target) updated_list.append(wf.target.path) copystat_dir_list.append((wf.source, wf.target)) # Adjust directory permissions once all files have been copied for d in copystat_dir_list: copystat(d[0], d[1]) return updated_list, deleted_list
def test_prefix_matching(): t = Trie(word_list=ENGLISH_WORD_LIST) assert t.match("across l'univers") assert t.match("across l'univers", delimiter=" ") assert t.match("across", delimiter=" ") assert not t.match("acrossl'univers", delimiter=" ")