def hashlist_generate(srcpath, opts, source_mode=True, existing_hashlist=None): ''' Generate the hashlist for the given path. srcpath - the top-level directory opts - the optparse options dict Return a list of FileHash objects representing objects in the srcpath filesystem. If opts.trim_path is True, strip the srcpath from the filename in the hashlist. This makes it easier to work with relative paths. opts.no_ignore_dirs and opts.no_ignore_files disable the default behaviour, which is to ignore of common dirs (CVS, .git, .svn) and files (*~, *.swp). ''' log.debug("hashlist_generate: srcpath %s source_mode %s", srcpath, source_mode) if os.path.exists(srcpath): if not os.path.isdir(srcpath): raise NonDirFoundAtDirLocationError( "'%s' found but is not a directory" % srcpath) else: os.mkdir(srcpath) # If we're the target, defer reading file contents until we're asked to # compare. That way, if we're trusting the mtime, we may not have to read # the file at all. if opts.always_checksum: defer_fs_read = False else: defer_fs_read = True lookup_existing = None source_extramsg = '' # If we have an existing hashfile, also defer reading the file, as we may # be able to avoid that if we trust mtimes. if existing_hashlist is not None: lookup_existing = hashlist_to_dict(existing_hashlist) defer_fs_read = True source_extramsg = ' (with cache)' hashlist = get_hashlist(opts) if not opts.quiet: if source_mode: print("Scanning source filesystem%s" % (source_extramsg)) else: print("Comparing local filesystem to signature file%s" % (source_extramsg)) if opts.progress and source_mode: verb = "Add" else: verb = "Scan" re_globmatch = re.compile(r'[*?\[\]]') if opts.exclude_dir: excdirs = set( [d for d in opts.exclude_dir if not re_globmatch.search(d)]) excdirs_glob = set([d for d in opts.exclude_dir if d not in excdirs]) else: excdirs = set() ## # Walk the filesystem. ## for root, dirs, files in os.walk(srcpath): relroot = root[len(srcpath) + 1:] if log.isEnabledFor(logging.DEBUG): log.debug("os.walk: root %s dirs %s files %s", root, dirs, files) # See if the directory list can be pruned. # XXX refactor. if not opts.no_ignore_dirs: copydirs = dirs[:] # Don't iterate over a list we may change. for dirname in copydirs: fulldirname = os.path.join(relroot, dirname) for di in dirignore: # dirignore is a regex anchored to the start - need to # use the short dirname, e.g. 'CVS', as opposed to the # long dirname, 'stuff/CVS'. if di.search(dirname): if source_mode and opts.verbose: print("Skipping ignore-able dir %s" % dirname) dirs.remove(dirname) log.debug("Exclude dir '%s' full path '%s'", dirname, fulldirname) # Likewise, handle the user's exclusions. This makes the assumption # that the list of exclusions will not be much larger than the list of # directories. if opts.exclude_dir: done_skip = False # Don't iterate over a list we'll be changing inside the loop. copydirs = dirs[:] for dirname in copydirs: fulldirname = os.path.join(relroot, dirname) if is_dir_excluded(fulldirname, excdirs, excdirs_glob): log.debug("Exclude dir '%s' full path '%s'", dirname, fulldirname) dirs.remove(dirname) done_skip = True if done_skip: log.debug("dirs now %s", dirs) # Handle directories. for n, dirname in enumerate(dirs, start=1): fpath = os.path.join(root, dirname) fh = FileHash.init_from_file(fpath, trim=opts.trim_path, root=srcpath, defer_read=defer_fs_read) if opts.progress: print("D: %s dir %s (dir-in-dir %d/%d)" % (verb, fpath, n, len(dirs))) elif opts.verbose: print("%s dir: %s" % (verb, fpath)) hashlist.append(fh) files.sort() for n, filename in enumerate(files, start=1): fpath = os.path.join(root, filename) # Don't include hashfiles or lockfiles. if is_hashfile(filename, custom_hashfile=opts.hash_file, guess_sigfiles=opts.guess_sigfiles): log.debug("Skipping hash file or lock '%s'", filename) continue skipped = False if not opts.no_ignore_files: for fi in fileignore: if fi.search(fpath): if source_mode and opts.verbose: print("Ignore: %s" % fpath) skipped = True break if skipped: continue log.debug("Add file: %s", fpath) if opts.progress: print("F: %s [dir %s] file %s (file-in-dir %d/%d)" % (verb, root, filename, n, len(files))) elif opts.verbose: print("%s file: %s" % (verb, fpath)) fh = FileHash.init_from_file(fpath, trim=opts.trim_path, root=srcpath, defer_read=defer_fs_read) if not opts.always_checksum and fh.is_file: # Attempt to bypass the checksum, if the old HSYNC.SIG has it. do_checksum = True if lookup_existing is not None: if fh.fpath in lookup_existing: oldfh = lookup_existing[fh.fpath] log.debug("'%s': Found old entry (%s)", fh.fpath, repr(oldfh)) if fh.safe_to_skip(oldfh): do_checksum = False fh.inherit_attributes(oldfh) if do_checksum: log.debug("'%s': fall back to reading file", fh.fpath) fh.read_file_contents() log.debug("'%s': Adding to hash list", fh.fpath) assert fh.hashstr != fh.notsethash hashlist.append(fh) if opts.scan_debug: _scan_debug(hashlist) log.debug("hashlist_generate: entries %d", len(hashlist)) return hashlist
def hashlist_check(dstpath, src_hashlist, opts, existing_hashlist=None, opportunistic_write=False, opwrite_path=None, source_side=False): ''' Check the dstpath against the provided hashlist. Return a tuple (needed, notneeded, dst_hashlist), where needed is a list of filepaths that need to be fetched, and notneeded is a list of filepaths that are not present on the target, so may be removed. dst_hashlist is a list of FileHash objects for the destination path. ''' log.debug("hashlist_check():") src_fdict = hashlist_to_dict(src_hashlist) # Take the simple road. Generate a hashlist for the destination. dst_hashlist = hashlist_generate(dstpath, opts, source_mode=False, existing_hashlist=existing_hashlist) no_compress = False if source_side: no_compress = True if opportunistic_write: assert opwrite_path is not None sigfile_write(dst_hashlist, opwrite_path, opts, use_tmp=True, verb='Caching scanned', no_compress=no_compress) dst_fdict = hashlist_to_dict(dst_hashlist) re_globmatch = re.compile(r'[*?\[\]]') if opts.exclude_dir: direx = set( [d for d in opts.exclude_dir if not re_globmatch.search(d)]) direx_glob = set([d for d in opts.exclude_dir if d not in direx]) else: direx = set() direx_glob = set() # Now compare the two dictionaries. needed = get_hashlist(opts) excluded_dirs = set() mapper = UidGidMapper() if opts.set_user: mapper.set_default_name(opts.set_user) if opts.set_group: mapper.set_default_group(opts.set_group) for fpath, fh in src_fdict.iteritems(): # Generate (pointless) stat. if not fh.is_dir and fh.size_is_known: opts.stats.bytes_total += fh.size assert fpath == fh.fpath # Process exclusions. filename = os.path.basename(fpath) if filename != '' and \ is_hashfile(filename, custom_hashfile=opts.hash_file, guess_sigfiles=opts.guess_sigfiles): log.debug("needed: skipping hash file or lock '%s'", filename) continue if is_path_pre_excluded(fpath, excluded_dirs): continue if fh.is_dir: if is_dir_excluded(fpath, direx, direx_glob, excluded_dirs): log.debug("Dir '%s' excluded", fpath) continue # If the user overrode stuff, set that up here. if opts.set_user: fh.uid = mapper.default_uid fh.user = mapper.default_name if opts.set_group: fh.gid = mapper.default_gid fh.group = mapper.default_group if fpath in dst_fdict: if not src_fdict[fpath].compare( dst_fdict[fpath], ignore_mode=opts.ignore_mode, trust_mtime=(not opts.always_checksum)): log.debug("%s: needed", fpath) # Store a reference to the object at the destination. # This can be used to update the dest's HSYNC.SIG file and # save on rechecks. fh.associated_dest_object = dst_fdict[fpath] needed.append(fh) else: log.debug("%s: needed", fpath) fh.dest_missing = True needed.append(fh) not_needed = get_hashlist(opts) for fpath, fh in dst_fdict.iteritems(): filename = os.path.basename(fpath) if filename != '' and \ is_hashfile(filename, custom_hashfile=opts.hash_file, guess_sigfiles=opts.guess_sigfiles): log.debug("not_needed: skipping hash file or lock '%s'", filename) continue if fpath not in src_fdict: log.debug("%s: not found in source", fpath) not_needed.append(fh) if opts.check_debug: _check_debug(needed, not_needed) return (needed, not_needed, dst_hashlist)