def match(self): # Record matches (hash,size) pairs matches_done = set() # Iterate over targets try: for target, hash, db in self.hash(targets=self.settings.match_targets): matches = [] # Find row data target_data = db.path_get_prime(target.true) if not target_data: log.debug(PREFIX_SKIP + "%r (unable to find data)" % target.user) continue # Already reported? if (target_data.hash, target_data.size) in matches_done: log.debug(PREFIX_SKIP + "%r (already reported match)" % target.user) continue matches_done.add((target_data.hash, target_data.size)) # Search for duplicates for match_data in db.path_select_duplicates( path=target_data.path, hash=target_data.hash, size=target_data.size ): if not match_data.is_remote: # local if self.settings.match_verify: try: match_stat = os.lstat(match_data.path) except OSError, ex: log.debug(PREFIX_SKIP + "%r (unable to lstat match): %s" % (match_data.path, ex)) continue except IOError, ex: log.debug(PREFIX_SKIP + "%r (unable to lstat match): %s" % (match_data.path, ex)) continue if not stat.S_ISREG(match_stat.st_mode): log.debug(PREFIX_SKIP + "%r (not a regular file)" % match_data.path) continue # No longer a regular file # Verify/update hash match_hash = db.path_hash(match_data.path, match_stat) if match_hash == None: match_hash = build_hash(Walker.Target(match_data.path, match_data.path, match_stat)) if match_hash != None: db.path_setstat(match_data.path, match_stat, match_hash) if match_hash == None: log.debug(PREFIX_SKIP + "%r (unable to determine hash)" % match_data.path) continue # update data match_data = match_data._replace( hash=match_hash, size=match_stat.st_size, time=match_stat.st_mtime, mark=db.mark ) if (match_data.hash != target_data.hash) or (match_data.size != target_data.size): log.debug(PREFIX_SKIP + "%r (files no longer match)" % match_data.path) continue # skip if its no longer identical log.verbose("comp %s" % match_data.path)
def hash(self, targets=None): # Setup logging log.setLevel(self.settings.verbosity) # Display configuration log.debug("%s" % self.settings) # Setup database log.debug("* setup database's...") db = HashDatabase2(self.settings.database) db.extend_locals(self.settings.databases_locals) if not db.open(): return # Setup the walker log.debug("* setup walker...") walker = Walker() walker.walk_depth = self.settings.walk_depth walker.extend_targets(targets or self.settings.hash_targets) walker.extend_skip_fstypes(self.settings.skip_fstypes) walker.extend_skip_paths(self.settings.skip_paths) walker.extend_skip_names(self.settings.skip_names) walker.extend_skip_dirnames(self.settings.skip_dirnames) walker.extend_skip_filenames(self.settings.skip_filenames) walker.skip_mounts = self.settings.skip_mounts walker.skip_binds = self.settings.skip_binds walker.skip_symlinks = self.settings.skip_symlinks log.debug("* walk...") try: start_time = time.time() start_changes = db.connection.total_changes for target in walker.walk(): target_hash = db.path_hash(target.true, target.stat) if target_hash == None: target_hash = build_hash(target) if target_hash != None: db.path_setstat(target.true, target.stat, target_hash) if target_hash != None: yield (target, target_hash, db) # Only commit every so often since we are limited by disk speed now = time.time() if ((now - start_time) >= THRESHHOLD_TIMEDELTA) or ( (db.connection.total_changes - start_changes) > THRESHHOLD_CHANGEDELTA ): log.debug("* committing changes...") db.connection.commit() start_time = time.time() start_changes = db.connection.total_changes finally: log.debug("* committing changes...") db.connection.commit()