def schema(self): # Setup logging log.setLevel(self.settings.verbosity) # Display configuration log.debug("%s" % self.settings) # Setup database log.debug("* setup database's...") db = HashDatabase2(self.settings.database) if not db.open(): return yield "-- [Tables]" yield "-- " + ", ".join( row["name"] for row in db.connection.execute( "SELECT name,sql FROM sqlite_master WHERE (type='table') AND (substr(name,1,7) <> 'sqlite_') ORDER BY name" ) ) for row in db.connection.execute( "SELECT name,sql FROM sqlite_master WHERE (type='table') AND (substr(name,1,7) <> 'sqlite_') ORDER BY name" ): yield row["sql"] yield "-- [Databases]" for row in db.connection.execute("PRAGMA database_list"): yield "-- %s: %s" % (row[1], row[2])
def path_select_duplicates(self, path, hash, size): path = hashdb_sql.normalize(path) for select, paramaters in self.build_selects( sources=[None, self.locals, self.remotes], columns=HashDatabase2.RowDetailed._fields, filters=[ 'hash = :hash', 'size = :size', 'path <> :path', ] ): paramaters['path'] = path paramaters['size'] = size paramaters['hash'] = hash log.debug('-- select') log.debug(select) log.debug('-- paramaters') log.debug(paramaters) for row in self.connection.execute(select, paramaters): yield HashDatabase2.RowDetailed(**row)
def walk(self): '''Walks the directory tree specified by targets, yielding all accessible regular files as Walker.Target objects''' # compile all the information required to walk the targets targets = set(self._targets) fskip_fstype = self.build_fskip_globs(self._skip_fstypes) fskip_path = self.build_fskip_globs(self._skip_paths) fskip_name = self.build_fskip_globs(self._skip_names) fskip_dirname = self.build_fskip_globs(self._skip_dirnames) fskip_filename = self.build_fskip_globs(self._skip_filenames) fskip_access = None skip_binds = self._skip_binds skip_mounts = self._skip_mounts skip_symlinks = self._skip_symlinks is_linuxy = False mounts = MountEntries() if platform.system() != 'Windows': is_linuxy = True def fskip_access(target): access = stat.S_IMODE(target.stat.st_mode) if access & (stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) == 0: # usr, grp, oth : no access return True elif target.stat.st_uid == euid: # usr if access & stat.S_IRUSR == 0: return True elif access & (stat.S_IRGRP | stat.S_IROTH) == 0: # grp, oth : no access return True elif target.stat.st_gid in groups: # grp if access & stat.S_IRGRP == 0: return True elif access & (stat.S_IROTH) == 0: # oth : no access return True # Note that this may return some erronious negatives. # The objective is to try to avoid access errors, not prevent them entirely # Note that we could also use os.access, but that would cause additional # os.stat calls, and we want speed return False fskip_access.func_globals['uid'] = os.getuid() fskip_access.func_globals['euid'] = os.geteuid() fskip_access.func_globals['gid'] = os.getgid() fskip_access.func_globals['egid'] = os.getegid() fskip_access.func_globals['groups'] = os.getgroups() if os.geteuid() == 0: fskip_access = None # No need to skip things if we are root try: todo = deque() dirs = deque() if self.walk_depth: fappend = dirs.appendleft def fdone(): todo.extendleft(dirs) dirs.clear() else: fappend = todo.append def fdone(): pass for target in targets: log.verbose(PREFIX_ROOT + '%s (root)' % target.user) if stat.S_ISREG(target.stat.st_mode): yield target if stat.S_ISDIR(target.stat.st_mode): todo.clear() todo.append((target, [(target.stat.st_ino, target.stat.st_dev)])) while True: try: target, nodes = todo.popleft() except IndexError, _: break # Reached the last element in the list try: filelist = os.listdir(target.true) filelist.sort() except OSError, ex: log.warning('warning: Unable to list target %r: %s' % (target.user, ex)) continue for name in filelist: child = Walker.Target( os.path.join(target.true, name), os.path.join(target.user, name), None) # skip name? if fskip_name and fskip_name(name): if log.is_debug: log.debug(PREFIX_SKIP + '%s (skip_name)' % child.user) continue # skip path? if fskip_path and fskip_path(child.user): if log.is_debug: log.debug(PREFIX_SKIP + '%s (skip_path)' % child.user) continue # stat try: child = child._replace( stat=os.lstat(child.true)) except OSError, ex: log.warning('warning: Unable to lstat %r: %s' % (child.user, ex)) if log.is_debug: log.debug(PREFIX_SKIP + '%r (failed lstat)' % child.user) continue # recursive loop? if (child.stat.st_ino, child.stat.st_dev) in nodes: log.debug( PREFIX_SKIP + '%r (loop chain detected)' % child.user) continue # check access if fskip_access and fskip_access(child): log.debug(PREFIX_SKIP + '%r (no access)' % child.user) continue # resolve symlinks if stat.S_ISLNK(child.stat.st_mode): if skip_symlinks: log.debug( PREFIX_SKIP + '%r (skip_symlinks)' % child.user) continue log.debug(PREFIX_SYM + '%s (sym link)' % child.user) try: child = child._replace( true=mounts.truepath(child.true)) child = child._replace( stat=os.lstat(child.true)) except OSError, ex: log.warning( 'warning: Unable to read symlink target %r: %s' % (child.user, ex)) if log.is_debug: log.debug( PREFIX_SKIP + '%r (failed to read symlink target)' % child.user) continue # recursive loop? if (child.stat.st_ino, child.stat.st_dev) in nodes: log.debug(PREFIX_SKIP + '%r (loop chain detected)' % child.user) continue # check access if fskip_access != None and fskip_access( child): log.debug(PREFIX_SKIP + '%r (no access)' % child.user) continue # Need to recalculate child.true/.. parent_stat = None else: parent_stat = target.stat # regular file? if stat.S_ISREG(child.stat.st_mode): # skip filename? if fskip_filename and fskip_filename(name): if log.is_debug: log.debug( PREFIX_SKIP + '%r (skip_filename)' % child.user) continue if log.is_verbose: log.verbose( PREFIX_REG + '%s (regular file)' % child.user) yield child continue # directory? if stat.S_ISDIR(child.stat.st_mode): # skip dirname? if fskip_dirname and fskip_dirname(name): if log.is_debug: log.debug( PREFIX_SKIP + '%s (skip_dirname)' % child.user) continue # is bind? ToDo: Should this check be in a loop for bind chains? if mounts.is_bind(child.true): # skip binds? if skip_binds: if log.is_debug: log.debug( PREFIX_SKIP + '%s (skip_binds)' % child.user) continue log.debug(PREFIX_BIND + '%s (bind mount)' % child.user) try: child = child._replace( true=mounts.truepath(child.true)) child = child._replace( stat=os.lstat(child.true)) except OSError, ex: log.warning( 'warning: Unable to read bind target %r: %s' % (child.user, ex)) if log.is_debug: log.debug( PREFIX_SKIP + '%r (failed to read bind target)' % (child.user, ex)) continue # recursive loop? if (child.stat.st_ino, child.stat.st_dev) in nodes: log.debug(PREFIX_SKIP + '%r (loop chain detected)' % child.user) continue # check access if fskip_access != None and fskip_access( child): log.debug( PREFIX_SKIP + '%r (no access)' % child.user) continue parent_stat = None # get parent stat if is_linuxy and parent_stat == None: try: parent_stat = os.lstat( os.path.join(child.true, '..')) except OSError, ex: log.warning( 'warning: Unable to read parent %r: %s' % (os.path.join(child.user, '..'), ex)) # is mount? keep = None if (is_linuxy and (parent_stat != None))\ and ((parent_stat.st_dev != child.stat.st_dev)\ or (parent_stat.st_ino == child.stat.st_ino)): # skip mounts? if skip_mounts: log.debug( PREFIX_SKIP + '%r (skip_mounts)' % child.user) continue # skip fstype? if fskip_fstype: # find fstype, updating mount points if required fstype = mounts.get_fstype(child.true) ##if fstype == None: ## mounts = MountEntries() ## fstype = mounts.get_fstype(child.true) if fstype == None: log.warning( 'warning: Unable to resolve mount fstype %r' % child.user) log.debug( PREFIX_SKIP + '%s (failed to resolve mount fstype)' % child.user) continue mounts = mounts if fskip_fstype(fstype.type): if log.is_debug: log.debug(PREFIX_SKIP + '%s (skip_fstype)' % child.user) continue # directory if log.is_verbose: log.verbose(PREFIX_DIR + '%s (directory)' % child.user) # put directory in the todo list fappend((child, nodes + [(child.stat.st_ino, child.stat.st_dev)])) continue
% child.user) continue mounts = mounts if fskip_fstype(fstype.type): if log.is_debug: log.debug(PREFIX_SKIP + '%s (skip_fstype)' % child.user) continue # directory if log.is_verbose: log.verbose(PREFIX_DIR + '%s (directory)' % child.user) # put directory in the todo list fappend((child, nodes + [(child.stat.st_ino, child.stat.st_dev)])) continue if log.is_debug: log.debug(PREFIX_SKIP + '%s (unknown st_mode %r)' % (child.user, child.stat)) fdone() except Exception, ex: log.exception('Unexpected exception!')
#!/usr/bin/python from hashdb_output import log try: from hashdb_mntent import mntent, setmntent, getmntent_r, endmntent import ctypes except: log.debug('failed to load mntent stubs') setmntent = lambda filename: None import os from collections import namedtuple MountEntry = namedtuple('MountEntry', 'fsname dir type opts freq passno') def enum_mntent(filename): stream = setmntent(filename) if not stream: return [] try: buf = ctypes.create_string_buffer(4095) mount = mntent() results = [] while getmntent_r(stream, mount, ctypes.byref(buf), ctypes.sizeof(buf) - 1): results.append(MountEntry( mount.mnt_fsname, mount.mnt_dir, mount.mnt_type, set(mount.mnt_opts.split(',')),
def match(self): # Record matches (hash,size) pairs matches_done = set() # Iterate over targets try: for target, hash, db in self.hash(targets=self.settings.match_targets): matches = [] # Find row data target_data = db.path_get_prime(target.true) if not target_data: log.debug(PREFIX_SKIP + "%r (unable to find data)" % target.user) continue # Already reported? if (target_data.hash, target_data.size) in matches_done: log.debug(PREFIX_SKIP + "%r (already reported match)" % target.user) continue matches_done.add((target_data.hash, target_data.size)) # Search for duplicates for match_data in db.path_select_duplicates( path=target_data.path, hash=target_data.hash, size=target_data.size ): if not match_data.is_remote: # local if self.settings.match_verify: try: match_stat = os.lstat(match_data.path) except OSError, ex: log.debug(PREFIX_SKIP + "%r (unable to lstat match): %s" % (match_data.path, ex)) continue except IOError, ex: log.debug(PREFIX_SKIP + "%r (unable to lstat match): %s" % (match_data.path, ex)) continue if not stat.S_ISREG(match_stat.st_mode): log.debug(PREFIX_SKIP + "%r (not a regular file)" % match_data.path) continue # No longer a regular file # Verify/update hash match_hash = db.path_hash(match_data.path, match_stat) if match_hash == None: match_hash = build_hash(Walker.Target(match_data.path, match_data.path, match_stat)) if match_hash != None: db.path_setstat(match_data.path, match_stat, match_hash) if match_hash == None: log.debug(PREFIX_SKIP + "%r (unable to determine hash)" % match_data.path) continue # update data match_data = match_data._replace( hash=match_hash, size=match_stat.st_size, time=match_stat.st_mtime, mark=db.mark ) if (match_data.hash != target_data.hash) or (match_data.size != target_data.size): log.debug(PREFIX_SKIP + "%r (files no longer match)" % match_data.path) continue # skip if its no longer identical log.verbose("comp %s" % match_data.path)
def hash(self, targets=None): # Setup logging log.setLevel(self.settings.verbosity) # Display configuration log.debug("%s" % self.settings) # Setup database log.debug("* setup database's...") db = HashDatabase2(self.settings.database) db.extend_locals(self.settings.databases_locals) if not db.open(): return # Setup the walker log.debug("* setup walker...") walker = Walker() walker.walk_depth = self.settings.walk_depth walker.extend_targets(targets or self.settings.hash_targets) walker.extend_skip_fstypes(self.settings.skip_fstypes) walker.extend_skip_paths(self.settings.skip_paths) walker.extend_skip_names(self.settings.skip_names) walker.extend_skip_dirnames(self.settings.skip_dirnames) walker.extend_skip_filenames(self.settings.skip_filenames) walker.skip_mounts = self.settings.skip_mounts walker.skip_binds = self.settings.skip_binds walker.skip_symlinks = self.settings.skip_symlinks log.debug("* walk...") try: start_time = time.time() start_changes = db.connection.total_changes for target in walker.walk(): target_hash = db.path_hash(target.true, target.stat) if target_hash == None: target_hash = build_hash(target) if target_hash != None: db.path_setstat(target.true, target.stat, target_hash) if target_hash != None: yield (target, target_hash, db) # Only commit every so often since we are limited by disk speed now = time.time() if ((now - start_time) >= THRESHHOLD_TIMEDELTA) or ( (db.connection.total_changes - start_changes) > THRESHHOLD_CHANGEDELTA ): log.debug("* committing changes...") db.connection.commit() start_time = time.time() start_changes = db.connection.total_changes finally: log.debug("* committing changes...") db.connection.commit()
def view(self): # Setup logging log.setLevel(self.setting_verbosity) # Display configuration display_settings(self.settings, log.debug) # Setup database log.debug("* setup database's...") db = HashDatabase(self.setting_database) db.add_combines(self.setting_combine) if not db.open(): return # Read mounts (for truepath) mounts = MountEntries() # Build a query string, filtering on targets targets = [mounts.truepath(t) for t in self.setting_targets] qfilters = [] qargmap = {} if ("/" not in targets) and ("\\" not in targets) and ("//" not in targets) and ("\\\\" not in targets): for i, target in enumerate(targets): target = mounts.truepath(target) qfilters.append( r"""(path = :%(name)s) OR (substr(path, 1, :%(name)s_len + 1) = :%(name)s || '/')""" % {"name": "t%02d" % i} ) qargmap.update({"t%02d" % i: target, "t%02d_len" % i: len(target)}) qfilter = (r"""WHERE """ + r""" OR """.join(qfilters)) if len(qfilters) != 0 else r"""""" qorder = ( r""" ORDER BY path, mark DESC """ if self.setting_walk_depth else r""" ORDER BY count_components(path), path, mark DESC """ ) query = ( r""" SELECT * FROM combinedtab """ + qfilter + qorder ) # yield all results as a HashRowData blob (don't expose the underlying row) for row in db.connection.execute(query, qargmap): yield ( HashRowData(path=row["path"], hash=row["hash"], mark=row["mark"], time=row["time"], size=row["size"]), db, )