def find_dupes(self, dupes_only=True): """ An interactive routine similar to 'dupseek.pl' which implements a greedy algorithm, pointing out the duplicate groups using the most space. """ # need to sort by size multiplied by number of duplicates print 'Calculating file sizes...' space_hogs = [] for key, paths in self.hash_dict.items(): path0 = os.path.join(self.root_path, paths[0]) try: size = os.path.getsize(path0) space_hogs.append( DupeRecord(key=key, file_size=size, num_dupes=len(paths))) except OSError: print 'File "%s" is missing' % path0 self.del_file_hash(path0) continue space_hogs = sorted( space_hogs, key=lambda record: record.num_dupes * record.file_size, reverse=True) for item in space_hogs: if dupes_only and item.num_dupes == 1: continue print '%s occupied in %d copies of size %s with hash %s...' % ( progressbar.humanize_bytes(item.num_dupes*item.file_size), item.num_dupes, progressbar.humanize_bytes(item.file_size), format_hash(item.key) ) for i, path in enumerate(self.hash_dict[item.key]): print '%5d: "%s"' % (i, path) while True: print '\n', \ '[return] to continue, [q] to quit\n', \ '[k0...k'+str(item.num_dupes)+'] '\ 'keep one file and remove the rest' # if system has symbolic links if platform.system() != 'Windows': print '[l0...l'+str(item.num_dupes)+'] '\ 'keep one file and substitute the rest with '\ 'symbolic links\n' response = sys.stdin.readline() if check_resp_valid(response.strip(), item.num_dupes): print 'Response not recognized' break
def delete_dups_in_dest(source, dest, act=False, prompt=False, verbose=False, min_size=None): """ Delete files in dest HashMap that are duplicates of files in source HashMap. """ found_dup = 0 found_size = 0 for key, path_list in dest.hash_dict.items(): if key not in source.hash_dict: continue for rel_path in path_list: path = os.path.join(dest.root_path, rel_path) if not os.path.isfile(path): print '"%s" does not exist' % path continue if os.path.getsize(path) == 0: # If it's an empty file? continue if min_size and os.stat(path).st_size < min_size: continue found_dup += 1 found_size += os.path.getsize(path) print '%s duplicate of "%s" at "%s" (%s)' % ( 'Removing' if act else 'Found', source.hash_dict[key][0], path, progressbar.humanize_bytes(os.path.getsize(path))) if verbose: print 'Matches: ' + str(source.hash_dict[key]) if act: if prompt: if not input('OK? ').lower().startswith('y'): continue os.remove(path) # delete the parent directory if it is empty if not os.listdir(os.path.dirname(path)): # recursively remove empty directories os.removedirs(os.path.dirname(path)) dest.del_file_hash(rel_path) print '%s %d duplicate files (%s) in destination' % ( 'Deleted' if act else 'Found', found_dup, progressbar.humanize_bytes(found_size)) dest.save()
def findDupes(HashMap, dupesOnly=True): "An interactive routine similar to 'dupseek.pl' which implements a greedy algorithm, pointing out the duplicate groups using the most space. " # need to sort by size multiplied by number of duplicates print "Calculating file sizes..." spaceHogs = [] for key, paths in HashMap.hashDict.items(): try: size = os.path.getsize(paths[0]) spaceHogs.append( dupeRecord(key=key, fileSize=size, numDupes=len(paths)) ) except OSError: print "File '%s' is missing" % paths[0] HashMap._delFile(paths[0]) continue spaceHogs = sorted(spaceHogs, key=lambda record: record.numDupes * record.fileSize, reverse=True) for item in spaceHogs: if (dupesOnly and item.numDupes == 1): continue print " ".join([progressbar.humanize_bytes(item.numDupes*item.fileSize), str(item.numDupes), progressbar.humanize_bytes(item.fileSize), repr(HashMap.hashDict[item.key])]) def checkResponseValid (response, numDupes): matches = re.match("(\S+)(\d+)", response) if (response == ""): return True if (not matches): return False if (response in ['q', '']): return True if (matches.group(1) in ['k', 'l'] \ and matches.group(2) in range(numDupes)): return False return True while(True): print "\n", \ "[return] to continue, [q] to quit\n", \ "[k0...k"+str(item.numDupes)+"] keep one file and remove the rest" # if system has symbolic links if ( True ): print "[l0...l"+str(item.numDupes)+"] keep one file and substitute the rest with symbolic links\n" response = sys.stdin.readline() if (checkResponseValid(response.strip(), item.numDupes)): print "Response not recognized" break
def deleteDupsInDest (sourceMap, destMap, act=False, prompt=False, verbose=False): foundDup = 0 foundSize = 0 #rf = open('hashpatch.report', 'a') for key, val in destMap.hashDict.items(): if (key in sourceMap.hashDict): for path in val: try: path = os.path.join(destMap.rootPath, path) if (os.path.getsize(path) == 0): # If it's an empty file? continue foundDup += 1 foundSize += os.path.getsize(path) print "%s duplicate of '%s' at '%s' (%s)" % ("Removing" if act else "Found", sourceMap.hashDict[key][0], path, progressbar.humanize_bytes(os.path.getsize(path))) if (verbose): print "Matches: " + str(sourceMap.hashDict[key]) if act: if (prompt): print "OK? ", if (not prompt or sys.stdin.readline().lower().startswith('y')): os.remove(path) # delete the parent directory if it is empty if (not os.listdir(os.path.dirname(path))): # This is a function that recursively removes empty directories os.removedirs(os.path.dirname(path)) except OSError as e: print "File vanished: " + str(e) print "%s %d duplicate files (%s) in destination" % ("Deleted" if act else "Found", foundDup, progressbar.humanize_bytes(foundSize))