예제 #1
0
    def find_dupes(self, dupes_only=True):
        """
        An interactive routine similar to 'dupseek.pl' which implements a
        greedy algorithm, pointing out the duplicate groups using the most
        space.
        """
        # need to sort by size multiplied by number of duplicates
        print 'Calculating file sizes...'
        space_hogs = []
        for key, paths in self.hash_dict.items():
            path0 = os.path.join(self.root_path, paths[0])
            try:
                size = os.path.getsize(path0)
                space_hogs.append(
                    DupeRecord(key=key, file_size=size, num_dupes=len(paths)))
            except OSError:
                print 'File "%s" is missing' % path0
                self.del_file_hash(path0)
                continue
        space_hogs = sorted(
            space_hogs,
            key=lambda record: record.num_dupes * record.file_size,
            reverse=True)

        for item in space_hogs:
            if dupes_only and item.num_dupes == 1:
                continue
            print '%s occupied in %d copies of size %s with hash %s...' % (
                progressbar.humanize_bytes(item.num_dupes*item.file_size),
                item.num_dupes,
                progressbar.humanize_bytes(item.file_size),
                format_hash(item.key)
                )
            for i, path in enumerate(self.hash_dict[item.key]):
                print '%5d: "%s"' % (i, path)


            while True:
                print '\n', \
                        '[return] to continue, [q] to quit\n', \
                        '[k0...k'+str(item.num_dupes)+'] '\
                        'keep one file and remove the rest'
                # if system has symbolic links
                if platform.system() != 'Windows':
                    print '[l0...l'+str(item.num_dupes)+'] '\
                        'keep one file and substitute the rest with '\
                        'symbolic links\n'

                response = sys.stdin.readline()
                if check_resp_valid(response.strip(), item.num_dupes):
                    print 'Response not recognized'
                    break
예제 #2
0
def delete_dups_in_dest(source, dest, act=False, prompt=False,
                        verbose=False, min_size=None):
    """
    Delete files in dest HashMap that are duplicates of files in source
    HashMap.
    """
    found_dup = 0
    found_size = 0

    for key, path_list in dest.hash_dict.items():
        if key not in source.hash_dict:
            continue

        for rel_path in path_list:
            path = os.path.join(dest.root_path, rel_path)
            if not os.path.isfile(path):
                print '"%s" does not exist' % path
                continue
            if os.path.getsize(path) == 0:
                # If it's an empty file?
                continue
            if min_size and os.stat(path).st_size < min_size:
                continue
            found_dup += 1
            found_size += os.path.getsize(path)
            print '%s duplicate of "%s" at "%s" (%s)' % (
                'Removing' if act else 'Found',
                source.hash_dict[key][0],
                path,
                progressbar.humanize_bytes(os.path.getsize(path)))
            if verbose:
                print 'Matches: ' + str(source.hash_dict[key])
            if act:
                if prompt:
                    if not input('OK? ').lower().startswith('y'):
                        continue
                os.remove(path)
                # delete the parent directory if it is empty
                if not os.listdir(os.path.dirname(path)):
                    # recursively remove empty directories
                    os.removedirs(os.path.dirname(path))
                dest.del_file_hash(rel_path)

    print '%s %d duplicate files (%s) in destination' % (
        'Deleted' if act else 'Found',
        found_dup,
        progressbar.humanize_bytes(found_size))
    dest.save()
예제 #3
0
def findDupes(HashMap, dupesOnly=True):
    "An interactive routine similar to 'dupseek.pl' which implements a greedy algorithm, pointing out the duplicate groups using the most space. "
    # need to sort by size multiplied by number of duplicates
    print "Calculating file sizes..."
    spaceHogs = []
    for key, paths in HashMap.hashDict.items():
        try:
            size = os.path.getsize(paths[0])
            spaceHogs.append( dupeRecord(key=key, fileSize=size, numDupes=len(paths)) )
        except OSError:
            print "File '%s' is missing" % paths[0]
            HashMap._delFile(paths[0])
            continue
    spaceHogs = sorted(spaceHogs, key=lambda record: record.numDupes * record.fileSize, reverse=True)

    for item in spaceHogs:
        if (dupesOnly and item.numDupes == 1):
            continue
        print " ".join([progressbar.humanize_bytes(item.numDupes*item.fileSize), str(item.numDupes), progressbar.humanize_bytes(item.fileSize), repr(HashMap.hashDict[item.key])])


        def checkResponseValid (response, numDupes):
            matches = re.match("(\S+)(\d+)", response)
            if (response == ""):
                return True
            if (not matches):
                return False
            if (response in ['q', '']):
                return True
            if (matches.group(1) in ['k', 'l'] \
                    and matches.group(2) in range(numDupes)):
                return False

            return True

        while(True):
            print "\n", \
                    "[return] to continue, [q] to quit\n", \
                    "[k0...k"+str(item.numDupes)+"] keep one file and remove the rest"
            # if system has symbolic links
            if ( True ):
                print "[l0...l"+str(item.numDupes)+"] keep one file and substitute the rest with symbolic links\n"

            response = sys.stdin.readline()
            if (checkResponseValid(response.strip(), item.numDupes)):
                print "Response not recognized"
                break
예제 #4
0
def deleteDupsInDest (sourceMap, destMap, act=False, prompt=False, verbose=False):
    foundDup = 0
    foundSize = 0

    #rf = open('hashpatch.report', 'a')

    for key, val in destMap.hashDict.items():
        if (key in sourceMap.hashDict):

            for path in val:
                try:
                    path = os.path.join(destMap.rootPath, path)
                    if (os.path.getsize(path) == 0):
                        # If it's an empty file?
                        continue
                    foundDup += 1
                    foundSize += os.path.getsize(path)
                    print "%s duplicate of '%s' at '%s' (%s)" % ("Removing" if act else "Found", sourceMap.hashDict[key][0], path, progressbar.humanize_bytes(os.path.getsize(path)))
                    if (verbose):
                        print "Matches: " + str(sourceMap.hashDict[key])
                    if act:
                        if (prompt):
                            print "OK? ",
                        if (not prompt or sys.stdin.readline().lower().startswith('y')):
                            os.remove(path)
                            # delete the parent directory if it is empty
                            if (not os.listdir(os.path.dirname(path))):
                                # This is a function that recursively removes empty directories
                                os.removedirs(os.path.dirname(path))
                except OSError as e:
                    print "File vanished: " + str(e)

    print "%s %d duplicate files (%s) in destination" % ("Deleted" if act else "Found", foundDup, progressbar.humanize_bytes(foundSize))