def main(options, arguments): #print 'options %s' % options #print 'arguments %s' % arguments if(options.device != None) : if(options.device == '/dev/mem') : mmemory = Mem() elif(options.device == '/dev/kmem') : mmemory = Kmem() else: usage() else : mmemory = Kmem() if(options.usemmap == None): options.usemmap = 0 if(options.view != None): if(options.view == 'tasks'): ttasks = GVTasks(mmemory, options.usemmap) ttasks.viewTasks() elif(options.view == 'syscalls'): mysyscalls = GVSyscalls(mmemory, options.usemmap) mysyscalls.viewSyscalls() elif(options.view == 'networks'): nnetworks = GVNetworks(mmemory, options.usemmap) nnetworks.viewNetworks() elif(options.check != None): if(options.check == 'tasks'): ttasks = GVTasks(mmemory, options.usemmap) ttasks.checkViewTasks() elif(options.check == 'networks'): nnetworks = GVNetworks(mmemory, options.usemmap) nnetworks.checkViewNetworks() elif(options.fingerprints != None): ffingerprints = Fingerprints(mmemory) if(options.fingerprints[1] == 'create'): ffingerprints.doFingerprints(options.fingerprints[0]) elif(options.fingerprints[1] == 'check'): ffingerprints.checkFingerprints(options.fingerprints[0]) elif(options.bump != None): mmemory.open("r", options.usemmap) mmemory.dump(string.atol(options.bump[0], 16), int(options.bump[1]), options.bump[2]) mmemory.close() else: usage()
def compare(files, threshold): import tempfile import glob tempdir = tempfile.mkdtemp() print tempdir for filepath in files: shutil.copy2(filepath, tempdir) files = glob.glob(os.path.join(tempdir, '*')) fingerprints = Fingerprints(output_dir=tempdir, threshold=threshold) globals()['fingerprints'] = fingerprints results = utils.apply_async(calculate, [ (tempdir, filepath) for filepath in files ]) for r in results: fingerprints.add(r) fingerprints.uncompress() for filepath in files: duplicates = get_duplicates(fingerprints.find_hash(filepath)) print duplicates
def process(sources, dest, threshold, ignore, clean, normalize, cpus): sources = utils.normalize_paths(list(sources)) ignore = utils.normalize_paths(list(ignore) + [dest]) fingerprints = Fingerprints(output_dir=dest, threshold=threshold, ignore=ignore) duplicates_file = os.path.join(dest, ".duplicates.json") print "Max distance of similarity:", fingerprints.max_distance # to get progress, count the total number of files we will touch files = [] for source in sources: files += utils.walker(source, lambda x: x, ignore=ignore) print "Processing {0} files.".format(len(files)) if len(files) == 0: print "No files found." quit() if normalize: print "Normalizing images..." files = utils.apply_async(utils.normalize_image, [ (fingerprints._backup_dir, filepath, clean) for filepath in files ], cpus=cpus) need_fingerprint = [ (fingerprints._backup_dir, filepath) for filepath in files if not fingerprints.find_hash(filepath) ] if need_fingerprint: def save_progress(results): for r in results: fingerprints.add(r) fingerprints.save() print "Saved." print "Calculating fingerprints for {0} files...".format(len(need_fingerprint)) results = utils.apply_async(calculate, need_fingerprint, aborted_callback=save_progress, cpus=cpus) save_progress(results) fingerprints.uncompress() output = {"threshold": threshold, "duplicates": {}} if os.path.exists(duplicates_file): with open(duplicates_file, 'r') as fp: loaded = json.load(fp) if loaded.get("threshold") == threshold: output = loaded output_duplicates = output.get("duplicates", {}) # we need to recalculate *all* duplicate information if there is a new file # added since the last time we ran, since it might be a duplicate of anything # TODO: handle ctrl-c again old_duplicates = set(output_duplicates.keys()) missing_duplicate_info = set(files) - old_duplicates if missing_duplicate_info: need_duplicate = files output_duplicates.clear() else: need_duplicate = [ filepath for filepath in files if filepath not in output_duplicates.keys() ] # seen_files = [] # for filepath, duplicate_files in output_duplicates.iteritems(): # seen_files.append(filepath) # for distance, duplicate_path in duplicate_files: # seen_files.append(duplicate_path) if need_duplicate: print "Looking for duplicates for {0} files...".format(len(need_duplicate)) def save_progress(results): for image_hash, duplicates in results: if not image_hash: continue filepath = image_hash.path output_duplicates[filepath] = duplicates with open(duplicates_file, "w") as fp: json.dump(output, fp) print "Saved." def setup(fingerprints): """ hack: store the fingerprints into globals for each process so the worker function can pull them from globals() """ globals()['fingerprints'] = fingerprints results = utils.apply_async( get_duplicates, [(fingerprints.find_hash(filepath),) for filepath in need_duplicate], pool_args=dict( initializer=setup, initargs=(fingerprints,) ), aborted_callback=save_progress, cpus=cpus ) # results = [] # setup(fingerprints) # progress = utils.Progress(len(need_duplicate)) # for filepath in need_duplicate: # results.append(get_duplicates(fingerprints.find_hash(filepath))) # progress.incr() # progress.done() save_progress(results) keys = sorted(output_duplicates.keys(), key=get_file_date) seen_files = [] count = 0 duplicates_count = 0 duplicates_dest = os.path.join(dest, "duplicates") for filepath in keys: if filepath in seen_files: continue duplicates = output_duplicates[filepath] count += 1 seen_files.append(filepath) seen_files.extend([duplicate_path for distance, duplicate_path in duplicates]) all_files = [(0, filepath)] + duplicates distance, best_image = get_best_match(all_files) new_filepath = get_new_image_path(best_image, count, dest) print "Copying {0} to {1}".format(best_image, new_filepath) shutil.copy2(best_image, new_filepath) if duplicates: duplicates_count += 1 filedest = os.path.join(duplicates_dest, str(duplicates_count)) all_files = [(distance, filepath)] + duplicates for idx, (distance, filepath) in enumerate(all_files, start=1): # only copy non-exact matches for spot checking if distance > 0 and filepath != best_image: if not os.path.isdir(filedest): os.makedirs(filedest) shutil.copy2( filepath, get_new_image_path( filepath, idx, filedest, "{0}_{1}".format(distance, "{0}{1}") ) ) if os.path.isdir(filedest): shutil.copy2( new_filepath, get_new_image_path(new_filepath, None, filedest, "kept{1}") )