class TestKFuzzy(unittest.TestCase): kfd = None buf = "" def setUp(self): self.kfd = CKoretFuzzyHashing() buf = "" for c in xrange(0, 255): buf += chr(c) * 512 self.buf = buf def testDHA(self): """ Default hashing algorithm (DHA) """ key = "AgIEBAQEBgYGBggICAgKCgoKDAwMDA4O;BAQGBggICgoMDA4OEBASEhQUFhYYGBoa;+/v5+ff39fXz8/Hx7+/t7evr6enn5+Xl" hash = self.kfd.hash_bytes(self.buf) self.assert_(key == hash) def testFHA(self): key = "IAIEBggKDA4QEhQWGBocHg;AgQGCAoMDhASFBYYGhweICIkJigqLC4w;vb/Bw8XHycvNz9HT1dfZ293f4ePl5+nr" self.kfd.algorithm = self.kfd._fast_hash hash = self.kfd.hash_bytes(self.buf) self.assert_(key == hash) def testSimplified(self): key = "/v4DA/39Bgb8/AkJ+/sMDPr6Dw/5+RIS;AYB//n/+fv1+/X38ffx8+3z7e/p7+nr5;+3z8ff19/X7+fv5/f4ABgAGBAoECggOC" buf = self.buf * 16 self.kfd.algorithm = self.kfd.simplified hash = self.kfd.hash_bytes(buf) self.assert_(key == hash) def testOutputSize(self): self.kfd.algorithm = None l = len(self.kfd.hash_bytes(self.buf)) size = self.kfd.output_size * 3 size += 2 self.assertEqual(l, size) def testNullBlocks(self): buf = "\x00" * 8192 buf += self.buf self.kfd.algorithm = None self.kfd.reduce_errors = True h = self.kfd.hash_bytes(buf) self.failUnless(h.find("AA") == -1) def testSimplified(self): self.kfd.algorithm = self.kfd.simplified h = self.kfd.hash_bytes(self.buf + self.buf) self.kfd.algorithm = self.kfd._fast_hash h2 = self.kfd.hash_bytes(self.buf + self.buf) self.failUnless(((self.kfd.output_size * 3) + 2) - self.kfd.edit_distance(h, h2) < 16)
class TestKFuzzy(unittest.TestCase): kfd = None buf = "" def setUp(self): self.kfd = CKoretFuzzyHashing() buf = "" for c in xrange(0, 255): buf += chr(c)*512 self.buf = buf def testDHA(self): """ Default hashing algorithm (DHA) """ key = "AgIEBAQEBgYGBggICAgKCgoKDAwMDA4O;BAQGBggICgoMDA4OEBASEhQUFhYYGBoa;+/v5+ff39fXz8/Hx7+/t7evr6enn5+Xl" hash = self.kfd.hash_bytes(self.buf) self.assert_(key == hash) def testFHA(self): key = "IAIEBggKDA4QEhQWGBocHg;AgQGCAoMDhASFBYYGhweICIkJigqLC4w;vb/Bw8XHycvNz9HT1dfZ293f4ePl5+nr" self.kfd.algorithm = self.kfd._fast_hash hash = self.kfd.hash_bytes(self.buf) self.assert_(key == hash) def testSimplified(self): key = "/v4DA/39Bgb8/AkJ+/sMDPr6Dw/5+RIS;AYB//n/+fv1+/X38ffx8+3z7e/p7+nr5;+3z8ff19/X7+fv5/f4ABgAGBAoECggOC" buf = self.buf * 16 self.kfd.algorithm = self.kfd.simplified hash = self.kfd.hash_bytes(buf) self.assert_(key == hash) def testOutputSize(self): self.kfd.algorithm = None l = len(self.kfd.hash_bytes(self.buf)) size = self.kfd.output_size * 3 size += 2 self.assertEqual(l, size) def testNullBlocks(self): buf = "\x00"*8192 buf += self.buf self.kfd.algorithm = None self.kfd.reduce_errors = True h = self.kfd.hash_bytes(buf) self.failUnless(h.find("AA") == -1) def testSimplified(self): self.kfd.algorithm = self.kfd.simplified h = self.kfd.hash_bytes(self.buf + self.buf) self.kfd.algorithm = self.kfd._fast_hash h2 = self.kfd.hash_bytes(self.buf + self.buf) self.failUnless(((self.kfd.output_size*3)+2) - self.kfd.edit_distance(h, h2) < 16)
class CDeepToad: def __init__(self): self.kfd = CKoretFuzzyHashing() self.kfd.bsize = 512 self.kfd.output_size = 32 self.kfd.ignore_range = 2 self.kfd.big_file_size = 1024*1024*32 self.groups = {} self.ingroups = {} self.extensions = [] self.ignore_extensions = [] self.edit_distance = MAX_EDIT_DISTANCE self.maximum = 0 self.aggresive = False self.just_print = False self.just_compare = False self.print_similars = False self.output_dir = None def cluster(self, hashes, filename): if self.just_print or self.just_compare: self.groups[filename] = hashes return for hash in hashes: hashed = False for key in self.groups: # Check for maximum edit distance if self.kfd.edit_distance(key, hash) <= self.edit_distance: self.groups[key].append(filename) hashed = True break if hashed: continue if not self.groups.has_key(hash): self.groups[hash] = [] self.groups[hash].append(filename) def compareSimilars(self, hashes, filename): for line in open(self.similars_file, "rb").readlines(): line = line.strip("\r").strip("\n") similar_hases = line.split(";") print similar_hashes def hashFile(self, filename): try: s1, s2, s3 = self.kfd.hash_file(filename, self.aggresive).split(";") if self.just_print: print "%s;%s;%s;%s" % (s1, s2, s3, filename) elif self.print_similars: self.compareSimilars((s1, s2, s3), filename) else: self.cluster((s1, s2, s3), filename) except KeyboardInterrupt: raise except: sys.stderr.write(" -> %s\n" % str(sys.exc_info()[1])) sys.stderr.flush() def printReportHeader(self): print "Signature;Simple Signature;Reverse Signature;Filename" def clusterDirectory(self, path, output_dir): last_size = 0 total = 0 if self.just_print: self.printReportHeader() for root, dirs, files in os.walk(path): for name in files: if self.maximum != 0 and total >= self.maximum: break total += 1 basename, extension = os.path.splitext(name) if extension in self.ignore_extensions: continue elif len(self.extensions) != 0: if extension not in self.extensions: continue if not self.just_print: sys.stderr.write("\b"*last_size + " "*last_size + "\b"*last_size) sys.stderr.flush() sys.stderr.write("Processing file %s ..." % os.path.join(root, name)) last_size = len("Processing file %s ..." % os.path.join(root, name)) sys.stderr.flush() self.hashFile(os.path.join(root, name)) if self.maximum != 0 and total >= self.maximum: break if total > 0: sys.stderr.write("\n") sys.stderr.flush() def sortByCount(self): # First, sort by count of elements newGrp = {} for x in self.groups: if x == "": continue newGrp[x] = len(self.groups[x]) # Now sort the dict by values outgrp = {} alist = sorted(newGrp.iteritems(), key=lambda (k,v): (v,k), reverse=True) dones = [] # Create the new dict with only non empty groups for x in alist: val = x[0] outgrp[val] = [] for element in self.groups[val]: if element not in dones: outgrp[val].append(element) dones.append(element) if len(outgrp[val]) == 0: del outgrp[val] return outgrp def printHashes(self): self.printReportHeader() for x in self.groups: hashes = self.groups[x] print "%s;%s;%s;%s" % (hashes[0], hashes[1], hashes[2], x) def compareAndReportHashes(self, x, y, hashesx, hashesy, dones): finished = False for hx in hashesx: if hx == "": continue elif finished: break #if hx not in hashesy: # continue for hy in hashesy: if hy == "": continue dis = self.kfd.edit_distance(hx, hy) dis = len(hx) - dis percent = dis*100.00/len(hx) if percent > 33: print "File '%s' matches '%s' (%0.2f%%)" % (x, y, percent) dones[y] = x finished = True break return dones def compareHashes(self): dones = {} for x in self.groups: for y in self.groups: if x == y: continue elif dones.has_key(x): if dones[x] == y: #print "Ignored" continue hashesx = self.groups[x] hashesy = self.groups[y] dones = self.compareAndReportHashes(x, y, hashesx, hashesy, dones) def printReport(self): if self.just_print: #self.printHashes() return if self.just_compare: self.compareHashes() return grp = self.sortByCount() already = [] for x in grp: for element in self.groups[x]: if element not in already: print "%s;%s" % (x, element) already.append(element) def copySamples(self, out_dir): if not os.path.exists(out_dir): os.mkdir(out_dir) base_dir_name = "set" sets = {} i = 1 grp = self.sortByCount() already = [] for x in grp: for element in self.groups[x]: if element not in already: if not sets.has_key(x): tmp = base_dir_name + str(i) i += 1 tmp = os.path.join(self.output_dir, tmp) os.mkdir(tmp) sets[x] = tmp new_path = sets[x] basefile = os.path.basename(element) new_path = os.path.join(new_path, basefile) shutil.copy(element, new_path) #print ("Moving %s to %s" % (element, new_path)) #print "%s;%s" % (x, element) already.append(element)