Python CKoretFuzzyHashing 예제들, kfuzzy.CKoretFuzzyHashing Python 예제들

예제 #1

0

파일 보기

파일: simple_macro_parser.py 프로젝트: wgwjifeng/pigaios

    def create_enums(self, d):
        kfh = CKoretFuzzyHashing()
        kfh.bsize = 1
        kfh.output_size = 8

        fuzzy_hashes = {}
        for key in d.keys():
            hash1, hash2, _ = kfh.hash_bytes(key).split(";")
            new_key = "%s-%s" % (hash1, hash2)
            if new_key in fuzzy_hashes:
                fuzzy_hashes[new_key].append(key)
            else:
                fuzzy_hashes[new_key] = [key]

        enums = {}
        enums[DEFAULT_ENUM] = []
        for key in fuzzy_hashes:
            l = fuzzy_hashes[key]
            if len(l) == 1:
                continue

            enum_name = self.get_enum_name(l)
            enums[enum_name] = []
            tmp = []
            for element in l:
                tmp.append("  %s = %s, " % (element, str(d[element])))

            tmp.sort()
            tmp.insert(0, "enum %s {" % enum_name)
            tmp.append("};")
            enums[enum_name] = "\n".join(tmp)

        return enums

예제 #2

0

파일 보기

 def __init__(self):
     self.kfd = CKoretFuzzyHashing()
     self.kfd.bsize = 512
     self.kfd.output_size = 32
     self.kfd.ignore_range = 2
     self.kfd.big_file_size = 1024*1024*32
     
     self.groups = {}
     self.ingroups = {}
     self.extensions = []
     self.ignore_extensions = []
     self.edit_distance = MAX_EDIT_DISTANCE
     self.maximum = 0
     self.aggresive = False
     self.just_print = False
     self.just_compare = False
     self.print_similars = False
     self.output_dir = None

예제 #3

0

파일 보기

파일: test.py 프로젝트: pombredanne/deeptoad-1

class TestKFuzzy(unittest.TestCase):

    kfd = None
    buf = ""

    def setUp(self):
        self.kfd = CKoretFuzzyHashing()
        buf = ""
        for c in xrange(0, 255):
            buf += chr(c) * 512
        self.buf = buf

    def testDHA(self):
        """ Default hashing algorithm (DHA) """
        key = "AgIEBAQEBgYGBggICAgKCgoKDAwMDA4O;BAQGBggICgoMDA4OEBASEhQUFhYYGBoa;+/v5+ff39fXz8/Hx7+/t7evr6enn5+Xl"
        hash = self.kfd.hash_bytes(self.buf)
        self.assert_(key == hash)

    def testFHA(self):
        key = "IAIEBggKDA4QEhQWGBocHg;AgQGCAoMDhASFBYYGhweICIkJigqLC4w;vb/Bw8XHycvNz9HT1dfZ293f4ePl5+nr"
        self.kfd.algorithm = self.kfd._fast_hash
        hash = self.kfd.hash_bytes(self.buf)
        self.assert_(key == hash)

    def testSimplified(self):
        key = "/v4DA/39Bgb8/AkJ+/sMDPr6Dw/5+RIS;AYB//n/+fv1+/X38ffx8+3z7e/p7+nr5;+3z8ff19/X7+fv5/f4ABgAGBAoECggOC"
        buf = self.buf * 16
        self.kfd.algorithm = self.kfd.simplified
        hash = self.kfd.hash_bytes(buf)
        self.assert_(key == hash)

    def testOutputSize(self):
        self.kfd.algorithm = None
        l = len(self.kfd.hash_bytes(self.buf))
        size = self.kfd.output_size * 3
        size += 2
        self.assertEqual(l, size)

    def testNullBlocks(self):
        buf = "\x00" * 8192
        buf += self.buf
        self.kfd.algorithm = None
        self.kfd.reduce_errors = True
        h = self.kfd.hash_bytes(buf)
        self.failUnless(h.find("AA") == -1)

    def testSimplified(self):
        self.kfd.algorithm = self.kfd.simplified
        h = self.kfd.hash_bytes(self.buf + self.buf)
        self.kfd.algorithm = self.kfd._fast_hash
        h2 = self.kfd.hash_bytes(self.buf + self.buf)
        self.failUnless(((self.kfd.output_size * 3) + 2) -
                        self.kfd.edit_distance(h, h2) < 16)

예제 #4

0

파일 보기

파일: test.py 프로젝트: kai5263499/deeptoad

class TestKFuzzy(unittest.TestCase):

    kfd = None
    buf = ""

    def setUp(self):
        self.kfd = CKoretFuzzyHashing()
        buf = ""
        for c in xrange(0, 255):
            buf += chr(c)*512
        self.buf = buf

    def testDHA(self):
        """ Default hashing algorithm (DHA) """
        key = "AgIEBAQEBgYGBggICAgKCgoKDAwMDA4O;BAQGBggICgoMDA4OEBASEhQUFhYYGBoa;+/v5+ff39fXz8/Hx7+/t7evr6enn5+Xl"
        hash = self.kfd.hash_bytes(self.buf)
        self.assert_(key == hash)

    def testFHA(self):
        key = "IAIEBggKDA4QEhQWGBocHg;AgQGCAoMDhASFBYYGhweICIkJigqLC4w;vb/Bw8XHycvNz9HT1dfZ293f4ePl5+nr"
        self.kfd.algorithm = self.kfd._fast_hash
        hash = self.kfd.hash_bytes(self.buf)
        self.assert_(key == hash)

    def testSimplified(self):
        key = "/v4DA/39Bgb8/AkJ+/sMDPr6Dw/5+RIS;AYB//n/+fv1+/X38ffx8+3z7e/p7+nr5;+3z8ff19/X7+fv5/f4ABgAGBAoECggOC"
        buf = self.buf * 16
        self.kfd.algorithm = self.kfd.simplified
        hash = self.kfd.hash_bytes(buf)
        self.assert_(key == hash)

    def testOutputSize(self):
        self.kfd.algorithm = None
        l = len(self.kfd.hash_bytes(self.buf))
        size = self.kfd.output_size * 3
        size += 2
        self.assertEqual(l, size)

    def testNullBlocks(self):
        buf  = "\x00"*8192
        buf += self.buf
        self.kfd.algorithm = None
        self.kfd.reduce_errors = True
        h = self.kfd.hash_bytes(buf)
        self.failUnless(h.find("AA") == -1)

    def testSimplified(self):
        self.kfd.algorithm = self.kfd.simplified
        h = self.kfd.hash_bytes(self.buf + self.buf)
        self.kfd.algorithm = self.kfd._fast_hash
        h2 = self.kfd.hash_bytes(self.buf + self.buf)
        self.failUnless(((self.kfd.output_size*3)+2) - self.kfd.edit_distance(h, h2) < 16)

예제 #5

0

파일 보기

파일: simple_macro_parser.py 프로젝트: xiaobo996/pigaios

  def create_enums(self, d):
    kfh = CKoretFuzzyHashing()
    kfh.bsize = 1
    kfh.output_size = 8

    fuzzy_hashes = {}
    for key in d.keys():
      hash1, hash2, _ = kfh.hash_bytes(key).split(";")
      new_key = "%s-%s" % (hash1, hash2)
      if new_key in fuzzy_hashes:
        fuzzy_hashes[new_key].append(key)
      else:
        fuzzy_hashes[new_key] = [key]

    enums = {}
    enums[DEFAULT_ENUM] = []
    for key in fuzzy_hashes:
      l = fuzzy_hashes[key]
      if len(l) == 1:
        continue

      enum_name = self.get_enum_name(l)
      enums[enum_name] = []
      tmp = []
      for element in l:
        value = None
        if type(d[element]) is decimal.Decimal:
          eng_str = d[element].to_eng_string()
          if str(eng_str).find(".") == -1:
            value = "0x%08x" % long(eng_str)

        if value is None:
          value = str(d[element])
        tmp.append("  %s = %s, " % (element, value))

      tmp.sort()
      tmp.insert(0, "enum %s {" % enum_name)
      tmp.append("};")
      enums[enum_name] = "\n".join(tmp)

    return enums

예제 #6

0

파일 보기

파일: nightmare_frontend.py 프로젝트: ssatanss/nightmare

def find_original_file(db, id):
  
  vars = {"id":id}
  where = "sample_id = $id"
  res = db.select("samples", what="sample_hash", where=where, vars=vars)
  res = list(res)
  if len(res) == 0:
    raise Exception("Invalid crash identifier")
  sample_hash = res[0].sample_hash

  res = db.select("config", what="value", where="name='SAMPLES_PATH'")
  res = list(res)
  if len(res) == 0:
    raise Exception("Invalid configuration value for 'SAMPLES_PATH'")

  path = os.path.join(res[0].value, "crashes")
  path = os.path.join(path, sample_hash)
  if not os.path.exists(path):
    raise Exception("Crash sample does not exists! %s" % path)

  magic = open(path, "rb").read(3)
  if magic == "PK\x03":
    z = ZipFile(path, "r")
    cmt = z.comment
    z.close()
    if cmt == "NIGHTMARE":
      raise Exception("Cannot find the original sample for ZIP archives created by Nightmare, sorry.")

  res = db.select("config", what="value", where="name = 'TEMPLATES_PATH'")
  res = list(res)
  if len(res) == 0:
    raise Exception("Invalid configuration value for 'TEMPLATES_PATH'")
  templates_path = res[0].value

  sql = """select p.subfolder subfolder
             from projects p,
                  crashes c
            where c.sample_id = $id
              and p.project_id = c.project_id"""
  vars = {"id":id}
  res = db.query(sql, vars=vars)
  res = list(res)
  if len(res) == 0:
    raise Exception("Cannot find the project associated to the crash identifier")

  project_path = os.path.join(templates_path, res[0].subfolder)
  if not os.path.exists(project_path):
    raise Exception("Cannot find path '%s'" % project_path)

  kfh = CKoretFuzzyHashing()
  kfh.bsize = 16
  h1, h2, h3 = kfh.hash_file(path).split(";")

  original_file = None
  for f in os.listdir(project_path):
    filename = os.path.join(project_path, f)
    if not os.path.isfile(filename):
      continue

    tmp1, tmp2, tmp3 = kfh.hash_file(filename).split(";")
    if h1 == tmp1 and h2 == tmp2 and h3 == tmp3:
      original_file = filename
      break
    elif h1 == tmp1 or h2 == tmp2 or h3 == tmp3:
      original_file = filename
      break

  return original_file, path

예제 #7

0

파일 보기

파일: test.py 프로젝트: kai5263499/deeptoad

 def setUp(self):
     self.kfd = CKoretFuzzyHashing()
     buf = ""
     for c in xrange(0, 255):
         buf += chr(c)*512
     self.buf = buf

예제 #8

0

파일 보기

class CDeepToad:
    def __init__(self):
        self.kfd = CKoretFuzzyHashing()
        self.kfd.bsize = 512
        self.kfd.output_size = 32
        self.kfd.ignore_range = 2
        self.kfd.big_file_size = 1024*1024*32
        
        self.groups = {}
        self.ingroups = {}
        self.extensions = []
        self.ignore_extensions = []
        self.edit_distance = MAX_EDIT_DISTANCE
        self.maximum = 0
        self.aggresive = False
        self.just_print = False
        self.just_compare = False
        self.print_similars = False
        self.output_dir = None

    def cluster(self, hashes, filename):
        if self.just_print or self.just_compare:
            self.groups[filename] = hashes
            return
        
        for hash in hashes:
            hashed = False
            for key in self.groups:
                # Check for maximum edit distance
                if self.kfd.edit_distance(key, hash) <= self.edit_distance:
                    self.groups[key].append(filename)
                    hashed = True
                    break
            
            if hashed:
                continue
            
            if not self.groups.has_key(hash):
                self.groups[hash] = []
            
            self.groups[hash].append(filename)

    def compareSimilars(self, hashes, filename):
        for line in open(self.similars_file, "rb").readlines():
            line = line.strip("\r").strip("\n")
            similar_hases = line.split(";")
            print similar_hashes

    def hashFile(self, filename):
        try:
            s1, s2, s3 = self.kfd.hash_file(filename, self.aggresive).split(";")
            if self.just_print:
                print "%s;%s;%s;%s" % (s1, s2, s3, filename)
            elif self.print_similars:
                self.compareSimilars((s1, s2, s3), filename)
            else:
                self.cluster((s1, s2, s3), filename)
        except KeyboardInterrupt:
            raise
        except:
            sys.stderr.write(" -> %s\n" % str(sys.exc_info()[1]))
            sys.stderr.flush()

    def printReportHeader(self):
        print "Signature;Simple Signature;Reverse Signature;Filename"
    
    def clusterDirectory(self, path, output_dir):
        last_size = 0
        total = 0
        if self.just_print:
            self.printReportHeader()
        
        for root, dirs, files in os.walk(path):
            for name in files:
                if self.maximum != 0 and total >= self.maximum:
                    break
                total += 1
                
                basename, extension = os.path.splitext(name)
                if extension in self.ignore_extensions:
                    continue
                elif len(self.extensions) != 0:
                    if extension not in self.extensions:
                        continue
                
                if not self.just_print:
                    sys.stderr.write("\b"*last_size + " "*last_size + "\b"*last_size)
                    sys.stderr.flush()
                    sys.stderr.write("Processing file %s ..." % os.path.join(root, name))
                    last_size = len("Processing file %s ..." % os.path.join(root, name))
                    sys.stderr.flush()
                
                self.hashFile(os.path.join(root, name))
            
            if self.maximum != 0 and total >= self.maximum:
                break
        
        if total > 0:
            sys.stderr.write("\n")
            sys.stderr.flush()

    def sortByCount(self):
        # First, sort by count of elements
        newGrp = {}
        for x in self.groups:
            if x == "":
                continue
            
            newGrp[x] = len(self.groups[x])
        
        # Now sort the dict by values
        outgrp = {}
        alist = sorted(newGrp.iteritems(), key=lambda (k,v): (v,k), reverse=True)
        dones = []
        
        # Create the new dict with only non empty groups
        for x in alist:
            val = x[0]
            outgrp[val] = []
            for element in self.groups[val]:
                if element not in dones:
                    outgrp[val].append(element)
                    dones.append(element)
            
            if len(outgrp[val]) == 0:
                del outgrp[val]
        
        return outgrp

    def printHashes(self):
        self.printReportHeader()
        for x in self.groups:
            hashes = self.groups[x]
            print "%s;%s;%s;%s" % (hashes[0], hashes[1], hashes[2], x)

    def compareAndReportHashes(self, x, y, hashesx, hashesy, dones):
        finished = False
        
        for hx in hashesx:
            if hx == "":
                continue
            elif finished:
                break
            
            #if hx not in hashesy:
            #    continue
            
            for hy in hashesy:
                if hy == "":
                    continue
                
                dis = self.kfd.edit_distance(hx, hy)
                dis = len(hx) - dis
                percent = dis*100.00/len(hx)
                
                if percent > 33:
                    print "File '%s' matches '%s' (%0.2f%%)" % (x, y, percent)
                    dones[y] = x
                    finished = True
                    break
        
        return dones

    def compareHashes(self):
        dones = {}
        
        for x in self.groups:
            for y in self.groups:
                if x == y:
                    continue
                elif dones.has_key(x):
                    if dones[x] == y:
                        #print "Ignored"
                        continue
                
                hashesx = self.groups[x]
                hashesy = self.groups[y]
                
                dones = self.compareAndReportHashes(x, y, hashesx, hashesy, dones)

    def printReport(self):
        if self.just_print:
            #self.printHashes()
            return
        
        if self.just_compare:
            self.compareHashes()
            return
        
        grp = self.sortByCount()
        already = []
        for x in grp:
            for element in self.groups[x]:
                if element not in already:
                    print "%s;%s" % (x, element)
                    already.append(element)
    
    def copySamples(self, out_dir):
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        
        base_dir_name = "set"
        sets = {}
        i = 1
        
        grp = self.sortByCount()
        already = []
        for x in grp:
            for element in self.groups[x]:
                if element not in already:
                    if not sets.has_key(x):
                        tmp = base_dir_name + str(i)
                        i += 1
                        tmp = os.path.join(self.output_dir, tmp)
                        os.mkdir(tmp)
                        sets[x] = tmp
                    
                    new_path = sets[x]
                    
                    basefile = os.path.basename(element)
                    new_path = os.path.join(new_path, basefile)
                    
                    shutil.copy(element, new_path)
                    #print ("Moving %s to %s" % (element, new_path))
                    #print "%s;%s" % (x, element)
                    already.append(element)

예제 #9

0

파일 보기

파일: test.py 프로젝트: pombredanne/deeptoad-1

 def setUp(self):
     self.kfd = CKoretFuzzyHashing()
     buf = ""
     for c in xrange(0, 255):
         buf += chr(c) * 512
     self.buf = buf