def getTextualHash(self, driver): """ returns a textual hash of the url passed """ #get the html source html_source = driver.page_source #create and update our nilsimsa object with the source nilsimsaObj = Nilsimsa() nilsimsaObj.update(html_source) return nilsimsaObj.hexdigest()
def test_nilsimsa(): """ tests the nilsimsa hash by choosing a random test file computes the nilsimsa digest and compares to the true value stored in the pickled sid_to_nil dictionary """ fname = random.choice(dircache.listdir(test_data_dir)) f = open(os.path.join(test_data_dir, fname), "rb") nil = Nilsimsa(f.read()) f.close() assert nil.hexdigest() == sid_to_nil[fname.split(".")[0]]
def ToHash(h1,h2, classes): try: th = tlsh.hash(h1) except: th = 'None' try: sh = fuzzyhashlib.sdhash(h1).hexdigest().rstrip() except: sh = 'None' try: nil = Nilsimsa(h1).hexdigest() except: nil = 'None' try: ss = fuzzyhashlib.ssdeep(h1).hexdigest() except: ss = 'None' ch = [] if classes!=None: for c in classes: name = c[0] content = c[1] try: cnil = Nilsimsa(content).hexdigest() except: cnil = 'None' try: css = fuzzyhashlib.ssdeep(content).hexdigest() except: css = 'None' try: csh = 'None' if len(content) >= 512: csh = fuzzyhashlib.sdhash(content).hexdigest().rstrip() except: csh = 'None' try: cth = 'None' if len(content) >= 256: cth = tlsh.hash(content) except: cth = 'None' ch.append((name,cth,csh,cnil,css)) return th,sh,nil,ss,ch
def __init__(self,string,parent,parent_filename,lines,type): self.string=string # string dump of TC/KW from parser self.parent=parent # TC/KW name self.parent_filename=parent_filename # parent filename where TC/KW is located self.lines=lines # line numbers of TC/KW self.type=type # type - KW or TC self.digest=Nilsimsa(self.string) # hash
def getNilsimsaHash(self, url, call_phantom=True): if call_phantom: self.setUpGetter(url) # if not output file exists, then the page failed to load if not os.path.isfile("{}-output.txt".format(self.id)): return -1 #create and update our nilsimsa object with the source try: with open("{}-output.txt".format(self.id), "rb") as f: nilsimsaObj = Nilsimsa(f.read()) #nilsimsaObj.from_file("output.txt") self.nilsimsa_hash = nilsimsaObj.hexdigest() except Exception as e: print(e) finally: # always remove the old file even if an exception is thrown os.remove('{}-output.txt'.format(self.id)) #test = True return self.nilsimsa_hash
def get_hash(text, hash_function="ssdeep"): """ Generates hashed text using one of several available hashing functions. :param text: The string to hash :type text: str :param hash_function: The specific algorithm to use; options are ``'nilsimsa'``, ``'md5'``, and ``'ssdeep'`` \ (default) :type hash_function: str :return: A hashed representation of the provided string :rtype: str .. note:: The string will be passed through :py:func:`pewtils.decode_text` and the returned value will be used \ instead of the original value if it runs successfully, in order to ensure consistent hashing in both Python 2 and \ 3. By default the function uses the :py:mod:`ssdeep` algorithm, which generates context-sensitive hashes that are \ useful for computing document similarities at scale. .. note:: Using `hash_function='ssdeep'` requires the :py:mod:`ssdeep` library, which is not installed by default \ because it requires the installation of additional system libraries on certain operating systems. For help \ installing ssdeep, refer to the pewtils documentation installation section, which provides OS-specific instructions. Usage:: from pewtils import get_hash >>> text = 'test_string' >>> get_hash(text) '3:HI2:Hl' """ decoded_text = decode_text(text).encode("utf8").strip() if decoded_text == "": decoded_text = text text = decoded_text if hash_function == "nilsimsa": from nilsimsa import Nilsimsa hashed = Nilsimsa(text).hexdigest() elif hash_function == "md5": hashed = md5(text).hexdigest() else: try: import ssdeep except ImportError: raise Exception(""" To use get_hash with hash_function='ssdeep' you need to install the ssdeep package. Try running: >> BUILD_LIB=1 pip install ssdeep If you encounter installation problems, refer to the pewtils documentation for troubleshooting help. """) hashed = ssdeep.hash(text) return hashed
def test_compatability(): """ testing compat with deprecated version by comparing nilsimsa scores of 5 randomly selected documents from the test corpus and asserting that both give the same hexdigest """ names = dircache.listdir(test_data_dir) fnames = set([random.choice(names) for i in range(5)]) for fname in fnames: f = open(os.path.join(test_data_dir, fname), "rb") text = f.read() f.close() if not (Nilsimsa(text).hexdigest() == orig_Nilsimsa(text).hexdigest()): assert False assert True
def test_nilsimsa_speed(): """ computes nilsimsa hash for all test files and prints speed """ corpus = [] for fname in listdir(test_data_dir): f = open(os.path.join(test_data_dir, fname), "rb") corpus.append(f.read()) f.close() start = time.time() for text in corpus: Nilsimsa(text) elapsed = time.time() - start print("%d in %f --> %f per second" % ( len(corpus), elapsed, len(corpus)/elapsed))
def get_nilsimsa(): dbconn = MySQLdb.connect('10.141.221.73', 'root', 'root', 'fdroid') dbcursor = dbconn.cursor() # sql = 'select block_id,block_code from fdroid.cc_block where detection_id=1 and detection_tp = "20150101--20150131"' sql = 'select detection_tp,block_id,block_code from fdroid.cc_block where detection_id=1' dbcursor.execute(sql) f = open('nilsimsa1.txt', 'w') start = time.clock() for i in dbcursor.fetchall(): f.write( str(i[0]) + ' ' + str(i[1]) + ' ' + Nilsimsa(i[2]).hexdigest() + '\n') print i[1] end = time.clock() f.write(str(end - start)) f.close() dbcursor.close() dbconn.close()
def get_hash(text, hash_function="ssdeep"): """ Generates hashed text using one of several available hashing functions. :param text: The string to hash :type text: str :param hash_function: The specific algorithm to use; options are ``'nilsimsa'``, ``'md5'``, and ``'ssdeep'`` \ (default) :type hash_function: str :return: A hashed representation of the provided string :rtype: str .. note:: The string will be passed through :py:func:`pewtils.decode_text` and the returned value will be used \ instead of the original value if it runs successfully, in order to ensure consistent hashing in both Python 2 and \ 3. By default the function uses the :py:mod:`ssdeep` algorithm, which generates context-sensitive hashes that are \ useful for computing document similarities at scale. Usage:: from pewtils import get_hash >>> text = 'test_string' >>> get_hash(text) '3:HI2:Hl' """ decoded_text = decode_text(text).encode("utf8").strip() if decoded_text == "": decoded_text = text text = decoded_text if hash_function == "nilsimsa": from nilsimsa import Nilsimsa hashed = Nilsimsa(text).hexdigest() elif hash_function == "md5": hashed = md5(text).hexdigest() else: import ssdeep hashed = ssdeep.hash(text) return hashed
def nilsimsa_hash(text): if isinstance(text, unicode): text = text.encode('utf8') return Nilsimsa(text).hexdigest()
import extractToken import getCodeFragment import os try: import cPickle as pickle except ImportError: import pickle test_data_dir = os.path.join(os.path.dirname(__file__), "nilsimsa\\test_data\\") test_data = "test_dict.p" test_dict = os.path.join(test_data_dir, test_data) sid_to_nil = pickle.load(open(test_dict, "rb")) # print sid_to_nil nil = Nilsimsa('0' * 64) s1 = nil.hexdigest() nil = Nilsimsa('0' * 63 + '1') s2 = nil.hexdigest() print s1, s2 print compare_digests(s1, s2) # for i in range(1,30): # cloneGroup = getCodeFragment.getCloneClass('1.2.txt', i) # s1 = Nilsimsa(cloneGroup[0]).hexdigest() # s2 = Nilsimsa(cloneGroup[1]).hexdigest() # #print s1,s2 # print compare_digests(s1,s2) # if compare_digests(s1,s2) <0: # getCodeFragment.printCloneClass('1.2.txt', i)
def get_ads(base_url): c = conn.cursor() page = download.get(base_url + "/search/cpg") for p in page.select(".row"): pid = p['data-pid'] a_tag = p.find('a', class_='hdrlnk') ad_href = a_tag['href'] ad_title = a_tag.text dt = p.find('time')['datetime'] dt = datetime.datetime.strptime(dt, "%Y-%m-%d %H:%M") dt = int(dt.strftime("%s")) c.execute("SELECT * FROM ad WHERE id = ?", (pid, )) row = c.fetchone() if row is None: url = ad_href if not ad_href.startswith('http'): url = base_url + ad_href time.sleep(0.5) ad = download.get(url) print url ad_text = ad.find(id='postingbody') if ad_text is None: if ad.find(id='has_been_removed'): continue else: raise "malformed body" ad_text = ad_text.text.strip() ad_text = filter(lambda x: x in string.printable, ad_text) nilsimsa = Nilsimsa(ad_text) lshash = nilsimsa.hexdigest() # c.execute("SELECT * FROM ad") # row = c.fetchone() # while row: # diff = nilsimsa.compare(row[4], True) # if diff < 10: # print diff # print cache.get("text:" + row[0]) # print "----" # print ad_text # sys.exit() seen = generate_word_counts(ad_text) cache.write("text:" + pid, ad_text) row = (pid, url, ad_title, dt, lshash) c.execute( "INSERT INTO ad (id, url, title, posted, lshash) " + " VALUES (?,?,?,?,?)", row) for word in seen: if word not in stopwords: row = (pid, word, seen[word]) c.execute( "INSERT INTO tf (id, word, cnt) " + "VALUES (?,?,?)", row) conn.commit()
}, } if data['last-modified']: try: last_modified = int( datetime.datetime( *eut.parsedate(data['last-modified'])[:6]).strftime('%s')) except Exception, exc: logger.info('failed to parse last-modified=%r', data['last-modified']) last_modified = 0 else: last_modified = 0 doc_id = md5(data['content-location']).hexdigest() content_hash = Nilsimsa(data['body']).hexdigest() file_id = (doc_id, last_modified, content_hash) file_id_str = '%s-%d-%s' % file_id kvlclient.setup_namespace(highlights_kvlayer_tables) if data['store'] is False: kvlclient.delete('files', (file_id[0], )) kvlclient.delete('highlights', (file_id[0], )) logger.info('cleared all store records related to doc_id=%r', file_id[0]) else: # storing is allowed payload_strs = list(kvlclient.get('highlights', file_id)) if payload_strs and payload_strs[0][1]: payload_str = payload_strs[0][1] try: payload = json.loads(payload_str)
def calc_nilsimsa(self, gold_surface_form, comp_surface_form): nil_0 = Nilsimsa(gold_surface_form) nil_1 = Nilsimsa(comp_surface_form) nil = compare_digests(nil_0.hexdigest(), nil_1.hexdigest()) return nil
def compute_hash(self, text): from nilsimsa import Nilsimsa result = Nilsimsa(data=text) result = result.hexdigest() return str(result)
def test_unicode(): """ ensures that feeding unicode to Nilsimsa behaves gracefully """ nil = Nilsimsa(u'\u1F631') assert nil.hexdigest()
def get_nilsimsa(string): return Nilsimsa(string).hexdigest()
[] # ssdeep ] compare_results = [ [], # nilsimsa [], # tlsh [] # ssdeep ] for filename in os.listdir('.'): if filename.startswith("prog"): with open(filename, "rb") as file: file_data = file.read() # nilsimsa hashes[0].append(Nilsimsa(file_data)) # tlsh hashes[1].append((filename, tlsh.hash(file_data))) # ssdeep output = subprocess.Popen(["ssdeep.exe", filename], stdout=subprocess.PIPE).communicate()[0] hashes[2].append((filename, output)) #hashes[2].append((filename, str(output.splitlines()[2]).split("\'", 1)[1].split(",", 1)[0])) print("") print("nilsimsa (different 0 - 128 similar)") for e in hashes[0]: print(str(e.hexdigest())) compare_results[0].append(hashes[0][0].compare(e.hexdigest(), True))