def resort(companies, artifacts, news): dic = {'artifact': {}, 'news': {}} for a in artifacts: str_a = getArtiStr(a) simhash_a = simhash.Simhash(simhash.get_features(str_a)) minDistance = 99999999 minCompany = '' for c in companies: str_c = getComStr(c) simhash_c = simhash.Simhash(simhash.get_features(str_c)) distance = simhash_a.distance(simhash_c) if distance < minDistance: minDistance = distance minCompany = c dic['artifact'][a] = minCompany for n in news: str_a = getNewsStr(n) simhash_a = simhash.Simhash(simhash.get_features(str_a)) minDistance = 99999999 minCompany = '' for c in companies: str_c = getComStr(c) simhash_c = simhash.Simhash(simhash.get_features(str_c)) distance = simhash_a.distance(simhash_c) if distance < minDistance: minDistance = distance minCompany = c dic['news'][n] = minCompany return dic
def get_simhash_dis(str1, str2): """计算两个文本之间的simhash相似度""" simhash_str1 = simhash.Simhash(str1) simhash_str2 = simhash.Simhash(str2) dis_simhash = 1 - simhash_str1.distance(simhash_str2) / 64 dis_ratio = Levenshtein.ratio(str1, str2) dis_jaro = Levenshtein.jaro(str1, str2) res = (dis_simhash + dis_ratio + dis_jaro) / 3 return res
def sketchesSimhash(sketches): hashes = {} for sketch, asProb in sketches.iteritems(): hashes[sketch] = simhash.Simhash(asProb, f=64) return hashes
def compute_simhash(self): texto = [] texto.append(self.enunciado) for p in self.cuestiones: texto.append(p.enunciado) texto.append(p.respuesta) self.simhash = "%x" % simhash.Simhash("\n".join(texto)).value
def __init__( self, setURLIter, maxFileBytes, setStemmer=UT.Stemmer_killPunct() ): self.urlIter = setURLIter self.stemmer = setStemmer self.simhash = SH.Simhash() # File writing self.writer = UT.FileWriter(True) # True turns logging on self.contentWriter = UT.FileWriter() # For unlogged content # Passing None makes uq resume after last uq id # Defeat this by saving a 1 to uqgen.txt # Call uq.saveState() after parseAll() self.uq = UT.UQGen(None) self.maxFile = maxFileBytes self.bytesWritten = 0 # Logging self.badList = [] # Current data self.currTitle = "" self.currRaw = None self.currContent = None self.procContent = None self.currAttr = None
def get_simhash_value(contents): main = "" for content in contents: if content["content"].strip() != "": main = main + content["content"] a = simhash.Simhash(simhash.get_features(main)) # logger.info("*****%s", a.value) return str(a.value)
def get_distance(self, newsId1, newsId2): news1 = self.collection.find_one({'_id': newsId1}) news2 = self.collection.find_one({'_id': newsId2}) def get_simhash_value(contents): main = "" for content in contents: if content["content"].strip() != "": main = main + content["content"] a = simhash.Simhash(simhash.get_features(main)) # logger.info("*****%s", a.value) return str(a.value) # logger.info('sim1:%s | sim2:%s',get_simhash_value(news1['contents']),get_simhash_value(news2['contents'])) simhash1 = simhash.Simhash(get_simhash_value(news1['contents'])) simhash2 = simhash.Simhash(get_simhash_value(news2['contents'])) distance = simhash1.distance(simhash2) return distance
def hash_simhash(text): """ Calculates the SimHash value for the string property passed in. :param data: Contains the keyword arguments passed in. Requires a keyword "property", which is a string. :return: An integer value """ return int(simhash.Simhash(text).value / 100)
def process(): global count global finallist i = 0 j = 0 while i < len(texts): j = i + 1 while j < len(texts): count += 1 s1 = simhash.Simhash(texts[i]) s2 = simhash.Simhash(texts[j]) if s1.distance(s2) < 3: if i not in finallist: finallist.append(i) if j not in finallist: finallist.append(j) j += 1 i += 1 print finallist
def check_same_act(act): v = long(act["simhashValue"]) acts1 = list( collection_news.find({ "type": 60002, "beginDate": act["beginDate"], "endDate": act["endDate"], "city": act["city"] })) for act1 in acts1: if act1.has_key( "simhashValue") is False or act["simhashValue"] is None: continue logger.info("same title: %s", act["title"]) v1 = long(act1["simhashValue"]) dis = simhash.Simhash(v).distance(simhash.Simhash(v1)) if dis < 6: logger.info("Same act!!! %s, %s, %s, %s, %s", dis, act["title"], act1["title"], act["link"], act1["link"]) return True return False
def _detect_internal(self, fpath): with open(fpath, 'rb') as fp: detect_result = dict() hash_set = self._load_features_from_file(fp) hash_result = simhash.Simhash(hash_set) matched_samples = self.detector.get_near_dups(hash_result) if len(matched_samples) > 0: detect_messgae = '[Webshell] > {0} with matches: {1}'.format(fpath, matched_samples) self.cmdx_logger.warning(detect_messgae) self.file_logger.warning(detect_messgae + '\r\n') detect_result[fpath] = True else: detect_messgae = 'NormalPage > {0}.'.format(fpath) self.cmdx_logger.info(detect_messgae) detect_result[fpath] = False return detect_result #A dict which key means file path and value is a bool value.
def add_url(self, url, **kwargs): _path = urlparse(url).path if not _path: _path = "/" _path = self._prehandle_path(_path) _final = self._concat_url(url, _path, **kwargs) if _final in self.bfilter: return else: self.bfilter.add(_final) if self.distance: _shash = simhash.Simhash(_final) result = self._simindex.get_near_dups(_shash) if not result: self._simindex.add(uuid.uuid4(), _shash)
def url_is_duplicate(self, url, **kwargs): path = urlparse(url).path if not path: path = "/" _path = self._prehandle_path(path) _final = self._concat_url(url, _path, **kwargs) if _final not in self.bfilter: if not self.distance: return False else: shash = simhash.Simhash(_final) result = self._simindex.get_near_dups(shash) if not result: return False else: return True else: return True
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") self.frontier.end_thread() if self.frontier.threadCount == 0: self.report.print_report() break resp = download(tbd_url, self.config, self.logger) self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls, tokens = scraper(tbd_url, resp) #checks to make sure page is not empty and not a duplicate (using simhash) if not tokens == '' and not self.frontier.simhashIndex.get_near_dups( simhash.Simhash(tokens)): self.report.store_report(tbd_url, tokens) self.frontier.add_simhash(tbd_url, tokens) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url)
return text # In[21]: extracted["norm_text"] = extracted.text.apply(normalize_text) # In[22]: extracted.head() # In[25]: extracted.iloc[48].norm_text # In[29]: def get_features(s): width = 3 s = s.lower() s = re.sub(r'[^\w]+', '', s) return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))] # In[30]: extracted.content_hash = extracted.norm_text.apply( lambda X: simhash.Simhash(get_features(X)).value) extracted.head()
_WRE = re.compile(r'\b\S+\b') words = _WRE.findall(TEXT) MIN_SAMPLE_WORDS = 10 MAX_SAMPLE_WORDS = 25 NUM_SAMPLES = 1000 TOLERANCE = 6 # <--- Вот это очень влияет на производительность! SEARCHES = 100 samples = [ random.sample(words, random.randint(MIN_SAMPLE_WORDS, MAX_SAMPLE_WORDS)) for _ in range(NUM_SAMPLES) ] simhashes = [simhash.Simhash(s) for s in samples] shi = simhash.SimhashIndex([(' '.join(s), sh) for s, sh in zip(samples, simhashes)], k=TOLERANCE) shi.log.setLevel(logging.ERROR) def test_dummy(): result = [] rsh = random.choice(simhashes) for j in range(len(samples)): if rsh.distance(simhashes[j]) <= TOLERANCE: result.append(samples[j]) return result
def ham_dist(text1, text2): return simhash.Simhash(text1, f = 8).distance(simhash.Simhash(text2, f = 8))
def sim(str): return simhash.Simhash(simhash.get_features(str))
def createRegisterChain(self, p, ea): f = idaapi.FlowChart(idaapi.get_func(ea)) functionName = idaapi.get_func_name(ea) client = MongoClient('localhost', 27017) db = client.BinAuthor collection = db.Choice18 if idaapi.get_func_name(ea) not in self.functionRegisterChains.keys(): self.functionRegisterChains[idaapi.get_func_name(ea)] = {} for block in f: if p: registerChain = {} for address in Heads(block.startEA, block.endEA): if idc.GetOpType(address, 0) == 1 and idc.GetOpnd(address, 0) != "": if idc.GetOpnd(address, 0) not in self.functionRegisterChains[ idaapi.get_func_name(ea)].keys(): self.functionRegisterChains[idaapi.get_func_name( ea)][idc.GetOpnd(address, 0)] = [idc.GetMnem(address)] else: self.functionRegisterChains[idaapi.get_func_name( ea)][idc.GetOpnd(address, 0)].append( idc.GetMnem(address)) if idc.GetOpnd(address, 0) not in registerChain.keys(): registerChain[idc.GetOpnd( address, 0)] = [idc.GetMnem(address)] else: registerChain[idc.GetOpnd(address, 0)].append( idc.GetMnem(address)) if idc.GetOpType(address, 1) == 1 and idc.GetOpnd(address, 1) != "": if idc.GetOpnd(address, 1) not in self.functionRegisterChains[ idaapi.get_func_name(ea)].keys(): self.functionRegisterChains[idaapi.get_func_name( ea)][idc.GetOpnd(address, 1)] = [idc.GetMnem(address)] else: self.functionRegisterChains[idaapi.get_func_name( ea)][idc.GetOpnd(address, 1)].append( idc.GetMnem(address)) if idc.GetOpnd(address, 1) not in registerChain.keys(): registerChain[idc.GetOpnd( address, 1)] = [idc.GetMnem(address)] else: registerChain[idc.GetOpnd(address, 1)].append( idc.GetMnem(address)) for register in registerChain.keys(): fingerPrint = str(register) functionMinhashes = {} functionMinhashes["FunctionName"] = functionName functionMinhashes["FileName"] = self.fileName functionMinhashes["FileMD5"] = self.fileMD5 functionMinhashes["Author Name"] = self.authorName functionMinhashes["BlockStartEA"] = block.startEA functionMinhashes["register"] = register functionMinhashes["registerChain"] = registerChain[ register] counter = 0 for instruction in registerChain[register]: fingerPrint += " " + str(instruction) counter += 1 functionMinhashes["SimHashSignature"] = str( simhash.Simhash(fingerPrint).value) self.simhashList.append( [counter, simhash.Simhash(fingerPrint).value]) if len(fingerPrint.split(" ")) >= 6: self.registerChainMinhash.append([ fingerPrint, minhash.minHash( minhash.createShingles(fingerPrint)) ]) functionMinhashes[ "MinHashSignature"] = minhash.minHash( minhash.createShingles(fingerPrint)) collection.insert(functionMinhashes) else: self.registerChainMinhash.append([ fingerPrint, ])
def main(): simhash.Simhash(unicode(TEXT, 'utf-8'), reg=RE_WORD, hashfunc=lambda x: mmh3.hash64(x)[0])
def hash_ad_creative_text(text): return simhash.Simhash(_get_features(text)).value
seed = 5381 for i in s: seed = ((seed << 5) + seed) + ord(i) return ctypes.c_long(seed).value def convert_n_bytes(n, b): bits = b * 8 return (n + 2**(bits - 1)) % 2**bits - 2**(bits - 1) def convert_4_bytes(n): return convert_n_bytes(n, 4) def get_hashcode(s): h = 0 n = len(s) for i, c in enumerate(s): h = h + ord(c) * 31**(n - 1 - i) return convert_4_bytes(h) print(sh.Simhash(s1).distance(sh.Simhash(s2))) print(sh.Simhash(s1).value) print(sh.Simhash(s2).value) print(djb_hash(s1)) print(djb_hash(s2)) print(get_hashcode(s1)) print(get_hashcode(s2))
def _load_sample_with_features(self, fpath): with open(fpath, 'rb') as fp: hash_set = self._load_features_from_file(fp) hash_result = simhash.Simhash(hash_set) self.black_list.add((fpath, hash_result))
def add_simhash(self, url, page): with self.lock: s = simhash.Simhash(page) self.simhashIndex.add(url, s)
return False if __name__ == "__main__": acts = list( collection_news.find({ "type": 60002, "title": "微链投递“直通车”,助力高效融资!(9月份超值福利)" })) aa = 0 for act in acts: if act.has_key("simhashValue") is False or act["simhashValue"] is None: contents = get_contents(act["_id"]) #logger.info(contents) if contents is not None: a = simhash.Simhash(simhash.get_features(contents)) logger.info("*****%s, value: %s", act["title"], a.value) v = a.value collection_news.update_one( {"_id": act["_id"]}, {"$set": { "simhashValue": str(a.value) }}) else: logger.info("No content for title: %s", act["title"]) continue else: #continue v = long(act["simhashValue"]) acts1 = list( collection_news.find({