def gensim_simhash(content,test_news): # 加载积累的stopwords stopwords = load_stopwords() # 切割token并清除stopwords x = [[word for word in line.split() if word not in stopwords] for line in content] # 切割token并清除stopwords test_news = [word for word in test_news.split() if word not in stopwords] # 计算simhash test_news_hash = Simhash(test_news) sim=[] # 遍历语料计算simhash值 for news in x: hash = Simhash(news) score=test_news_hash.distance(hash) sim.append( score) #print "add %d %f" %(index,score) for index, score in sorted(enumerate(sim), key=lambda item: item[1])[:6]: # print "index:%d similarities:%f" % (index, score) print "index:%d similarities:%f content:%s" % (index, score, content[index])
def simhash_similarity(text1, text2): aa_simhash = Simhash(text1) bb_simhash = Simhash(text2) max_hashbit = max(len(bin(aa_simhash.value)), (len(bin(bb_simhash.value)))) distince = aa_simhash.distance(bb_simhash) similar = 1 - distince / max_hashbit return similar
def get_simHash(file_path): """ 功能:对语句进行分词并进行Simhash计算 返回值:返回Simhash值索引对象和错误 """ simhash_list = [] data = None file_type = os.path.splitext(file_path)[1] # 后缀获取 if file_type in text_type: data, data_dic, err = rf.Filer().Test_Split(file_path) #对文档类进行分割 for i, v in data_dic.items(): words = jieba.cut(v) smValue = Simhash(words) #计算simhash值 simhash_list.append((i, smValue)) elif file_type in doc_type: data, data_dic, err = rf.Filer().Docx_Split(file_path) #对word类进行分割 for i, v in data_dic.items(): words = jieba.cut(v) smValue = Simhash(words) simhash_list.append((i, smValue)) elif file_type in code_type: data, data_dic, err = rf.Filer().Code_Split(file_path) #对代码类进行分割 for i, v in data_dic.items(): words = re.split(r"[|,|.|;|\?|!|,|。|;|!|>|(|)|:|%|\s]\s*", v.strip()) smValue = Simhash(words) simhash_list.append((i, smValue)) else: err = "文件类型不支持" return get_simHashindex(simhash_list), data, err
def test_equality_comparison(self): a = Simhash('My name is John') b = Simhash('My name is John') c = Simhash('My name actually is Jane') self.assertEqual(a, b, 'A should equal B') self.assertNotEqual(a, c, 'A should not equal C')
def gensim_simhash(content, test_news): # 加载积累的stopwords stopwords = load_stopwords() # 切割token并清除stopwords x = [[word for word in line.split() if word not in stopwords] for line in content] # 切割token并清除stopwords test_news = [word for word in test_news.split() if word not in stopwords] # 计算simhash test_news_hash = Simhash(test_news) sim = [] # 遍历语料计算simhash值 for news in x: hash = Simhash(news) score = test_news_hash.distance(hash) sim.append(score) #print "add %d %f" %(index,score) for index, score in sorted(enumerate(sim), key=lambda item: item[1])[:6]: # print "index:%d similarities:%f" % (index, score) print "index:%d similarities:%f content:%s" % (index, score, content[index])
def itemSimilarity(data_json): item_item_count = dict() item_count = dict() # 计算每两个item共有的user数目 for train_list in data_json: for item_count1 in train_list: if item_count1[0] not in item_count.keys(): item_count[item_count1[0]] = 0.0 item_count[item_count1[0]] += item_count1[1] for item_count2 in train_list: if item_count1 == item_count2: continue if item_count1[0] not in item_item_count.keys(): item_item_count[item_count1[0]] = dict() if item_count2[0] not in item_item_count[item_count1[0]]: item_item_count[item_count1[0]][item_count2[0]] = 0.0 item_item_count[item_count1[0]][ item_count2[0]] += item_count1[1] + item_count2[1] UserSimi2arr = dict() for i, related_items in item_item_count.items(): for j, cij in related_items.items(): if i not in UserSimi2arr: UserSimi2arr[i] = dict() UserSimi2arr[i][j] = 1000 * cij / ( math.sqrt(item_count[i] * item_count[j]) * (Simhash(i).distance(Simhash(j)))) return UserSimi2arr
def main(): if request.method == 'POST': document = request.json['json'] # check if redis is empty if len(r.keys()) == 0: r.set(str(Simhash(document).value), document) return { "message": "The redis cache was empty. Just populated it with your document" } #if redis not empty else: # get all redis docs, make sure its a list of tuples s1 = Simhash(document) i = index.get_near_dups(s1) index.add(str(s1.value), s1) simdocs = [] for num in i: simdocs.append(r.get((num))) r.set(str(s1.value), document) return {"docs": simdocs} return {"status": "meme"}
def print_event(cpu, data, size): t = time.time() event = b["events"].event(data) data = b["events_data_long"][c_int(0)] args = [data.execve_arg0] argc = data.execve_argc if argc >= 2: args.append(data.execve_arg1) if argc >= 3: args.append(data.execve_arg2) if argc >= 4: args.append(data.execve_arg3) ppid = int(event.ppid) exe = get_exe_best_effort(event.pid) pexe = get_exe_best_effort(ppid) values = [ str(event.ns), Simhash(exe).value, Simhash(pexe).value, Simhash(event.comm).value, event.syscall, event.cred, event.pid, event.ppid, event.ip, event.fp, event.sp, Simhash(args).value, str( b64encode("annotation:%s_%s_%s_%s_%s" % (str(t), exe, pexe, event.comm, args))) ] writer.writerow(values)
def test2(): # po = Pool(10) for dirpath, dirnames, filenames in os.walk(driver_path): for filename in filenames: index = filenames.index(filename) print '下标', index file_path1 = dirpath + '/' + filename cont = news_process(file_path1) simhash1 = Simhash(cont) print file_path1 key1 = num10_to2_sys(simhash1.value) print key1 for i in filenames[:index]: file_path2 = dirpath + '/' + i cont2 = news_process(file_path2) simhash2 = Simhash(cont2) # key2 = num10_to2_sys(simhash2.value) # a = hammingDis(key1, key2) # print '海明距离', a # print file_path1 # print key1 # print file_path2 # print key2 key2 = simhash1.distance(simhash2) print '海明距离', key2 print file_path1 print simhash1.value print file_path2 print simhash2.value
def calculate_simhash(features_dict, simhash_size, hashfunc=None): """Calculate simhash for features in a dict. `features_dict` contains data like {'text': weight} """ if hashfunc: return Simhash(features_dict, simhash_size, hashfunc=hashfunc).value return Simhash(features_dict, simhash_size).value
def view_simhash_value(): print("%x" % Simhash(get_features("How are you? I am fine. Thanks.")).value) print("%x" % Simhash(get_features("How are u? I am fine. Thanks.")).value) print("%x" % Simhash(get_features("How r you?I am fine. Thanks.")).value)
def get_xls_data(): # 获取数据 data = pd.read_excel("./data/排重案例样本0706.xlsx", names=["content1", "content2"], sheetname=[0]) content_ls_1 = [(x, y) for x, y in enumerate(data[0]["content1"]) if y] content_ls_2 = [(x, y) for x, y in enumerate(data[0]["content2"]) if y] content_ls = [] for x in content_ls_1: for y in content_ls_2: if x[0] == y[0]: content_ls.append((x[1], y[1])) # 数据分词 print("语料长度:" + str(len(content_ls))) similarity_length = 0 for x in content_ls: distance = (Simhash(x[0]).distance(Simhash(x[1]))) if distance <= 25: print(distance) print("相似文本为:" + x[0] + " ||||| " + x[1]) print("==================") similarity_length = similarity_length + 1 print("相似语料长度:" + str(similarity_length)) print("相似度识别成功率:%s" % (similarity_length / len(content_ls)) * 100 + "%")
def itemSimilarity(train_dict): ''' 计算物品相似度 @param train_dict 训练数据集Dict @return UserSimilar2array 记录用户相似度的二维矩阵 ''' item_item_count = dict() item_count = dict() # 计算每两个item共有的user数目 for user, items_weight in train_dict.items(): for id1, count1 in items_weight: if id1 not in item_count: item_count[id1] = count1 item_count[id1] += count1 for id2, count2 in items_weight: if id1 == id2: continue if id1 not in item_item_count: item_item_count[id1] = dict() if id2 not in item_item_count[id1]: item_item_count[id1][id2] = count1 + count2 item_item_count[id1][id2] += count1 + count2 UserSimi2arr = dict() for i, related_items in item_item_count.items(): for j, cij in related_items.items(): if i not in UserSimi2arr: UserSimi2arr[i] = dict() UserSimi2arr[i][j] = 1000 * cij / ( math.sqrt(item_count[i] * item_count[j]) * (Simhash(i).distance(Simhash(j))**2)) return UserSimi2arr
def get_simlar_text(self, text1, text2): ''' 1.文本相似度比较算法 2.使用simhash分析 :param text1: :param text2: :return: ''' new_simhash = SimHash() hash_first = new_simhash.getHash(text1) # 计算hash值 hash_second = new_simhash.getHash(text2) text_first_hash = Simhash(hash_first) text_second_hash = Simhash(hash_second) distince = text_first_hash.distance(text_second_hash) max_hashbit = max(len(bin(text_first_hash.value)), (len(bin(text_second_hash.value)))) if max_hashbit == 0: return 0 else: similar = 1 - distince / max_hashbit return (similar)
def sim_hash(file1,file2): str1=open(file1,'r').read() str1=re.sub(r'#.*\n', "", str1) str1=re.sub(r'//.*\n', "", str1) str1=re.sub(r'/\*.*\*/', "", str1) str1=re.sub(r'var\d+','var',str1) str1=re.sub(r'func\d+','func',str1) str2=open(file2,'r').read() str2=re.sub(r'#.*\n', "", str2) str2=re.sub(r'//.*\n', "", str2) str2=re.sub(r'/\*.*\*/', "", str2) str2=re.sub(r'var\d+','var',str2) str2=re.sub(r'func\d+','func',str2) l1=re.split(r'[\s]',str1) l2=re.split(r'[\s]',str2) dist1=find_lcsubstr(l1,l2) sim1=(dist1/len(l1)) dist2=len(find_lcseque(str1.split(),str2.split())) sim2=dist2/len(str1.split()) dist3=Simhash(str1.split('\n')).distance(Simhash(str2.split('\n'))) sim3=1-dist3/64 #if sim3>=0.92: # sim3=sim3*3 sim=(sim1*5+sim2+sim3*3)/9 print sim1, sim3, sim2 return sim
def tokenize(self, database, url: str, resp) -> bool: # beautifulsoup will parse all the html soup = BeautifulSoup(resp.raw_response.content, features="lxml") # get text return all the text in this page all_text = soup.get_text() if all_text != "": if Simhash(all_text).value in self.simHash.keys(): database.robotTXT += 1 return False self.simHash[Simhash(all_text).value] = 1 database.addUniqueUrl(url) words = nltk.tokenize.word_tokenize(all_text) # count the total word in this page wordcount = 0 for word in words: wordcount += 1 if word.isalnum( ) and word not in Tokenizer._StopWord and word.isdigit( ) != True: database.updateCommonword(word) database.updateLongestpage(url, wordcount) # update the number of subdomain in database if "ics.uci.edu" in url and urlparse(url)[1] != "ics.uci.edu": database.updateSubDomain( urlparse(url)[0] + "://" + urlparse(url)[1]) return True
def K_Simhash(ksimhash, vs): sim_list = [] for ks in ksimhash[1:]: sim_list.append(min([Simhash(v).distance(Simhash(ks)) for v in vs])) return median(sim_list)
def extract_next_links(url, resp): if resp.status != 200 or not resp.raw_response: return [] res = set() this_path = urlparse(url).path this_scheme = urlparse(url).scheme html = resp.raw_response.content for link in BeautifulSoup(html, parse_only=SoupStrainer('a'), features='html.parser'): if link.has_attr('href'): # clean up links site = "" if link['href'] == '/': # same site continue if link['href'].startswith("//"): # same domain site = link['href'][2:] elif link['href'].startswith('/') and urlparse(url).netloc not in link['href']: site = url + link['href'] else: site = link['href'] if not urlparse(site).scheme: site = this_scheme + "://" + site defrag_site = urldefrag(site.strip())[0] new_path = urlparse(defrag_site).path if Simhash(this_path).distance(Simhash(new_path)) > 16: res.add(defrag_site) return list(res)
def cloest_topic(base_topic, base_url, query): # get corpus on nist if base_url not in corpus_dict.keys(): corpus_dict[base_url] = requests.get(base_url).content base_corpus = BeautifulSoup(corpus_dict[base_url], 'html.parser').get_text() # key: value = link: similarity sim_res = {} if query not in gquery_dict.keys(): gquery_dict[query] = google(query, my_api_key, my_cse_id) results = gquery_dict[query] if len(results) == 0: return '', 100000000 try: for cmp_link in results: if cmp_link not in corpus_dict.keys(): corpus_dict[cmp_link] = requests.get(cmp_link).content cmp_corpus = BeautifulSoup(corpus_dict[cmp_link], 'html.parser').get_text() res = Simhash(base_corpus).distance(Simhash(cmp_corpus)) sim_res[cmp_link] = res except: sim_res = {} if len(sim_res) == 0: return '', 100000000 cloest_link = min(sim_res.items(), key=lambda x: x[1])[0] cloest_dist = sim_res[cloest_link] return cloest_link, cloest_dist
def similarity(text1, text2): """ :param tex1: 文本1 :param text2: 文本2 :return: 返回兩篇文章的相似度 """ aa_simhash = Simhash(text1) bb_simhash = Simhash(text2) # 打印simhash值二進制 #print(bin(aa_simhash.value)) #print(bin(bb_simhash.value)) # 漢明距離 #distince = aa_simhash.distance(bb_simhash) #print(distince) a = float(aa_simhash.value) b = float(bb_simhash.value) if a > b: similar = b / a else: similar = a / b return similar
def simhash_remove_similar(news_list): result_list = [] # 需要两两比较simhash值 len_news_list = len(news_list) for i in range(len_news_list): news_i_id = news_list[i]['news_id'] news_i_news_content = accord_news_id_get_content_list( news_i_id)['news_content'] sim_hash1 = Simhash(news_i_news_content) for j in range(i + 1, len_news_list): # 已经被打过标记的不判断 if 'del' in news_list[j]: continue news_j_id = news_list[j]['news_id'] news_j_news_content = accord_news_id_get_content_list( news_j_id)['news_content'] sim_hash2 = Simhash(news_j_news_content) # 如果两个新闻的汉明距离小于 10 则按照顺序只保留一个 if sim_hash1.distance(sim_hash2) <= SIMHASH_DISTINCT: # 表示不要这个新闻了 news_list[j]['del'] = 'yes' for news in news_list: if 'del' not in news: result_list.append(news) return result_list
def search_duplicate(): data = load_pk(datafile) index = load_pk(indexfile) begin, middle, end = data[1000], data[500000], data[-1000] print("1,000th is %r, 500,000th is %r, -1,000th is %r." % ( begin, middle, end)) st = time.clock() ind = data.index(begin) print("list.index(item) method for begin takes %.6f sec" % (time.clock() - st,)) st = time.clock() res = index.get_near_dups(Simhash(begin)) print("simhash index method for begin takes %.6f sec" % (time.clock() - st,)) st = time.clock() ind = data.index(middle) print("list.index(item) method for middle takes %.6f sec" % (time.clock() - st,)) st = time.clock() res = index.get_near_dups(Simhash(middle)) print("simhash index method for middle takes %.6f sec" % (time.clock() - st,)) st = time.clock() ind = data.index(end) print("list.index(item) method for end takes %.6f sec" % (time.clock() - st,)) st = time.clock() res = index.get_near_dups(Simhash(end)) print("simhash index method for end takes %.6f sec" % (time.clock() - st,))
def test_sparse_features(self): data = [ 'How are you? I Am fine. blar blar blar blar blar Thanks.', 'How are you i am fine. blar blar blar blar blar than', 'This is simhash test.', 'How are you i am fine. blar blar blar blar blar thank1' ] vec = TfidfVectorizer() D = vec.fit_transform(data) voc = dict((i, w) for w, i in vec.vocabulary_.items()) # Verify that distance between data[0] and data[1] is < than # data[2] and data[3] shs = [] for i in range(D.shape[0]): Di = D.getrow(i) # features as list of (token, weight) tuples) features = zip([voc[j] for j in Di.indices], Di.data) shs.append(Simhash(features)) self.assertNotEqual(0, shs[0].distance(shs[1])) self.assertNotEqual(0, shs[2].distance(shs[3])) self.assertLess(shs[0].distance(shs[1]), shs[2].distance(shs[3])) # features as token -> weight dicts D0 = D.getrow(0) dict_features = dict(zip([voc[j] for j in D0.indices], D0.data)) self.assertEqual(17583409636488780916, Simhash(dict_features).value) # the sparse and non-sparse features should obviously yield # different results self.assertNotEqual(Simhash(dict_features).value, Simhash(data[0]).value)
def get_sim_simhash(self, text1, text2, f_num=64): a_simhash = Simhash(text1, f=f_num) b_simhash = Simhash(text2, f=f_num) max_hashbit = max(len(bin(a_simhash.value)), len(bin(b_simhash.value))) distance = a_simhash.distance(b_simhash) sim = 1 - distance / max_hashbit return sim
def test_large_inputs(self): """ Test code paths for dealing with feature lists larger than batch_size, and weights larger than large_weight_cutoff. """ many_features = [str(i) for i in range(int(Simhash.batch_size * 2.5))] many_features_large_weights = [(f, Simhash.large_weight_cutoff * i) for i, f in enumerate(many_features)] self.assertEqual(7984652473404407437, Simhash(many_features).value) self.assertEqual(3372825719632739723, Simhash(many_features_large_weights).value)
def calcTitleHashFeats(title1, title2, featVector): if title1 is None or title2 is None or title1 == '' or title2 == '': featVector.append(1) return title1 = '%x' % Simhash(get_features(normalize(title1))).value title2 = '%x' % Simhash(get_features(normalize(title2))).value t2 = distance.nlevenshtein(title1, title2) featVector.append(t2)
def calcAbstractHashFeats(abstract1, abstract2, featVector): if abstract1 is None or abstract2 is None or abstract1 == '' or abstract2 == '': featVector.append(1) return abstract1 = '%x' % Simhash(get_features(abstract1)).value abstract2 = '%x' % Simhash(get_features(abstract2)).value t2 = distance.nlevenshtein(abstract1, abstract2) featVector.append(t2)
def simhash_method(doc1, doc2, threshold): distance = Simhash(doc1).distance(Simhash(doc2)) # print(distance) #to check the similarity is same as it should to be# if distance < threshold: result = 1 else: result = 0 return result
def test_segtree(self): sh = Simhash('How are you? I am fine. Thanks. And you?') self.assertEqual(sh.value, 6460565663990245323) sh2 = Simhash('How old are you ? :-) I am fine. Thanks. And you?') self.assertEqual(sh.distance(sh2), 8) sh3 = Simhash(sh2) self.assertEqual(sh2.distance(sh3), 0)
def test_distance(self): sh = Simhash('How are you? I AM fine. Thanks. And you?') sh2 = Simhash('How old are you ? :-) i am fine. Thanks. And you?') self.assertTrue(sh.distance(sh2) > 0) sh3 = Simhash(sh2) self.assertEqual(sh2.distance(sh3), 0) self.assertNotEqual(Simhash('1').distance(Simhash('2')), 0)
def clustering(): fout = open('cluster.txt', 'w', encoding='UTF-8') cursor = conn.cursor() cursor.execute( 'SELECT id, title, cluster, sim_count, link, simhash FROM entries where cluster=0' ) entrylist = cursor.fetchall() objs = [] entrydic = {} for item in entrylist: if not is_en(item[1]): if not item[4].startswith("https://weibo.com"): sim = Simhash(get_features_cn(item[1])) objs.append((str(item[0]), sim)) entrydic[str(item[0])] = { 'title': item[1], 'cluster': 0, 'sim_count': 0, 'link': item[4], 'simhash': sim.value } else: sim = Simhash(get_features(item[1])) objs.append((str(item[0]), sim)) entrydic[str(item[0])] = { 'title': item[1], 'cluster': 0, 'sim_count': 0, 'link': item[4], 'simhash': sim.value } index = SimhashIndex(objs, k=tolerance) cluster_num = last_cluster_num for key in entrydic: if entrydic[key]['cluster'] == 0: sims = index.get_near_dups( Simhash(get_features_cn(entrydic[key]['title']))) for item in sims: entrydic[item]['cluster'] = cluster_num # if len(sims) > 1: entrydic[item]['sim_count'] = len(sims) - 1 if len(sims) > 1: fout.write(item + '\t' + str(entrydic[item]['cluster']) + '\t' + entrydic[item]['title'] + '\n') cursor.execute( 'UPDATE entries SET cluster=%s, sim_count=%s, simhash=%s where id = %s', (entrydic[item]['cluster'], entrydic[item]['sim_count'], str(entrydic[item]['simhash']), item)) # conn.commit() # fout.write(item + '\t' + str(entrydic[item]['cluster']) + '\t' + entrydic[item]['title'] + '\t' + entrydic[item]['link'] + '\n') cluster_num += 1 # cursor.execute('UPDATE somevariables SET last_cluster=%s', (cluster_num,)) # conn.commit() conn.close()
def isPageTooSimilar(pageTextString, pageHashes): pageHash = Simhash(pageTextString) minDist = 100000000 skipPage = False for hashedPage in pageHashes: if pageHash.distance(hashedPage) < 3: skipPage = True break else: pageHashes.add(pageHash) return skipPage
def hanming_distance(s1, s2): if type(s1) == str and type(s2) == str: hanmingdistance = bin( int(hex(Simhash(get_features(s1)).value), 16) ^ int(hex(Simhash(get_features(s2)).value), 16)).count('1') elif type(s1) == int and type(s2) == int: hanmingdistance = bin(int(hex(s1), 16) ^ int(hex(s2), 16)).count('1') else: print('s1和s2需要是相同的数据类型!') # return hanming_distance return 1 - hanmingdistance / 64
def shrinkdup(self): pkv = Simhash('') shdist = 10 for content in self.sortedlog: if not content or content.strip() == '': continue try: ret = self._extract_content(content) hv = Simhash(ret) if pkv.distance(hv) > shdist: self.shrinklogs.append(content) pkv = hv except Exception as e: print content print e if len(self.shrinklogs) > 0: filen = self.filen + '-shrunk' with open(filen, 'w') as f: for v in self.shrinklogs: f.write('%s' % (v))
def simHashLabel(user1filepath,user2filepath,user1Floder,user2Floder,num_floder): ans=0.0 for i in range(num_floder): labeluser1='' labeluser2='' tempmax1=0 tempmax2=0 f1=open(user1filepath+user1Floder[i]+os.sep+'RCed_stoppoint.txt') for line in f1: labeluser1+=line.split(',')[4] labeluser1+=',' tempmax1+=1 f2=open(user2filepath+user2Floder[i]+os.sep+'RCed_stoppoint.txt') for line in f2: labeluser2+=line.split(',')[4] labeluser2+=',' tempmax2+=1 sh1 = Simhash(u'%s'%labeluser1) sh2 = Simhash(u'%s'%labeluser2) maxlen=tempmax1 if tempmax1>=tempmax2 else tempmax2 ans+= sh1.distance(sh2)/maxlen return ans
def test_chinese(self): self.maxDiff = None sh1 = Simhash(u'你好 世界! 呼噜。') sh2 = Simhash(u'你好,世界 呼噜') sh4 = Simhash(u'How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.') sh5 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than') sh6 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank') self.assertEqual(sh1.distance(sh2), 0) self.assertTrue(sh4.distance(sh6) < 3) self.assertTrue(sh5.distance(sh6) < 3)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author: Joshua # @E-Mail: [email protected] # @Date: 2015-02-11 12:32:00 # @About demo.py # import re from simhash import Simhash from simhash import SimhashIndex from data import news_lists_1, news_lists_2 # 对于汉语无论分词于不分词,simhash的结果是相同的。 # for i, news in enumerate(news_lists_1): x = Simhash(news_lists_1[i]['content'], f=64) #y = Simhash('hello') #x = Simhash('Hi') y = Simhash(news_lists_2[i]['content'], f=64) print('1.simhash:', x.value) print('2.simhash:', y.value) print('distance:', x.distance(y)) print('similarity:', (64 - x.distance(y)) / 64) print(news_lists_1[i]['title'])
def test_chinese(self): sh1 = Simhash(u'你好 世界! 呼噜。') sh2 = Simhash(u'你好,世界 呼噜') #self.assertEqual(sh1._features, []) self.assertEqual(sh1.distance(sh2), 0)
def generate_simhash(self, tokens): #Generate a Simhash from Spacy tokens. sh = Simhash(u'', f=self.hash_size) #silly interface... sh.build_by_features(tokens) return sh