Пример #1
1
def gensim_simhash(content,test_news):

    # 加载积累的stopwords
    stopwords = load_stopwords()

    # 切割token并清除stopwords
    x = [[word for word in line.split() if word not in stopwords] for line in content]

    # 切割token并清除stopwords
    test_news = [word for word in test_news.split() if word not in stopwords]

    # 计算simhash
    test_news_hash = Simhash(test_news)


    sim=[]
    # 遍历语料计算simhash值
    for news in x:
        hash = Simhash(news)
        score=test_news_hash.distance(hash)
        sim.append( score)
        #print "add %d %f" %(index,score)

    for index, score in sorted(enumerate(sim), key=lambda item: item[1])[:6]:
        # print   "index:%d similarities:%f" % (index, score)
        print   "index:%d similarities:%f content:%s" % (index, score, content[index])
def simhash_similarity(text1, text2):
    aa_simhash = Simhash(text1)
    bb_simhash = Simhash(text2)
    max_hashbit = max(len(bin(aa_simhash.value)), (len(bin(bb_simhash.value))))
    distince = aa_simhash.distance(bb_simhash)
    similar = 1 - distince / max_hashbit
    return similar
Пример #3
0
def get_simHash(file_path):
    """
    功能:对语句进行分词并进行Simhash计算
    返回值:返回Simhash值索引对象和错误
    """
    simhash_list = []
    data = None

    file_type = os.path.splitext(file_path)[1]  # 后缀获取
    if file_type in text_type:
        data, data_dic, err = rf.Filer().Test_Split(file_path) #对文档类进行分割
        for i, v in data_dic.items():
            words = jieba.cut(v)
            smValue = Simhash(words) #计算simhash值
            simhash_list.append((i, smValue))
    elif file_type in doc_type:
        data, data_dic, err = rf.Filer().Docx_Split(file_path)  #对word类进行分割
        for i, v in data_dic.items():
            words = jieba.cut(v)
            smValue = Simhash(words)
            simhash_list.append((i, smValue))
    elif file_type in code_type:
        data, data_dic, err = rf.Filer().Code_Split(file_path)  #对代码类进行分割
        for i, v in data_dic.items():
            words = re.split(r"[|,|.|;|\?|!|,|。|;|!|>|(|)|:|%|\s]\s*", v.strip())
            smValue = Simhash(words) 
            simhash_list.append((i, smValue))
    else:
        err = "文件类型不支持"
    return get_simHashindex(simhash_list), data, err
Пример #4
0
    def test_equality_comparison(self):
        a = Simhash('My name is John')
        b = Simhash('My name is John')
        c = Simhash('My name actually is Jane')

        self.assertEqual(a, b, 'A should equal B')
        self.assertNotEqual(a, c, 'A should not equal C')
Пример #5
0
def gensim_simhash(content, test_news):

    # 加载积累的stopwords
    stopwords = load_stopwords()

    # 切割token并清除stopwords
    x = [[word for word in line.split() if word not in stopwords]
         for line in content]

    # 切割token并清除stopwords
    test_news = [word for word in test_news.split() if word not in stopwords]

    # 计算simhash
    test_news_hash = Simhash(test_news)

    sim = []
    # 遍历语料计算simhash值
    for news in x:
        hash = Simhash(news)
        score = test_news_hash.distance(hash)
        sim.append(score)
        #print "add %d %f" %(index,score)

    for index, score in sorted(enumerate(sim), key=lambda item: item[1])[:6]:
        # print   "index:%d similarities:%f" % (index, score)
        print "index:%d similarities:%f content:%s" % (index, score,
                                                       content[index])
def itemSimilarity(data_json):
    item_item_count = dict()
    item_count = dict()

    # 计算每两个item共有的user数目
    for train_list in data_json:
        for item_count1 in train_list:
            if item_count1[0] not in item_count.keys():
                item_count[item_count1[0]] = 0.0
            item_count[item_count1[0]] += item_count1[1]
            for item_count2 in train_list:
                if item_count1 == item_count2:
                    continue
                if item_count1[0] not in item_item_count.keys():
                    item_item_count[item_count1[0]] = dict()
                if item_count2[0] not in item_item_count[item_count1[0]]:
                    item_item_count[item_count1[0]][item_count2[0]] = 0.0
                item_item_count[item_count1[0]][
                    item_count2[0]] += item_count1[1] + item_count2[1]

    UserSimi2arr = dict()
    for i, related_items in item_item_count.items():
        for j, cij in related_items.items():
            if i not in UserSimi2arr:
                UserSimi2arr[i] = dict()

            UserSimi2arr[i][j] = 1000 * cij / (
                math.sqrt(item_count[i] * item_count[j]) *
                (Simhash(i).distance(Simhash(j))))

    return UserSimi2arr
Пример #7
0
def main():

    if request.method == 'POST':

        document = request.json['json']

        # check if redis is empty
        if len(r.keys()) == 0:
            r.set(str(Simhash(document).value), document)
            return {
                "message":
                "The redis cache was empty. Just populated it with your document"
            }
        #if redis not empty
        else:

            # get all redis docs, make sure its a list of tuples
            s1 = Simhash(document)
            i = index.get_near_dups(s1)
            index.add(str(s1.value), s1)
            simdocs = []
            for num in i:
                simdocs.append(r.get((num)))
            r.set(str(s1.value), document)

            return {"docs": simdocs}

    return {"status": "meme"}
Пример #8
0
def print_event(cpu, data, size):
    t = time.time()
    event = b["events"].event(data)
    data = b["events_data_long"][c_int(0)]
    args = [data.execve_arg0]
    argc = data.execve_argc
    if argc >= 2:
        args.append(data.execve_arg1)
    if argc >= 3:
        args.append(data.execve_arg2)
    if argc >= 4:
        args.append(data.execve_arg3)

    ppid = int(event.ppid)
    exe = get_exe_best_effort(event.pid)
    pexe = get_exe_best_effort(ppid)

    values = [
        str(event.ns),
        Simhash(exe).value,
        Simhash(pexe).value,
        Simhash(event.comm).value, event.syscall, event.cred, event.pid,
        event.ppid, event.ip, event.fp, event.sp,
        Simhash(args).value,
        str(
            b64encode("annotation:%s_%s_%s_%s_%s" %
                      (str(t), exe, pexe, event.comm, args)))
    ]

    writer.writerow(values)
Пример #9
0
def test2():
    # po = Pool(10)
    for dirpath, dirnames, filenames in os.walk(driver_path):
        for filename in filenames:
            index = filenames.index(filename)
            print '下标', index
            file_path1 = dirpath + '/' + filename
            cont = news_process(file_path1)
            simhash1 = Simhash(cont)
            print file_path1
            key1 = num10_to2_sys(simhash1.value)
            print key1
            for i in filenames[:index]:
                file_path2 = dirpath + '/' + i
                cont2 = news_process(file_path2)
                simhash2 = Simhash(cont2)
                # key2 = num10_to2_sys(simhash2.value)
                # a = hammingDis(key1, key2)
                # print '海明距离', a
                # print file_path1
                # print key1
                # print file_path2
                # print key2

                key2 = simhash1.distance(simhash2)
                print '海明距离', key2
                print file_path1
                print simhash1.value
                print file_path2
                print simhash2.value
Пример #10
0
def calculate_simhash(features_dict, simhash_size, hashfunc=None):
    """Calculate simhash for features in a dict. `features_dict` contains data
    like {'text': weight}
    """
    if hashfunc:
        return Simhash(features_dict, simhash_size, hashfunc=hashfunc).value
    return Simhash(features_dict, simhash_size).value
Пример #11
0
def view_simhash_value():
    print("%x" %
          Simhash(get_features("How are you? I am fine. Thanks.")).value)
    print("%x" %
          Simhash(get_features("How are u? I am fine.     Thanks.")).value)
    print("%x" %
          Simhash(get_features("How r you?I    am fine. Thanks.")).value)
Пример #12
0
def get_xls_data():
    # 获取数据
    data = pd.read_excel("./data/排重案例样本0706.xlsx",
                         names=["content1", "content2"],
                         sheetname=[0])
    content_ls_1 = [(x, y) for x, y in enumerate(data[0]["content1"]) if y]
    content_ls_2 = [(x, y) for x, y in enumerate(data[0]["content2"]) if y]
    content_ls = []
    for x in content_ls_1:
        for y in content_ls_2:
            if x[0] == y[0]:
                content_ls.append((x[1], y[1]))

    # 数据分词
    print("语料长度:" + str(len(content_ls)))
    similarity_length = 0
    for x in content_ls:
        distance = (Simhash(x[0]).distance(Simhash(x[1])))
        if distance <= 25:
            print(distance)
            print("相似文本为:" + x[0] + "    |||||   " + x[1])
            print("==================")
            similarity_length = similarity_length + 1

    print("相似语料长度:" + str(similarity_length))
    print("相似度识别成功率:%s" % (similarity_length / len(content_ls)) * 100 + "%")
Пример #13
0
def itemSimilarity(train_dict):
    ''' 计算物品相似度
        @param train_dict            训练数据集Dict
        @return UserSimilar2array    记录用户相似度的二维矩阵
    '''
    item_item_count = dict()
    item_count = dict()

    # 计算每两个item共有的user数目
    for user, items_weight in train_dict.items():
        for id1, count1 in items_weight:
            if id1 not in item_count:
                item_count[id1] = count1
            item_count[id1] += count1
            for id2, count2 in items_weight:
                if id1 == id2:
                    continue
                if id1 not in item_item_count:
                    item_item_count[id1] = dict()
                if id2 not in item_item_count[id1]:
                    item_item_count[id1][id2] = count1 + count2
                item_item_count[id1][id2] += count1 + count2

    UserSimi2arr = dict()
    for i, related_items in item_item_count.items():
        for j, cij in related_items.items():
            if i not in UserSimi2arr:
                UserSimi2arr[i] = dict()

            UserSimi2arr[i][j] = 1000 * cij / (
                math.sqrt(item_count[i] * item_count[j]) *
                (Simhash(i).distance(Simhash(j))**2))

    return UserSimi2arr
Пример #14
0
    def get_simlar_text(self, text1, text2):
        '''
        1.文本相似度比较算法
        2.使用simhash分析
        :param text1:
        :param text2:
        :return:
        '''
        new_simhash = SimHash()

        hash_first = new_simhash.getHash(text1)  # 计算hash值
        hash_second = new_simhash.getHash(text2)

        text_first_hash = Simhash(hash_first)
        text_second_hash = Simhash(hash_second)

        distince = text_first_hash.distance(text_second_hash)

        max_hashbit = max(len(bin(text_first_hash.value)),
                          (len(bin(text_second_hash.value))))

        if max_hashbit == 0:
            return 0
        else:
            similar = 1 - distince / max_hashbit
            return (similar)
Пример #15
0
def sim_hash(file1,file2):
	str1=open(file1,'r').read()
	str1=re.sub(r'#.*\n', "", str1)
	str1=re.sub(r'//.*\n', "", str1)
	str1=re.sub(r'/\*.*\*/', "", str1)
	str1=re.sub(r'var\d+','var',str1)
	str1=re.sub(r'func\d+','func',str1)


	str2=open(file2,'r').read()
	str2=re.sub(r'#.*\n', "", str2)
	str2=re.sub(r'//.*\n', "", str2)
	str2=re.sub(r'/\*.*\*/', "", str2)
	str2=re.sub(r'var\d+','var',str2)
	str2=re.sub(r'func\d+','func',str2)

	l1=re.split(r'[\s]',str1)
	l2=re.split(r'[\s]',str2)


	dist1=find_lcsubstr(l1,l2)
	sim1=(dist1/len(l1))

	dist2=len(find_lcseque(str1.split(),str2.split()))
	sim2=dist2/len(str1.split())

	dist3=Simhash(str1.split('\n')).distance(Simhash(str2.split('\n')))
	sim3=1-dist3/64
	#if sim3>=0.92:
	#	sim3=sim3*3

	sim=(sim1*5+sim2+sim3*3)/9
	print sim1, sim3, sim2
	return sim
Пример #16
0
 def tokenize(self, database, url: str, resp) -> bool:
     # beautifulsoup will parse all the html
     soup = BeautifulSoup(resp.raw_response.content, features="lxml")
     # get text return all the text in this page
     all_text = soup.get_text()
     if all_text != "":
         if Simhash(all_text).value in self.simHash.keys():
             database.robotTXT += 1
             return False
         self.simHash[Simhash(all_text).value] = 1
         database.addUniqueUrl(url)
         words = nltk.tokenize.word_tokenize(all_text)
         # count the total word in this page
         wordcount = 0
         for word in words:
             wordcount += 1
             if word.isalnum(
             ) and word not in Tokenizer._StopWord and word.isdigit(
             ) != True:
                 database.updateCommonword(word)
         database.updateLongestpage(url, wordcount)
         # update the number of subdomain in database
         if "ics.uci.edu" in url and urlparse(url)[1] != "ics.uci.edu":
             database.updateSubDomain(
                 urlparse(url)[0] + "://" + urlparse(url)[1])
     return True
Пример #17
0
def K_Simhash(ksimhash, vs):
    sim_list = []

    for ks in ksimhash[1:]:
        sim_list.append(min([Simhash(v).distance(Simhash(ks)) for v in vs]))

    return median(sim_list)
Пример #18
0
def extract_next_links(url, resp):
    if resp.status != 200 or not resp.raw_response:
        return []
    res = set()
    this_path = urlparse(url).path
    this_scheme = urlparse(url).scheme
    html = resp.raw_response.content
    for link in BeautifulSoup(html, parse_only=SoupStrainer('a'), features='html.parser'):
        if link.has_attr('href'):
            # clean up links
            site = ""
            if link['href'] == '/':  # same site
                continue
            if link['href'].startswith("//"):  # same domain
                site = link['href'][2:]
            elif link['href'].startswith('/') and urlparse(url).netloc not in link['href']:
                site = url + link['href']
            else:
                site = link['href']

            if not urlparse(site).scheme:
                site = this_scheme + "://" + site
            defrag_site = urldefrag(site.strip())[0]
            new_path = urlparse(defrag_site).path
            if Simhash(this_path).distance(Simhash(new_path)) > 16:
                res.add(defrag_site)
    return list(res)
Пример #19
0
def cloest_topic(base_topic, base_url, query):
    # get corpus on nist
    if base_url not in corpus_dict.keys():
        corpus_dict[base_url] = requests.get(base_url).content
    base_corpus = BeautifulSoup(corpus_dict[base_url],
                                'html.parser').get_text()
    # key: value = link: similarity
    sim_res = {}
    if query not in gquery_dict.keys():
        gquery_dict[query] = google(query, my_api_key, my_cse_id)
    results = gquery_dict[query]
    if len(results) == 0:
        return '', 100000000
    try:
        for cmp_link in results:
            if cmp_link not in corpus_dict.keys():
                corpus_dict[cmp_link] = requests.get(cmp_link).content
            cmp_corpus = BeautifulSoup(corpus_dict[cmp_link],
                                       'html.parser').get_text()
            res = Simhash(base_corpus).distance(Simhash(cmp_corpus))
            sim_res[cmp_link] = res
    except:
        sim_res = {}
    if len(sim_res) == 0:
        return '', 100000000
    cloest_link = min(sim_res.items(), key=lambda x: x[1])[0]
    cloest_dist = sim_res[cloest_link]
    return cloest_link, cloest_dist
Пример #20
0
def similarity(text1, text2):
    """
    :param tex1: 文本1
    :param text2: 文本2
    :return: 返回兩篇文章的相似度
    """
    aa_simhash = Simhash(text1)
    bb_simhash = Simhash(text2)

    # 打印simhash值二進制
    #print(bin(aa_simhash.value))
    #print(bin(bb_simhash.value))

    # 漢明距離
    #distince = aa_simhash.distance(bb_simhash)
    #print(distince)

    a = float(aa_simhash.value)
    b = float(bb_simhash.value)

    if a > b:
        similar = b / a
    else:
        similar = a / b

    return similar
Пример #21
0
def simhash_remove_similar(news_list):
    result_list = []
    # 需要两两比较simhash值
    len_news_list = len(news_list)
    for i in range(len_news_list):
        news_i_id = news_list[i]['news_id']
        news_i_news_content = accord_news_id_get_content_list(
            news_i_id)['news_content']
        sim_hash1 = Simhash(news_i_news_content)
        for j in range(i + 1, len_news_list):
            # 已经被打过标记的不判断
            if 'del' in news_list[j]:
                continue
            news_j_id = news_list[j]['news_id']
            news_j_news_content = accord_news_id_get_content_list(
                news_j_id)['news_content']
            sim_hash2 = Simhash(news_j_news_content)
            # 如果两个新闻的汉明距离小于 10 则按照顺序只保留一个
            if sim_hash1.distance(sim_hash2) <= SIMHASH_DISTINCT:
                # 表示不要这个新闻了
                news_list[j]['del'] = 'yes'
    for news in news_list:
        if 'del' not in news:
            result_list.append(news)
    return result_list
Пример #22
0
def search_duplicate():
    data = load_pk(datafile)
    index = load_pk(indexfile)
    
    begin, middle, end = data[1000], data[500000], data[-1000]
    print("1,000th is %r, 500,000th is %r, -1,000th is %r." % (
        begin, middle, end))
         
    st = time.clock()
    ind = data.index(begin)
    print("list.index(item) method for begin takes %.6f sec" % (time.clock() - st,))

    st = time.clock()
    res = index.get_near_dups(Simhash(begin))
    print("simhash index method for begin takes %.6f sec" % (time.clock() - st,))
    
    st = time.clock()
    ind = data.index(middle)
    print("list.index(item) method for middle takes %.6f sec" % (time.clock() - st,))

    st = time.clock()
    res = index.get_near_dups(Simhash(middle))
    print("simhash index method for middle takes %.6f sec" % (time.clock() - st,))
    
    st = time.clock()
    ind = data.index(end)
    print("list.index(item) method for end takes %.6f sec" % (time.clock() - st,))
    
    st = time.clock()
    res = index.get_near_dups(Simhash(end))
    print("simhash index method for end takes %.6f sec" % (time.clock() - st,))
Пример #23
0
    def test_sparse_features(self):
        data = [
            'How are you? I Am fine. blar blar blar blar blar Thanks.',
            'How are you i am fine. blar blar blar blar blar than',
            'This is simhash test.',
            'How are you i am fine. blar blar blar blar blar thank1'
        ]
        vec = TfidfVectorizer()
        D = vec.fit_transform(data)
        voc = dict((i, w) for w, i in vec.vocabulary_.items())

        # Verify that distance between data[0] and data[1] is < than
        # data[2] and data[3]
        shs = []
        for i in range(D.shape[0]):
            Di = D.getrow(i)
            # features as list of (token, weight) tuples)
            features = zip([voc[j] for j in Di.indices], Di.data)
            shs.append(Simhash(features))
        self.assertNotEqual(0, shs[0].distance(shs[1]))
        self.assertNotEqual(0, shs[2].distance(shs[3]))
        self.assertLess(shs[0].distance(shs[1]), shs[2].distance(shs[3]))

        # features as token -> weight dicts
        D0 = D.getrow(0)
        dict_features = dict(zip([voc[j] for j in D0.indices], D0.data))
        self.assertEqual(17583409636488780916, Simhash(dict_features).value)

        # the sparse and non-sparse features should obviously yield
        # different results
        self.assertNotEqual(Simhash(dict_features).value,
                            Simhash(data[0]).value)
Пример #24
0
 def get_sim_simhash(self, text1, text2, f_num=64):
     a_simhash = Simhash(text1, f=f_num)
     b_simhash = Simhash(text2, f=f_num)
     max_hashbit = max(len(bin(a_simhash.value)), len(bin(b_simhash.value)))
     distance = a_simhash.distance(b_simhash)
     sim = 1 - distance / max_hashbit
     return sim
Пример #25
0
 def test_large_inputs(self):
     """ Test code paths for dealing with feature lists larger than batch_size, and weights larger than large_weight_cutoff. """
     many_features = [str(i) for i in range(int(Simhash.batch_size * 2.5))]
     many_features_large_weights = [(f, Simhash.large_weight_cutoff * i)
                                    for i, f in enumerate(many_features)]
     self.assertEqual(7984652473404407437, Simhash(many_features).value)
     self.assertEqual(3372825719632739723,
                      Simhash(many_features_large_weights).value)
Пример #26
0
 def calcTitleHashFeats(title1, title2, featVector):
     if title1 is None or title2 is None or title1 == '' or title2 == '':
         featVector.append(1)
         return
     title1 = '%x' % Simhash(get_features(normalize(title1))).value
     title2 = '%x' % Simhash(get_features(normalize(title2))).value
     t2 = distance.nlevenshtein(title1, title2)
     featVector.append(t2)
Пример #27
0
 def calcAbstractHashFeats(abstract1, abstract2, featVector):
     if abstract1 is None or abstract2 is None or abstract1 == '' or abstract2 == '':
         featVector.append(1)
         return
     abstract1 = '%x' % Simhash(get_features(abstract1)).value
     abstract2 = '%x' % Simhash(get_features(abstract2)).value
     t2 = distance.nlevenshtein(abstract1, abstract2)
     featVector.append(t2)
Пример #28
0
def simhash_method(doc1, doc2, threshold):
    distance = Simhash(doc1).distance(Simhash(doc2))
    # print(distance) #to check the similarity is same as it should to be#
    if distance < threshold:
        result = 1
    else:
        result = 0
    return result
Пример #29
0
    def test_segtree(self):
        sh = Simhash('How are you? I am fine. Thanks. And you?')
        self.assertEqual(sh.value, 6460565663990245323)

        sh2 = Simhash('How old are you ? :-) I am fine. Thanks. And you?')
        self.assertEqual(sh.distance(sh2), 8)

        sh3 = Simhash(sh2)
        self.assertEqual(sh2.distance(sh3), 0)
Пример #30
0
    def test_distance(self):
        sh = Simhash('How are you? I AM fine. Thanks. And you?')
        sh2 = Simhash('How old are you ? :-) i am fine. Thanks. And you?')
        self.assertTrue(sh.distance(sh2) > 0)

        sh3 = Simhash(sh2)
        self.assertEqual(sh2.distance(sh3), 0)

        self.assertNotEqual(Simhash('1').distance(Simhash('2')), 0)
Пример #31
0
def clustering():
    fout = open('cluster.txt', 'w', encoding='UTF-8')
    cursor = conn.cursor()
    cursor.execute(
        'SELECT id, title, cluster, sim_count, link, simhash FROM entries where cluster=0'
    )
    entrylist = cursor.fetchall()
    objs = []
    entrydic = {}
    for item in entrylist:
        if not is_en(item[1]):
            if not item[4].startswith("https://weibo.com"):
                sim = Simhash(get_features_cn(item[1]))
                objs.append((str(item[0]), sim))
                entrydic[str(item[0])] = {
                    'title': item[1],
                    'cluster': 0,
                    'sim_count': 0,
                    'link': item[4],
                    'simhash': sim.value
                }
        else:
            sim = Simhash(get_features(item[1]))
            objs.append((str(item[0]), sim))
            entrydic[str(item[0])] = {
                'title': item[1],
                'cluster': 0,
                'sim_count': 0,
                'link': item[4],
                'simhash': sim.value
            }

    index = SimhashIndex(objs, k=tolerance)
    cluster_num = last_cluster_num
    for key in entrydic:
        if entrydic[key]['cluster'] == 0:
            sims = index.get_near_dups(
                Simhash(get_features_cn(entrydic[key]['title'])))
            for item in sims:
                entrydic[item]['cluster'] = cluster_num
                # if len(sims) > 1:
                entrydic[item]['sim_count'] = len(sims) - 1
                if len(sims) > 1:
                    fout.write(item + '\t' + str(entrydic[item]['cluster']) +
                               '\t' + entrydic[item]['title'] + '\n')
                cursor.execute(
                    'UPDATE entries SET cluster=%s, sim_count=%s, simhash=%s where id = %s',
                    (entrydic[item]['cluster'], entrydic[item]['sim_count'],
                     str(entrydic[item]['simhash']), item))
                # conn.commit()
                # fout.write(item + '\t' + str(entrydic[item]['cluster']) + '\t' + entrydic[item]['title'] + '\t' + entrydic[item]['link'] + '\n')
            cluster_num += 1
    # cursor.execute('UPDATE somevariables SET last_cluster=%s', (cluster_num,))
    # conn.commit()
    conn.close()
Пример #32
0
def isPageTooSimilar(pageTextString, pageHashes):
    pageHash = Simhash(pageTextString)
    minDist = 100000000
    skipPage = False
    for hashedPage in pageHashes:
        if pageHash.distance(hashedPage) < 3:
            skipPage = True
            break
    else:
        pageHashes.add(pageHash)
    return skipPage
Пример #33
0
def hanming_distance(s1, s2):
    if type(s1) == str and type(s2) == str:
        hanmingdistance = bin(
            int(hex(Simhash(get_features(s1)).value), 16)
            ^ int(hex(Simhash(get_features(s2)).value), 16)).count('1')
    elif type(s1) == int and type(s2) == int:
        hanmingdistance = bin(int(hex(s1), 16) ^ int(hex(s2), 16)).count('1')
    else:
        print('s1和s2需要是相同的数据类型!')
    # return hanming_distance
    return 1 - hanmingdistance / 64
Пример #34
0
    def shrinkdup(self):
        pkv = Simhash('')
        shdist = 10

        for content in self.sortedlog:
            if not content or content.strip() == '':
                continue
            try:
                ret = self._extract_content(content)
                hv = Simhash(ret)
                if pkv.distance(hv) > shdist:
                    self.shrinklogs.append(content)
                    pkv = hv
            except Exception as e:
                print content
                print e

        if len(self.shrinklogs) > 0:
            filen = self.filen + '-shrunk'
            with open(filen, 'w') as f:
                for v in self.shrinklogs:
                    f.write('%s' % (v))
Пример #35
0
def simHashLabel(user1filepath,user2filepath,user1Floder,user2Floder,num_floder):
    ans=0.0
    for i in range(num_floder):
        labeluser1=''
        labeluser2=''
        tempmax1=0
        tempmax2=0
        f1=open(user1filepath+user1Floder[i]+os.sep+'RCed_stoppoint.txt')
        for line in f1:
            labeluser1+=line.split(',')[4]
            labeluser1+=','
            tempmax1+=1
        f2=open(user2filepath+user2Floder[i]+os.sep+'RCed_stoppoint.txt')
        for line in f2:
            labeluser2+=line.split(',')[4]
            labeluser2+=','
            tempmax2+=1
        sh1 = Simhash(u'%s'%labeluser1)
        sh2 = Simhash(u'%s'%labeluser2)
        maxlen=tempmax1 if tempmax1>=tempmax2 else tempmax2

        ans+= sh1.distance(sh2)/maxlen

    return ans
Пример #36
0
    def test_chinese(self):
        self.maxDiff = None

        sh1 = Simhash(u'你好 世界!  呼噜。')
        sh2 = Simhash(u'你好,世界 呼噜')

        sh4 = Simhash(u'How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.')
        sh5 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than')
        sh6 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank')

        self.assertEqual(sh1.distance(sh2), 0)

        self.assertTrue(sh4.distance(sh6) < 3)
        self.assertTrue(sh5.distance(sh6) < 3)
Пример #37
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Joshua
# @E-Mail: [email protected]
# @Date:   2015-02-11 12:32:00
# @About demo.py

# import re
from simhash import Simhash
from simhash import SimhashIndex
from data import news_lists_1, news_lists_2



# 对于汉语无论分词于不分词,simhash的结果是相同的。
# 
for i, news in enumerate(news_lists_1):
    x = Simhash(news_lists_1[i]['content'], f=64)
    #y = Simhash('hello')
    #x = Simhash('Hi')
    y = Simhash(news_lists_2[i]['content'], f=64)
    print('1.simhash:', x.value)
    print('2.simhash:', y.value)
    print('distance:', x.distance(y))
    print('similarity:', (64 - x.distance(y)) / 64)
    print(news_lists_1[i]['title'])

Пример #38
0
    def test_chinese(self):
        sh1 = Simhash(u'你好 世界!  呼噜。')
        sh2 = Simhash(u'你好,世界 呼噜')

        #self.assertEqual(sh1._features, [])
        self.assertEqual(sh1.distance(sh2), 0)
Пример #39
0
 def generate_simhash(self, tokens):
   #Generate a Simhash from Spacy tokens.
   sh = Simhash(u'', f=self.hash_size) #silly interface...
   sh.build_by_features(tokens)
   return sh