Пример #1
1
def gensim_simhash(content,test_news):

    # 加载积累的stopwords
    stopwords = load_stopwords()

    # 切割token并清除stopwords
    x = [[word for word in line.split() if word not in stopwords] for line in content]

    # 切割token并清除stopwords
    test_news = [word for word in test_news.split() if word not in stopwords]

    # 计算simhash
    test_news_hash = Simhash(test_news)


    sim=[]
    # 遍历语料计算simhash值
    for news in x:
        hash = Simhash(news)
        score=test_news_hash.distance(hash)
        sim.append( score)
        #print "add %d %f" %(index,score)

    for index, score in sorted(enumerate(sim), key=lambda item: item[1])[:6]:
        # print   "index:%d similarities:%f" % (index, score)
        print   "index:%d similarities:%f content:%s" % (index, score, content[index])
Пример #2
0
    def test_segtree(self):
        sh = Simhash('How are you? I am fine. Thanks. And you?')
        self.assertEqual(sh.value, 6460565663990245323)

        sh2 = Simhash('How old are you ? :-) I am fine. Thanks. And you?')
        self.assertEqual(sh.distance(sh2), 8)

        sh3 = Simhash(sh2)
        self.assertEqual(sh2.distance(sh3), 0)
Пример #3
0
    def test_distance(self):
        sh = Simhash('How are you? I AM fine. Thanks. And you?')
        sh2 = Simhash('How old are you ? :-) i am fine. Thanks. And you?')
        self.assertTrue(sh.distance(sh2) > 0)

        sh3 = Simhash(sh2)
        self.assertEqual(0, sh2.distance(sh3))

        self.assertNotEqual(0, Simhash('1').distance(Simhash('2')))
Пример #4
0
    def test_distance(self):
        sh = Simhash('How are you? I AM fine. Thanks. And you?')
        sh2 = Simhash('How old are you ? :-) i am fine. Thanks. And you?')
        self.assertTrue(sh.distance(sh2) > 0)

        sh3 = Simhash(sh2)
        self.assertEqual(sh2.distance(sh3), 0)

        self.assertNotEqual(Simhash('1').distance(Simhash('2')), 0)
Пример #5
0
    def test_chinese(self):
        self.maxDiff = None

        sh1 = Simhash(u'你好 世界!  呼噜。')
        sh2 = Simhash(u'你好,世界 呼噜')

        sh4 = Simhash(u'How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.')
        sh5 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than')
        sh6 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank')

        self.assertEqual(sh1.distance(sh2), 0)

        self.assertTrue(sh4.distance(sh6) < 3)
        self.assertTrue(sh5.distance(sh6) < 3)
Пример #6
0
    def test_chinese(self):
        self.maxDiff = None

        sh1 = Simhash(u'你好 世界!  呼噜。')
        sh2 = Simhash(u'你好,世界 呼噜')

        sh4 = Simhash(u'How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.')
        sh5 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than')
        sh6 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank')

        self.assertEqual(0, sh1.distance(sh2))

        self.assertTrue(sh4.distance(sh6) < 3)
        self.assertTrue(sh5.distance(sh6) < 3)
Пример #7
0
def gensim_simhash(content, test_news):

    # 加载积累的stopwords
    stopwords = load_stopwords()

    # 切割token并清除stopwords
    x = [[word for word in line.split() if word not in stopwords]
         for line in content]

    # 切割token并清除stopwords
    test_news = [word for word in test_news.split() if word not in stopwords]

    # 计算simhash
    test_news_hash = Simhash(test_news)

    sim = []
    # 遍历语料计算simhash值
    for news in x:
        hash = Simhash(news)
        score = test_news_hash.distance(hash)
        sim.append(score)
        #print "add %d %f" %(index,score)

    for index, score in sorted(enumerate(sim), key=lambda item: item[1])[:6]:
        # print   "index:%d similarities:%f" % (index, score)
        print "index:%d similarities:%f content:%s" % (index, score,
                                                       content[index])
def simhash_similarity(text1, text2):
    aa_simhash = Simhash(text1)
    bb_simhash = Simhash(text2)
    max_hashbit = max(len(bin(aa_simhash.value)), (len(bin(bb_simhash.value))))
    distince = aa_simhash.distance(bb_simhash)
    similar = 1 - distince / max_hashbit
    return similar
Пример #9
0
def test2():
    # po = Pool(10)
    for dirpath, dirnames, filenames in os.walk(driver_path):
        for filename in filenames:
            index = filenames.index(filename)
            print '下标', index
            file_path1 = dirpath + '/' + filename
            cont = news_process(file_path1)
            simhash1 = Simhash(cont)
            print file_path1
            key1 = num10_to2_sys(simhash1.value)
            print key1
            for i in filenames[:index]:
                file_path2 = dirpath + '/' + i
                cont2 = news_process(file_path2)
                simhash2 = Simhash(cont2)
                # key2 = num10_to2_sys(simhash2.value)
                # a = hammingDis(key1, key2)
                # print '海明距离', a
                # print file_path1
                # print key1
                # print file_path2
                # print key2

                key2 = simhash1.distance(simhash2)
                print '海明距离', key2
                print file_path1
                print simhash1.value
                print file_path2
                print simhash2.value
Пример #10
0
    def get_simlar_text(self, text1, text2):
        '''
        1.文本相似度比较算法
        2.使用simhash分析
        :param text1:
        :param text2:
        :return:
        '''
        new_simhash = SimHash()

        hash_first = new_simhash.getHash(text1)  # 计算hash值
        hash_second = new_simhash.getHash(text2)

        text_first_hash = Simhash(hash_first)
        text_second_hash = Simhash(hash_second)

        distince = text_first_hash.distance(text_second_hash)

        max_hashbit = max(len(bin(text_first_hash.value)),
                          (len(bin(text_second_hash.value))))

        if max_hashbit == 0:
            return 0
        else:
            similar = 1 - distince / max_hashbit
            return (similar)
Пример #11
0
def simhash_remove_similar(news_list):
    result_list = []
    # 需要两两比较simhash值
    len_news_list = len(news_list)
    for i in range(len_news_list):
        news_i_id = news_list[i]['news_id']
        news_i_news_content = accord_news_id_get_content_list(
            news_i_id)['news_content']
        sim_hash1 = Simhash(news_i_news_content)
        for j in range(i + 1, len_news_list):
            # 已经被打过标记的不判断
            if 'del' in news_list[j]:
                continue
            news_j_id = news_list[j]['news_id']
            news_j_news_content = accord_news_id_get_content_list(
                news_j_id)['news_content']
            sim_hash2 = Simhash(news_j_news_content)
            # 如果两个新闻的汉明距离小于 10 则按照顺序只保留一个
            if sim_hash1.distance(sim_hash2) <= SIMHASH_DISTINCT:
                # 表示不要这个新闻了
                news_list[j]['del'] = 'yes'
    for news in news_list:
        if 'del' not in news:
            result_list.append(news)
    return result_list
Пример #12
0
 def get_sim_simhash(self, text1, text2, f_num=64):
     a_simhash = Simhash(text1, f=f_num)
     b_simhash = Simhash(text2, f=f_num)
     max_hashbit = max(len(bin(a_simhash.value)), len(bin(b_simhash.value)))
     distance = a_simhash.distance(b_simhash)
     sim = 1 - distance / max_hashbit
     return sim
Пример #13
0
    def test_chinese(self):
        self.maxDiff = None

        sh1 = Simhash(u'浣犲ソ銆�涓栫晫锛併��銆�鍛煎櫆銆�')
        sh2 = Simhash(u'浣犲ソ锛屼笘鐣屻��鍛煎櫆')

        sh4 = Simhash(
            u'How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.'
        )
        sh5 = Simhash(
            u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than'
        )
        sh6 = Simhash(
            u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank'
        )

        self.assertEqual(sh1.distance(sh2), 0)

        self.assertTrue(sh4.distance(sh6) < 3)
        self.assertTrue(sh5.distance(sh6) < 3)
Пример #14
0
def isPageTooSimilar(pageTextString, pageHashes):
    pageHash = Simhash(pageTextString)
    minDist = 100000000
    skipPage = False
    for hashedPage in pageHashes:
        if pageHash.distance(hashedPage) < 3:
            skipPage = True
            break
    else:
        pageHashes.add(pageHash)
    return skipPage
    def simhash_distance(self, text1, text2):
        text1_hash = Simhash(text1)
        text2_hash = Simhash(text2)

        max_hashbit = max(len(bin(text1_hash.value)),
                          (len(bin(text2_hash.value))))

        # 汉明距离
        distince = text1_hash.distance(text2_hash)
        similar = 1 - distince / max_hashbit

        return similar
Пример #16
0
 def compare_data_simhash(self, data1, data2):
     """
     对文本使用simhash进行近似判断
     :param data1:
     :param data2:
     :return:
     """
     data1_sim = Simhash(data1)
     data2_sim = Simhash(data2)
     # 汉明距离
     dis = data1_sim.distance(data2_sim)
     if dis < 2:
         return True
Пример #17
0
def simhash_similarity(text1, text2):
    a_simhash = Simhash(text1)
    b_simhash = Simhash(text2)
    print(a_simhash.value)
    print(b_simhash.value)
    max_hashbit = max(len(bin(a_simhash.value)), len(bin(b_simhash.value)))
    print(max_hashbit)

    #汉明距离
    distince = a_simhash.distance(b_simhash)
    print(distince)
    similar = distince / max_hashbit
    return similar
Пример #18
0
def sim_hash_similarity(text1, text2):
    """
    :param text1: 文本1
    :param text2: 文本2
    :return: 返回两篇文章的相似度
    """
    aa_sim_hash = Simhash(text1)
    bb_sim_hash = Simhash(text2)
    max_hash_bit = max(len(bin(aa_sim_hash.value)),
                       (len(bin(bb_sim_hash.value))))
    # 汉明距离
    distance = aa_sim_hash.distance(bb_sim_hash)
    similar = 1 - distance / max_hash_bit
    return similar
Пример #19
0
def simhash_similarity(text1, text2):
    """
    :param tex1: 文本1
    :param text2: 文本2
    :return: 返回两篇文章的相似度
    """
    text1 = filter_html(text1)
    text2 = filter_html(text2)
    aa_simhash = Simhash(text1)
    bb_simhash = Simhash(text2)
    max_hashbit = max(len(bin(aa_simhash.value)), (len(bin(bb_simhash.value))))
    # 汉明距离
    distince = aa_simhash.distance(bb_simhash)
    # 相似度计算
    similar = 1 - distince / max_hashbit
    return similar
Пример #20
0
def simhash(sentence1: str, sentence2: str) -> float:
    """
    求两文本的相似度
    :param text_a:
    :param text_b:
    :return:
    """
    if sentence1 and sentence2:
        sentence1=str(sentence1)
        sentence2=str(sentence2)
        a_simhash = Simhash(sentence1)
        b_simhash = Simhash(sentence2)
        max_hashbit = max(len(bin(a_simhash.value)), len(bin(b_simhash.value)))
        # 汉明距离
        distince = a_simhash.distance(b_simhash)
        similar = 1 - distince / max_hashbit
        return similar
Пример #21
0
def is_similar_page(res1, res2, radio=3):
    # 使用simHash判断页面的相似程度
    if res1 is None or res2 is None:
        return False

    body1 = res1.body
    body2 = res2.body

    # 此处非常耗时,大概是split函数费时
    simhash1 = Simhash(body1.split())
    simhash2 = Simhash(body2.split())

    calc_radio = simhash1.distance(simhash2)

    if calc_radio <= radio:
        return True
    return False
Пример #22
0
    def is_content(self, text, word_count):
        if text and word_count >= 150:
            current_sim = Simhash(text)

            #first link, nothing in simhash set
            if len(self.simhashes) == 0:
                self.simhashes.add(current_sim)
                return True

            for x in self.simhashes:
                if current_sim.distance(x) <= 3:
                    print("duplicate detected")
                    return False
            self.simhashes.add(current_sim)
            return True
        else:
            print("low text count")
            return False
Пример #23
0
def similarHash(text1, text2):
    '''
    获取文章相似度
    '''

    simhash = GetHash()

    hash1 = simhash.get_str_hash(text1)  # 计算hash
    hash2 = simhash.get_str_hash(text2)  # 计算hash

    t1_simhash = Simhash(hash1)
    t2_simhash = Simhash(hash2)

    distince = t1_simhash.distance(t2_simhash)
    max_hashbit = max(len(bin(t1_simhash.value)), (len(bin(t2_simhash.value))))
    if max_hashbit == 0:
        return 0
    else:
        ssimilar = 1 - distince / max_hashbit
        return (ssimilar)
def simhash_similarity(text1, text2):
    """
    :param tex1: 文本1
    :param text2: 文本2
    :return: 返回两篇文章的相似度
    """
    begin = time.time()
    aa_simhash = Simhash(text1)
    bb_simhash = Simhash(text2)
    # print(aa_simhash.value)
    # print(bb_simhash.value)

    max_hashbit = max(len(bin(aa_simhash.value)), (len(bin(bb_simhash.value))))
    # 汉明距离
    distince = aa_simhash.distance(bb_simhash)

    similar = 1 - distince / max_hashbit

    print("两两计算的时间:%f" % (time.time() - begin))
    return similar
Пример #25
0
def simhash_similarity(text1, text2):
    """
    :param tex1: 文本1
    :param text2: 文本2
    :return: 返回两篇文章的相似度
    """
    aa_simhash = Simhash(text1)
    bb_simhash = Simhash(text2)
    print(1, bin(aa_simhash.value))
    print(2, bin(aa_simhash.value))
    max_hashbit = max(len(bin(aa_simhash.value)), (len(bin(bb_simhash.value))))

    print(max_hashbit)

    # 汉明距离
    distince = aa_simhash.distance(bb_simhash)
    print(distince)

    similar = 1 - distince / max_hashbit

    return similar
Пример #26
0
    def shrinkdup(self):
        pkv = Simhash('')
        shdist = 10

        for content in self.sortedlog:
            if not content or content.strip() == '':
                continue
            try:
                ret = self._extract_content(content)
                hv = Simhash(ret)
                if pkv.distance(hv) > shdist:
                    self.shrinklogs.append(content)
                    pkv = hv
            except Exception as e:
                print content
                print e

        if len(self.shrinklogs) > 0:
            filen = self.filen + '-shrunk'
            with open(filen, 'w') as f:
                for v in self.shrinklogs:
                    f.write('%s' % (v))
Пример #27
0
def findNeighborVectors(simHashIndex, matrix, vector, topk=10):
    hash = Simhash(get_features(vector), f=32)
    hashstr = bin(hash.value)

    nearVectorArray=[]

    for num in range(4):  # For 32-bit fingerprints
        key = hashstr[num * 8 + 2:(num + 1) * 8 + 2]
        # if (simHashIndex[num].has_key(key)):
        #     nearVectorArray.extend(simHashIndex[num][key])
        nearVectorArray.extend(simHashIndex[num].get(int(key)))

    sortedDict= {}
    for index in nearVectorArray:
        index=int(index)
        vec=matrix[index]
        vechash=Simhash(get_features(vector), f=32)
        sortedDict[index]=hash.distance(vechash)

    sorted(sortedDict.items(), lambda x, y: cmp(x[1], y[1]))
    sortindexs=sortedDict.keys()[:topk]

    return sortindexs
Пример #28
0
class FuncHash:
    def __init__(self, dict):
        super().__init__()
        self.path = dict.get("path")
        self.start = dict.get("start")
        self.stop = dict.get("stop")
        self.source = dict.get("source")
        self.startLoc = dict.get("startLoc")
        self.stopLoc = dict.get("stopLoc")
        self.blame = dict.get("blame")
        self.lineCount = len(self.source.split('\n'))

    def hashSource(self):
        self.simHash = Simhash(self.source)

    def distance(self, to):
        return self.simHash.distance(to.simHash)

    def _asdict(self):
        return dict(path=self.path,
                    start=self.start,
                    stop=self.stop,
                    source=self.source)
Пример #29
0
def simHashLabel(user1filepath,user2filepath,user1Floder,user2Floder,num_floder):
    ans=0.0
    for i in range(num_floder):
        labeluser1=''
        labeluser2=''
        tempmax1=0
        tempmax2=0
        f1=open(user1filepath+user1Floder[i]+os.sep+'RCed_stoppoint.txt')
        for line in f1:
            labeluser1+=line.split(',')[4]
            labeluser1+=','
            tempmax1+=1
        f2=open(user2filepath+user2Floder[i]+os.sep+'RCed_stoppoint.txt')
        for line in f2:
            labeluser2+=line.split(',')[4]
            labeluser2+=','
            tempmax2+=1
        sh1 = Simhash(u'%s'%labeluser1)
        sh2 = Simhash(u'%s'%labeluser2)
        maxlen=tempmax1 if tempmax1>=tempmax2 else tempmax2

        ans+= sh1.distance(sh2)/maxlen

    return ans
Пример #30
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Joshua
# @E-Mail: [email protected]
# @Date:   2015-02-11 12:32:00
# @About demo.py

# import re
from simhash import Simhash
from simhash import SimhashIndex
from data import news_lists_1, news_lists_2



# 对于汉语无论分词于不分词,simhash的结果是相同的。
# 
for i, news in enumerate(news_lists_1):
    x = Simhash(news_lists_1[i]['content'], f=64)
    #y = Simhash('hello')
    #x = Simhash('Hi')
    y = Simhash(news_lists_2[i]['content'], f=64)
    print('1.simhash:', x.value)
    print('2.simhash:', y.value)
    print('distance:', x.distance(y))
    print('similarity:', (64 - x.distance(y)) / 64)
    print(news_lists_1[i]['title'])

Пример #31
0
#coding:utf-8
__author__ = 'lym'
import re
from simhash import Simhash
print Simhash('aa').distance(Simhash('bb'))
print Simhash('aa').distance(Simhash('aa'))
sh1 = Simhash(u'你好 世界 呼噜')
sh2 = Simhash(u'你好 世界')
print(sh1.distance(sh2))
print ','.join('fff')
test='的阿斯顿多少多少'
print "this is test1: %s" %test
Пример #32
0
 def sim_hash(self, q, doc):
     '''距离越小越相似'''
     q = self.fomrat_str(q)
     doc = self.fomrat_str(doc)
     s1, s2 = Simhash(q), Simhash(doc)
     return s1.distance(s2)
Пример #33
0
# coding: utf8

from simhash import Simhash

s1 = Simhash('abcdefghijklmnopqrstuvwxyz', 128)
s2 = Simhash('abcdefghijklmnopqrstuvwayz', 128)
print s1.value
print s2.value
print s1.distance(s2)
    def crawl(self, in_dir, num=10):
        # url: string
        # title: string
        # content: string
        # outlinks: list
        i = 0
        while (len(self.fetchedurls) <= num):
            # init new Webpage class
            webpage = Webpage()

            # pop url
            if (len(self.frontier) > 0):
                webpage.url = self.frontier.pop(
                )  # 0: first element default: last element
            else:
                print("empty frontier")
                exit

            # fetch url and parse
            try:
                html = urlopen(webpage.url)
            except HTTPError:
                continue
            else:
                bsObj = BeautifulSoup(html, "html.parser")  # get a bs object

            # check the category
            div = bsObj.find(name='div', id='mw-normal-catlinks')
            tmp_category = ""
            try:
                contents = div.find_all(name='a')
            except AttributeError:
                continue
            else:
                for content in contents:
                    aText = content.get_text().lower()
                    tmp_category += aText
                # print(tmp_category)
                if (not any(category in tmp_category
                            for category in self.category_list)):
                    continue

            # fetch title
            webpage.title = str(bsObj.title).replace(" - Wikipedia</title>",
                                                     '').replace(
                                                         "<title>", "")

            # fetch content
            tmp_content = ""
            div = bsObj.find(name='div', id='mw-content-text')
            ps = div.find_all(name='p')
            for p in ps:
                pText = p.get_text()
                tmp_content += pText
            webpage.content = tmp_content

            if (not webpage.title or not webpage.content
                    or len(webpage.content) < 100):
                continue

            # check the content
            tmp_simhash = Simhash(webpage.content)
            for i in range(len(self.hash)):
                if (tmp_simhash.distance(self.hash[i]) <= 5):
                    continue

            # satisfied url
            self.hash.append(tmp_simhash)
            self.fetchedurls.append(webpage.url)

            # fetch outlinks
            tmp_outlinks = []
            newurls = div.find_all('a',
                                   href=re.compile("^(/wiki/)((?!;)\S)*$"))
            for newurl in newurls:

                # obey the robots.txt
                if (newurl.attrs['href'].replace("\n", '') in self.rules):
                    continue

                myurl = "https://en.wikipedia.org" + newurl.attrs['href']

                # dup URL elim
                if myurl not in self.fetchedurls and myurl not in self.frontier:
                    self.frontier.append(myurl)
                    tmp_outlinks.append(myurl)
            webpage.outlinks = tmp_outlinks

            # write to file
            with open(in_dir + '/' + str(i), 'wb') as fwrite:
                pickle.dump(webpage, fwrite)

            i += 1
            if (i % 100 == 0):
                print(i)
        fopen = open("fetchedurls", "wb")
        pickle.dump(self.fetchedurls, fopen)
Пример #35
0
def extract_next_links(url, resp):
    # If the raw_response exists, and the status is within 200 to 599, and is not 404 or 403,
    # then process the raw_response.content
    if resp:
        if not resp.raw_response == None:
            if resp.status >= 200 and resp.status <= 599:
                if resp.status == 404 or resp.status == 403:
                    return list()
                try:
                    # Get the HTML content and make it into a tree with lxml
                    parser = lxml.etree.HTMLParser(encoding='UTF-8')
                    tree = lxml.etree.parse(
                        io.StringIO(
                            resp.raw_response.content.decode(
                                encoding='UTF-8')), parser)

                    # String of all the text on the page
                    pageTextString = ""

                    # Check these tags for text
                    wantedTags = {
                        "p", "span", "blockquote", "code", "br", "a", "ol",
                        "ins", "sub", "sup", "h1", "h2", "h3", "h4", "h5",
                        "h6", "li", "ul", "title", "b", "strong", "em", "i",
                        "small", "sub", "sup", "ins", "del", "mark", "pre"
                    }

                    parsed = urlparse(url)

                    listofLinks = []
                    for elem in tree.iter():

                        if elem.tag in wantedTags:
                            if elem.text:
                                pageTextString += elem.text + " "

                        if elem.tag == "a" and "href" in elem.attrib:
                            link = elem.attrib["href"]
                            if len(link) == 0:
                                continue
                            if link == r"/" or link == parsed.netloc:
                                continue
                            elif link[0] == r"/":
                                link = parsed.netloc + link
                            elif link[0:2] == r"//":
                                link = "https:" + link

                            link = link.split('#')[0]
                            if "replytocom=" in link or "share=" in link:
                                link = link.split('?')[0]
                            listofLinks.append(link)

                    # If the distance between this page's hash and any other page
                    # is less than 3, return an empty list because this page is
                    # too similar to another page to be useful
                    pageHash = Simhash(pageTextString)
                    minDist = 100000000000
                    for hashedPage in hashes:
                        if pageHash.distance(hashedPage) < minDist:
                            minDist = pageHash.distance(hashedPage)
                        if pageHash.distance(hashedPage) <= 3:
                            return list()
                    hashes.add(pageHash)
                    print(minDist)

                    # Tokenize the page and put the resulting list in pageListofWords
                    pageListofWords = []
                    currWord = ""
                    for char in pageTextString:
                        try:
                            charOrd = ord(char)
                            if (charOrd >= 64 and charOrd <= 90):
                                currWord += char.lower()
                            elif (charOrd >= 48
                                  and charOrd <= 57) or (charOrd >= 97
                                                         and charOrd <= 122):
                                currWord += char
                            else:
                                if currWord != "":
                                    if not currWord in stopWords and len(
                                            currWord) > 1:
                                        pageListofWords.append(currWord)
                                    currWord = ""
                        except:
                            continue

                    # If the number of words is less than 150, return an empty list
                    # because this page is not useful enough
                    pageWordCount = len(pageListofWords)
                    if pageWordCount < 150:
                        return list()

                    # If this page has more words than the current longest page,
                    # set this page as the new longest page
                    global longestPageWordCount
                    global longestPageURL
                    if pageWordCount > longestPageWordCount:
                        longestPageWordCount = pageWordCount
                        longestPageURL = url
                        print("New longest page: " + url + " " +
                              str(longestPageWordCount))

                    # Increase word counters by their occurrences on this page
                    for word in pageListofWords:
                        if word not in stopWords:
                            if word not in totalWordDict:
                                totalWordDict[word] = 1
                            else:
                                totalWordDict[word] += 1

                    return listofLinks

                # Prints an exception if the page has non-UTF-8 characters
                except Exception as e:
                    print(e)

    # There was no response, or no content, or a bad resp_status
    return list()
Пример #36
0
    def test_chinese(self):
        sh1 = Simhash(u'你好 世界!  呼噜。')
        sh2 = Simhash(u'你好,世界 呼噜')

        #self.assertEqual(sh1._features, [])
        self.assertEqual(sh1.distance(sh2), 0)
Пример #37
0
    s2 = """我们知道,在文本去重的时候,有很多方式,在文本与文本之间对比,如果是整篇对比,费时费力,有人就想到用什么东西代表每篇文章,如摘要,当然,对计算机来说,摘要和整篇的区别只是缩小了篇幅,所以又有人想到了采用关键字来对比。这样确实可以大大缩减我们对比的复杂性。那我们怎么得到一篇文章的关键字呢?一般采用词频(TF),但是只用词频,如中文出现类似“的”、“我们”之类的词语很多,应该怎么去掉这些词语呢,手动去掉实在是麻烦,于是可以结合逆向词频(IDF),这就是著名的TD-IDF,一种提取一个文章的关键词的算法。词频我们很好理解,一个词语在整篇文章中出现的次数与词语总个数之比。IDF又怎么算呢,假如一个词语,在我们所有文章中出现的频率都非常高(例如“的”在我们多个文本中出现的次数很多),我们就认为,这个词语不具有代表性,就可以降低其作用,也就是赋予其较小的权值。
    那这个权重,我们怎么计算呢,(这里敲公式比较麻烦,直接找来图片),如下图,分子代表文章总数,分母表示该词语在这些文章(|D|)出现的篇数。一般我们还会采取分母加一的方法,防止分母为0的情况出现,在这个比值之后取对数,就是IDF了。
    simhash是一种局部敏感hash。我们都知道什么是hash。那什么叫局部敏感呢,假定A、B具有一定的相似性,在hash之后,仍然能保持这种相似性,就称之为局部敏感hash。
    在上文中,我们得到一个文档的关键词,取得一篇文章关键词集合,又会降低对比效率,我们可以通过hash的方法,把上述得到的关键词集合hash成一串二进制,这样我们直接对比二进制数,看其相似性就可以得到两篇文档的相似性,在查看相似性的时候我们采用海明距离,即在对比二进制的时候,我们看其有多少位不同,就称海明距离为多少。在这里,我是将文章simhash得到一串64位的二进制,一般取海明距离为3作为阈值,即在64位二进制中,只有三位不同,我们就认为两个文档是相似的。当然了,这里可以根据自己的需求来设置阈值。
    就这样,我们把一篇文档用一个二进制代表了,也就是把一个文档hash之后得到一串二进制数的算法,称这个hash为simhash。
    具体simhash步骤如下:
    (1)将文档分词,取一个文章的TF-IDF权重最高的前20个词(feature)和权重(weight)。即一篇文档得到一个长度为20的(feature:weight)的集合。
    (2)对其中的词(feature),进行普通的哈希之后得到一个64为的二进制,得到长度为20的(hash : weight)的集合。
    (3)根据(2)中得到一串二进制数(hash)中相应位置是1是0,对相应位置取正值weight和负值weight。例如一个词进过(2)得到(010111:5)进过步骤(3)之后可以得到列表[-5,5,-5,5,5,5],即对一个文档,我们可以得到20个长度为64的列表[weight,-weight...weight]。
    """
import lshash
    # a1 = Simhash('How are you? I AM fine. Thanks. And you?')
    # a2 = Simhash('How old are you ? :-) i am fine. Thanks. And you?')
    a1 = Simhash(s1)
    a2 = Simhash(s2)
    ret = a1.distance(a2)
    print(ret)
    exit(0)




    usage = """  
            ftclass.py -r 训练数据集文件名 -t 测试数据集文件名 -m 模型文件名

            -r file        训练数据集文件名
            -t file        测试数据集文件名
            -m file        模型文件名
            -l label       关键词和类别之间的分隔符,"__label__"

            ftclass.py  -r train.txt  -t test.txt -m model.bin -c a,b,c,d,e  -l __|||__