Пример #1
0
def main():
    hashm = zerorpc.Client('tcp://yaha.v-find.com:5678')
    #load_hashes(hashm)
    sim_server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
    r = redis.Redis()
    corpus = []

    while True:
        try:
            # process queue as FIFO, change `blpop` to `brpop` to process as LIFO
            source, data = r.blpop(["groud_crawler:items", "followall:items"], timeout=20)
        except KeyboardInterrupt:
            print 'Exit'
            break
        except:
            #print 'No blpop', len(corpus)
            if len(corpus) > 0:
                sim_server.index(corpus)
                corpus = []
            continue
        try:
            #print source, type(data)
            item = json.loads(data)
        except:
            print 'Load json error'
            continue
        url = item['url']
        try:
            html = HtmlContent.objects.get(url=url)
            #Ignore the exists item, TODO use bloomfilter to ignore
            if html.status != 2:
                continue
        except:
            html = HtmlContent(url=url)
        try:
            html.title = item['title'][0:200]
            html.content = item['content']
            tokens = list(Tokenize(html.content))
            html.hash = hash_token(tokens)
            #html.hash = long(simhashpy(tokens))
            html.tags,html.summerize,html.classify = summarize(html.content)
            html.summerize = html.summerize[0:400]
            html.preview = item['preview']

            if find_duplicate(hashm, html.hash) != 0:
                #Mark as duplicate
                html.status = 1
            else:
                html.status = 0

            html.save()
            hashm.insert(html.hash)
            if html.status == 0:
                doc = {}
                doc['id'] = 'html_%d' % html.id
                doc['tokens'] = tokens
                corpus.append(doc)
                #print 'Append corpus', len(corpus), corpus[-1]['id']
                if len(corpus) >= CORPUS_LEN:
                    sim_server.index(corpus)
                    corpus = []

            #print 'Saved url %s' % html.url
        except:
            tb = traceback.format_exc()
            print 'Load json error', html.url, tb
Пример #2
0
    sim_server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
    dels = []
    for obj in HtmlContent.objects.filter(status=1).filter(~Q(content='')):
        dels.append('html_%d' % obj.id)
    sim_server.delete(dels)
#hash_test()

obj1 = HtmlContent.objects.get(pk=6870)
obj2 = HtmlContent.objects.get(pk=7024)
token1 = list(Tokenize(obj1.content))
token2 = list(Tokenize(obj2.content))
h1 = simhashpy(token1, 64)
h2 = simhashpy(token2, 64)
print h1,h2
print corpus.distance(h1,h2)
h1 = simhash.hash_token(token1)
h2 = simhash.hash_token(token2)
print h1,h2
print corpus.distance(h1,h2)
h1 = simhash.hash_tokenpy(token1)
h2 = simhash.hash_tokenpy(token2)
print h1,h2
print corpus.distance(h1,h2)

'''
str1 = 'test love you'
str2 = 'love you test'
t1 = str1.decode('utf-8').split()
t2 = str2.decode('utf-8').split()
h1 = simhash.hash_token(t1)
h2 = simhash.hash_token(t2)
Пример #3
0
    for obj in HtmlContent.objects.filter(status=1).filter(~Q(content='')):
        dels.append('html_%d' % obj.id)
    sim_server.delete(dels)


#hash_test()

obj1 = HtmlContent.objects.get(pk=6870)
obj2 = HtmlContent.objects.get(pk=7024)
token1 = list(Tokenize(obj1.content))
token2 = list(Tokenize(obj2.content))
h1 = simhashpy(token1, 64)
h2 = simhashpy(token2, 64)
print h1, h2
print corpus.distance(h1, h2)
h1 = simhash.hash_token(token1)
h2 = simhash.hash_token(token2)
print h1, h2
print corpus.distance(h1, h2)
h1 = simhash.hash_tokenpy(token1)
h2 = simhash.hash_tokenpy(token2)
print h1, h2
print corpus.distance(h1, h2)
'''
str1 = 'test love you'
str2 = 'love you test'
t1 = str1.decode('utf-8').split()
t2 = str2.decode('utf-8').split()
h1 = simhash.hash_token(t1)
h2 = simhash.hash_token(t2)
h2 = simhash.hash_token(t1)