def main(): hashm = zerorpc.Client('tcp://yaha.v-find.com:5678') #load_hashes(hashm) sim_server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver')) r = redis.Redis() corpus = [] while True: try: # process queue as FIFO, change `blpop` to `brpop` to process as LIFO source, data = r.blpop(["groud_crawler:items", "followall:items"], timeout=20) except KeyboardInterrupt: print 'Exit' break except: #print 'No blpop', len(corpus) if len(corpus) > 0: sim_server.index(corpus) corpus = [] continue try: #print source, type(data) item = json.loads(data) except: print 'Load json error' continue url = item['url'] try: html = HtmlContent.objects.get(url=url) #Ignore the exists item, TODO use bloomfilter to ignore if html.status != 2: continue except: html = HtmlContent(url=url) try: html.title = item['title'][0:200] html.content = item['content'] tokens = list(Tokenize(html.content)) html.hash = hash_token(tokens) #html.hash = long(simhashpy(tokens)) html.tags,html.summerize,html.classify = summarize(html.content) html.summerize = html.summerize[0:400] html.preview = item['preview'] if find_duplicate(hashm, html.hash) != 0: #Mark as duplicate html.status = 1 else: html.status = 0 html.save() hashm.insert(html.hash) if html.status == 0: doc = {} doc['id'] = 'html_%d' % html.id doc['tokens'] = tokens corpus.append(doc) #print 'Append corpus', len(corpus), corpus[-1]['id'] if len(corpus) >= CORPUS_LEN: sim_server.index(corpus) corpus = [] #print 'Saved url %s' % html.url except: tb = traceback.format_exc() print 'Load json error', html.url, tb
sim_server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver')) dels = [] for obj in HtmlContent.objects.filter(status=1).filter(~Q(content='')): dels.append('html_%d' % obj.id) sim_server.delete(dels) #hash_test() obj1 = HtmlContent.objects.get(pk=6870) obj2 = HtmlContent.objects.get(pk=7024) token1 = list(Tokenize(obj1.content)) token2 = list(Tokenize(obj2.content)) h1 = simhashpy(token1, 64) h2 = simhashpy(token2, 64) print h1,h2 print corpus.distance(h1,h2) h1 = simhash.hash_token(token1) h2 = simhash.hash_token(token2) print h1,h2 print corpus.distance(h1,h2) h1 = simhash.hash_tokenpy(token1) h2 = simhash.hash_tokenpy(token2) print h1,h2 print corpus.distance(h1,h2) ''' str1 = 'test love you' str2 = 'love you test' t1 = str1.decode('utf-8').split() t2 = str2.decode('utf-8').split() h1 = simhash.hash_token(t1) h2 = simhash.hash_token(t2)
for obj in HtmlContent.objects.filter(status=1).filter(~Q(content='')): dels.append('html_%d' % obj.id) sim_server.delete(dels) #hash_test() obj1 = HtmlContent.objects.get(pk=6870) obj2 = HtmlContent.objects.get(pk=7024) token1 = list(Tokenize(obj1.content)) token2 = list(Tokenize(obj2.content)) h1 = simhashpy(token1, 64) h2 = simhashpy(token2, 64) print h1, h2 print corpus.distance(h1, h2) h1 = simhash.hash_token(token1) h2 = simhash.hash_token(token2) print h1, h2 print corpus.distance(h1, h2) h1 = simhash.hash_tokenpy(token1) h2 = simhash.hash_tokenpy(token2) print h1, h2 print corpus.distance(h1, h2) ''' str1 = 'test love you' str2 = 'love you test' t1 = str1.decode('utf-8').split() t2 = str2.decode('utf-8').split() h1 = simhash.hash_token(t1) h2 = simhash.hash_token(t2) h2 = simhash.hash_token(t1)