def hashdistance(str1, str2): hash1 = simhashpy(str1, 64) hash2 = simhashpy(str2, 64) #distance = 1 - hash1.similarity(hash2) #return hash1.similarity(hash2) print hash1, hash2 print hash1.hamming_distance(hash2) print corpus.distance(hash1,hash2) corpus.insert(hash1) corpus.insert(hash2) print corpus.find_all(hash1)
def hashdistance(str1, str2): hash1 = simhashpy(str1, 64) hash2 = simhashpy(str2, 64) #distance = 1 - hash1.similarity(hash2) #return hash1.similarity(hash2) print hash1, hash2 print hash1.hamming_distance(hash2) print corpus.distance(hash1, hash2) corpus.insert(hash1) corpus.insert(hash2) print corpus.find_all(hash1)
def proxy_task(id): try: html = HtmlContent.objects.get(pk=id) #print 'html content', html.content if html.status <= 1 and html.content != '': return html.status except: # Not find return HttpResponse('not find') the_data = '' try: req = urllib2.Request(html.url) req.add_header('User-Agent', USER_AGENT) encoding_support = ContentEncodingProcessor opener = urllib2.build_opener(encoding_support, urllib2.HTTPHandler) #print 'requesting' proxied_request = opener.open(req, timeout=12) content = proxied_request.read() #print 'requested' except urllib2.HTTPError as e: html.status = 3 print 'urllib2 error' else: try: ucontent = content.decode('utf-8') except UnicodeDecodeError: ucontent = content.decode('gbk', 'ignore') #print 'parsing' tx = TextExtract(ucontent) #print 'parsed' html.title = tx.title html.content = tx.content.strip() if tx.content == '': print 'Parse html error' html.status = 4 else: html.status = 0 html.hash = long(simhashpy(list(Tokenize(html.content)))) html.tags, html.summerize = summarize(html.content) if len(html_remove.sub('', tx.preview)) < 250: html.preview = TextToHtml(tx.content) else: html.preview = tx.preview #print html.id, html.title, html.tags, html.summerize html.save() if html.status == 0: print 'begin sim_index' #sim_index(html) return html.status
def proxy_task(id): try: html = HtmlContent.objects.get(pk=id) #print 'html content', html.content if html.status <= 1 and html.content != '': return html.status except: # Not find return HttpResponse('not find') the_data = '' try: req = urllib2.Request(html.url) req.add_header('User-Agent', USER_AGENT) encoding_support = ContentEncodingProcessor opener = urllib2.build_opener(encoding_support, urllib2.HTTPHandler) #print 'requesting' proxied_request = opener.open(req, timeout=12) content = proxied_request.read() #print 'requested' except urllib2.HTTPError as e: html.status = 3 print 'urllib2 error' else: try: ucontent = content.decode('utf-8') except UnicodeDecodeError: ucontent = content.decode('gbk','ignore') #print 'parsing' tx = TextExtract(ucontent) #print 'parsed' html.title = tx.title html.content = tx.content.strip() if tx.content == '': print 'Parse html error' html.status = 4 else: html.status = 0 html.hash = long(simhashpy(list(Tokenize(html.content)))) html.tags,html.summerize = summarize(html.content) if len(html_remove.sub('', tx.preview)) < 250: html.preview = TextToHtml(tx.content) else: html.preview = tx.preview #print html.id, html.title, html.tags, html.summerize html.save() if html.status == 0: print 'begin sim_index' #sim_index(html) return html.status
c.insert(h) #hash_all() def hash_test(): sim_server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver')) dels = [] for obj in HtmlContent.objects.filter(status=1).filter(~Q(content='')): dels.append('html_%d' % obj.id) sim_server.delete(dels) #hash_test() obj1 = HtmlContent.objects.get(pk=6870) obj2 = HtmlContent.objects.get(pk=7024) token1 = list(Tokenize(obj1.content)) token2 = list(Tokenize(obj2.content)) h1 = simhashpy(token1, 64) h2 = simhashpy(token2, 64) print h1,h2 print corpus.distance(h1,h2) h1 = simhash.hash_token(token1) h2 = simhash.hash_token(token2) print h1,h2 print corpus.distance(h1,h2) h1 = simhash.hash_tokenpy(token1) h2 = simhash.hash_tokenpy(token2) print h1,h2 print corpus.distance(h1,h2) ''' str1 = 'test love you' str2 = 'love you test'
def hash_test(): sim_server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver')) dels = [] for obj in HtmlContent.objects.filter(status=1).filter(~Q(content='')): dels.append('html_%d' % obj.id) sim_server.delete(dels) #hash_test() obj1 = HtmlContent.objects.get(pk=6870) obj2 = HtmlContent.objects.get(pk=7024) token1 = list(Tokenize(obj1.content)) token2 = list(Tokenize(obj2.content)) h1 = simhashpy(token1, 64) h2 = simhashpy(token2, 64) print h1, h2 print corpus.distance(h1, h2) h1 = simhash.hash_token(token1) h2 = simhash.hash_token(token2) print h1, h2 print corpus.distance(h1, h2) h1 = simhash.hash_tokenpy(token1) h2 = simhash.hash_tokenpy(token2) print h1, h2 print corpus.distance(h1, h2) ''' str1 = 'test love you' str2 = 'love you test' t1 = str1.decode('utf-8').split()