def hash_all(): for obj in HtmlContent.objects.filter(status__lte=2).filter(~Q(content='')): h = simhash.hash_tokenpy(list(Tokenize(obj.content))) if find_duplicate(c, h) == 0: obj.status = 0 else: obj.status = 1 obj.hash = h obj.save() c.insert(h)
def hash_all(): for obj in HtmlContent.objects.filter( status__lte=2).filter(~Q(content='')): h = simhash.hash_tokenpy(list(Tokenize(obj.content))) if find_duplicate(c, h) == 0: obj.status = 0 else: obj.status = 1 obj.hash = h obj.save() c.insert(h)
sim_server.delete(dels) #hash_test() obj1 = HtmlContent.objects.get(pk=6870) obj2 = HtmlContent.objects.get(pk=7024) token1 = list(Tokenize(obj1.content)) token2 = list(Tokenize(obj2.content)) h1 = simhashpy(token1, 64) h2 = simhashpy(token2, 64) print h1,h2 print corpus.distance(h1,h2) h1 = simhash.hash_token(token1) h2 = simhash.hash_token(token2) print h1,h2 print corpus.distance(h1,h2) h1 = simhash.hash_tokenpy(token1) h2 = simhash.hash_tokenpy(token2) print h1,h2 print corpus.distance(h1,h2) ''' str1 = 'test love you' str2 = 'love you test' t1 = str1.decode('utf-8').split() t2 = str2.decode('utf-8').split() h1 = simhash.hash_token(t1) h2 = simhash.hash_token(t2) h2 = simhash.hash_token(t1) print h1,h2 print corpus.distance(h1,h2) '''
#hash_test() obj1 = HtmlContent.objects.get(pk=6870) obj2 = HtmlContent.objects.get(pk=7024) token1 = list(Tokenize(obj1.content)) token2 = list(Tokenize(obj2.content)) h1 = simhashpy(token1, 64) h2 = simhashpy(token2, 64) print h1, h2 print corpus.distance(h1, h2) h1 = simhash.hash_token(token1) h2 = simhash.hash_token(token2) print h1, h2 print corpus.distance(h1, h2) h1 = simhash.hash_tokenpy(token1) h2 = simhash.hash_tokenpy(token2) print h1, h2 print corpus.distance(h1, h2) ''' str1 = 'test love you' str2 = 'love you test' t1 = str1.decode('utf-8').split() t2 = str2.decode('utf-8').split() h1 = simhash.hash_token(t1) h2 = simhash.hash_token(t2) h2 = simhash.hash_token(t1) print h1,h2 print corpus.distance(h1,h2) '''