/
simhash_test.py
102 lines (87 loc) · 3.79 KB
/
simhash_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -*- coding: UTF-8 -*-
import sys, os
import numpy as np
from hashes.simhash import simhash as simhashpy
from hashes.nilsimsa import nilsimsa
from sklearn.metrics.pairwise import pairwise_distances as sk_pd
import simhash
corpus = simhash.Corpus(6,3)
def hashdistance(str1, str2):
hash1 = simhashpy(str1, 64)
hash2 = simhashpy(str2, 64)
#distance = 1 - hash1.similarity(hash2)
#return hash1.similarity(hash2)
print hash1, hash2
print hash1.hamming_distance(hash2)
print corpus.distance(hash1,hash2)
corpus.insert(hash1)
corpus.insert(hash2)
print corpus.find_all(hash1)
#return distance
strings = ['n some cases it’s useful to restrict the number of features. CountVectorizer has a'
, ' CountVectorizer has a max_features constructor argument that limits the vocabularyCountVectorizer has a max_features constructor argument that limits the vocabularyCountVectorizer has a max_features constructor argument that limits the vocabularyCountVectorizer has a max_features constructor argument that limits the vocabularyCountVectorizer has a max_features constructor argument that limits the vocabularyCountVectorizer has a max_features constructor argument that limits the vocabularyCountVectorizer has a max_features constructor argument that limits the vocabularyCountVectorizer has a max_features constructor argument that limits the vocabulary'
, ' CountVectorizer has a constructor argument that limits the vocabularyCountVectorizer has a max_features constructor argument that limits the vocabularyCountVectorizer has a max_features constructor argument that limits the vocabularyCountVectorizer has a max_features constructor argument that limits the vocabularyCountVectorizer has a max_features constructor argument that limits the vocabularyCountVectorizer has a max_features constructor argument that limits th has a max_features constructor that limits the vocabul']
#hashdistance(strings[1], strings[2])
django_path = '/opt/projects/git_source/Similar'
sys.path.insert(13, django_path)
os.environ['DJANGO_SETTINGS_MODULE'] = 'pull.settings'
from django.db.models import Count
from django.db.models import Q
from pull.models import *
from cppjiebapy import Tokenize
from hashes.simhash import simhash as simhashpy
#import dse
#dse.patch_models(specific_models=[HtmlContent])
from bulkops import update_many
import Pyro4
import zerorpc
c = zerorpc.Client('tcp://localhost:5678')
def find_duplicate(hashm, hash):
sims = hashm.find_first(hash)
#print sims
return sims[0][1]
def hash_all():
for obj in HtmlContent.objects.filter(status__lte=2).filter(~Q(content='')):
h = simhash.hash_tokenpy(list(Tokenize(obj.content)))
if find_duplicate(c, h) == 0:
obj.status = 0
else:
obj.status = 1
obj.hash = h
obj.save()
c.insert(h)
#hash_all()
def hash_test():
sim_server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
dels = []
for obj in HtmlContent.objects.filter(status=1).filter(~Q(content='')):
dels.append('html_%d' % obj.id)
sim_server.delete(dels)
#hash_test()
obj1 = HtmlContent.objects.get(pk=6870)
obj2 = HtmlContent.objects.get(pk=7024)
token1 = list(Tokenize(obj1.content))
token2 = list(Tokenize(obj2.content))
h1 = simhashpy(token1, 64)
h2 = simhashpy(token2, 64)
print h1,h2
print corpus.distance(h1,h2)
h1 = simhash.hash_token(token1)
h2 = simhash.hash_token(token2)
print h1,h2
print corpus.distance(h1,h2)
h1 = simhash.hash_tokenpy(token1)
h2 = simhash.hash_tokenpy(token2)
print h1,h2
print corpus.distance(h1,h2)
'''
str1 = 'test love you'
str2 = 'love you test'
t1 = str1.decode('utf-8').split()
t2 = str2.decode('utf-8').split()
h1 = simhash.hash_token(t1)
h2 = simhash.hash_token(t2)
h2 = simhash.hash_token(t1)
print h1,h2
print corpus.distance(h1,h2)
'''