Пример #1
0
def simhash(words, doc_corpus):
    sims = []
    ws = []
    # doc_corpus 中向量索引与词的对应关系处理
    [ws.extend(sorted(set([w1 for w1 in w if w1 not in ws]))) for w in words]
    for word, corpus in zip(words, doc_corpus):
        feature = [(ws[c[0]], c[1]) for c in corpus]
        sim = Simhash(word)
        sim.build_by_features(feature)
        sims.append(sim)
    return sims
Пример #2
0
from simhash import Simhash, SimhashIndex

# simhash
str0 = 'The Apache Hadoop software library is a framework that allows for the distributed processing large data'
str1 = 'The Apache Hadoop software library is a framework that allows for the distributed processing big data'
sh0 = Simhash(str0)
sh1 = Simhash(str1)
print(sh0.distance(sh1))  # 计算海明距离
features = [('Apache', 10), ('Hadoop', 15), ('framework', 3),
            ('distributed', 10), ('data', 6)]
sh0.build_by_features(features)
sh1.build_by_features(features)
print(sh0.distance(sh1))

# build a simhash index
data = {
    1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
    2: u'How are you i am fine. blar blar blar blar blar than',
    3: u'This is simhash test.',
}
objs = [(str(k), Simhash(v)) for k, v in data.items()]
index = SimhashIndex(objs, k=3)
print(index.bucket_size())
s1 = Simhash(u'How are you i am fine. blar blar blar blar blar thank')
print(index.get_near_dups(s1))
index.add('4', s1)
print(index.get_near_dups(s1))
Пример #3
0
 def generate_simhash(self, tokens):
   #Generate a Simhash from Spacy tokens.
   sh = Simhash(u'', f=self.hash_size) #silly interface...
   sh.build_by_features(tokens)
   return sh