/
Similarity.py
91 lines (76 loc) · 3.82 KB
/
Similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# coding: utf-8
from gensim.models.keyedvectors import KeyedVectors
from itertools import product
from collections import defaultdict
from scipy.spatial.distance import euclidean
import pulp
from nltk.corpus import stopwords
import nltk, string
class Similarity_score():
def __init__(self, target_text, text_list, model_path):
self.target_text = target_text
self.text_list = text_list
self.w2v_model = KeyedVectors.load(model_path)
def result(self):
"""
Using Word Mover's Distance algorithm and the pre-trained word vector to calculate similarity scores
:return: the similarity score of target text and comparable texts in text list
"""
def tokens_to_fracdict(tokens):
"""
Decompose sentences to tokens
:param tokens: a list of tokens to handle
:return: a dictionary of {token: frequency of token}
"""
cntdict = defaultdict(lambda: 0)
for token in tokens:
cntdict[token] += 1
totalcnt = sum(cntdict.values())
return {token: float(cnt) / totalcnt for token, cnt in cntdict.items()}
def word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, lpFile=None):
"""
apply Word Mover's Distance algorithm
:param first_sent_tokens: target text
:param second_sent_tokens: comparable text (each text in the text list)
:param wvmodel: the pre-trained word vector model
:param lpFile: a file to store the result
:return: a pulp LpProblem (will yield the result in the last part of <result> function)
"""
all_tokens = list(set(first_sent_tokens + second_sent_tokens))
wordvecs = {}
for token in all_tokens:
try:
wordvecs[token] = wvmodel[token]
except:
wordvecs[token] = 0
first_sent_buckets = tokens_to_fracdict(first_sent_tokens)
second_sent_buckets = tokens_to_fracdict(second_sent_tokens)
T = pulp.LpVariable.dicts('T_matrix', list(product(all_tokens, all_tokens)), lowBound=0)
prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize)
prob += pulp.lpSum([T[token1, token2] * euclidean(wordvecs[token1], wordvecs[token2])
for token1, token2 in product(all_tokens, all_tokens)])
for token2 in second_sent_buckets:
prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets]) == second_sent_buckets[token2]
for token1 in first_sent_buckets:
prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets]) == first_sent_buckets[token1]
if lpFile != None:
prob.writeLP(lpFile)
prob.solve()
return prob
def normalize(text):
"""
remove punctuation, lowercase, stem
:param text:
:return: a list of normalized words
"""
stop_words = set(stopwords.words('english')) - set(['no', 'not', 'nor']) # drop 'no', 'not', and 'nor'
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
res = nltk.word_tokenize(text.lower().translate(remove_punctuation_map))
return [w for w in res if w not in stop_words]
res = []
for i in range(0, len(self.text_list)):
prob = word_mover_distance_probspec(normalize(self.target_text),
normalize(self.text_list[i]),
self.w2v_model)
res.append(1 - pulp.value(prob.objective) / 5)
return res # the similarity score based on Word Mover's Distance algorithm