/
similarity.py
52 lines (40 loc) · 1.09 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import enchant
from nltk.stem.porter import *
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
t = TweetTokenizer()
d = enchant.Dict("en_US")
stemmer = PorterStemmer()
stopword = set(stopwords.words('english'))
def tokenize(text):
ret = t.tokenize(text)
return ret
def stem(text):
ret = []
for word in tokenize(text):
word = word.lower()
if not d.check(word):
continue
if word in stopword:
continue
word = stemmer.stem(word)
ret += [word]
return ret
def similarity(candidate1, candidate2):
set1 = set(stem(candidate1))
set2 = set(stem(candidate2))
return calc(set1,set2)
def calc(set1, set2):
intersection = set1.intersection(set2)
union = set1.union(set2)
if len(union) == 0:
return 0
return float(len(intersection))/len(union)
def prepare(candidate):
return set(stem(candidate))
def main():
a = 'yes they should they are humas to and if you love some on so much you get married and live together'
b = 'Why not? It is a choice and everybody deserves to get married to the ones they love.'
print similarity(a,b)
if __name__ == '__main__':
main()