/
ex_similar_docs.py
105 lines (87 loc) · 4.09 KB
/
ex_similar_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# 20180333 Seyoung Song
# rev. 20170818 Sunjoo Yoon
import nltk
from nltk.corpus import reuters
import pandas as pd
from pprint import pprint
from random import randrange
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
def reuters_dataframe(n=9160):
def clean(words):
stopwords = set(nltk.corpus.stopwords.words('english'))
words_lower = [w.lower() for w in words]
return [w for w in words_lower if w not in stopwords and len(w) >= 3]
def title(words):
return words[:20]
fileids = [i for i in reuters.fileids() if len(reuters.categories(i)) == 1][:n]
df = pd.DataFrame({'text': [' '.join(clean(reuters.words(i))) for i in fileids],
'category': [reuters.categories(i)[0] for i in fileids],
'title': [' '.join(title(reuters.words(i))) for i in fileids],
'fileids': fileids,
'words': [reuters.words(i) for i in fileids]})
return df
def cosine_similarity(data):
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['text'])
return linear_kernel(tfidf_matrix, tfidf_matrix)
def doc_similarity(fileid1, fileid2, _data, _cosine_sim):
idx_to_fileid = dict(_data['fileids'])
fileid_to_idx = {v: k for k, v in idx_to_fileid.items()}
idx1 = fileid_to_idx[fileid1]
idx2 = fileid_to_idx[fileid2]
return _cosine_sim[idx1][idx2]
def similar_docs(fileid, data, cosine_sim):
idx_to_fileid = dict(data['fileids'])
fileid_to_idx = {v: k for k, v in idx_to_fileid.items()}
idx = fileid_to_idx[fileid]
sim_scores = sorted(list(enumerate(cosine_sim[idx])), key=lambda x: x[1], reverse=True)
return [(idx_to_fileid[i[0]], i[1]) for i in sim_scores]
reuters_df = reuters_dataframe()
cosine_sim = cosine_similarity(reuters_df)
# pprint(similar_docs(reuters_df.fileids[0], reuters_df, cosine_sim)[:10])
# print(doc_similarity(reuters_df.fileids[0], reuters_df.fileids[1], reuters_df, cosine_sim))
# TESTING 1: MANUALLY CHECK RECOMMENDED ARTICLES FOR HUMAN-VERIFIED SIMILARITY WITH ORIGINAL
titlewords = ["lt", "Lt", "ltd", "Ltd", "co", "Co"] # words not capitalised even in title
for i in range(5):
random_number = randrange(len(reuters_df.fileids))
# get recommendations for five random articles
ret = similar_docs(reuters_df.fileids[random_number], reuters_df, cosine_sim)
for j in range(4):
# get the text of the original article and top three recommendations
text = reuters.words(ret[j][0])
# the titles of each article is capitalised, so use that to get the title
k = 0
title = ""
while (text[k].isupper()) or (text[k][0] in string.punctuation) or text[k] in titlewords:
title = title + text[k].lower() + " "
k = k + 1
title = title[:(len(title) - 1)]
# get the tags of the articles
tags = reuters.categories(ret[j][0])
# print the article titles and their tags
if j == 0:
print("ORIGINAL ARTICLE:", title, "TAGS:", tags)
else:
print("RELATED ARTICLE:", title, "TAGS:", tags)
# TESTING 2: AUTOMATICALLY CHECK RECOMMENDATIONS FOR SHARED TAGS WITH ORIGINAL ARTICLE
score = 0
for i in range(1000):
# get recommendations for a thousand random articles
random_number = randrange(len(reuters_df.fileids))
ret = similar_docs(reuters_df.fileids[random_number], reuters_df, cosine_sim)
original_tags = reuters.categories(ret[0][0])
for j in range(3):
# only consider the top three recommendations for each original article
tags = reuters.categories(ret[j + 1][0])
# count how many of them have shared tags with the original article
flag = 0
for k in range(len(tags)):
if tags[k] in original_tags:
flag = 1
if flag == 1:
score = score + 1
# print the result
print("3000 recommendations generated, three each for 1000 randomly-selected articles")
print(score, "of 3000 had at least one shared reuters tag with the original article")