-
Notifications
You must be signed in to change notification settings - Fork 0
/
tfidf.py
133 lines (94 loc) · 4.2 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import utils
import math
import sortedcontainers
import nltk
# initialise dictionaries/list to ease computation
doc_freqs={}
def average_length(doc_dict):
# Compute average document length (used to compute the tdf portion of the score)
total_length = 0
for doc_id in doc_dict:
doc_length = len(doc_dict[doc_id])
total_length += doc_length
average_doc_length = float(total_length)/len(doc_dict)
return average_doc_length
def find_term_freq_doc(word, doc_dict):
if word not in doc_freqs:
doc_freq = len([1 for doc_id in doc_dict if word in doc_dict[doc_id]])
doc_freqs[word] = doc_freq
return doc_freq
else:
return doc_freqs[word]
def calculate_tfidf(query_text, doc_dict, average_doc_length, k):
for doc_id in doc_dict:
doc_text = doc_dict[doc_id]
score = 0
for word in set(query_text):
# term frequency for the query and document
term_freq_query = query_text.count(word)
term_freq_doc = doc_text.count(word)
# find number of occurrences of word in all documents
dfw = find_term_freq_doc(word, doc_dict)
# skip if 0, as query word not in docs
if not dfw:
continue
# inverse document frequency (idf) = log(|C|/dfw)
idf = math.log(float(len(doc_dict))/dfw)
# tdf = tfwd / (tfwd + ((k * |D|) / avgD))
tdf = float(term_freq_doc)/(term_freq_doc + (k * len(doc_text) / average_doc_length))
# tfidf = tfqd * tdf * idf
score += term_freq_query * tdf * idf
yield doc_id, score
def standard_tfidf(query_dict, doc_dict, k=2):
# find the average doc length
average_doc_length = average_length(doc_dict)
# initialise dictionary for results
score_dict = {}
for query_id in query_dict:
# get the text
query_text = query_dict[query_id]
for doc_id, tfidf_score in calculate_tfidf(query_text, doc_dict, average_doc_length, k):
score_dict[(query_id, doc_id)] = tfidf_score
return score_dict
def find_ranking(top_ranking, doc_id, score, max_len):
top_ranking.add((doc_id, score))
# remove item with lowest score if length greater than max len
if len(top_ranking) > max_len:
del top_ranking[-1]
return top_ranking
def tfidf_pseudo_relevance_feedback(query_dict, doc_dict, k=2, num_docs=15, num_words=30):
# find the average doc length
average_doc_length = average_length(doc_dict)
# initialise dictionary for results
score_dict = {}
for query_id in query_dict:
# get the text
query_text = query_dict[query_id]
# initialise sorted list which sorts by the score
top_ranking=sortedcontainers.SortedListWithKey(key=lambda (key, value): (value, key))
for doc_id, tfidf_score in calculate_tfidf(query_text, doc_dict, average_doc_length, k):
top_ranking = find_ranking(top_ranking, doc_id, tfidf_score, num_docs)
# go through top ranking docs gather all words
# TODO: improve
top_doc_word_list = []
for doc_id, _ in top_ranking:
doc_text = doc_dict[doc_id]
top_doc_word_list+=doc_text
# find the most frequent words
freq_dist = nltk.FreqDist(word for word in top_doc_word_list)
best_words = freq_dist.keys()[:num_words]
# add to the query
new_query = query_text + best_words
# recalculate tfidf score and add to score dictionary
for doc_id, tfidf_score in calculate_tfidf(new_query, doc_dict, average_doc_length, k):
score_dict[query_id, doc_id] = tfidf_score
return score_dict
if __name__ == "__main__":
query_dict = utils.process_data('data/qrys.txt')
doc_dict = utils.process_data('data/docs.txt')
standard_tfidf_scores = standard_tfidf(query_dict, doc_dict)
with open('results/tfidf.top', 'w') as output_file:
output_file = utils.write_result(standard_tfidf_scores, output_file)
tfidf_with_prf_scores = tfidf_pseudo_relevance_feedback(query_dict, doc_dict)
with open('results/best.top', 'w') as output_file:
output_file = utils.write_result(tfidf_with_prf_scores, output_file)