/
csim_answer.py
154 lines (112 loc) · 3.85 KB
/
csim_answer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/python
#
# Todo...make it so that this is a window based scheme and not based on sentence
# if the window is the best return the sentence that it came from
# we move through the sentence by incremementally moving our window to the right
# one word at a time
#
#
import nltk
import re
import sys
import os
import math
tokenizer = nltk.tokenize.punkt.PunktWordTokenizer()
sent_tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
stemmer = nltk.PorterStemmer()
sim_list = []
#-------------------------------------------------------------------------------
# main()
#
# Start Point
#
#-------------------------------------------------------------------------------
def main(strQuestion, articleFile):
csim_answer(strQuestion, articleFile)
#-------------------------------------------------------------------------------
# Cosine Similarity Answer
#
# This function will allow you to give it an article and a question and it will
# return the sentence from the file that is most likely to answer the given
# question.
#
#-------------------------------------------------------------------------------
def csim_answer(question, articleFile):
# Generate the question vector
q_stems = get_stems(question)
q_vector = get_vector(q_stems)
# Get the sentences from the file
article = articleFile.read()
article_sents = sent_tokenizer.tokenize(article)
# Sentence with the most likely answer
maxSim = 0
maxAns = ""
# Go through each sentence in the article and generate a vector for it
# We can then use this vector to compare it against the question vector
# and we will store this value in our sim_list
for sent in article_sents:
sent_parts = tokenizer.tokenize(sent)
stems = get_stems(sent_parts)
if len(stems) > 12:
for i in range(len(stems)-11):
vector = get_vector(stems[i:i+12])
cs = calc_sim(q_vector, vector)
if cs > maxSim:
maxSim = cs
maxAns = " ".join(sent_parts[i:i+12])
else:
#print sent
vector = get_vector(stems)
cs = calc_sim(q_vector, vector)
if cs > maxSim:
maxSim = cs
maxAns = sent
#print "\n"
print maxAns
#-------------------------------------------------------------------------------
# get_stems(sent)
#
# This function will return an array of words which are the stemmed
# words from the sentence.
#
#-------------------------------------------------------------------------------
def get_stems(sent_parts):
stems = []
for part in sent_parts:
stems.append(stemmer.stem(part))
return(stems)
#-------------------------------------------------------------------------------
# get_vector(stem_list)
#
# This function will return a nltk.defaultdict() which contains our vector
# representation of a given sentence given its stem_list
#
#-------------------------------------------------------------------------------
def get_vector(stem_list):
vector = nltk.defaultdict(int)
for stem in stem_list:
vector[stem] += 1
return vector
#-------------------------------------------------------------------------------
# calc_sim(q_vector, s_vector)
#
# This function will use cosine similarity to compute the similarity of two
# sentences.
#
#-------------------------------------------------------------------------------
def calc_sim(q_vector, s_vector):
sim = 0
num = 0
qden = 0
sden = 0
for word in s_vector.keys():
num += s_vector[word] * q_vector[word]
qden += q_vector[word] ** 2
sden += s_vector[word] ** 2
if qden*sden==0:
return 0
sim = (num) / ( math.sqrt(qden) * math.sqrt(sden) )
return sim
### If running as a console script ###
if (__name__ == "__main__"):
main()