-
Notifications
You must be signed in to change notification settings - Fork 0
/
question_answer_util.py
392 lines (362 loc) · 14.1 KB
/
question_answer_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
#!/usr/bin/env python
import sys
import nltk
import inflect
import re
import string
from textblob import TextBlob
import grammars as g
import util
import backup_answer as b
import parse as p
import sentence_edit_distance as edit
import np_util as n
extractor = p.SuperNPExtractor()
def compare_phrases(q_phrase, t_phrase, uncommon):
q_words = [word.lemmatize() for word, tag in q_phrase]
t_words = [word.lemmatize() for word, tag in t_phrase]
def get_bigrams(words):
return [words[i-1:i+1] for i in xrange(1, len(words))]
q_grams = get_bigrams(q_words)
t_grams = get_bigrams(t_words)
def judge_gram(target):
if target in uncommon:
return 3.5 + (1 if target.istitle() else 0)
else:
return 1
ans1 = sum([max([judge_gram(g[0]), judge_gram(g[1])])
for g in q_grams if g in t_grams]) - float(len(q_words)) + 1
ans2 = sum([judge_gram(i)
for i in q_words if i in t_words]) - float(len(q_words))
return (ans1, ans2, t_phrase)
def examine_rels(q, q_phrase, bestrels, uncommon, mode):
def taglist(l):
return [tag for word, tag in l]
if mode == 'IS':
comp1, comp2, rel = max([compare_phrases(q_phrase, relation, uncommon)
for best, relation in bestrels])
print >> sys.stderr, comp1, comp2
if comp1 >= -2 and comp2 >= 0:
#"at most 2 unmatched bigrams and 1 unmatched unigram"
return "Yes"
elif comp1 >= -2 and comp2 >= -1:
#if there's an unmatched unigram, it's either a red herring
#or good enough
print >> sys.stderr, q_phrase
print >> sys.stderr, rel
for word, tag in q_phrase:
if (word, tag) not in rel:
if tag in ['IN', 'NNP', 'CD']:
return "No"
return "Yes"
elif mode == 'OBJECT':
nextidx = q.tokens.index('what') + 1
nexttoken = q.tokens[nextidx]
npsidx = sum([1 for i in n.idxs if i <= nextidx])
subj = n.nps[npsidx]
rest = q.tags[n.idxs[npsidx] + len(n.get_np_tags(subj, q)):]
if (nexttoken in ['is', 'was', 'do', 'does', 'will', 'did', 'can', 'must']):
#what VB NP VP (what will Jake do,..)
#compare things
q_phrase = n.get_np_tags(subj, q) + [q.tags[nextidx]] + rest
comp1, comp2, rel = max([compare_phrases(q_phrase, relation, uncommon)
for best, relation in bestrels])
print >> sys.stderr, comp1, comp2
relwords = [word for word, tag in rel]
reltags = [tag for word, tag in rel]
answerstart = 0
if subj in relwords:
answerstart = relwords.index(subj.split()[0]) + len(subj.split()) + 1
answerend = answerstart
if relwords[answerend] == rest[0][0]:
answerstart += len(rest)
if ',' in reltags[answerstart:]:
answerend = reltags[answerstart:].index(',')
elif '.' in reltags[answerstart:]:
answerend = reltags[answerstart:].index('.')
else:
answerend = -1
else:
try:
answerend = relwords[answerstart:].index(rest[0][0])
except:
answerend = -1
return " ".join(relwords[answerstart:answerend])
else:
q_phrase = q_phrase
comp1, comp2, rel = max([compare_phrases(q_phrase, relation, uncommon)
for best, relation in bestrels])
q_ne = p.extract_named_entities(q)
relblob = TextBlob(" ".join([word for word, tag in rel]))
r_ne = p.extract_named_entities(relblob)
diff = dict([(ne, tag) for ne, tag in r_ne.iteritems()
if ne not in q_ne.keys()])
if len(diff) == 0:
return ""
return diff[0]
elif mode == 'PERSON':
q_phrase = q_phrase
comp1, comp2, rel = max([compare_phrases(q_phrase, relation, uncommon)
for best, relation in bestrels])
q_ne = p.extract_named_entities(q)
relblob = TextBlob(" ".join([word for word, tag in rel]))
r_ne = p.extract_named_entities(relblob)
diff = dict([(ne, tag) for ne, tag in r_ne.iteritems()
if ne not in q_ne.keys()])
diff = dict([(ne, tag) for ne, tag in diff.iteritems()
if tag in ['PERSON', 'GPE']])
if len(diff) == 0:
return ""
return diff.keys()[0]
elif mode == 'GPE':
q_phrase = q_phrase
comp1, comp2, rel = max([compare_phrases(q_phrase, relation, uncommon)
for best, relation in bestrels])
q_ne = p.extract_named_entities(q)
relblob = TextBlob(" ".join([word for word, tag in rel]))
r_ne = p.extract_named_entities(relblob)
diff = dict([(ne, tag) for ne, tag in r_ne.iteritems()
if ne not in q_ne.keys()])
diff_gpe = dict([(ne, tag) for ne, tag in diff.iteritems()
if tag in ['GPE']])
diff = dict([(ne, tag) for ne, tag in diff.iteritems()
if tag in ['OBJECT', 'PERSON']])
if len(diff_gpe) == 0:
return diff_gpe.keys()[0]
if len(diff) == 0:
return ""
nextidx = q.tokens.index('what') + 1
relwords = [word for word, tag in rel]
reltags = [tag for word, tag in rel]
prpidx = 0
prp = ""
next = ""
if 'PRP' in reltags[nextidx:] or 'PP' in reltags[nextidx:]:
prpidx = reltags[nextidx:].index('PRP')
prp = relwords[prpidx]
next = relwords[prpidx + 1:]
elif 'IN' in reltags[nextidx:]:
prpidx = reltags[nextidx:].index('IN')
prp = relwords[prpidx]
next = relwords[prpidx + 1:]
def find_ne(word):
for e in diff.keys():
if word in e.split():
return word
return None
for w in next:
ans = find_ne(word)
if ans != None:
return " ".join([prp, ans])
elif mode == 'DATETIME':
q_phrase = q_phrase
comp1, comp2, rel = max([compare_phrases(q_phrase, relation, uncommon)
for best, relation in bestrels])
q_ne = p.extract_named_entities(q)
relblob = TextBlob(" ".join([word for word, tag in rel]))
r_ne = p.extract_named_entities(relblob)
diff = dict([(ne, tag) for ne, tag in r_ne.iteritems()
if ne not in q_ne.keys()])
diff = dict([(ne, tag) for ne, tag in diff.iteritems()
if tag in ['DATETIME']])
if len(diff) == 0:
return ""
return diff.keys()[0]
elif mode == 'ABSTRACT':
bestrels = [(best, rel) for best, rel in bestrels
if any([word in ['because', 'due', 'by', 'since']
for word, tag in rel])]
if len(bestrels) == 0:
return ""
comp1, comp2, rel = max([compare_phrases(q_phrase, relation, uncommon)
for best, relation in bestrels])
words = [word for word, tag in rel]
tags = [tag for word, tag in rel]
def find_whyword(rel):
for w in ['because', 'due', 'by', 'since']:
if w in words:
return w
return None
why = find_whyword(rel)
answerstart = words.index(why)
answerend = -1
if ',' in tags[answerstart:]:
answerend = tags[answerstart:].index(',')
return " ".join(words[answerstart:answerend])
elif mode == 'NUMBER':
comp1, comp2, rel = max([compare_phrases(q_phrase, relation, uncommon)
for best, relation in bestrels])
words = [word for word, tag in rel]
tags = [tag for word, tag in rel]
num = "many"
q_ne = p.extract_named_entities(q)
relblob = TextBlob(" ".join([word for word, tag in rel]))
r_ne = p.extract_named_entities(relblob)
diff = dict([(ne, tag) for ne, tag in r_ne.iteritems()
if ne not in q_ne.keys()])
diff = dict([(ne, tag) for ne, tag in diff.iteritems()
if tag in ['NUMBER']])
ans = ""
if len(diff) == 0:
if 'all' in words or 'every' in words:
return 'all'
if 'most' in words:
return 'most'
if 'some' in words:
return 'some'
if 'many' in words:
return 'many'
same = dict([(ne, tag) for ne, tag in r_ne.iteritems()
if ne not in q_ne.keys()])
if len(same) == 0:
return ""
idx = words.index(same.keys()[0].split()[0]) - 2
ans = words[idx]
else:
ans == diff.keys()[0]
if not ans.isnumeric:
for d in diff.keys()[1:]:
if ans.isnumeric:
break
else:
ans += " " + d
if ans == 'one':
if 'every' in words or 'each' in words:
return 'all'
return ans
elif mode == 'VERB PHRASE':
comp1, comp2, rel = max([compare_phrases(q_phrase, relation, uncommon)
for best, relation in bestrels])
words = [word for word, tag in rel]
tags = [tag for word, tag in rel]
if 'IN' not in tags:
return ""
answerstart = tags.index('IN')
answerend = -1
if ',' in tags[answerstart:]:
answerend = tags[answerstart:].index(',')
return " ".join(words[answerstarrt:answerend])
#nothing found here
return ""
def parse_first(q, database, uncommon, mode):
words = q.words.lower()
nps = n.nps
tags = q.tags
if len(nps) == 0:
print >> sys.stderr, "No subject found"
return "No"
subj = nps[0] #assuming subject is the first noun phrase
print >> sys.stderr, "\tSubject:", subj
first = words[0]
if True:#first.lower() in ['is','was']: #is this everything?
#question is an "is/was ___ NP/AP"
#get index of the first word after the noun phrase
loc = n.np_idx(subj, q)
nexti = loc + len(subj.split()) + (1 if "'" in subj else 0)
subj_tags = n.get_np_tags(subj, q)
#get potential relations from database
closest = n.get_similar_np(subj_tags, database)
if closest == None:
#retry with partial noun phrases
closest = n.get_similar_np(subj_tags[0:1], database)
nexti = nexti + 1 - len(subj_tags)
if closest == None:
closest = n.get_similar_np(subj_tags[0:2], database)
nexti = nexti + 1
if closest == None:
return ""
q_phrase = tags[nexti:]
print >> sys.stderr, q_phrase
if len(q_phrase) <= 1:
q_phrase = tags
#construct possible relations
rel_tags = [database[close][e]["relation"]
+ database[close][e]["pos"]
for close in closest
for e in database[close]]
#get the best, compare
bestrels = edit.distance(q_phrase, rel_tags)
return examine_rels(q, q_phrase, bestrels, uncommon, mode)
elif first.lower() in ['does','did','will']:
#question is a "does/did/will ___ VP"
return "No"
return "No"
def parse_second(q, blob, uncommon, mode):
sents = blob.sentences
q_phrase = q.tags[2:]
if mode == 'IS':
q_phrase = q.tags[1:]
q_phrase = q_phrase[:n.idxs[0]] + q_phrase[n.idxs[0] + 1:]
bestrels = edit.distance(q_phrase, [s.tags for s in sents if len(s.tags) > 6])
return examine_rels(q, q_phrase, bestrels, uncommon, mode)
return ""
def parse_question(question, database, raw):
q = question
q = q[0:1].lower() + q[1:]
q = TextBlob(q, np_extractor=extractor)
n.init_nps(q)
#get 25th percentile words by frequency
#actually, just gets infrequent words
bigblob = TextBlob(raw, np_extractor=extractor)
freqdict = bigblob.word_counts
backwards = [(c, w) for w, c in freqdict.iteritems()]
cutoff = (0.25 * sum([c for c, w in backwards]))
best = 0
#for c, w in sorted(backwards):
# best = c
# cutoff -= c
# if cutoff < 0:
# break
uncommon = [w for c, w in sorted(backwards) if 1 < c < 4]#best]
mode = q.words[0].upper()
if mode in ['IS', 'WAS', 'DO', 'DOES', 'DID', 'WILL']:
mode = 'IS'
#else:
# if q.tags[0][1][0] != 'W':
# tagtypes = [tag[0] for word, tag in q.tags]
# if 'W' not in tagtypes:
# return 'IS'
# idx = tagtypes.index('W')
# mode = p.determine_question_type(q[idx:])
# else:
# mode = p.determine_question_type(q)
else:
mode = None
for i in xrange(0, len(q.tokens)-1):
mode = p.determine_question_type(q.tokens[i:])
if mode != None:
break
if mode == None:
mode = 'IS'
print >> sys.stderr, mode
try:
first_attempt = parse_first(q, database, uncommon, mode)
except:
first_attempt = ""
if first_attempt != "":
return first_attempt
try:
second_attempt = parse_second(q, bigblob, uncommon, mode)
except:
second_attempt = ""
if second_attempt != "":
return second_attempt
third_attempt = b.backup_answer(q, n.nps, raw)
if third_attempt != "":
return third_attempt
if len(n.nps) > 0:
return n.nps[0]
else:
return "Yes" #guess
if __name__ == "__main__":
q = raw_input("Ask a question\n")
q = TextBlob(q, np_extractor=extractor)
print q.noun_phrases
noun_phrases, idxs = n.get_nps_from_blob(q)
print noun_phrases
print q.words
first = noun_phrases[0]
print n.get_np_tags(first, q)
print q.tags
print q.parse()
#print p.extract_generic_relations(q)