-
Notifications
You must be signed in to change notification settings - Fork 0
/
querying.py
359 lines (298 loc) · 22.5 KB
/
querying.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
###! /usr/bin/python
import sys
import math
import random
import re
from collections import OrderedDict,defaultdict
import pickle
from sets import Set
import time
import ranking
import os
import nltk
from nltk.stem.porter import PorterStemmer
## unpickling the index
index = OrderedDict()
index = pickle.load(open("index","rb"))
porter = nltk.PorterStemmer()
## processing phrases in the query that are not negated -- put their information in the 2 result dictionaries- score_dict,pos_dict
def process_phrase(phrase_list,score_dict,pos_dict):
for phrase in phrase_list: ## for each of the phrases entered in the query
common_docid_set = set() ## this set stores the docids that contain all the phrase words in the same doc
phrase_words = phrase.strip().split() ## split phrase into phrase-words
for i in range(0,len(phrase_words)): ## stemming the phrase words as index is stemmed
phrase_words[i] = porter.stem(phrase_words[i])
if len(phrase_words) == 1: ## if only one word in the phrase
words_list.append(phrase_words[0]) ## add it to the word list to process as a normal word in the query
else: ## else
if phrase_words[0] in index.keys(): ## if the first phrase-word is in index
common_docid_set = set(index[phrase_words[0]].keys()) ## add it's docids to the set containing the common docids
for i in range(1,len(phrase_words)):
if phrase_words[i] in index.keys(): ## if the next phrase-word is in the index
common_docid_set = set(index[phrase_words[i]].keys()) & common_docid_set ## find the docids that are common with the previous words in the phrase
else: ## if one of the phrase words is not in index, it implies the phrase does not occur in any document
common_docid_set.clear() ## hence empty the common doc set and break
break
for docid in common_docid_set: ## for each doc containing the phrase-words, find if they occur together as a phrase
prev_pos_set = set(index[phrase_words[0]][docid]) ## stores the positions of the first phrase-word
curr_pos_set = set()
for j in range(1,len(phrase_words)):
for pos in index[phrase_words[i]][docid]: ## for each position of the current phrase-word in current doc
curr_pos_set.add(pos-j) ## decrement the positions by j to match the start position of the phrase
correct_pos_set = curr_pos_set & prev_pos_set ## stores the start positions of only those instances where the words appear next to each other
if len(correct_pos_set) == 0: ## if there are no such positions in this doc where the words appear as a phrase
break ## break and move to the next doc
else:
prev_pos_set = correct_pos_set ## else make current position set as the previous and move to the next word in the phrase
if len(correct_pos_set) == 0: ## if pos set is empty, it implies there is no phrase match in current document
continue ## hence continue to next document
else: ## else make the entry in the 2 result dictionaries for this phrase
if docid not in score_dict: ## if new docid entry in dict
score = len(correct_pos_set)*len(phrase_words) ## score = no of occurrences * no of phrase words
score_dict[docid] = score ## storing docids with no of occurrences
pos_dict[docid] = correct_pos_set ## storing docids and start positions of phrase occurrences
else: ## if existing docid, add occurrences and new phrase positions
score = score_dict[docid] + (len(correct_pos_set)*len(phrase_words)) ## add the scores for this phrase with the previous scores for this doc
score_dict[docid] = score
pos_dict[docid] = pos_dict[docid].union(correct_pos_set) ## append the start positions for this phrase in the same doc
## processing individual words in the query that are not negated -- put their info in the 2 result dictionaries
def process_words(words_list,score_dict,pos_dict):
for q_word in words_list: ## for each word in the query
q_word = q_word.strip()
if q_word in index: ## check if query word is in index
for docid in index[q_word].keys():
score = 0
if docid not in score_dict: ## if the docid is a new entry
score_dict[docid] = len(index[q_word][docid]) ## store score = no of occurrences of that word ## storing the score = no of occurrences of the word in the current doc
pos_dict[docid] = set(index[q_word][docid]) ## store positions of word occurrences in the current doc
else: ## if docid already exists in dict
score = score_dict[docid] + len(index[q_word][docid]) ## store score = previous score of the doc + no of occurrences of current word in same doc
score_dict[docid] = score
pos_dict[docid] = pos_dict[docid].union(set(index[q_word][docid])) ## append the positions of this word in the current doc to existing position set
## This function does the pre-processing. i.e matches the phrase and word patterns to extract the phrases/negated-phrases and words/negated-words from the query
def pre_process_query(query):
global words_list
global phrase_list
global negphrase_list
global negwords_list
global index
words_list = list()
phrase_list = list()
negphrase_list = list()
negwords_list = list()
negphrase_pat = re.compile('!\"(.*?)\"')
negphrase_list = negphrase_pat.findall(query)
## for negphrase in negphrase_m:
## negphrase_list.append(porter.stem(negphrase))
query = re.sub('!\"(.*?)\"','',query).strip()
phrase_pat = re.compile('\"(.*?)\"')
phrase_list = phrase_pat.findall(query)
## for phrase in phrase_m:
## phrase_list.append(porter.stem(phrase_m))
query = re.sub('\"(.*?)\"','',query).strip()
negword_pat = re.compile(r'!(\w+)+')
negwords_m = negword_pat.findall(query)
for negword in negwords_m:
negwords_list.append(porter.stem(negword))
query = re.sub(r'!(\w+)+','',query).strip()
word_pat = re.compile(r'\w+')
words_m = word_pat.findall(query)
for word in words_m:
words_list.append(porter.stem(word))
query = re.sub(r'\w+','',query).strip()
## This function calculates the statistics like df, tf and freq of words/phrases
def get_statistics(query,score_dict,pos_dict):
query_parts = query.strip().split()
pre_process_query(' '.join(query_parts[1:]))
if query_parts[0] == 'df': ## if query is to calculate df
df = 0
if re.search('\"(.*?)\"',query): ## check if query is for a phrase
process_phrase(phrase_list,score_dict,pos_dict) ## process the phrase
process_words(words_list,score_dict,pos_dict) ## for cases where users enter a phrase of one word
for docid in score_dict.keys():
if score_dict[docid] > 0: ## Phrase occurrences will have scores >0
df += 1 ## count all such docids from the score dict
print df
else: ## else query is for a word
word = porter.stem(query_parts[1])
if word in index.keys(): ## check if word is in index
print len(index[word].keys()) ## using the index get the no of docs in which the word appears
else: ## else print 0
print "0"
if query_parts[0] == 'tf': ## if query is to calculate tf
if re.search('\[0-9]+',query_parts[1]) or len(query_parts) != 3: ## error handling
print "Please enter a valid query: tf docid term"
else:
docid = query_parts[1] ## extract docid
word = query_parts[2] ## extract word
word = porter.stem(word) ## stemming the word as indexed words are stemmed
if word in index.keys():
if docid in index[word].keys(): ## check if docid in index
print len(index[word][docid]) ## using the index get the no of occurrences of the word in the doc
else: ## else print 0
print "Word not present in this document ID as per index."
else:
print "Word not indexed."
if query_parts[0] == 'freq': ## if query is to calculate freq of a phrase
process_phrase(phrase_list,score_dict,pos_dict) ## process the phrase
process_words(words_list,score_dict,pos_dict) ## for cases where users enter a phrase of one word, process the word
freq = 0
for docid in pos_dict.keys():
freq = freq + len(pos_dict[docid]) ## using the doc dict get the freq by adding the no of occurrences of the phrase for each doc in which the phrase appears
print freq
## get metadata - in this case only the title and text(data)
def get_metadata(query,score_dict,pos_dict):
query_parts = query.strip().split()
metadata = pickle.load(open("metadata","rb")) ## unpickle the metadata dictionary
docid = query_parts[1] ## retrieve the docid from the query
path = ".//Extracted_text_files"
if query_parts[0] == 'title': ## get the title for a given docid
print metadata[docid]
if query_parts[0] == 'doc': ## get the text for a given docid
with open(path + "//" + docid,"r") as fi:
lines = fi.read()
print lines
fi.close()
## This function finds the similar words from the nltk corpus
def get_similar_words(query):
similar_set = set()
query_parts = query.strip().split()
word = query_parts[1].strip()
if re.search('\w+',word): ## if valid word entered for the request
path = ".//Extracted_text_files"
dirlist = os.listdir(path)
w1_list = list()
w2_list = list()
for input_file in dirlist: ## for each of the cranfield text files
with open(path + "//" + input_file,"r") as fi: ## open it in the read mode and parse
lines = fi.read()
if re.search('%s'% word,lines):
w1_list.append(re.findall('(\w+ %s \w+)'% word,lines))
w2_set = set()
for input_file in dirlist: ## for each of the cranfield text files
with open(path + "//" + input_file,"r") as fi: ## open it in the read mode and parse
lines = fi.read()
for w1 in w1_list:
if len(w1) != 0:
w2 = w1[0].split(' ')
match = set(re.findall('{} (\w+) {}'.format(w2[0],w2[2]),lines))
if len(match) != 0:
similar_set = similar_set.union(match)
if len(similar_set) != 0:
print similar_set
else:
print "No such word in the corpus."
else: ## error handling
print "Please enter a valid query: similar term"
## retrieve the documents for the query and store results in the 2 result dictionaries
def get_documents(query,score_dict,pos_dict):
global words_list
global phrase_list
global negphrase_list
global negwords_list
global index
common_docid_set = set()
pre_process_query(query)
all_negative_phrase_docs = defaultdict() ## stores the docids of all the phrases that are negated i.e it stores docids which contain the negated phrase
all_negative_word_docs = defaultdict() ## stores the docids of all words that are negated i.e it stores docids which contain the negated word
## processing phrases in the query that are not negated -- put their information in the 2 result dictionaries
process_phrase(phrase_list,score_dict,pos_dict)
## processing phrases in the query that are negated -- put their info in a separate dict 'all_negative_phrase_docs'
## functionality is the same as the processing for a normal phrase, but stored in a separate dict
for phrase in negphrase_list:
phrase_words = phrase.strip().split() ## split phrase into phrase-words
for i in range(0,len(phrase_words)): ## stemming the phrase words as index is stemmed
phrase_words[i] = porter.stem(phrase_words[i])
if len(phrase_words) == 1: ## if only one word in the phrase
words_list.append(phrase_words[0]) ## add it to the word list to process as a normal word in the query
else: ## else
if phrase_words[0] in index.keys(): ## if the first phrase-word is in index
common_docid_set = set(index[phrase_words[0]].keys()) ## add it's docids to the set containing the common docids
for i in range(1,len(phrase_words)):
if phrase_words[i] in index.keys(): ## if the next phrase-word is in the index
common_docid_set = set(index[phrase_words[i]].keys()) & common_docid_set ## find the docids that are common with the previous words in the phrase
else: ## if one of the phrase words is not in index, it implies the phrase does not occur in any document
common_docid_set.clear() ## hence empty the common doc set and break
break
for docid in common_docid_set: ## for each doc containing the phrase-words, find if they occur together as a phrase
prev_pos_set = set(index[phrase_words[0]][docid]) ## stores the positions of the first phrase-word
curr_pos_set = set()
for j in range(1,len(phrase_words)):
for pos in index[phrase_words[i]][docid]: ## for each position of the current phrase-word in current doc
curr_pos_set.add(pos-j) ## decrement the positions by j to match the start position of the phrase
correct_pos_set = curr_pos_set & prev_pos_set ## stores only the start positions of those instances where the words appear next to each other
if len(correct_pos_set) == 0: ## if there are no such positions in thid doc where the words appear as a phrase
break ## break and move to the next doc
else:
prev_pos_set = correct_pos_set
if len(correct_pos_set) == 0: ## if pos set is empty, it implies that no phrase match in current document
continue ## hence continue to next document
else: ## else make the entry in the doc dict for positive phrase
if docid not in all_negative_phrase_docs: ## if new docid entry in dict
score = len(correct_pos_set)*len(phrase_words) ## score = no of occurrences * no of phrase words
all_negative_phrase_docs[docid] = score ## storing docids with no of occurrences
else: ## if existing docid, add occurrences and new positions
score = all_negative_phrase_docs[docid] + (len(correct_pos_set)*len(phrase_words)) ## add the scores for the next phrase in the same doc
all_negative_phrase_docs[docid] = score ## storing the scores just to maintain consistency, scores will not be used for document retrieval
## processing individual words in the query that are not negated -- put their info in the 2 result dictionaries
process_words(words_list,score_dict,pos_dict)
## processing individual words in the query that are negated -- put their info in a separate dict 'all_negative_word_docs'
## functionality is the same as processing a normal word, but stored in a separate dict
for q_negword in negwords_list:
q_negword = q_negword.strip()
if q_negword in index:
for docid in index[q_negword].keys():
score = 0
if docid not in all_negative_word_docs:
all_negative_word_docs[docid] = len(index[q_negword][docid]) ## stores the docids in a separate dict for negated words
else:
score = all_negative_word_docs[docid] + len(index[q_negword][docid]) ## storing the scores just to maintain consistency in code, scores are not used in document retrieval
all_negative_word_docs[docid] = score
## get the set of documents that contains negated phrases or negated words
negative_phrase_set = set()
negative_word_set = set()
if len(all_negative_phrase_docs.keys()) != 0: ## if there is a negated phrase in the query, the all_negative_phrase_docs will contain docids containing the phrase
negative_phrase_set = set(score_dict.keys()) - set(all_negative_phrase_docs.keys()) ## thus taking the difference from the whole set of documents will give the set not containing that phrase
if len(all_negative_word_docs.keys()) != 0: ## if there is a negated word in the query, the all_negative_word_docs will contain docids containing the word
negative_word_set = set(score_dict.keys()) - set(all_negative_word_docs.keys()) ## thus taking the difference from the whole set of documents will give the set not containing that word
total_negative_set = negative_phrase_set | negative_word_set ## union of these 2 sets gives the set of documents not containing the phrase/words that are negated in the query
## to get the final set of documents, we need to do a fuzzy or between the normal(score_dict) and negated set (total_negative_set) of documents
## As per the following logic, if there is a docid common to both the normal set and the negated set, it will retain it and show phrase/word instances from the normal set.
## else the negated docid set is identified by assigning them a negative score
for docid in total_negative_set: ## for each docid in the negated set of documents
if score_dict[docid] == 0:
score_dict[docid] = -1
## pickle the final dictionaries for ranking
## pickle.dump(score_dict,open("score_dict","wb"))
## pickle.dump(pos_dict,open("pos_dict","wb"))
## call the ranking function to rand and display snippets
ranking.ranking(query,score_dict,pos_dict)
## takes the query as input and calls the appropriate function to get statistics, metadata or retrieve documents
def get_query():
## 2 result dictionaries
score_dict = defaultdict()
pos_dict = defaultdict()
while True:
query = raw_input("enter query: ")
print "retrieving results.."
start = time.clock()
for i in range(1,1401): ## initializing the score dictionary with 0 scores for all docids
score_dict[str(i)] = 0
for i in range(1,1401): ## initializing the position dictionary with an empty position set for all docids
pos_dict[str(i)] = set()
query_parts = query.strip().split() ## split the query to understand the request type
if query_parts[0] in ('df', 'tf', 'freq'): ## if query is to calculate the tf,df or freq, then call get_statistics
get_statistics(query,score_dict,pos_dict)
elif query_parts[0] in ('title','doc'): ## if query is to get the text or title, call get_metadata
get_metadata(query,score_dict,pos_dict)
elif query_parts[0] in ('similar'):
get_similar_words(query)
else: ## else call get_documents
get_documents(query,score_dict,pos_dict)
## claculate the time elapsed for returning results
elapsed = (time.clock() - start)
print elapsed
if __name__ == "__main__":
if len(sys.argv)!=1: # Expect exactly zero argument: the training data file
sys.exit(2)
get_query()