/
Binary_Inverted_Index.py
97 lines (70 loc) · 2.68 KB
/
Binary_Inverted_Index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from __future__ import division
import time
import Tokenizer
from math import log
i = input('Enter within quotes, m for movie reviews corpus,'
'r for reuters corpus( default is reuters) : ')
corpus=''
if i=='m' or i=='M':
corpus='mr'
else:
corpus='reuters'
start_time = time.time()
list_fileids = Tokenizer.get_list_fileids(corpus)
#val = my_dict.get(key, mydefaultval)
##1)Create a dictionary with word as key and list of documents where it occurs in sorted order as value
word_doc_dict={}
##2)Loop through the dataset, to get the entire text from each file
for (file_index,file_name) in enumerate(list_fileids):
list_words = Tokenizer.get_list_tokens_nltk(corpus,file_name)
##3) Parse the string to get individual words
#!!!!!!!!------Possible Improvement: Stemming--------------#
##4) Update the dictionary
for w in set(list_words):
if word_doc_dict.get(w,0)==0:
word_doc_dict[w]=[]
word_doc_dict[w].append(file_index)
print "Inverted Index has been prepared and it took"
print time.time() - start_time, "seconds"
##5) Getting the query from the user
query = input("Enter your query string : ")
op = input("Enter the operator, (AND/OR) Default is AND: ")
start_time = time.time()
query_list=Tokenizer.get_list_tokens_string(query)
result_file_indices=[]
if op=='OR' or op=='or':
for q in query_list:
if word_doc_dict.get(q.lower(), 0)!=0:
result_file_indices.extend(word_doc_dict[q.lower()])
else:
file_indices=range(len(list_fileids))
for q in query_list:
if word_doc_dict.get(q.lower(), 0)==0:
file_indices=[]
break
else:
temp_list=[]
query_file_indices=word_doc_dict[q.lower()]
index_f=0
index_q=0
while index_f < len(file_indices) and index_q < len(query_file_indices):
if file_indices[index_f]==query_file_indices[index_q]:
temp_list.append(query_file_indices[index_q])
index_f+=1
index_q+=1
elif file_indices[index_f] < query_file_indices[index_q]:
index_f+=1
else:
index_q+=1
file_indices=[]
file_indices.extend(temp_list)
result_file_indices.extend(file_indices)
print "Time taken to search"
print time.time() - start_time, "seconds"
if(len(result_file_indices)==0):
print "Sorry No matches found"
else:
print "Number of search results : " , len(result_file_indices)
for index in result_file_indices:
print Tokenizer.get_raw_text(corpus,list_fileids[index])[:40]
print "\n"