/
searcher.py
85 lines (66 loc) · 2.1 KB
/
searcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import argparse
from collections import Set
import json
import os
import re
import sys
from posting_utils import *
from stemmer import stem
class Searcher:
def __init__(self):
with open('indexes/data_index', "r") as indexFile:
self.index = json.loads(indexFile.read())
self.indexDict = self.index["index"]
self.idMap = self.index["id_map"]
def printResult(self, doc):
print("\n")
filename = self.idMap[str(doc)]
print(filename)
with open("data/" + filename, 'r') as doc:
print(doc.read())
print("========")
print("\n")
def search(self, query, isPhrase, isOrMatch):
results = []
stemmed = [stem(t) for t in query.split(" ")]
if (isPhrase):
results = self.phraseSearch(stemmed)
else:
results = self.termSearch(stemmed, isOrMatch)
for doc in self.removeNailPolish(results):
self.printResult(doc)
def termSearch(self, terms, isOrMatch=False):
postingsLists = []
for t in terms:
try:
postingsLists.append(set(self.indexDict[t].keys()))
except KeyError:
postingsLists.append(set())
if isOrMatch:
return set.union(*postingsLists)
else:
return set.intersection(*postingsLists)
def phraseSearch(self, terms):
candidates = self.termSearch(terms, isOrMatch=False)
matches = []
for docId in candidates:
positions = [self.indexDict[t][docId] for t in terms]
for init in positions[0]:
positionChecks = []
for i in range(0, len(positions)):
positionChecks.append((init + i) in positions[i])
if all(positionChecks):
matches.append(docId)
break
return matches
def removeNailPolish(self, results):
npResults = self.phraseSearch("nail polish".split())
return set(results) - set(npResults)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--query', type=str)
parser.add_argument('--phrase', action='store_true')
parser.add_argument('--orMatch', action='store_true')
args = parser.parse_args()
s = Searcher()
s.search(args.query, args.phrase, args.orMatch)