-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
99 lines (84 loc) · 3.42 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import webapp2
import logging
from templateHelper import renderHandler
from dbDocMapping import DocMapping
from crawler import crawl
from parser import parse
from const import CLASS_ACTIVE
from const import MYURL
class Enum(set):
def __getattr__(self, name):
if name in self:
return name
raise AttributeError
MODE = Enum(['SEARCH', 'CRAWL', 'PARSE'])
class SearchHandler(renderHandler):
def get(self):
str_title = 'Web search in GAE by Santiago Arias'
str_meta_content = "Google App engine web search python Santiago Arias"
self.render('search.html',
str_address = 'search',
str_title = str_title,
str_active = CLASS_ACTIVE,
str_meta_content = str_meta_content,
bsearch = True)
def post(self):
current_mode = self.getCurrentMode()
if current_mode == MODE.SEARCH:
user_search = self.request.get('inputSearch')
words = user_search.split()
words = list(set(words))
results = []
self.render('search.html',
str_address = 'search',
str_active = CLASS_ACTIVE,
query = user_search,
results = results[:10],
bsearch = True)
elif current_mode == MODE.CRAWL:
try:
logging.info('fetching urls from ' + MYURL)
crawled = crawl(MYURL)
except:
logging.error('An error occured when crawling from url: ' + MYURL)
self.redirect('/search')
return
DocMapping.clean()
DocMapping.add_list(crawled)
self.redirect('/crawled')
elif current_mode == MODE.PARSE:
# parsing mode: extract content from all documents and build data structures.
# build inverted index
docs = DocMapping.all()
invertedIndex ={}
termFrequency = {}
termFrequencyByDoc = {}
docFrequencyByTerm = {}
logging.info('starting to parse all documents')
for d in docs:
parse(d, invertedIndex, termFrequency, termFrequencyByDoc, docFrequencyByTerm)
logging.info('parsing done!')
# we need to store this in a blob or cloud storage for later
#json_str = json.dumps(invertedIndex)
#json_str = json.dumps(termFrequency)
#json_str = json.dumps(termFrequencyByDoc)
#json_str = json.dumps(docFrequencyByTerm)
self.redirect('/search')
def getCurrentMode(self):
'determine current post mode'
logging.info('getCurrentMode: determine current post mode')
mode = ''
user_crawl = self.request.get('inputSubmitCrawl')
if user_crawl:
logging.info('getCurrentMode: mode CRAWL')
return MODE.CRAWL
user_search = self.request.get('inputSubmitSearch')
if user_search:
logging.info('getCurrentMode: mode SEARCH')
return MODE.SEARCH
user_parse = self.request.get('inputSubmitParse')
if user_parse:
logging.info('getCurrentMode: mode PARSE')
return MODE.PARSE
logging.info('getCurrentMode: mode not found')
return mode