-
Notifications
You must be signed in to change notification settings - Fork 0
/
tanya.py
380 lines (306 loc) · 12.1 KB
/
tanya.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
import os
from urllib import urlencode
import urlparse
import logging
import json
from collections import OrderedDict
import web
from web import form
import rfc3987
import redis
import rq
from alyosha import alyosha as al
from alyosha import reference as REF
# Minimum word count for search result to be eligible
MIN_WC = 400
# Minimum match score for search result to be eligible
MIN_MATCH = 0.4
# Cut-off for source site's quality weight to be included in search (heavy
# sinks to bottom, [0, 100])
MAX_QUALITY_WEIGHT = 50
# number of stem_top list elements to consider for match score:
MATCH_SCORE_LEN = 20
# Max. wait (in seconds) in between calls to Google:
GOOGLE_DELAY = 2
logging.basicConfig(level=logging.DEBUG)
# Connect to Redis and define expiry in seconds:
redis_url = os.getenv('REDISTOGO_URL', 'redis://localhost:6379')
# urlparse.uses_netloc.append('redis')
# url = urlparse.urlparse(redis_url)
# redis_conn = redis.StrictRedis(host=url.hostname, port=url.port)
redis_conn = redis.from_url(redis_url)
REDIS_EXPIRE = 1200
render = web.template.render('templates/', base='layout')
urls = (
'/', 'index',
'/request', 'request',
'/results', 'results',
'/scorematches', 'score_matches',
'/error', 'error',
'/thify-what', 'thify_what',
'/thify-why', 'thify_why',
'/thify-how', 'thify_how',
'/thify-thanks', 'thify_thanks',
'/thify-contact', 'thify_contact',
)
urlForm = form.Form(
form.Textbox(
'URL',
form.notnull,
description='',
id='url-entry'))
# construct the source checkboxes:
srcSitesForms = []
for cat in REF.src_cats:
cs = REF.cat_sources(cat)
cbForms = [form.Checkbox(c.id, checked=(c.weight <= MAX_QUALITY_WEIGHT),
value=c.site, description=c.label) for c in cs]
srcSitesForms.append(form.Form(*cbForms))
matchCriteriaForm = form.Form(
form.Textbox(
'matchScore',
form.notnull,
form.regexp('\d+', 'Must be an integer number'),
size='5',
maxlength='3',
description='Desired content match:',
post='%',
value=int(MIN_MATCH * 100),
id='match-score'),
form.Textbox(
'minWC',
form.notnull,
form.regexp('\d+', 'Must be an integer number'),
size='5',
maxlength='4',
description='Minimum word count:',
value=MIN_WC,
id='min-word-count'),
form.Dropdown(
'backMonths',
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
value=0,
description="No older than:",
post='months (0 for no age restrictions)',
id='back-months'))
formDict = {
'src_by_cat': srcSitesForms,
'match_criteria': matchCriteriaForm,
}
class index(object):
def GET(self):
return render.index(urlForm)
def POST(self):
if not urlForm.validates():
return render.index(urlForm)
form_data = web.input()
check_spec = True if 'checkspec' in form_data else False
if rfc3987.match(form_data.URL, rule='URI_reference'):
parsed = urlparse.urlparse(form_data['URL'], scheme='http')
ref_url = parsed.scheme + '://' + '/'.join([parsed.netloc,
parsed.path.lstrip('/')]).lstrip('/')
else:
ref_url = None
if ref_url:
params = {'url': ref_url,
'checkspec': int(check_spec)}
raise web.seeother('/request?' + urlencode(params), '/')
else:
params = {
'msg': "Sorry, not implememted yet...",
'back_link': '/'
}
raise web.seeother('/error?' + urlencode(params))
class request(object):
cust_search_prompt = 'Roll your own:'
def GET(self):
i = web.input(url='', back_link='/', checkspec='0')
try:
wa = al.WebArticle(i.url, REF.stop_words, REF.late_kills)
logging.debug("article '%s' (%d words) at url='%s'",
wa.title, wa.wcount, wa.url)
except (al.NotAnArticleError, al.ArticleFormatError,
al.ArticleExtractionError, al.InvalidUrlError,
al.PageRetrievalError) as e:
params = {
'msg': "%s: %s" % (type(e), e.message),
'back_link': '/'
}
raise web.seeother('/error?' + urlencode(params))
# need to de-dupe search_strings in case no phrases are found
search_str_opts = [
{'use_phrases': False, 'force_phrases': False},
{'use_phrases': True, 'force_phrases': False},
{'use_phrases': True, 'force_phrases': True}
]
search_str_choices = list(dedupe([wa.search_string(num_terms=6, **sso)
for sso in search_str_opts]))
# store essentials on Redis:
wa_key = al.RedisWA.redis_store(redis_conn, wa.url, wa.title,
wa.wcount, wa.stem_tops[:MATCH_SCORE_LEN], search_str_choices)
redis_conn.expire(wa_key, REDIS_EXPIRE)
if not int(i.checkspec):
# use 'unforced phrases' for search string if available
ssi = 1 if len(search_str_choices) > 1 else 0
src_sel = [s.id for s in REF.source_sites
if s.weight <= MAX_QUALITY_WEIGHT]
params = {
'wa_key': wa_key,
'search_str': search_str_choices[ssi],
'sources': encode_src_sel(src_sel),
'match_score': int(MIN_MATCH * 100),
'min_wc': MIN_WC,
'back_days': 0,
}
logging.debug("request parameter: %s" % params)
raise web.seeother('/results?' + urlencode(params))
# set "no phrases" search string as default:
radio_buttons = [form.Radio('SearchStr', [search_str_choices[0]],
description='', value=search_str_choices[0],
class_='search-string-choice')]
# add remaining search string options (if any)
radio_buttons += [form.Radio('SearchStr', [ssc], description='',
class_='search-string-choice') for ssc in
search_str_choices[1:]]
radio_buttons.append(form.Radio('SearchStr',
[request.cust_search_prompt], description='',
id='cust-search-rb'))
# add custom search string choice and text box
radio_buttons.append(form.Textbox('CustSearch', description='',
size='40', id='cust-search-entry'))
formDict['search_strings'] = form.Form(*radio_buttons)
# put redis key string into hidden form field for retrieval by POST:
formDict['wa_key'] = form.Form(form.Textbox(
'waKey',
form.notnull,
description='',
id='wa-key',
value=wa_key,
style="display: none;"))
return render.request(formDict, REF.src_cats.keys(), wa.title, wa.url)
def POST(self):
for entry in formDict.values():
if type(entry) is list:
for f in entry:
if not f.validates():
return render.request(formDict)
else:
if not entry.validates():
return render.request(formDict)
form_data = web.input()
search_str = form_data.SearchStr
if search_str == request.cust_search_prompt:
search_str = form_data.CustSearch
params = {
'wa_key': form_data.waKey,
'search_str': search_str,
'sources': encode_src_sel(form_data),
'match_score': int(form_data.matchScore),
'min_wc': int(form_data.minWC),
'back_days': int(form_data.backMonths) * 30,
}
logging.debug("request parameter: %s" % params)
raise web.seeother('/results?' + urlencode(params))
class results(object):
def GET(self):
i = web.input(wa_key=None, search_str=None, sources=0, match_score=0,
min_wc=0, back_days=None)
logging.debug("results parameter: wa_key=%s, search_str='%s', "
"sources=%s, match_score=%s, min_wc=%s, back_days=%s",
i.wa_key, i.search_str, i.sources, i.match_score, i.min_wc,
i.back_days)
# get WebArticle data from Redis:
try:
rwa = al.RedisWA(redis_conn, i.wa_key)
except al.RedisGetError as e:
logging.debug("%s: %s", type(e), e.message)
raise
# we need to construct a dict of source categories mapped to the
# correponding (masked) source lists to be passed by render to the
# page, so we can include it there as a JS variable.
cat_sources = OrderedDict()
for cat in REF.src_cats:
cat_sources[cat] = REF.cat_sources(cat, sites_only=True,
sort_by='weight', mask=decode_src_sel(int(i.sources)))
return render.results(
rwa, i.search_str, cat_sources, i.match_score, i.min_wc,
i.back_days, '/request')
class score_matches(object):
q = rq.Queue(connection=redis_conn)
def GET(self):
i = web.input(wa_key=None, search_str=None, src=None, job_id=None)
logging.debug("score_matches called with '%s'", i)
web.header('Content-Type', 'application/json')
if i.get('job_id'):
# they want us to check for results...
job = score_matches.q.fetch_job(i.job_id)
status = job.get_status()
result = (job.result[0] if status == rq.job.Status.FINISHED
and job.result else '')
s = json.dumps({'src': i.src, 'job_id': i.job_id, 'status': status,
'result': result})
logging.debug("JSON string: %s", s)
return s
else:
# get worker busy; first retrieve original article from Redis:
rwa = al.RedisWA(r=redis_conn, key=i.wa_key)
job = score_matches.q.enqueue(al.score_matches, rwa, [i.src],
i.search_str, num_matches=3,
delay=GOOGLE_DELAY, encoding='utf-8')
s = json.dumps({'src': i.src, 'job_id': job.id})
logging.debug("JSON string: %s", s)
return s
class error(object):
def GET(self):
i = web.input(msg='', back_link='/')
return render.error(i.msg, i.back_link)
class thify_what(object):
def GET(self):
return render.thify_what(msg='')
class thify_why(object):
def GET(self):
return render.thify_why(msg='')
class thify_how(object):
def GET(self):
return render.thify_how(msg='')
class thify_thanks(object):
def GET(self):
return render.thify_thanks(msg='')
class thify_contact(object):
def GET(self):
return render.thify_contact(msg='')
def dedupe(items):
"""
Utility generator that eliminates duplicates from a sequence while
maintaining order (source: Python Cookbook)
"""
seen = set()
for item in items:
if item not in seen:
yield item
seen.add(item)
def encode_src_sel(src_sel):
"""
Encodes selected sources as binary flags an returns as a decimal number.
`src_sel` can be ani kind of object that supports `in` for membership
checking (e.g. form_data dict).
"""
selection = 0
for i, s in enumerate(REF.source_sites):
if s.id in src_sel:
selection += 2 ** i
return selection
def decode_src_sel(selection):
"""
Returns boolean array indicating which sources in REF.source_sites have
been included in the selection. `selection` is an integer.
"""
decoded = list(reversed(str(bin(selection))[2:]))
decoded = [(True if int(d) else False) for d in decoded]
decoded += [False] * (len(REF.source_sites) - len(decoded))
return decoded
# For serving using any wsgi server
wsgiapp = web.application(urls, globals()).wsgifunc()
if __name__ == '__main__':
app = web.application(urls, globals())
app.run()