forked from mideind/GreynirServer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
executable file
·698 lines (563 loc) · 24.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
#!/usr/bin/env python
"""
Reynir: Natural language processing for Icelandic
Main module, URL scraper and web server
Copyright (c) 2015 Vilhjalmur Thorsteinsson
All rights reserved
See the accompanying README.md file for further licensing and copyright information.
This module is written in Python 3 for Python 3.4
"""
import time
from contextlib import closing
from datetime import datetime
import re
from bs4 import NavigableString
from collections import OrderedDict, defaultdict
from flask import Flask
from flask import render_template, jsonify
from flask import request
from fastparser import Fast_Parser, ParseError, ParseForestPrinter, ParseForestDumper
from grammar import Nonterminal
from ptest import run_test, Test_DB
from reducer import Reducer
from scraper import Scraper
from scraperdb import Scraper_DB, Person
from settings import Settings, ConfigError
from tokenizer import tokenize, TOK
# Initialize Flask framework
app = Flask(__name__)
from flask import current_app
def debug():
# Call this to trigger the Flask debugger on purpose
assert current_app.debug == False, "Don't panic! You're here by request of debug()"
# Current default URL for testing
DEFAULT_URL = 'http://kjarninn.is/2015/04/mar-gudmundsson-segir-margskonar-misskilnings-gaeta-hja-hannesi-holmsteini/'
# 'http://www.ruv.is//frett/flottamennirnir-matarlausir-i-einni-kos'
# HTML tags that we explicitly don't want to look at
exclude_tags = frozenset(["script", "audio", "video", "style"])
# HTML tags that typically denote blocks (DIV-like), not inline constructs (SPAN-like)
block_tags = frozenset(["p", "h1", "h2", "h3", "h4", "div",
"main", "article", "header", "section",
"table", "thead", "tbody", "tr", "td", "ul", "li",
"form", "option", "input", "label",
"figure", "figcaption", "footer"])
whitespace_tags = frozenset(["br", "img"])
class TextList:
""" Accumulates raw text blocks and eliminates unnecessary nesting indicators """
def __init__(self):
self._result = []
self._nesting = 0
def append(self, w):
if self._nesting > 0:
self._result.append(" [[ " * self._nesting)
self._nesting = 0
self._result.append(w)
def append_whitespace(self):
if self._nesting == 0:
# No need to append whitespace if we're just inside a begin-block
self._result.append(" ")
def begin(self):
self._nesting += 1
def end(self):
if self._nesting > 0:
self._nesting -= 1
else:
self._result.append(" ]] ")
def result(self):
return "".join(self._result)
def extract_text(soup, result):
""" Append the human-readable text found in an HTML soup to the result TextList """
if soup:
for t in soup.children:
if type(t) == NavigableString:
# Text content node
result.append(t)
elif isinstance(t, NavigableString):
# Comment, CDATA or other text data: ignore
pass
elif t.name in whitespace_tags:
# Tags that we interpret as whitespace, such as <br> and <img>
result.append_whitespace()
elif t.name in block_tags:
# Nested block tag
result.begin() # Begin block
extract_text(t, result)
result.end() # End block
elif t.name not in exclude_tags:
# Non-block tag
extract_text(t, result)
def process_url(url):
""" Open a URL and process the returned response """
metadata = None
body = None
# Fetch the URL, returning a (metadata, content) tuple or None if error
info = Scraper.fetch_url(url)
if info:
metadata, body = info
if metadata is None:
if Settings.DEBUG:
print("No metadata")
metadata = dict(heading = "",
author = "",
timestamp = datetime.utcnow(),
authority = 0.0)
else:
if Settings.DEBUG:
print("Metadata: heading '{0}'".format(metadata.heading))
print("Metadata: author '{0}'".format(metadata.author))
print("Metadata: timestamp {0}".format(metadata.timestamp))
print("Metadata: authority {0:.2f}".format(metadata.authority))
metadata = vars(metadata) # Convert namedtuple to dict
# Extract the text content of the HTML into a list
tlist = TextList()
extract_text(body, tlist)
text = tlist.result()
# Eliminate soft hyphen and zero-width space characters
text = re.sub('\u00AD|\u200B', '', text)
# Eliminate consecutive whitespace
text = re.sub(r'\s+', ' ', text)
# Tokenize the resulting text, returning a generator
# noinspection PyRedundantParentheses
return (metadata, tokenize(text))
def profile(func, *args, **kwargs):
""" Profile the processing of text or URL """
import cProfile as profile
filename = 'Reynir.profile'
pr = profile.Profile()
result = pr.runcall(func, *args, **kwargs)
pr.dump_stats(filename)
return result
def parse(toklist, single, use_reducer, dump_forest = False, keep_trees = False):
""" Parse the given token list and return a result dict """
# Count sentences
num_sent = 0
num_parsed_sent = 0
total_ambig = 0.0
total_tokens = 0
sent = []
sent_begin = 0
# Accumulate parsed sentences in a text dump format
trees = OrderedDict()
with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages
version = bp.version
rdc = Reducer(bp.grammar)
for ix, t in enumerate(toklist):
if t[0] == TOK.S_BEGIN:
num_sent += 1
sent = []
sent_begin = ix
elif t[0] == TOK.S_END:
slen = len(sent)
if slen:
# Parse the accumulated sentence
err_index = None
num = 0 # Number of tree combinations in forest
score = 0 # Reducer score of the best parse tree
try:
# Parse the sentence
forest = bp.go(sent)
if forest:
num = Fast_Parser.num_combinations(forest)
if single and dump_forest:
# Dump the parse tree to parse.txt
with open("parse.txt", mode = "w", encoding= "utf-8") as f:
print("Reynir parse tree for sentence '{0}'".format(" ".join(sent)), file = f)
print("{0} combinations\n".format(num), file = f)
if num < 10000:
ParseForestPrinter.print_forest(forest, file = f)
else:
print("Too many combinations to dump", file = f)
if use_reducer and num > 1:
# Reduce the resulting forest
forest, score = rdc.go_with_score(forest)
assert Fast_Parser.num_combinations(forest) == 1
if Settings.DEBUG:
print(ParseForestDumper.dump_forest(forest))
num = 1
except ParseError as e:
forest = None
# Obtain the index of the offending token
err_index = e.token_index
if Settings.DEBUG:
print("Parsed sentence of length {0} with {1} combinations, score {2}{3}"
.format(slen, num, score,
"\n" + (" ".join(s[1] for s in sent) if num >= 100 else "")))
if num > 0:
num_parsed_sent += 1
# Calculate the 'ambiguity factor'
ambig_factor = num ** (1 / slen)
# Do a weighted average on sentence length
total_ambig += ambig_factor * slen
total_tokens += slen
if keep_trees:
# We want to keep the trees for further processing down the line:
# reduce and dump the best tree to text
if num > 1:
# Reduce the resulting forest before dumping it to text format
forest = rdc.go(forest)
trees[num_sent] = ParseForestDumper.dump_forest(forest)
# Mark the sentence beginning with the number of parses
# and the index of the offending token, if an error occurred
toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
elif t[0] == TOK.P_BEGIN:
pass
elif t[0] == TOK.P_END:
pass
else:
sent.append(t)
result = dict(
version = version,
tokens = toklist,
tok_num = len(toklist),
num_sent = num_sent,
num_parsed_sent = num_parsed_sent,
avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
)
# noinspection PyRedundantParentheses
return (result, trees)
def create_name_register(result):
""" Assemble a register of names and titles from the token list """
tokens = result["tokens"]
register = { }
db = Scraper_DB()
with closing(db.session) as session:
for t in tokens:
if t.kind == TOK.PERSON:
gn = t.val
for pn in gn:
# Attempt to look up the name pn.name
q = session.query(Person).filter_by(name = pn.name).all()
titles = defaultdict(int)
for p in q:
# Collect and count the titles
titles[p.title] += 1
if sum(cnt >= 4 for cnt in titles.values()) >= 2:
# More than one title with four or more instances:
# reduce the choices to just those and decide based on length
titles = { key: 0 for key, val in titles.items() if val >= 4 }
if titles:
# Pick the most popular title, or the longer one if two are equally popular
title = sorted([(cnt, len(t), t) for t, cnt in titles.items()])[-1][2]
# Add it to the register
register[pn.name] = title
session.commit()
result["register"] = register
if Settings.DEBUG:
print("Register is: {0}".format(register))
@app.route("/analyze", methods=['POST'])
def analyze():
""" Analyze text from a given URL """
url = request.form.get("url", "").strip()
use_reducer = not ("noreduce" in request.form)
dump_forest = "dump" in request.form
metadata = None
# Single sentence (True) or contiguous text from URL (False)?
single = False
keep_trees = False
t0 = time.time()
if url.startswith("http:") or url.startswith("https:"):
# Scrape the URL, tokenize the text content and return the token list
metadata, generator = process_url(url)
toklist = list(generator)
# If this is an already scraped URL, keep the parse trees and update
# the database with the new parse
keep_trees = Scraper.is_known_url(url)
else:
# Tokenize the text entered as-is and return the token list
# In this case, there's no metadata
toklist = list(tokenize(url))
single = True
tok_time = time.time() - t0
t0 = time.time()
# result = profile(parse, toklist, single, use_reducer, dump_forest)
result, trees = parse(toklist, single, use_reducer, dump_forest, keep_trees)
# Add a name register to the result
create_name_register(result)
parse_time = time.time() - t0
if keep_trees:
# Save a new parse result
if Settings.DEBUG:
print("Storing a new parse tree for url {0}".format(url))
Scraper.store_parse(url, result, trees)
result["metadata"] = metadata
result["tok_time"] = tok_time
result["parse_time"] = parse_time
# Return the tokens as a JSON structure to the client
return jsonify(result = result)
def make_grid(w):
""" Make a 2d grid from a flattened parse schema """
def make_schema(w):
""" Create a flattened parse schema from the forest w """
def _part(w, level, suffix):
""" Return a tuple (colheading + options, start_token, end_token, partlist, info)
where the partlist is again a list of the component schemas - or a terminal
matching a single token - or None if empty """
if w is None:
# Epsilon node: return empty list
return None
if w.is_token:
return ([ level ] + suffix, w.start, w.end, None, (w.terminal, w.token.text))
# Interior nodes are not returned
# and do not increment the indentation level
if not w.is_interior:
level += 1
# Accumulate the resulting parts
plist = [ ]
ambig = w.is_ambiguous
add_suffix = [ ]
for ix, pc in enumerate(w.enum_children()):
prod, f = pc
if ambig:
# Uniquely identify the available parse options with a coordinate
add_suffix = [ ix ]
def add_part(p):
""" Add a subtuple p to the part list plist """
if p:
if p[0] is None:
# p describes an interior node
plist.extend(p[3])
elif p[2] > p[1]:
# Only include subtrees that actually contain terminals
plist.append(p)
if isinstance(f, tuple):
add_part(_part(f[0], level, suffix + add_suffix))
add_part(_part(f[1], level, suffix + add_suffix))
else:
add_part(_part(f, level, suffix + add_suffix))
if w.is_interior:
# Interior node: relay plist up the tree
return (None, 0, 0, plist, None)
# Completed nonterminal
assert w.is_completed
assert w.nonterminal is not None
return ([level - 1] + suffix, w.start, w.end, plist, w.nonterminal)
# Start of make_schema
if w is None:
return None
return _part(w, 0, [ ])
# Start of make_grid
if w is None:
return None
schema = make_schema(w)
assert schema[1] == 0
cols = [] # The columns to be populated
NULL_TUPLE = tuple()
def _traverse(p):
""" Traverse a schema subtree and insert the nodes into their
respective grid columns """
# p[0] is the coordinate of this subtree (level + suffix)
# p[1] is the start column of this subtree
# p[2] is the end column of this subtree
# p[3] is the subpart list
# p[4] is the nonterminal or terminal/token at the head of this subtree
col, option = p[0][0], p[0][1:] # Level of this subtree and option
if not option:
# No option: use a 'clean key' of NULL_TUPLE
option = NULL_TUPLE
else:
# Convert list to a frozen (hashable) tuple
option = tuple(option)
while len(cols) <= col:
# Add empty columns as required to reach this level
cols.append(dict())
# Add a tuple describing the rows spanned and the node info
assert isinstance(p[4], Nonterminal) or isinstance(p[4], tuple)
if option not in cols[col]:
# Put in a dictionary entry for this option
cols[col][option] = []
cols[col][option].append((p[1], p[2], p[4]))
# Navigate into subparts, if any
if p[3]:
for subpart in p[3]:
_traverse(subpart)
_traverse(schema)
# Return a tuple with the grid and the number of tokens
return (cols, schema[2])
@app.route("/parsegrid", methods=['POST'])
def parse_grid():
""" Show the parse grid for a particular parse tree of a sentence """
MAX_LEVEL = 32 # Maximum level of option depth we can handle
txt = request.form.get('txt', "")
parse_path = request.form.get('option', "")
use_reducer = not ("noreduce" in request.form)
# Tokenize the text
tokens = list(tokenize(txt))
# Parse the text
with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages
err = dict()
grammar = bp.grammar
try:
forest = bp.go(tokens)
except ParseError as e:
err["msg"] = str(e)
# Relay information about the parser state at the time of the error
err["info"] = None # e.info
forest = None
# Find the number of parse combinations
combinations = 0 if forest is None else Fast_Parser.num_combinations(forest)
score = 0
if Settings.DEBUG:
# Dump the parse tree to parse.txt
with open("parse.txt", mode = "w", encoding= "utf-8") as f:
if forest is not None:
print("Reynir parse tree for sentence '{0}'".format(txt), file = f)
print("{0} combinations\n".format(combinations), file = f)
if combinations < 10000:
ParseForestPrinter.print_forest(forest, file = f)
else:
print("Too many combinations to dump", file = f)
else:
print("No parse available for sentence '{0}'".format(txt), file = f)
if forest is not None and use_reducer:
# Reduce the parse forest
forest, score = Reducer(grammar).go_with_score(forest)
if Settings.DEBUG:
print(ParseForestDumper.dump_forest(forest))
# Make the parse grid with all options
grid, ncols = make_grid(forest) if forest else ([], 0)
# The grid is columnar; convert it to row-major
# form for convenient translation into HTML
# There will be as many columns as there are tokens
nrows = len(grid)
tbl = [ [] for _ in range(nrows) ]
# Info about previous row spans
rs = [ [] for _ in range(nrows) ]
# The particular option path we are displaying
if not parse_path:
# Not specified: display the all-zero path
path = [(0,) * i for i in range(1, MAX_LEVEL)]
else:
# Disassemble the passed-in path
def toint(s):
""" Safe conversion of string to int """
try:
n = int(s)
except ValueError:
n = 0
return n if n >= 0 else 0
p = [ toint(s) for s in parse_path.split("_") ]
path = [tuple(p[0 : i + 1]) for i in range(len(p))]
# This set will contain all option path choices
choices = set()
NULL_TUPLE = tuple()
for gix, gcol in enumerate(grid):
# gcol is a dictionary of options
# Accumulate the options that we want do display
# according to chosen path
cols = gcol[NULL_TUPLE] if NULL_TUPLE in gcol else [] # Default content
# Add the options we're displaying
for p in path:
if p in gcol:
cols.extend(gcol[p])
# Accumulate all possible path choices
choices |= gcol.keys()
# Sort the columns that will be displayed
cols.sort(key = lambda x: x[0])
col = 0
for startcol, endcol, info in cols:
assert isinstance(info, Nonterminal) or isinstance(info, tuple)
if col < startcol:
gap = startcol - col
gap -= sum(1 for c in rs[gix] if c < startcol)
if gap > 0:
tbl[gix].append((gap, 1, "", ""))
rowspan = 1
if isinstance(info, tuple):
cls = { "terminal" }
rowspan = nrows - gix
for i in range(gix + 1, nrows):
# Note the rowspan's effect on subsequent rows
rs[i].append(startcol)
else:
cls = { "nonterminal" }
# Get the 'pure' name of the nonterminal in question
assert isinstance(info, Nonterminal)
info = info.name
if endcol - startcol == 1:
cls |= { "vertical" }
tbl[gix].append((endcol-startcol, rowspan, info, cls))
col = endcol
ncols_adj = ncols - len(rs[gix])
if col < ncols_adj:
tbl[gix].append((ncols_adj - col, 1, "", ""))
# Calculate the unique path choices available for this parse grid
choices -= { NULL_TUPLE } # Default choice: don't need it in the set
unique_choices = choices.copy()
for c in choices:
# Remove all shorter prefixes of c from the unique_choices set
unique_choices -= { c[0:i] for i in range(1, len(c)) }
# Create a nice string representation of the unique path choices
uc_list = [ "_".join(str(c) for c in choice) for choice in unique_choices ]
if not parse_path:
# We are displaying the longest possible all-zero choice: find it
i = 0
while (0,) * (i + 1) in unique_choices:
i += 1
parse_path = "_".join(["0"] * i)
#debug()
return render_template("parsegrid.html", txt = txt, err = err, tbl = tbl,
combinations = combinations, score = score,
choice_list = uc_list, parse_path = parse_path)
@app.route("/addsentence", methods=['POST'])
def add_sentence():
""" Add a sentence to the test database """
sentence = request.form.get('sentence', "")
# The sentence may be one that should parse and give us ideally one result tree,
# or one that is wrong and should not parse, giving 0 result trees.
should_parse = request.form.get('shouldparse', 'true') == 'true'
result = False
if sentence:
try:
with closing(Test_DB.open_db()) as db:
result = db.add_sentence(sentence, target = 1 if should_parse else 0)
except Exception as e:
return jsonify(result = False, err = str(e))
return jsonify(result = result)
@app.route("/")
def main():
""" Handler for the main (index) page """
# Instantiate a dummy parser to access grammar info
# (this does not cause repeated parsing of the grammar as it is cached in memory)
bp = Fast_Parser(verbose = False)
txt = request.args.get("txt", None)
if not txt:
txt = DEFAULT_URL
return render_template("main.html", default_text = txt, grammar = bp.grammar)
@app.route("/test")
def test():
""" Handler for a page of sentences for testing """
# Run test and show the result
bp = Fast_Parser(verbose = False) # Don't emit diagnostic messages
return render_template("test.html", result = run_test(bp))
# Flask handlers
# noinspection PyUnusedLocal
@app.errorhandler(404)
def page_not_found(e):
""" Return a custom 404 error """
return 'Þessi vefslóð er ekki rétt', 404
@app.errorhandler(500)
def server_error(e):
""" Return a custom 500 error """
return 'Eftirfarandi villa kom upp: {}'.format(e), 500
# Initialize the main module
try:
# Read configuration file
Settings.read("Reynir.conf")
except ConfigError as e:
print("Configuration error: {0}".format(e))
quit()
if Settings.DEBUG:
print("Running Reynir with debug={0}, host={1}, db_hostname={2}"
.format(Settings.DEBUG, Settings.HOST, Settings.DB_HOSTNAME))
if __name__ == "__main__":
# Run a default Flask web server for testing if invoked directly as a main program
# Additional files that should cause a reload of the web server application
# Note: Reynir.grammar is automatically reloaded if its timestamp changes
extra_files = [ 'Reynir.conf', 'Verbs.conf', 'Main.conf' ]
# Run the Flask web server application
app.run(debug=Settings.DEBUG, host=Settings.HOST, use_reloader=True,
extra_files = extra_files)
else:
# Running as a server module: force the grammar to be pre-loaded in to memory
with Fast_Parser() as fp:
pass