def test3(): print("\n\n------ Test 3 ---------") p = BIN_Parser() g = p.grammar() print("Reynir.grammar has {0} nonterminals, {1} terminals, {2} productions" .format(g.num_nonterminals(), g.num_terminals(), g.num_productions())) def create_sentence_table(): """ Only used to create a test fresh sentence table if one doesn't exist """ with closing(Test_DB.open_db()) as db: try: db.create_sentence_table() TEXTS = [ "Páll fór út með stóran kött og Jón keypti heitan graut.", "Unga fallega konan frá Garðabæ elti ljóta og feita karlinn rösklega og fumlaust í svörtu myrkrinu", "Kötturinn sem strákurinn átti veiddi feitu músina", "Gamla bláa kommóðan var máluð fjólublá með olíumálningu", "Landsframleiðslan hefur aukist frá því í fyrra", "Guðmundur og Guðrún kusu Framsóknarflokkinn", "Þú skalt fara til Danmerkur.", "Ég og þú fórum til Frakklands í utanlandsferð", "Stóru bláu könnunni mun hafa verið fleygt í ruslið", "Már Guðmundsson segir margskonar misskilnings gæta hjá Hannesi Hólmsteini", "Már Guðmundsson seðlabankastjóri Íslands segir þetta við Morgunblaðið í dag.", "Það er náttúrlega einungis í samfélögum sem eiga við býsna stór vandamál að stríða að ný stjórnmálaöfl geta snögglega sveiflast upp í þriðjungs fylgi.", "Áætlaður kostnaður verkefnisins var tíu milljónir króna og áætluð verklok eru í byrjun september næstkomandi.", "Pakkinn snerist um að ábyrgjast innlán og skuldabréfaútgáfu danskra fjármálafyrirtækja.", "Kynningarfundurinn sem ég hélt í dag fjallaði um lausnina á þessum vanda.", "Kynningarfundurinn sem haldinn var í dag fjallaði um lausnina á þessum vanda.", "Það sakamál sé til meðferðar við Héraðsdóm Suðurlands." ] for t in TEXTS: db.add_sentence(t) slist = db.sentences() for s in slist: print("{0}".format(s)) except Exception as e: print("{0}".format(e)) for test in run_test(p): print("\n'{0}'\n{1} parse trees found in {2:.3f} seconds\n" .format(test["sentence"], test["numtrees"], test["parse_time"])) if test["numtrees"] > 0: Parser.print_parse_forest(test["forest"]) # print("{0}".format(Parser.make_schema(test["forest"]))) elif test["err"]: print("Error: {0}".format(test["err"]))
def test(): """ Handler for a page of sentences for testing """ # Run test and show the result bp = BIN_Parser() return render_template("test.html", result=run_test(bp))
def parse_grid(): """ Show the parse grid for a particular parse tree of a sentence """ MAX_LEVEL = 32 # Maximum level of option depth we can handle txt = request.form.get('txt', "") parse_path = request.form.get('option', "") # Tokenize the text tokens = list(tokenize(txt)) # Parse the text bp = BIN_Parser() err = dict() try: forest = bp.go(tokens) except ParseError as e: err["msg"] = str(e) # Relay information about the parser state at the time of the error err["info"] = e.info() forest = None # Find the number of parse combinations combinations = Parser.num_combinations(forest) if forest else 0 # Make the parse grid with all options grid, ncols = Parser.make_grid(forest) if forest else ([], 0) # The grid is columnar; convert it to row-major # form for convenient translation into HTML # There will be as many columns as there are tokens nrows = len(grid) tbl = [ [] for _ in range(nrows) ] # Info about previous row spans rs = [ [] for _ in range(nrows) ] # The particular option path we are displaying if not parse_path: # Not specified: display the all-zero path path = [(0,) * i for i in range(1, MAX_LEVEL)] else: # Disassemble the passed-in path def toint(s): """ Safe conversion of string to int """ try: n = int(s) except ValueError: n = 0 return n if n >= 0 else 0 p = [ toint(s) for s in parse_path.split("_") ] path = [tuple(p[0 : i + 1]) for i in range(len(p))] # This set will contain all option path choices choices = set() NULL_TUPLE = tuple() for gix, gcol in enumerate(grid): # gcol is a dictionary of options # Accumulate the options that we want do display # according to chosen path cols = gcol[NULL_TUPLE] if NULL_TUPLE in gcol else [] # Default content # Add the options we're displaying for p in path: if p in gcol: cols.extend(gcol[p]) # Accumulate all possible path choices choices |= gcol.keys() # Sort the columns that will be displayed cols.sort(key = lambda x: x[0]) col = 0 for startcol, endcol, info in cols: assert isinstance(info, Nonterminal) or isinstance(info, tuple) if col < startcol: gap = startcol - col gap -= sum(1 for c in rs[gix] if c < startcol) if gap > 0: tbl[gix].append((gap, 1, "", "")) rowspan = 1 if isinstance(info, tuple): cls = { "terminal" } rowspan = nrows - gix for i in range(gix + 1, nrows): # Note the rowspan's effect on subsequent rows rs[i].append(startcol) else: cls = { "nonterminal" } # Get the 'pure' name of the nonterminal in question assert isinstance(info, Nonterminal) info = info.name() if endcol - startcol == 1: cls |= { "vertical" } tbl[gix].append((endcol-startcol, rowspan, info, cls)) col = endcol ncols_adj = ncols - len(rs[gix]) if col < ncols_adj: tbl[gix].append((ncols_adj - col, 1, "", "")) # Calculate the unique path choices available for this parse grid choices -= { NULL_TUPLE } # Default choice: don't need it in the set unique_choices = choices.copy() for c in choices: # Remove all shorter prefixes of c from the unique_choices set unique_choices -= { c[0:i] for i in range(1, len(c)) } # Create a nice string representation of the unique path choices uc_list = [ "_".join(str(c) for c in choice) for choice in unique_choices ] if not parse_path: # We are displaying the longest possible all-zero choice: find it i = 0 while (0,) * (i + 1) in unique_choices: i += 1 parse_path = "_".join(["0"] * i) #debug() return render_template("parsegrid.html", txt = txt, err = err, tbl = tbl, combinations = combinations, choice_list = uc_list, parse_path = parse_path)
def analyze(): """ Analyze text from a given URL """ url = request.form.get("url", "").strip() t0 = time.time() if url.startswith("http:") or url.startswith("https:"): # Scrape the URL, tokenize the text content and return the token list toklist = list(process_url(url)) else: # Tokenize the text entered as-is and return the token list toklist = list(tokenize(url)) tok_time = time.time() - t0 # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent_begin = 0 bp = BIN_Parser() t0 = time.time() for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) # Parse the accumulated sentence err_index = None try: forest = bp.go(sent) except ParseError as e: forest = None # Obtain the index of the offending token err_index = e.token_index() num = 0 if forest is None else Parser.num_combinations(forest) print("Parsed sentence of length {0} with {1} combinations{2}".format(slen, num, "\n" + " ".join(s[1] for s in sent) if num >= 100 else "")) if num > 0: num_parsed_sent += 1 # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) parse_time = time.time() - t0 result = dict( tokens = toklist, tok_time = tok_time, tok_num = len(toklist), parse_time = parse_time, num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 ) # Dump the tokens to a text file for inspection # dump_tokens_to_file("txt", toklist) # Return the tokens as a JSON structure to the client return jsonify(result = result)
def parse_grid(): """ Show the parse grid for a particular parse tree of a sentence """ MAX_LEVEL = 32 # Maximum level of option depth we can handle txt = request.form.get('txt', "") parse_path = request.form.get('option', "") # Tokenize the text tokens = list(tokenize(txt)) # Parse the text bp = BIN_Parser() err = dict() try: forest = bp.go(tokens) except ParseError as e: err["msg"] = str(e) # Relay information about the parser state at the time of the error err["info"] = e.info() forest = None # Find the number of parse combinations combinations = Parser.num_combinations(forest) if forest else 0 # Make the parse grid with all options grid, ncols = Parser.make_grid(forest) if forest else ([], 0) # The grid is columnar; convert it to row-major # form for convenient translation into HTML # There will be as many columns as there are tokens nrows = len(grid) tbl = [[] for _ in range(nrows)] # Info about previous row spans rs = [[] for _ in range(nrows)] # The particular option path we are displaying if not parse_path: # Not specified: display the all-zero path path = [(0, ) * i for i in range(1, MAX_LEVEL)] else: # Disassemble the passed-in path def toint(s): """ Safe conversion of string to int """ try: n = int(s) except ValueError: n = 0 return n if n >= 0 else 0 p = [toint(s) for s in parse_path.split("_")] path = [tuple(p[0:i + 1]) for i in range(len(p))] # This set will contain all option path choices choices = set() NULL_TUPLE = tuple() for gix, gcol in enumerate(grid): # gcol is a dictionary of options # Accumulate the options that we want do display # according to chosen path cols = gcol[NULL_TUPLE] if NULL_TUPLE in gcol else [ ] # Default content # Add the options we're displaying for p in path: if p in gcol: cols.extend(gcol[p]) # Accumulate all possible path choices choices |= gcol.keys() # Sort the columns that will be displayed cols.sort(key=lambda x: x[0]) col = 0 for startcol, endcol, info in cols: assert isinstance(info, Nonterminal) or isinstance(info, tuple) if col < startcol: gap = startcol - col gap -= sum(1 for c in rs[gix] if c < startcol) if gap > 0: tbl[gix].append((gap, 1, "", "")) rowspan = 1 if isinstance(info, tuple): cls = {"terminal"} rowspan = nrows - gix for i in range(gix + 1, nrows): # Note the rowspan's effect on subsequent rows rs[i].append(startcol) else: cls = {"nonterminal"} # Get the 'pure' name of the nonterminal in question assert isinstance(info, Nonterminal) info = info.name() if endcol - startcol == 1: cls |= {"vertical"} tbl[gix].append((endcol - startcol, rowspan, info, cls)) col = endcol ncols_adj = ncols - len(rs[gix]) if col < ncols_adj: tbl[gix].append((ncols_adj - col, 1, "", "")) # Calculate the unique path choices available for this parse grid choices -= {NULL_TUPLE} # Default choice: don't need it in the set unique_choices = choices.copy() for c in choices: # Remove all shorter prefixes of c from the unique_choices set unique_choices -= {c[0:i] for i in range(1, len(c))} # Create a nice string representation of the unique path choices uc_list = ["_".join(str(c) for c in choice) for choice in unique_choices] if not parse_path: # We are displaying the longest possible all-zero choice: find it i = 0 while (0, ) * (i + 1) in unique_choices: i += 1 parse_path = "_".join(["0"] * i) #debug() return render_template("parsegrid.html", txt=txt, err=err, tbl=tbl, combinations=combinations, choice_list=uc_list, parse_path=parse_path)
def analyze(): """ Analyze text from a given URL """ url = request.form.get("url", "").strip() t0 = time.time() if url.startswith("http:") or url.startswith("https:"): # Scrape the URL, tokenize the text content and return the token list toklist = list(process_url(url)) else: # Tokenize the text entered as-is and return the token list toklist = list(tokenize(url)) tok_time = time.time() - t0 # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent_begin = 0 bp = BIN_Parser() t0 = time.time() for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) # Parse the accumulated sentence err_index = None try: forest = bp.go(sent) except ParseError as e: forest = None # Obtain the index of the offending token err_index = e.token_index() num = 0 if forest is None else Parser.num_combinations(forest) print("Parsed sentence of length {0} with {1} combinations{2}". format( slen, num, "\n" + " ".join(s[1] for s in sent) if num >= 100 else "")) if num > 0: num_parsed_sent += 1 # Calculate the 'ambiguity factor' ambig_factor = num**(1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses=num, err_index=err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) parse_time = time.time() - t0 result = dict(tokens=toklist, tok_time=tok_time, tok_num=len(toklist), parse_time=parse_time, num_sent=num_sent, num_parsed_sent=num_parsed_sent, avg_ambig_factor=(total_ambig / total_tokens) if total_tokens > 0 else 1.0) # Dump the tokens to a text file for inspection # dump_tokens_to_file("txt", toklist) # Return the tokens as a JSON structure to the client return jsonify(result=result)