def annotate(token_stream, auto_uppercase): """ Look up word forms in the BIN word database. If auto_uppercase is True, change lower case words to uppercase if it looks likely that they should be uppercase. """ at_sentence_start = False with BIN_Db.get_db() as db: # Consume the iterable source in wlist (which may be a generator) for t in token_stream: if t.kind != TOK.WORD: # Not a word: relay the token unchanged yield t if t.kind == TOK.S_BEGIN or (t.kind == TOK.PUNCTUATION and t.txt == ':'): at_sentence_start = True elif t.kind != TOK.PUNCTUATION and t.kind != TOK.ORDINAL: at_sentence_start = False continue if t.val is None: # Look up word in BIN database w, m = db.lookup_word(t.txt, at_sentence_start, auto_uppercase) # Yield a word tuple with meanings yield TOK.Word(w, m) else: # Already have a meaning, which probably needs conversion # from a bare tuple to a BIN_Meaning yield TOK.Word(t.txt, list(map(BIN_Meaning._make, t.val))) # No longer at sentence start at_sentence_start = False
def parse_tokens(toklist, mim_tags, fast_p): """ Parse the given token list and return a result dict """ # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 tag_ix = 0 ntags = len(mim_tags) rdc = Reducer(fast_p.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest score = 0 # Reducer score of the best parse tree try: # Progress indicator: sentence count print("{}".format(num_sent), end="\r") # Parse the sentence forest = fast_p.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None num = 0 # Obtain the index of the offending token err_index = e.token_index if num > 0: num_parsed_sent += 1 # Extract the POS tags for the terminals in the forest pos_tags = find_pos_tags(forest) # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) # Check whether the token streams are in sync if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]: #print("Warning: mismatch between MIM token '{0}' and Greynir token '{1}'".format(mim_tags[tag_ix][1], t[1])) # Attempt to sync again by finding the Greynir token in the MIM tag stream gap = 1 MAX_LOOKAHEAD = 4 while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]: gap += 1 if gap < MAX_LOOKAHEAD: # Found the Greynir token ahead #print("Re-synced by skipping ahead by {0} tokens".format(gap)) tag_ix += gap if tag_ix < ntags: tag_ix += 1 return dict( tokens = toklist, tok_num = len(toklist), num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 )
def disambiguate_phrases(token_stream): """ Parse a stream of tokens looking for common ambiguous multiword phrases (i.e. phrases that have a well known very likely interpretation but other extremely uncommon ones are also grammatically correct). The algorithm implements N-token lookahead where N is the length of the longest phrase. """ tq = [] # Token queue state = defaultdict(list) # Phrases we're considering pdict = AmbigPhrases.DICT # The phrase dictionary try: while True: token = next(token_stream) if token.kind != TOK.WORD: # Not a word: no match; yield the token queue if tq: yield from tq tq = [] # Discard the previous state, if any if state: state = defaultdict(list) # ...and yield the non-matching token yield token continue # Look for matches in the current state and build a new state newstate = defaultdict(list) w = token.txt.lower() def add_to_state(slist, index): """ Add the list of subsequent words to the new parser state """ wrd = slist[0] rest = slist[1:] newstate[wrd].append((rest, index)) if w in state: # This matches an expected token: # go through potential continuations tq.append(token) # Add to lookahead token queue token = None for sl, ix in state[w]: if not sl: # No subsequent word: this is a complete match # Discard meanings of words in the token queue that are not # compatible with the category list specified cats = AmbigPhrases.get_cats(ix) for t, cat in zip(tq, cats): # Yield a new token with fewer meanings for each original token in the queue if cat == "fs": # Handle prepositions specially, since we may have additional # preps defined in Main.conf that don't have fs meanings in BÍN w = t.txt.lower() yield TOK.Word( t.txt, [BIN_Meaning(w, 0, "fs", "alm", w, "-")]) else: yield TOK.Word( t.txt, [m for m in t.val if m.ordfl == cat]) # Discard the state and start afresh if newstate: newstate = defaultdict(list) w = "" tq = [] # Note that it is possible to match even longer phrases # by including a starting phrase in its entirety in # the static phrase dictionary break add_to_state(sl, ix) elif tq: # This does not continue a started phrase: # yield the accumulated token queue yield from tq tq = [] if w in pdict: # This word potentially starts a new phrase for sl, ix in pdict[w]: # assert sl add_to_state(sl, ix) if token: tq.append(token) # Start a lookahead queue with this token elif token: # Not starting a new phrase: pass the token through yield token # Transition to the new state state = newstate except StopIteration: # Token stream is exhausted pass # Yield any tokens remaining in queue yield from tq
def parse_static_phrases(token_stream, auto_uppercase): """ Parse a stream of tokens looking for static multiword phrases (i.e. phrases that are not affected by inflection). The algorithm implements N-token lookahead where N is the length of the longest phrase. """ tq = [] # Token queue state = defaultdict(list) # Phrases we're considering pdict = StaticPhrases.DICT # The phrase dictionary try: while True: token = next(token_stream) if token.txt is None: # token.kind != TOK.WORD: # Not a word: no match; discard state if tq: yield from tq tq = [] if state: state = defaultdict(list) yield token continue # Look for matches in the current state and build a new state newstate = defaultdict(list) wo = token.txt # Original word w = wo.lower() # Lower case if wo == w: wo = w def add_to_state(slist, index): """ Add the list of subsequent words to the new parser state """ wrd = slist[0] rest = slist[1:] newstate[wrd].append((rest, index)) # First check for original (uppercase) word in the state, if any; # if that doesn't match, check the lower case wm = None if wo is not w and wo in state: wm = wo elif w in state: wm = w if wm: # This matches an expected token: # go through potential continuations tq.append(token) # Add to lookahead token queue token = None for sl, ix in state[wm]: if not sl: # No subsequent word: this is a complete match # Reconstruct original text behind phrase plen = StaticPhrases.get_length(ix) while len(tq) > plen: # We have extra queued tokens in the token queue # that belong to a previously seen partial phrase # that was not completed: yield them first yield tq.pop(0) w = " ".join([t.txt for t in tq]) # Add the entire phrase as one 'word' to the token queue yield TOK.Word( w, map(BIN_Meaning._make, StaticPhrases.get_meaning(ix))) # Discard the state and start afresh newstate = defaultdict(list) w = wo = "" tq = [] # Note that it is possible to match even longer phrases # by including a starting phrase in its entirety in # the static phrase dictionary break add_to_state(sl, ix) elif tq: yield from tq tq = [] wm = None if auto_uppercase and len(wo) == 1 and w is wo: # If we are auto-uppercasing, leave single-letter lowercase # phrases alone, i.e. 'g' for 'gram' and 'm' for 'meter' pass elif wo is not w and wo in pdict: wm = wo elif w in pdict: wm = w # Add all possible new states for phrases that could be starting if wm: # This word potentially starts a phrase for sl, ix in pdict[wm]: if not sl: # Simple replace of a single word if tq: yield from tq tq = [] # Yield the replacement token yield TOK.Word( token.txt, map(BIN_Meaning._make, StaticPhrases.get_meaning(ix))) newstate = defaultdict(list) token = None break add_to_state(sl, ix) if token: tq.append(token) elif token: yield token # Transition to the new state state = newstate except StopIteration: # Token stream is exhausted pass # Yield any tokens remaining in queue yield from tq
def parse_phrases_2(token_stream): """ Parse a stream of tokens looking for phrases and making substitutions. Second pass """ token = None try: # Maintain a one-token lookahead token = next(token_stream) # Maintain a set of full person names encountered names = set() at_sentence_start = False while True: next_token = next(token_stream) # Make the lookahead checks we're interested in # Check for [number] [currency] and convert to [amount] if token.kind == TOK.NUMBER and (next_token.kind == TOK.WORD or next_token.kind == TOK.CURRENCY): # Preserve the case of the number, if available # (milljónir, milljóna, milljónum) cases = token.val[1] genders = token.val[2] cur = None if next_token.kind == TOK.WORD: # Try to find a currency name cur = match_stem_list(next_token, CURRENCIES) if cur is None and next_token.txt.isupper(): # Might be an ISO abbrev (which is not in BÍN) cur = CURRENCIES.get(next_token.txt) if not cases: cases = list(ALL_CASES) if not genders: # Try to find a correct gender for the ISO abbrev, # or use neutral as a default genders = [ CURRENCY_GENDERS.get(next_token.txt, "hk") ] if cur is not None: # Use the case and gender information from the currency name if not cases: cases = all_cases(next_token) if not genders: genders = all_genders(next_token) elif next_token.kind == TOK.CURRENCY: # Already have an ISO identifier for a currency cur = next_token.val[0] # Use the case and gender information from the currency name # if no such information was given with the number itself cases = cases or next_token.val[1] genders = genders or next_token.val[2] if cur is not None: # Create an amount # Use the case and gender information from the number, if any token = TOK.Amount(token.txt + " " + next_token.txt, cur, token.val[0], cases, genders) # Eat the currency token next_token = next(token_stream) # Logic for human names def stems(tok, categories, given_name=False): """ If the token denotes a given name, return its possible interpretations, as a list of PersonName tuples (name, case, gender). If first_name is True, we omit from the list all name forms that occur in the disallowed_names section in the configuration file. """ if tok.kind != TOK.WORD or not tok.val: return None if at_sentence_start and tok.txt in NOT_NAME_AT_SENTENCE_START: # Disallow certain person names at the start of sentences, # such as 'Annar' return None # Set up the names we're not going to allow dstems = DisallowedNames.STEMS if given_name else {} # Look through the token meanings result = [] for m in tok.val: if m.fl in categories and "ET" in m.beyging: # If this is a given name, we cut out name forms # that are frequently ambiguous and wrong, i.e. "Frá" as accusative # of the name "Frár", and "Sigurð" in the nominative. c = case(m.beyging) if m.stofn not in dstems or c not in dstems[m.stofn]: # Note the stem ('stofn') and the gender from the word type ('ordfl') result.append( PersonName(name=m.stofn, gender=m.ordfl, case=c)) return result if result else None def has_category(tok, categories): """ Return True if the token matches a meaning with any of the given categories """ if tok.kind != TOK.WORD or not tok.val: return False return any(m.fl in categories for m in tok.val) def has_other_meaning(tok, category): """ Return True if the token can denote something besides a given name """ if tok.kind != TOK.WORD or not tok.val: return True # Return True if there is a different meaning, not a given name return any(m.fl != category for m in tok.val) # Check for person names def given_names(tok): """ Check for Icelandic person name (category 'ism') """ if tok.kind != TOK.WORD or not tok.txt[0].isupper(): # Must be a word starting with an uppercase character return None return stems(tok, {"ism"}, given_name=True) # Check for surnames def surnames(tok): """ Check for Icelandic patronym (category 'föð') or matronym (category 'móð') """ if tok.kind != TOK.WORD or not tok.txt[0].isupper(): # Must be a word starting with an uppercase character return None return stems(tok, {"föð", "móð"}) # Check for unknown surnames def unknown_surname(tok): """ Check for unknown (non-Icelandic) surnames """ # Accept (most) upper case words as a surnames if tok.kind != TOK.WORD: return False if not tok.txt[0].isupper(): # Must start with capital letter return False if has_category(tok, {"föð", "móð"}): # This is a known surname, not an unknown one return False # Allow single-letter abbreviations, but not multi-letter # all-caps words (those are probably acronyms) return len(tok.txt) == 1 or not tok.txt.isupper() def given_names_or_middle_abbrev(tok): """ Check for given name or middle abbreviation """ gnames = given_names(tok) if gnames is not None: return gnames if tok.kind != TOK.WORD: return None wrd = tok.txt if wrd.startswith('['): # Abbreviation: Cut off the brackets & trailing period, if present if wrd.endswith('.]'): wrd = wrd[1:-2] else: # This is probably a C. which had its period cut off as a sentence ending... wrd = wrd[1:-1] if len(wrd) > 2 or not wrd[0].isupper(): if wrd not in {"van", "de", "den", "der", "el", "al"}: # "of" was here # Accept "Thomas de Broglie", "Ruud van Nistelroy" return None # One or two letters, capitalized: accept as middle name abbrev, # all genders and cases possible return [PersonName(name=wrd, gender=None, case=None)] def compatible(pn, npn): """ Return True if the next PersonName (np) is compatible with the one we have (p) """ if npn.gender and (npn.gender != pn.gender): return False if npn.case and (npn.case != pn.case): return False return True if token.kind == TOK.WORD and token.val and token.val[ 0].fl == "nafn": # Convert a WORD with fl="nafn" to a PERSON with the correct gender, in all cases gender = token.val[0].ordfl token = TOK.Person(token.txt, [ PersonName(token.txt, gender, case) for case in ALL_CASES ]) gn = None else:
def convert_to_num(token): if multiplier is not None: token = TOK.Number(token.txt, multiplier, all_cases(token), all_genders(token)) return token
def parse_phrases_1(token_stream): """ Parse numbers and amounts """ with BIN_Db.get_db() as db: token = None try: # Maintain a one-token lookahead token = next(token_stream) while True: next_token = next(token_stream) # Logic for numbers that are partially or entirely # written out in words def number(tok): """ If the token denotes a number, return that number - or None """ if tok.txt.lower() == "áttu": # Do not accept 'áttu' (stem='átta', no kvk) as a number return None return match_stem_list( tok, MULTIPLIERS, filter_func=lambda m: m.ordfl in NUMBER_CATEGORIES) # Check whether we have an initial number word multiplier = number(token) if token.kind == TOK.WORD else None # Check for [number] 'hundred|thousand|million|billion' while (token.kind == TOK.NUMBER or multiplier is not None) \ and next_token.kind == TOK.WORD: multiplier_next = number(next_token) def convert_to_num(token): if multiplier is not None: token = TOK.Number(token.txt, multiplier, all_cases(token), all_genders(token)) return token if multiplier_next is not None: # Retain the case of the last multiplier, except # if it is possessive (eignarfall) and the previous # token had a case ('hundruðum milljarða' is dative, # not possessive) next_case = all_cases(next_token) next_gender = all_genders(next_token) if "ef" in next_case: # We may have something like 'hundruðum milljarða': # use the case and gender of 'hundruðum', not 'milljarða' next_case = all_cases(token) or next_case next_gender = all_genders(token) or next_gender token = convert_to_num(token) token = TOK.Number(token.txt + " " + next_token.txt, token.val[0] * multiplier_next, next_case, next_gender) # Eat the multiplier token next_token = next(token_stream) elif next_token.txt in AMOUNT_ABBREV: # Abbreviations for ISK amounts # For abbreviations, we do not know the case, # but we try to retain the previous case information if any token = convert_to_num(token) token = TOK.Amount( token.txt + " " + next_token.txt, "ISK", token.val[0] * AMOUNT_ABBREV[next_token.txt], # Number token.val[1], token.val[2]) # Cases and gender next_token = next(token_stream) else: # Check for [number] 'percent' percentage = match_stem_list(next_token, PERCENTAGES) if percentage is not None: token = convert_to_num(token) token = TOK.Percent( token.txt + " " + next_token.txt, token.val[0], all_cases(next_token), all_genders(next_token)) # Eat the percentage token next_token = next(token_stream) else: break multiplier = None # Check for currency name doublets, for example # 'danish krona' or 'british pound' if token.kind == TOK.WORD and next_token.kind == TOK.WORD: nat = match_stem_list(token, NATIONALITIES) if nat is not None: cur = match_stem_list(next_token, CURRENCIES) if cur is not None: if (nat, cur) in ISO_CURRENCIES: # Match: accumulate the possible cases iso_code = ISO_CURRENCIES[(nat, cur)] # Filter the possible cases by considering adjectives # having a strong declination (indefinite form) only token = TOK.Currency( token.txt + " " + next_token.txt, iso_code, all_common_cases( token, next_token, lambda m: (m.ordfl == "lo" and "SB" in m.beyging )), [CURRENCY_GENDERS[cur]]) next_token = next(token_stream) # Check for composites: # 'stjórnskipunar- og eftirlitsnefnd' # 'viðskipta- og iðnaðarráðherra' # 'marg-ítrekaðri' if token.kind == TOK.WORD and \ next_token.kind == TOK.PUNCTUATION and next_token.txt == COMPOSITE_HYPHEN: og_token = next(token_stream) if og_token.kind != TOK.WORD or (og_token.txt != "og" and og_token.txt != "eða"): # Incorrect prediction: make amends and continue handled = False if og_token.kind == TOK.WORD: composite = token.txt + "-" + og_token.txt if token.txt.lower() in ADJECTIVE_PREFIXES: # hálf-opinberri, marg-ítrekaðri token = TOK.Word(composite, [ m for m in og_token.val if m.ordfl == "lo" or m.ordfl == "ao" ]) next_token = next(token_stream) handled = True else: # Check for Vestur-Þýskaland, Suður-Múlasýsla (which are in BÍN in their entirety) m = db.meanings(composite) if m: # Found composite in BÍN: return it as a single token token = TOK.Word(composite, m) next_token = next(token_stream) handled = True if not handled: yield token # Put a normal hyphen instead of the composite one token = TOK.Punctuation(HYPHEN) next_token = og_token else: # We have 'viðskipta- og' final_token = next(token_stream) if final_token.kind != TOK.WORD: # Incorrect: unwind yield token yield TOK.Punctuation(HYPHEN) # Normal hyphen token = og_token next_token = final_token else: # We have 'viðskipta- og iðnaðarráðherra' # Return a single token with the meanings of # the last word, but an amalgamated token text. # Note: there is no meaning check for the first # part of the composition, so it can be an unknown word. txt = token.txt + "- " + og_token.txt + \ " " + final_token.txt token = TOK.Word(txt, final_token.val) next_token = next(token_stream) # Yield the current token and advance to the lookahead yield token token = next_token except StopIteration: pass # Final token (previous lookahead) if token: yield token
found_name = True break # If this is not a "strong" name, backtrack from recognizing it. # A "weak" name is (1) at the start of a sentence; (2) only one # word; (3) that word has a meaning that is not a name; # (4) the name has not been seen in a full form before; # (5) not on a 'well known name' list. weak = at_sentence_start and (' ' not in w) and not patronym and \ not found_name and (has_other_meaning(token, "ism") and w not in NamePreferences.SET) if not weak: # Return a person token with the accumulated name # and the intersected set of possible cases token = TOK.Person(w, gn) # Yield the current token and advance to the lookahead yield token if token.kind == TOK.S_BEGIN or (token.kind == TOK.PUNCTUATION and token.txt == ':'): at_sentence_start = True elif token.kind != TOK.PUNCTUATION and token.kind != TOK.ORDINAL: at_sentence_start = False token = next_token except StopIteration: pass # Final token (previous lookahead)
def analyze(): """ Find word categories in the submitted text """ txt = request.form.get("txt", "").strip() # Tokenize the text entered as-is and return the token list toklist = list(tokenize(txt)) # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 with Fast_Parser(verbose=False) as bp: # Don't emit diagnostic messages rdc = Reducer(bp.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest try: # Parse the sentence forest = bp.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) assert Fast_Parser.num_combinations(forest) == 1 # Mark the token list with the identified word categories mark_categories(forest, toklist, sent_begin + 1) except ParseError as e: # Obtain the index of the offending token err_index = e.token_index print( "Parsed sentence of length {0} with {1} combinations{2}" .format( slen, num, "\n" + (" ".join(s[1] for s in sent) if num >= 100 else ""))) if num > 0: num_parsed_sent += 1 # Calculate the 'ambiguity factor' ambig_factor = num**(1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence( num_parses=num, err_index=err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict(tokens=toklist, tok_num=len(toklist), num_sent=num_sent, num_parsed_sent=num_parsed_sent, avg_ambig_factor=(total_ambig / total_tokens) if total_tokens > 0 else 1.0) # Return the tokens as a JSON structure to the client return jsonify(result=result)
def analyze(): """ Analyze text from a given URL """ url = request.form.get("url", "").strip() t0 = time.time() if url.startswith("http:") or url.startswith("https:"): # Scrape the URL, tokenize the text content and return the token list toklist = list(process_url(url)) else: # Tokenize the text entered as-is and return the token list toklist = list(tokenize(url)) tok_time = time.time() - t0 # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent_begin = 0 bp = BIN_Parser() t0 = time.time() for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) # Parse the accumulated sentence err_index = None try: forest = bp.go(sent) except ParseError as e: forest = None # Obtain the index of the offending token err_index = e.token_index() num = 0 if forest is None else Parser.num_combinations(forest) print("Parsed sentence of length {0} with {1} combinations{2}". format( slen, num, "\n" + " ".join(s[1] for s in sent) if num >= 100 else "")) if num > 0: num_parsed_sent += 1 # Calculate the 'ambiguity factor' ambig_factor = num**(1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses=num, err_index=err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) parse_time = time.time() - t0 result = dict(tokens=toklist, tok_time=tok_time, tok_num=len(toklist), parse_time=parse_time, num_sent=num_sent, num_parsed_sent=num_parsed_sent, avg_ambig_factor=(total_ambig / total_tokens) if total_tokens > 0 else 1.0) # Dump the tokens to a text file for inspection # dump_tokens_to_file("txt", toklist) # Return the tokens as a JSON structure to the client return jsonify(result=result)