def convert_to_num(token): if multiplier is not None: token = TOK.Number(token.txt, multiplier, all_cases(token), all_genders(token)) return token
def parse_phrases_1(token_stream): """ Parse numbers and amounts """ with BIN_Db.get_db() as db: token = None try: # Maintain a one-token lookahead token = next(token_stream) while True: next_token = next(token_stream) # Logic for numbers that are partially or entirely # written out in words def number(tok): """ If the token denotes a number, return that number - or None """ if tok.txt.lower() == "áttu": # Do not accept 'áttu' (stem='átta', no kvk) as a number return None return match_stem_list( tok, MULTIPLIERS, filter_func=lambda m: m.ordfl in NUMBER_CATEGORIES) # Check whether we have an initial number word multiplier = number(token) if token.kind == TOK.WORD else None # Check for [number] 'hundred|thousand|million|billion' while (token.kind == TOK.NUMBER or multiplier is not None) \ and next_token.kind == TOK.WORD: multiplier_next = number(next_token) def convert_to_num(token): if multiplier is not None: token = TOK.Number(token.txt, multiplier, all_cases(token), all_genders(token)) return token if multiplier_next is not None: # Retain the case of the last multiplier, except # if it is possessive (eignarfall) and the previous # token had a case ('hundruðum milljarða' is dative, # not possessive) next_case = all_cases(next_token) next_gender = all_genders(next_token) if "ef" in next_case: # We may have something like 'hundruðum milljarða': # use the case and gender of 'hundruðum', not 'milljarða' next_case = all_cases(token) or next_case next_gender = all_genders(token) or next_gender token = convert_to_num(token) token = TOK.Number(token.txt + " " + next_token.txt, token.val[0] * multiplier_next, next_case, next_gender) # Eat the multiplier token next_token = next(token_stream) elif next_token.txt in AMOUNT_ABBREV: # Abbreviations for ISK amounts # For abbreviations, we do not know the case, # but we try to retain the previous case information if any token = convert_to_num(token) token = TOK.Amount( token.txt + " " + next_token.txt, "ISK", token.val[0] * AMOUNT_ABBREV[next_token.txt], # Number token.val[1], token.val[2]) # Cases and gender next_token = next(token_stream) else: # Check for [number] 'percent' percentage = match_stem_list(next_token, PERCENTAGES) if percentage is not None: token = convert_to_num(token) token = TOK.Percent( token.txt + " " + next_token.txt, token.val[0], all_cases(next_token), all_genders(next_token)) # Eat the percentage token next_token = next(token_stream) else: break multiplier = None # Check for currency name doublets, for example # 'danish krona' or 'british pound' if token.kind == TOK.WORD and next_token.kind == TOK.WORD: nat = match_stem_list(token, NATIONALITIES) if nat is not None: cur = match_stem_list(next_token, CURRENCIES) if cur is not None: if (nat, cur) in ISO_CURRENCIES: # Match: accumulate the possible cases iso_code = ISO_CURRENCIES[(nat, cur)] # Filter the possible cases by considering adjectives # having a strong declination (indefinite form) only token = TOK.Currency( token.txt + " " + next_token.txt, iso_code, all_common_cases( token, next_token, lambda m: (m.ordfl == "lo" and "SB" in m.beyging )), [CURRENCY_GENDERS[cur]]) next_token = next(token_stream) # Check for composites: # 'stjórnskipunar- og eftirlitsnefnd' # 'viðskipta- og iðnaðarráðherra' # 'marg-ítrekaðri' if token.kind == TOK.WORD and \ next_token.kind == TOK.PUNCTUATION and next_token.txt == COMPOSITE_HYPHEN: og_token = next(token_stream) if og_token.kind != TOK.WORD or (og_token.txt != "og" and og_token.txt != "eða"): # Incorrect prediction: make amends and continue handled = False if og_token.kind == TOK.WORD: composite = token.txt + "-" + og_token.txt if token.txt.lower() in ADJECTIVE_PREFIXES: # hálf-opinberri, marg-ítrekaðri token = TOK.Word(composite, [ m for m in og_token.val if m.ordfl == "lo" or m.ordfl == "ao" ]) next_token = next(token_stream) handled = True else: # Check for Vestur-Þýskaland, Suður-Múlasýsla (which are in BÍN in their entirety) m = db.meanings(composite) if m: # Found composite in BÍN: return it as a single token token = TOK.Word(composite, m) next_token = next(token_stream) handled = True if not handled: yield token # Put a normal hyphen instead of the composite one token = TOK.Punctuation(HYPHEN) next_token = og_token else: # We have 'viðskipta- og' final_token = next(token_stream) if final_token.kind != TOK.WORD: # Incorrect: unwind yield token yield TOK.Punctuation(HYPHEN) # Normal hyphen token = og_token next_token = final_token else: # We have 'viðskipta- og iðnaðarráðherra' # Return a single token with the meanings of # the last word, but an amalgamated token text. # Note: there is no meaning check for the first # part of the composition, so it can be an unknown word. txt = token.txt + "- " + og_token.txt + \ " " + final_token.txt token = TOK.Word(txt, final_token.val) next_token = next(token_stream) # Yield the current token and advance to the lookahead yield token token = next_token except StopIteration: pass # Final token (previous lookahead) if token: yield token