예제 #1
0
 def convert_to_num(token):
     if multiplier is not None:
         token = TOK.Number(token.txt, multiplier,
                            all_cases(token),
                            all_genders(token))
     return token
예제 #2
0
def parse_phrases_1(token_stream):
    """ Parse numbers and amounts """

    with BIN_Db.get_db() as db:

        token = None
        try:

            # Maintain a one-token lookahead
            token = next(token_stream)
            while True:
                next_token = next(token_stream)

                # Logic for numbers that are partially or entirely
                # written out in words

                def number(tok):
                    """ If the token denotes a number, return that number - or None """
                    if tok.txt.lower() == "áttu":
                        # Do not accept 'áttu' (stem='átta', no kvk) as a number
                        return None
                    return match_stem_list(
                        tok,
                        MULTIPLIERS,
                        filter_func=lambda m: m.ordfl in NUMBER_CATEGORIES)

                # Check whether we have an initial number word
                multiplier = number(token) if token.kind == TOK.WORD else None

                # Check for [number] 'hundred|thousand|million|billion'
                while (token.kind == TOK.NUMBER or multiplier is not None) \
                    and next_token.kind == TOK.WORD:

                    multiplier_next = number(next_token)

                    def convert_to_num(token):
                        if multiplier is not None:
                            token = TOK.Number(token.txt, multiplier,
                                               all_cases(token),
                                               all_genders(token))
                        return token

                    if multiplier_next is not None:
                        # Retain the case of the last multiplier, except
                        # if it is possessive (eignarfall) and the previous
                        # token had a case ('hundruðum milljarða' is dative,
                        # not possessive)
                        next_case = all_cases(next_token)
                        next_gender = all_genders(next_token)
                        if "ef" in next_case:
                            # We may have something like 'hundruðum milljarða':
                            # use the case and gender of 'hundruðum', not 'milljarða'
                            next_case = all_cases(token) or next_case
                            next_gender = all_genders(token) or next_gender
                        token = convert_to_num(token)
                        token = TOK.Number(token.txt + " " + next_token.txt,
                                           token.val[0] * multiplier_next,
                                           next_case, next_gender)
                        # Eat the multiplier token
                        next_token = next(token_stream)
                    elif next_token.txt in AMOUNT_ABBREV:
                        # Abbreviations for ISK amounts
                        # For abbreviations, we do not know the case,
                        # but we try to retain the previous case information if any
                        token = convert_to_num(token)
                        token = TOK.Amount(
                            token.txt + " " + next_token.txt,
                            "ISK",
                            token.val[0] *
                            AMOUNT_ABBREV[next_token.txt],  # Number
                            token.val[1],
                            token.val[2])  # Cases and gender
                        next_token = next(token_stream)
                    else:
                        # Check for [number] 'percent'
                        percentage = match_stem_list(next_token, PERCENTAGES)
                        if percentage is not None:
                            token = convert_to_num(token)
                            token = TOK.Percent(
                                token.txt + " " + next_token.txt, token.val[0],
                                all_cases(next_token), all_genders(next_token))
                            # Eat the percentage token
                            next_token = next(token_stream)
                        else:
                            break

                    multiplier = None

                # Check for currency name doublets, for example
                # 'danish krona' or 'british pound'
                if token.kind == TOK.WORD and next_token.kind == TOK.WORD:
                    nat = match_stem_list(token, NATIONALITIES)
                    if nat is not None:
                        cur = match_stem_list(next_token, CURRENCIES)
                        if cur is not None:
                            if (nat, cur) in ISO_CURRENCIES:
                                # Match: accumulate the possible cases
                                iso_code = ISO_CURRENCIES[(nat, cur)]
                                # Filter the possible cases by considering adjectives
                                # having a strong declination (indefinite form) only
                                token = TOK.Currency(
                                    token.txt + " " + next_token.txt, iso_code,
                                    all_common_cases(
                                        token, next_token, lambda m:
                                        (m.ordfl == "lo" and "SB" in m.beyging
                                         )), [CURRENCY_GENDERS[cur]])
                                next_token = next(token_stream)

                # Check for composites:
                # 'stjórnskipunar- og eftirlitsnefnd'
                # 'viðskipta- og iðnaðarráðherra'
                # 'marg-ítrekaðri'
                if token.kind == TOK.WORD and \
                    next_token.kind == TOK.PUNCTUATION and next_token.txt == COMPOSITE_HYPHEN:

                    og_token = next(token_stream)
                    if og_token.kind != TOK.WORD or (og_token.txt != "og" and
                                                     og_token.txt != "eða"):
                        # Incorrect prediction: make amends and continue
                        handled = False
                        if og_token.kind == TOK.WORD:
                            composite = token.txt + "-" + og_token.txt
                            if token.txt.lower() in ADJECTIVE_PREFIXES:
                                # hálf-opinberri, marg-ítrekaðri
                                token = TOK.Word(composite, [
                                    m for m in og_token.val
                                    if m.ordfl == "lo" or m.ordfl == "ao"
                                ])
                                next_token = next(token_stream)
                                handled = True
                            else:
                                # Check for Vestur-Þýskaland, Suður-Múlasýsla (which are in BÍN in their entirety)
                                m = db.meanings(composite)
                                if m:
                                    # Found composite in BÍN: return it as a single token
                                    token = TOK.Word(composite, m)
                                    next_token = next(token_stream)
                                    handled = True
                        if not handled:
                            yield token
                            # Put a normal hyphen instead of the composite one
                            token = TOK.Punctuation(HYPHEN)
                            next_token = og_token
                    else:
                        # We have 'viðskipta- og'
                        final_token = next(token_stream)
                        if final_token.kind != TOK.WORD:
                            # Incorrect: unwind
                            yield token
                            yield TOK.Punctuation(HYPHEN)  # Normal hyphen
                            token = og_token
                            next_token = final_token
                        else:
                            # We have 'viðskipta- og iðnaðarráðherra'
                            # Return a single token with the meanings of
                            # the last word, but an amalgamated token text.
                            # Note: there is no meaning check for the first
                            # part of the composition, so it can be an unknown word.
                            txt = token.txt + "- " + og_token.txt + \
                                " " + final_token.txt
                            token = TOK.Word(txt, final_token.val)
                            next_token = next(token_stream)

                # Yield the current token and advance to the lookahead
                yield token
                token = next_token

        except StopIteration:
            pass

        # Final token (previous lookahead)
        if token:
            yield token