Пример #1
0
from typing import Generator

from lexnlp.extract.en.amounts import get_amounts, NUM_PTN

__author__ = "ContraxSuite, LLC; LexPredict, LLC"
__copyright__ = "Copyright 2015-2017, ContraxSuite, LLC"
__license__ = "https://github.com/LexPredict/lexpredict-lexnlp/blob/master/LICENSE"
__version__ = "0.1.4"
__maintainer__ = "LexPredict, LLC"
__email__ = "*****@*****.**"

RATIO_PTN = r"""
(({num_ptn_1})\s*
(?:to|\:)\s*
({num_ptn_2}))(?!\s*[ap].?m(?:\W|$))
""".format(num_ptn_1=NUM_PTN.replace('(?:(?:no|\\d{1,2})/100)?',
                                     '').replace('(?:\\W|$)', ''),
           num_ptn_2=NUM_PTN.replace('(?:(?:no|\\d{1,2})/100)?', ''))
RATIO_PTN_RE = re.compile(
    RATIO_PTN, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)


def get_ratios(text, return_sources=False, float_digits=4) -> Generator:
    for source_text, ratio_1_text, ratio_2_text in RATIO_PTN_RE.findall(
            text.lower()):
        amount_1 = list(get_amounts(ratio_1_text, float_digits=float_digits))
        amount_2 = list(get_amounts(ratio_2_text, float_digits=float_digits))
        if len(amount_1) != 1 or len(amount_2) != 1:
            continue
        amount_1 = amount_1[0]
        amount_2 = amount_2[0]
        if amount_1 == 0 or amount_2 == 0:
Пример #2
0
DISTANCE_SYMBOL_MAP = {
    "km": "kilometer",
    "mi": "mile",
}

DISTANCE_TOKEN_MAP = {
    "kilometers": "kilometer",
    "kilometer": "kilometer",
    "miles": "mile",
    "mile": "mile",
}

DISTANCE_PTN = r"""
(({num_ptn})\s*
({distance_tokens}|{distance_symbols}))(?:\W|$)
""".format(num_ptn=NUM_PTN.replace('(?:\\W|$)', '').replace('(?<=\\W|^)', ''),
           distance_symbols='|'.join(DISTANCE_SYMBOL_MAP),
           distance_tokens='|'.join(DISTANCE_TOKEN_MAP))
DISTANCE_PTN_RE = re.compile(
    DISTANCE_PTN, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)


def get_distances(
    text: str,
    return_sources: bool = False,
    float_digits: int = 4
) -> Generator[Union[Tuple[Decimal, str], Tuple[Decimal, str, str]], None,
               None]:
    for ant in get_distance_annotations(text, float_digits):
        if return_sources:
            yield ant.amount, ant.distance_type, ant.text
Пример #3
0
CURRENCY_TOKEN_MAP = OrderedDict([('chinese yuans', 'CNY'),
                                  ('chinese yuan', 'CNY'), ('dollars', 'USD'),
                                  ('dollar', 'USD'), ('euros', 'EUR'),
                                  ('euro', 'EUR'), ('pounds', 'GBP'),
                                  ('pound', 'GBP'), ('renminbi', 'CNY'),
                                  ('yens', 'JPY'), ('yen', 'JPY'),
                                  ('yuans', 'CNY'), ('yuan', 'CNY')])

CURRENCY_ABBR_LIST = set(
    list(CURRENCY_SYMBOL_MAP.values()) + list(CURRENCY_TOKEN_MAP.values()) +
    list(CURRENCY_PREFIX_MAP.values()))

CURRENCY_PREFIXES = set(
    list(CURRENCY_PREFIX_MAP.keys()) + list(CURRENCY_SYMBOL_MAP.values()))

CURR_NUM_PTN = NUM_PTN.replace('(?<=\\W|^)', '')

CURRENCY_PTN = r"""
(?P<text>
(?P<prefix>{currency_prefixes}|[{currency_symbols}])\s*
(?P<amount>{num_ptn_1})
|
(?P<amount>{num_ptn_2})\s*
(?P<postfix>{currency_tokens}|{currency_abbreviations})(?:\W|$))
""".format(
    num_ptn_1=CURR_NUM_PTN,
    num_ptn_2=CURR_NUM_PTN,
    currency_prefixes='|'.join(CURRENCY_PREFIXES),
    currency_symbols=''.join([re.escape(i) for i in CURRENCY_SYMBOL_MAP]),
    currency_tokens='|'.join(
        [i.replace(' ', '\\s+') for i in CURRENCY_TOKEN_MAP]),