def en_parsers_speed(self): file_path = os.path.join(lexnlp_test_path, 'long_parsed_text.txt') with codecs.open(file_path, 'r', encoding='utf-8') as fr: text = fr.read() ge_path = os.path.join(lexnlp_test_path, 'lexnlp/extract/en/tests/test_geoentities/') entities_fn = ge_path + 'geoentities.csv' aliases_fn = ge_path + 'geoaliases.csv' geo_config = list(DictionaryEntry.load_entities_from_files(entities_fn, aliases_fn)) times = {} # type: Dict[str, float] self.check_time(text, lambda s: list(get_amounts(s)), 'get_amounts', times) self.check_time(text, lambda s: list(get_acts(s)), 'get_acts', times) self.check_time(text, lambda s: list(get_citations(s)), 'get_citations', times) self.check_time(text, lambda s: list(get_conditions(s)), 'get_conditions', times) self.check_time(text, lambda s: list(get_constraints(s)), 'get_constraints', times) self.check_time(text, lambda s: list(get_copyright(s)), 'get_copyright', times) self.check_time(text, lambda s: list(_get_courts(s)), 'get_courts', times) self.check_time(text, lambda s: list(get_cusip(s)), 'get_cusip', times) self.check_time(text, lambda s: list(get_dates(s)), 'get_dates', times) self.check_time(text, lambda s: list(get_definitions(s)), 'get_definitions', times) self.check_time(text, lambda s: list(get_distances(s)), 'get_distances', times) self.check_time(text, lambda s: list(get_durations(s)), 'get_durations', times) self.check_time(text, lambda s: list(get_geoentities(s, geo_config)), 'get_geoentities', times) self.check_time(text, lambda s: list(get_money(s)), 'get_money', times) self.check_time(text, lambda s: list(get_percents(s)), 'get_percents', times) self.check_time(text, lambda s: list(get_pii(s)), 'get_pii', times) self.check_time(text, lambda s: list(get_ratios(s)), 'get_ratios', times) self.check_time(text, lambda s: list(get_regulations(s)), 'get_regulations', times) self.check_time(text, lambda s: list(get_trademarks(s)), 'get_trademarks', times) self.check_time(text, lambda s: list(get_urls(s)), 'get_urls', times) self.assertTrue('get_amounts' in times)
def get_money_annotations(text: str, float_digits=4) \ -> Generator[MoneyAnnotation, None, None]: for match in CURRENCY_PTN_RE.finditer(text): capture = match.capturesdict() if not (capture['prefix'] or capture['postfix']) and not (capture['trigger_word']): continue prefix = capture['prefix'] postfix = capture['postfix'] amount = list( get_amounts(capture['amount'][0], float_digits=float_digits)) if len(amount) != 1: continue if prefix: prefix = prefix[0].lower() currency_type = CURRENCY_SYMBOL_MAP.get(prefix)\ or CURRENCY_PREFIX_MAP.get(prefix)\ or prefix.upper() elif postfix: postfix = postfix[0].lower() currency_type = CURRENCY_TOKEN_MAP.get(postfix) or ( capture['postfix'][0]).upper() else: currency_type = None if not currency_type: currency_type = DEFAULT_CURRENCY text = capture['text'][0].strip( string.punctuation.replace('$', '') + string.whitespace) ant = MoneyAnnotation(coords=match.span(), amount=amount[0], text=text, currency=currency_type) yield ant
def get_money(text, return_sources=False, float_digits=4) -> Generator: for match in CURRENCY_PTN_RE.finditer(text): capture = match.capturesdict() if not (capture['prefix'] or capture['postfix']): continue prefix = capture['prefix'] postfix = capture['postfix'] amount = list( get_amounts(capture['amount'][0], float_digits=float_digits)) if len(amount) != 1: continue if prefix: prefix = prefix[0].lower() currency_type = CURRENCY_SYMBOL_MAP.get(prefix)\ or CURRENCY_PREFIX_MAP.get(prefix)\ or prefix.upper() else: postfix = postfix[0].lower() currency_type = CURRENCY_TOKEN_MAP.get(postfix) or ( capture['postfix'][0]).upper() item = (amount[0], currency_type) if return_sources: item += (capture['text'][0].strip( string.punctuation.replace('$', '') + string.whitespace), ) yield item
def _extract_variants_from_text(self, field, text: str, **kwargs): amounts = get_amounts(text, return_sources=False) if not amounts: return None amounts = [int(i) if int(i) == i else i for i in amounts if isinstance(i, (float, int))] return amounts or None
def get_all_annotations(cls, text: str, float_digits=4) \ -> List[DurationAnnotation]: all_annotations = [] for match in cls.DURATION_PTN_RE.finditer(text.lower()): source_text, number_text, duration_type = match.groups() amount = list(get_amounts(number_text, float_digits=float_digits)) if len(amount) != 1: continue amount = amount[0] if float_digits: amount = round(amount, float_digits) duration_days = cls.DURATION_MAP[duration_type] * amount if duration_type == 'anniversaries': duration_type = 'anniversary' ant = DurationAnnotation(coords=match.span(), amount=amount, duration_type=duration_type, duration_days=duration_days, text=source_text.strip()) all_annotations.append(ant) return all_annotations
def get_all_annotations( cls, text: str, float_digits: int = 4, ) -> List[DurationAnnotation]: all_annotations: List[DurationAnnotation] = [] for match in cls.DURATION_PTN_RE.finditer(text.lower()): source_text, number_text, duration_type = match.groups() amount = list(get_amounts(number_text, float_digits=float_digits)) if len(amount) != 1: continue amount = amount[0] _duration_fraction: Fraction = cls.DURATION_MAP[duration_type] duration_days: Decimal = Decimal( (_duration_fraction.numerator * amount) / _duration_fraction.denominator) if float_digits: duration_days: Decimal = quantize_by_float_digit( amount=duration_days, float_digits=float_digits) if duration_type == 'anniversaries': duration_type = 'anniversary' ant: DurationAnnotation = DurationAnnotation( coords=match.span(), amount=amount, duration_type=duration_type, duration_days=duration_days, text=source_text.strip()) all_annotations.append(ant) return all_annotations
def get_ratios(text, return_sources=False, float_digits=4) -> Generator: for source_text, ratio_1_text, ratio_2_text in RATIO_PTN_RE.findall( text.lower()): amount_1 = list(get_amounts(ratio_1_text, float_digits=float_digits)) amount_2 = list(get_amounts(ratio_2_text, float_digits=float_digits)) if len(amount_1) != 1 or len(amount_2) != 1: continue amount_1 = amount_1[0] amount_2 = amount_2[0] if amount_1 == 0 or amount_2 == 0: continue if float_digits: amount_1 = round(amount_1, float_digits) amount_2 = round(amount_2, float_digits) total = float(amount_1) / amount_2 item = (amount_1, amount_2, total) if return_sources: item += (source_text.strip(), ) yield item
def extraction_function(self, field, possible_value, text): if possible_value is None and not text: return None try: return float(possible_value) except: possible_value = str(possible_value) if possible_value else text floats = list(get_amounts(possible_value, return_sources=False)) return ValueExtractionHint.get_value( floats, field.item_number) if floats else None
def get_ratio_annotations(text: str, float_digits=4) \ -> Generator[RatioAnnotation, None, None]: for match in RATIO_PTN_RE.finditer(text.lower()): source_text, ratio_1_text, ratio_2_text = match.groups() amount_1 = list(get_amounts(ratio_1_text, float_digits=float_digits)) amount_2 = list(get_amounts(ratio_2_text, float_digits=float_digits)) if len(amount_1) != 1 or len(amount_2) != 1: continue amount_1 = amount_1[0] amount_2 = amount_2[0] if amount_1 == 0 or amount_2 == 0: continue if float_digits: amount_1 = round(amount_1, float_digits) amount_2 = round(amount_2, float_digits) total = float(amount_1) / amount_2 ant = RatioAnnotation(coords=match.span(), text=source_text.strip(), left=amount_1, right=amount_2, ratio=total) yield ant
def get_distances(text, return_sources=False, float_digits=4) -> Generator: for source_text, number_text, distance_item in DISTANCE_PTN_RE.findall(text.lower()): amount = list(get_amounts(number_text, float_digits=float_digits)) if len(amount) != 1: continue distance_type = DISTANCE_SYMBOL_MAP.get(distance_item) or DISTANCE_TOKEN_MAP.get(distance_item) amount = amount[0] if float_digits: amount = round(amount, float_digits) item = (amount, distance_type) if return_sources: item += (source_text.strip(),) yield item
def get_distance_annotations( text: str, float_digits: int = 4) -> Generator[DistanceAnnotation, None, None]: for match in DISTANCE_PTN_RE.finditer(text.lower()): source_text, number_text, distance_item = match.groups() amount = list(get_amounts(number_text, float_digits=float_digits)) if len(amount) != 1: continue distance_type = DISTANCE_SYMBOL_MAP.get(distance_item) \ or DISTANCE_TOKEN_MAP.get(distance_item) yield DistanceAnnotation(coords=match.span(), amount=amount[0], distance_type=distance_type, text=source_text.strip())
def test_amounts(self): text = """ 2. Amendment to Interest Rate. Beginning on February 1, 1998, and continuing until July 18, 2002, which is the fifth anniversary of the Loan conversion date, interest shall be fixed at an annual rate of 7.38%, which rate is equal to 200 basis points above the Bank's five-year ""Treasury Constant Rate"" in effect on January 23, 1998. In accordance with the Agreement, the interest rate shall be adjusted again on July 18, 2002. """ amts = list(get_amounts(text)) str_vals = ', '.join([str(f) for f in amts]) self.assertEqual( '2.0, 1.0, 1998.0, 18.0, 2002.0, 5, 7.38, 200.0, 5, 23.0, 1998.0, 18.0, 2002.0', str_vals)
def get_durations(text, return_sources=False, float_digits=4) -> Generator: for source_text, number_text, duration_type in DURATION_PTN_RE.findall(text.lower()): amount = list(get_amounts(number_text, float_digits=float_digits)) if len(amount) != 1: continue amount = amount[0] if float_digits: amount = round(amount, float_digits) duration_days = DURATION_MAP[duration_type] * amount if duration_type == 'anniversaries': duration_type = 'anniversary' item = (duration_type, amount, duration_days) if return_sources: item += (source_text.strip(),) yield item
def parse(self, text, text_unit_id, _text_unit_lang, **kwargs) -> ParseResults: found = list( amounts.get_amounts(text, return_sources=True, extended_sources=False)) if found: unique = set(found) return ParseResults({ AmountUsage: [ AmountUsage(text_unit_id=text_unit_id, amount=item[0], amount_str=item[1][:300] if item[1] else None, count=found.count(item)) for item in unique ] })
def _extract_variants_from_text(self, field, text: str, **kwargs): amounts = get_amounts(text, return_sources=False) return list(amounts) if amounts else None
def _extract_variants_from_text(self, field, text: str): amounts = get_amounts(text, return_sources=False) if not amounts: return None amounts = [n for n in amounts if n.is_integer()] return amounts or None