def get_all_annotations( cls, text: str, float_digits: int = 4, ) -> List[DurationAnnotation]: all_annotations: List[DurationAnnotation] = [] for match in cls.DURATION_PTN_RE.finditer(text.lower()): source_text, number_text, duration_type = match.groups() amount = list(get_amounts(number_text, float_digits=float_digits)) if len(amount) != 1: continue amount = amount[0] _duration_fraction: Fraction = cls.DURATION_MAP[duration_type] duration_days: Decimal = Decimal( (_duration_fraction.numerator * amount) / _duration_fraction.denominator) if float_digits: duration_days: Decimal = quantize_by_float_digit( amount=duration_days, float_digits=float_digits) if duration_type == 'anniversaries': duration_type = 'anniversary' ant: DurationAnnotation = DurationAnnotation( coords=match.span(), amount=amount, duration_type=duration_type, duration_days=duration_days, text=source_text.strip()) all_annotations.append(ant) return all_annotations
def parse_annotations(self, text: str, float_digits: int = 4, return_sources: bool = True ) -> Generator[AmountAnnotation, None, None]: """ Find possible amount references in the text. :param text: text :param return_sources: return amount AND source text :param extended_sources: return data around amount itself :param float_digits: round float to N digits, don't round if None :return: list of amounts """ for match in self.NUM_PTN_RE.finditer(text): found_item = match.group() if self.WRONG_FULLMATCH_RE.fullmatch(found_item): continue try: amount = self.text2num(found_item) except Exception as e: print(e) continue if amount is None: continue if float_digits: amount: Decimal = quantize_by_float_digit( amount=amount, float_digits=float_digits) ant = AmountAnnotation(coords=match.span(), value=amount, locale=self.language) if return_sources: unit = '' next_text = text[match.span()[1]:] if next_text: for np in get_np(next_text): if next_text.startswith(np): unit = np if unit: found_item = ' '.join([found_item.strip(), unit]) if not unit: prev_text = text[:match.span()[0]] prev_text_tags = nltk.word_tokenize(prev_text) if prev_text_tags and prev_text_tags[-1].lower( ) in allowed_prev_units: sep = ' ' if text[match.span()[0] - 1] == ' ' else '' found_item = sep.join( [prev_text_tags[-1], found_item.rstrip()]) ant.text = found_item.strip() yield ant
def get_all_annotations(cls, text: str, float_digits: int = 4) -> List[DurationAnnotation]: all_annotations: List[DurationAnnotation] = [] for match in cls.DURATION_PTN_RE.finditer(text): capture = match.capturesdict() amount_text = ''.join(capture.get('num_text', '')) amounts = list(get_amounts(amount_text, float_digits=float_digits)) if len(amounts) != 1: amount = Decimal('1.0') else: amount = amounts[0] unit_name_local = ''.join(capture.get('unit_name', '')).lower() unit_prefix = ''.join(capture.get('unit_prefix', '')).lower() unit_name_local = cls.DURATION_MAP_RE.findall(unit_name_local) if not unit_name_local: continue unit_name_local = unit_name_local[0] unit_name_en = cls.DURATION_TRANSLATION_MAP.get(unit_name_local) _duration_fraction: Fraction = cls.DURATION_MAP[unit_name_en] amount_days: Decimal = Decimal( (_duration_fraction.numerator * amount) / _duration_fraction.denominator) if float_digits: amount_days: Decimal = quantize_by_float_digit( amount=amount_days, float_digits=float_digits) ant: DurationAnnotation = DurationAnnotation( coords=match.span(), text=''.join(capture.get('text', '')), amount=amount, duration_days=amount_days, duration_type_en=unit_name_en, duration_type=unit_name_local, prefix=unit_prefix, locale=cls.LOCALE) all_annotations.append(ant) return all_annotations