예제 #1
0
 def get_all_annotations(
     cls,
     text: str,
     float_digits: int = 4,
 ) -> List[DurationAnnotation]:
     all_annotations: List[DurationAnnotation] = []
     for match in cls.DURATION_PTN_RE.finditer(text.lower()):
         source_text, number_text, duration_type = match.groups()
         amount = list(get_amounts(number_text, float_digits=float_digits))
         if len(amount) != 1:
             continue
         amount = amount[0]
         _duration_fraction: Fraction = cls.DURATION_MAP[duration_type]
         duration_days: Decimal = Decimal(
             (_duration_fraction.numerator * amount) /
             _duration_fraction.denominator)
         if float_digits:
             duration_days: Decimal = quantize_by_float_digit(
                 amount=duration_days, float_digits=float_digits)
         if duration_type == 'anniversaries':
             duration_type = 'anniversary'
         ant: DurationAnnotation = DurationAnnotation(
             coords=match.span(),
             amount=amount,
             duration_type=duration_type,
             duration_days=duration_days,
             text=source_text.strip())
         all_annotations.append(ant)
     return all_annotations
예제 #2
0
    def parse_annotations(self,
                          text: str,
                          float_digits: int = 4,
                          return_sources: bool = True
                          ) -> Generator[AmountAnnotation, None, None]:
        """
        Find possible amount references in the text.
        :param text: text
        :param return_sources: return amount AND source text
        :param extended_sources: return data around amount itself
        :param float_digits: round float to N digits, don't round if None
        :return: list of amounts
        """
        for match in self.NUM_PTN_RE.finditer(text):
            found_item = match.group()
            if self.WRONG_FULLMATCH_RE.fullmatch(found_item):
                continue
            try:
                amount = self.text2num(found_item)
            except Exception as e:
                print(e)
                continue
            if amount is None:
                continue
            if float_digits:
                amount: Decimal = quantize_by_float_digit(
                    amount=amount, float_digits=float_digits)

            ant = AmountAnnotation(coords=match.span(),
                                   value=amount,
                                   locale=self.language)

            if return_sources:
                unit = ''
                next_text = text[match.span()[1]:]
                if next_text:
                    for np in get_np(next_text):
                        if next_text.startswith(np):
                            unit = np
                    if unit:
                        found_item = ' '.join([found_item.strip(), unit])
                if not unit:
                    prev_text = text[:match.span()[0]]
                    prev_text_tags = nltk.word_tokenize(prev_text)
                    if prev_text_tags and prev_text_tags[-1].lower(
                    ) in allowed_prev_units:
                        sep = ' ' if text[match.span()[0] - 1] == ' ' else ''
                        found_item = sep.join(
                            [prev_text_tags[-1],
                             found_item.rstrip()])

                ant.text = found_item.strip()
            yield ant
예제 #3
0
    def get_all_annotations(cls,
                            text: str,
                            float_digits: int = 4) -> List[DurationAnnotation]:
        all_annotations: List[DurationAnnotation] = []
        for match in cls.DURATION_PTN_RE.finditer(text):
            capture = match.capturesdict()
            amount_text = ''.join(capture.get('num_text', ''))
            amounts = list(get_amounts(amount_text, float_digits=float_digits))
            if len(amounts) != 1:
                amount = Decimal('1.0')
            else:
                amount = amounts[0]
            unit_name_local = ''.join(capture.get('unit_name', '')).lower()
            unit_prefix = ''.join(capture.get('unit_prefix', '')).lower()
            unit_name_local = cls.DURATION_MAP_RE.findall(unit_name_local)
            if not unit_name_local:
                continue
            unit_name_local = unit_name_local[0]
            unit_name_en = cls.DURATION_TRANSLATION_MAP.get(unit_name_local)

            _duration_fraction: Fraction = cls.DURATION_MAP[unit_name_en]
            amount_days: Decimal = Decimal(
                (_duration_fraction.numerator * amount) /
                _duration_fraction.denominator)

            if float_digits:
                amount_days: Decimal = quantize_by_float_digit(
                    amount=amount_days, float_digits=float_digits)

            ant: DurationAnnotation = DurationAnnotation(
                coords=match.span(),
                text=''.join(capture.get('text', '')),
                amount=amount,
                duration_days=amount_days,
                duration_type_en=unit_name_en,
                duration_type=unit_name_local,
                prefix=unit_prefix,
                locale=cls.LOCALE)
            all_annotations.append(ant)
        return all_annotations