Exemplo n.º 1
0
    def test_date_annotation(self):
        ant = DateAnnotation(coords=(2, 12),
                             date=date(2018, 1, 13),
                             score=0.5,
                             locale='pg')
        self.assertEqual('pg', ant.locale)
        s = ant.__repr__()
        self.assertGreater(len(s), 0)

        cite = ant.get_cite()
        self.assertEqual('/pg/date/2018-01-13', cite)
Exemplo n.º 2
0
def get_date_annotations(text: str, strict=False, base_date=None, threshold=0.50) \
        -> Generator[DateAnnotation, None, None]:
    """
    Find dates after cleaning false positives.
    :param text: raw text to search
    :param strict: whether to return only complete or strict matches
    :param base_date: base date to use for implied or partial matches
    :param return_source: whether to return raw text around date
    :param threshold: probability threshold to use for false positive classifier
    :return:
    """
    # Get raw dates
    raw_date_results = get_raw_date_list(text,
                                         strict=strict,
                                         base_date=base_date,
                                         return_source=True)

    for raw_date in raw_date_results:
        features_dict = get_date_features(text, raw_date[1][0], raw_date[1][1])
        row_df = DateFeaturesDataframeBuilder.build_feature_df(features_dict)
        # row_df = pd.DataFrame([get_date_features(text, raw_date[1][0], raw_date[1][1])])
        date_score = MODEL_DATE.predict_proba(row_df.loc[:,
                                                         MODEL_DATE.columns])
        if date_score[0, 1] >= threshold:
            ant = DateAnnotation(coords=raw_date[1],
                                 date=raw_date[0],
                                 score=date_score[0, 1])
            yield ant
Exemplo n.º 3
0
    def get_date_annotations(self,
                             text: str = None,
                             language: str = None) -> \
            Generator[DateAnnotation, None, None]:

        self.TEXT = text.replace('\n', ' ') or self.TEXT
        self.LANGUAGE = language or self.LANGUAGE

        if not self.TEXT or not self.LANGUAGE:
            raise RuntimeError('Define text and language.')

        # First try dateparser searcher
        try:
            self.DATES = self.get_dateparser_dates() or []
        except Exception as e:
            # TODO: add logging
            print(str(e))

        # Next try custom search logic
        self.get_extra_dates()

        positions = []
        for date_str, date in sorted(self.DATES, key=lambda i: -len(i[0])):

            # if possible date has weird format or unwanted symbols
            if not self.passed_general_check(date_str, date):
                continue

            for match in re.finditer(re.escape(date_str), self.TEXT):
                location_start, location_end = match.span()

                # skip overlapping entities
                if any([
                        1 for i, j in positions
                        if location_start >= i and location_end <= j
                ]):
                    continue
                positions.append(match.span())

                # filter out possible dates using classifier
                if self.ENABLE_CLASSIFIER_CHECK and \
                        not self.passed_classifier_check(location_start, location_end):
                    continue

                ant = DateAnnotation(
                    coords=(location_start, location_end),
                    date=date,
                    text=self.TEXT[location_start:location_end],
                    locale=language or self.LANGUAGE)
                yield ant
Exemplo n.º 4
0
    def get_date_annotations(self,
                             text: str = None,
                             locale: Locale = None,
                             strict: Optional[bool] = None) -> \
            Generator[DateAnnotation, None, None]:
        strict = strict if strict is not None else True
        self.text = text.replace('\n', ' ') or self.text
        self.locale.language = (locale.language if locale else "") or self.locale.language

        if not self.text or not self.locale.language:
            raise RuntimeError('Define text and language.')

        # First try dateparser searcher
        try:
            self.dates = self.get_dateparser_dates(text, strict)
        except Exception as e:
            # TODO: add logging
            print(str(e))

        # Next try custom search logic
        self.get_extra_dates(strict)

        positions = []
        for date_str, date in sorted(self.dates, key=lambda i: -len(i[0])):

            # if possible date has weird format or unwanted symbols
            if not self.passed_general_check(date_str, date):
                continue

            for match in re.finditer(re.escape(date_str), self.text):
                location_start, location_end = match.span()

                # skip overlapping entities
                if any(1 for i, j in positions if location_start >= i and location_end <= j):
                    continue
                positions.append(match.span())

                # filter out possible dates using classifier
                if self.enable_classifier_check and \
                        not self.passed_classifier_check(location_start, location_end):
                    continue

                ant = DateAnnotation(coords=(location_start, location_end),
                                     date=date,
                                     text=self.text[location_start:location_end],
                                     locale=self.locale.language)
                yield ant
Exemplo n.º 5
0
def get_date_annotations(text: str,
                         strict: Optional[bool] = None,
                         locale: Optional[str] = '',
                         base_date: Optional[datetime.datetime] = None,
                         threshold: float = 0.50) \
        -> Generator[DateAnnotation, None, None]:
    """
    Find dates after cleaning false positives.
    :param text: raw text to search
    :param strict: whether to return only complete or strict matches
    :param locale: locale string
    :param base_date: base date to use for implied or partial matches
    :param threshold: probability threshold to use for false positive classifier
    :return:
    """

    # Get raw dates
    strict = strict if strict is not None else False
    raw_date_results = get_raw_date_list(text,
                                         strict=strict,
                                         base_date=base_date,
                                         return_source=True,
                                         locale=Locale(locale))

    for raw_date in raw_date_results:
        features_dict = get_date_features(text,
                                          raw_date[1][0],
                                          raw_date[1][1],
                                          characters=DATE_MODEL_CHARS)
        row_df = DateFeaturesDataframeBuilder.build_feature_df(features_dict)
        # row_df = pd.DataFrame([get_date_features(text, raw_date[1][0], raw_date[1][1])])
        date_score = MODEL_DATE.predict_proba(row_df.loc[:,
                                                         MODEL_DATE.columns])
        if date_score[0, 1] >= threshold:
            ant = DateAnnotation(coords=raw_date[1],
                                 date=raw_date[0],
                                 score=date_score[0, 1])
            yield ant