def test_date_annotation(self): ant = DateAnnotation(coords=(2, 12), date=date(2018, 1, 13), score=0.5, locale='pg') self.assertEqual('pg', ant.locale) s = ant.__repr__() self.assertGreater(len(s), 0) cite = ant.get_cite() self.assertEqual('/pg/date/2018-01-13', cite)
def get_date_annotations(text: str, strict=False, base_date=None, threshold=0.50) \ -> Generator[DateAnnotation, None, None]: """ Find dates after cleaning false positives. :param text: raw text to search :param strict: whether to return only complete or strict matches :param base_date: base date to use for implied or partial matches :param return_source: whether to return raw text around date :param threshold: probability threshold to use for false positive classifier :return: """ # Get raw dates raw_date_results = get_raw_date_list(text, strict=strict, base_date=base_date, return_source=True) for raw_date in raw_date_results: features_dict = get_date_features(text, raw_date[1][0], raw_date[1][1]) row_df = DateFeaturesDataframeBuilder.build_feature_df(features_dict) # row_df = pd.DataFrame([get_date_features(text, raw_date[1][0], raw_date[1][1])]) date_score = MODEL_DATE.predict_proba(row_df.loc[:, MODEL_DATE.columns]) if date_score[0, 1] >= threshold: ant = DateAnnotation(coords=raw_date[1], date=raw_date[0], score=date_score[0, 1]) yield ant
def get_date_annotations(self, text: str = None, language: str = None) -> \ Generator[DateAnnotation, None, None]: self.TEXT = text.replace('\n', ' ') or self.TEXT self.LANGUAGE = language or self.LANGUAGE if not self.TEXT or not self.LANGUAGE: raise RuntimeError('Define text and language.') # First try dateparser searcher try: self.DATES = self.get_dateparser_dates() or [] except Exception as e: # TODO: add logging print(str(e)) # Next try custom search logic self.get_extra_dates() positions = [] for date_str, date in sorted(self.DATES, key=lambda i: -len(i[0])): # if possible date has weird format or unwanted symbols if not self.passed_general_check(date_str, date): continue for match in re.finditer(re.escape(date_str), self.TEXT): location_start, location_end = match.span() # skip overlapping entities if any([ 1 for i, j in positions if location_start >= i and location_end <= j ]): continue positions.append(match.span()) # filter out possible dates using classifier if self.ENABLE_CLASSIFIER_CHECK and \ not self.passed_classifier_check(location_start, location_end): continue ant = DateAnnotation( coords=(location_start, location_end), date=date, text=self.TEXT[location_start:location_end], locale=language or self.LANGUAGE) yield ant
def get_date_annotations(self, text: str = None, locale: Locale = None, strict: Optional[bool] = None) -> \ Generator[DateAnnotation, None, None]: strict = strict if strict is not None else True self.text = text.replace('\n', ' ') or self.text self.locale.language = (locale.language if locale else "") or self.locale.language if not self.text or not self.locale.language: raise RuntimeError('Define text and language.') # First try dateparser searcher try: self.dates = self.get_dateparser_dates(text, strict) except Exception as e: # TODO: add logging print(str(e)) # Next try custom search logic self.get_extra_dates(strict) positions = [] for date_str, date in sorted(self.dates, key=lambda i: -len(i[0])): # if possible date has weird format or unwanted symbols if not self.passed_general_check(date_str, date): continue for match in re.finditer(re.escape(date_str), self.text): location_start, location_end = match.span() # skip overlapping entities if any(1 for i, j in positions if location_start >= i and location_end <= j): continue positions.append(match.span()) # filter out possible dates using classifier if self.enable_classifier_check and \ not self.passed_classifier_check(location_start, location_end): continue ant = DateAnnotation(coords=(location_start, location_end), date=date, text=self.text[location_start:location_end], locale=self.locale.language) yield ant
def get_date_annotations(text: str, strict: Optional[bool] = None, locale: Optional[str] = '', base_date: Optional[datetime.datetime] = None, threshold: float = 0.50) \ -> Generator[DateAnnotation, None, None]: """ Find dates after cleaning false positives. :param text: raw text to search :param strict: whether to return only complete or strict matches :param locale: locale string :param base_date: base date to use for implied or partial matches :param threshold: probability threshold to use for false positive classifier :return: """ # Get raw dates strict = strict if strict is not None else False raw_date_results = get_raw_date_list(text, strict=strict, base_date=base_date, return_source=True, locale=Locale(locale)) for raw_date in raw_date_results: features_dict = get_date_features(text, raw_date[1][0], raw_date[1][1], characters=DATE_MODEL_CHARS) row_df = DateFeaturesDataframeBuilder.build_feature_df(features_dict) # row_df = pd.DataFrame([get_date_features(text, raw_date[1][0], raw_date[1][1])]) date_score = MODEL_DATE.predict_proba(row_df.loc[:, MODEL_DATE.columns]) if date_score[0, 1] >= threshold: ant = DateAnnotation(coords=raw_date[1], date=raw_date[0], score=date_score[0, 1]) yield ant