示例#1
0
    def parse_date_string(self, date_string: str, captures: Dict[str, List],
                          locale: Locale):
        # For well formatted string, we can already let dateparser parse them
        # otherwise self._find_and_replace method might corrupt them
        was_raised_error = False
        as_dt = None

        if not locale:
            try:
                as_dt = dateparser.parse(
                    date_string, settings={'RELATIVE_BASE': self.base_date})
                # Dateparser has issues with time when parsing something like `29MAY19 1350`
                as_dateutil = parser.parse(date_string, default=self.base_date)
                if as_dt != as_dateutil:
                    as_dt = as_dateutil
            except ValueError:
                was_raised_error = True
        else:
            try:
                print(date_string, self.base_date, type(locale))
                as_dt = dateparser.parse(
                    date_string,
                    settings={'RELATIVE_BASE': self.base_date},
                    locales=[locale.get_locale()])
            except ValueError:
                was_raised_error = True

        # Try to parse date using only language
        if was_raised_error:
            try:
                as_dt = dateparser.parse(
                    date_string,
                    settings={'RELATIVE_BASE': self.base_date},
                    languages=[locale.language])
                was_raised_error = False
            except ValueError:
                pass

        if was_raised_error:
            # replace tokens that are problematic for dateutil
            date_string, tz_string = self._find_and_replace(
                date_string, captures)

            # One last sweep after removing
            date_string = date_string.strip(self.STRIP_CHARS)
            # Match strings must be at least 3 characters long
            # < 3 tends to be garbage
            if len(date_string) < 3:
                return None

            try:
                debug_msg = 'Parsing {} with dateutil'.format(date_string)
                logger.debug(debug_msg)
                as_dt = parser.parse(date_string, default=self.base_date)
            except Exception as e:  # pylint: disable=broad-except
                logger.debug(e)
                as_dt = None
            if tz_string:
                as_dt = self._add_tzinfo(as_dt, tz_string)
        return as_dt
示例#2
0
 def test_locales_convert(self):
     data = [
         {
             'input': 'en',
             'output_locale_code': 'EN'
         },
         {
             'input': 'en-US',
             'output_locale_code': 'US'
         },
         {
             'input': 'en/Gb',
             'output_locale_code': 'GB'
         },
         {
             'input': 'En_us',
             'output_locale_code': 'US'
         },
     ]
     output_language_code = 'en'
     for item in data:
         locale_obj = Locale(item['input'])
         self.assertEqual(locale_obj.language, output_language_code)
         self.assertEqual(locale_obj.locale_code,
                          item['output_locale_code'])
示例#3
0
    def test_dates(self):
        text = """
        Ausfertigungsdatum: 23.05.1975 Vollzitat: \
        "Gesetz über vermögenswirksame Leistungen für Beamte, Richter, Berufssoldaten und \
        Soldaten auf Zeit in der Fassung der Bekanntmachung vom 16. Mai 2002 (BGBl. I S. 1778), \
        das zuletzt durch Artikel 39 des Gesetzes vom 29. März 2017 (BGBl. I S. 626) geändert worden ist" \
        Stand:        Neugefasst durch Bek. v. 16.5.2002 I 1778; \
        zuletzt geändert durch Art. 39 G v. 29.3.2017 I 626""".strip()

        ds = get_date_list(text=text, locale=Locale('de'))
        self.assertEqual(5, len(ds))
        ds.sort(key=lambda d: d['location_start'])

        self.assertEqual((20, 30),
                         (ds[0]['location_start'], ds[0]['location_end']))
        self.assertEqual((196, 208),
                         (ds[1]['location_start'], ds[1]['location_end']))
        self.assertEqual((282, 295),
                         (ds[2]['location_start'], ds[2]['location_end']))
        self.assertEqual((381, 390),
                         (ds[3]['location_start'], ds[3]['location_end']))
        self.assertEqual((443, 452),
                         (ds[4]['location_start'], ds[4]['location_end']))

        self.assertEqual(datetime.datetime(1975, 5, 23, 0, 0), ds[0]['value'])
        self.assertEqual(datetime.datetime(2002, 5, 16, 0, 0), ds[1]['value'])
        self.assertEqual(datetime.datetime(2017, 3, 29, 0, 0), ds[2]['value'])
        self.assertEqual(datetime.datetime(2002, 5, 16, 0, 0), ds[3]['value'])
        self.assertEqual(datetime.datetime(2017, 3, 29, 0, 0), ds[4]['value'])

        self.assertEqual('23.05.1975', ds[0]['source'])
        self.assertEqual('16. Mai 2002', ds[1]['source'])
        self.assertEqual('29. März 2017', ds[2]['source'])
        self.assertEqual('16.5.2002', ds[3]['source'])
        self.assertEqual('29.3.2017', ds[4]['source'])
示例#4
0
def get_copyright_annotations(
        locale: str,
        text: str,
        return_sources: bool = False) -> \
        Generator[CopyrightAnnotation, None, None]:
    routine = ROUTINE_BY_LOCALE.get(
        Locale(locale).language, ROUTINE_BY_LOCALE[DEFAULT_LANGUAGE.code])
    yield from routine(text, return_sources)
示例#5
0
def get_definition_annotations(
        locale: str,
        text: str,
        **kwargs) \
        -> Generator[DefinitionAnnotation, None, None]:
    routine = ROUTINE_BY_LOCALE.get(
        Locale(locale).language, ROUTINE_BY_LOCALE[DEFAULT_LANGUAGE.code])
    yield from routine(text, **kwargs)
示例#6
0
def get_duration_annotations(
        locale: str,
        text: str,
        float_digits: int = 4) \
        -> Generator[DurationAnnotation, None, None]:
    routine = ROUTINE_BY_LOCALE.get(
        Locale(locale).language, ROUTINE_BY_LOCALE[DEFAULT_LANGUAGE.code])
    yield from routine(text, float_digits)
示例#7
0
def get_date_annotations(
        text: str,
        strict: Optional[bool] = None,
        locale: Optional[str] = '',
        _base_date: Optional[datetime.datetime] = None,
        _threshold: float = 0.50) -> Generator[DateAnnotation, None, None]:
    strict = strict if strict is not None else False
    yield from parser.get_date_annotations(text, Locale(locale), strict)
示例#8
0
def get_date_annotations(
        locale: str,
        text: str,
        strict: Optional[bool] = None,
        base_date: Optional[datetime] = None,
        threshold: float = 0.50) -> Generator[DateAnnotation, None, None]:
    routine = ROUTINE_BY_LOCALE.get(
        Locale(locale).language, ROUTINE_BY_LOCALE[DEFAULT_LANGUAGE.code])
    yield from routine(text, strict, locale, base_date, threshold)
示例#9
0
def get_amount_annotations(
    locale: str,
    text: str,
    extended_sources: bool = True,
    float_digits: int = 4,
) -> Generator[AmountAnnotation, None, None]:
    routine = ROUTINE_BY_LOCALE.get(
        Locale(locale).language, ROUTINE_BY_LOCALE[DEFAULT_LANGUAGE.code])
    yield from routine(text, extended_sources, float_digits)
示例#10
0
 def __init__(self,
              text: Optional[str] = None,
              locale: Locale = Locale('en-US'),
              dateparser_settings: Optional[Dict[str, Any]] = None,
              enable_classifier_check: bool = False,
              classifier_model: Optional[Any] = None,
              classifier_threshold: float = 0.5):
     super().__init__(DATE_MODEL_CHARS, text, locale, dateparser_settings,
                      enable_classifier_check, classifier_model,
                      classifier_threshold)
示例#11
0
def get_court_annotations(
    locale: str,
    text: str,
    court_config_list: List[DictionaryEntry],
    priority: bool = False,
    text_locales: List[str] = (),
    simplified_normalization: bool = False
) -> Generator[CourtAnnotation, None, None]:
    locale_obj = Locale(locale)
    dic_entries = find_dict_entities(
        text,
        court_config_list,
        default_language=locale_obj.language,
        conflict_resolving_func=conflicts_take_first_by_id
        if priority else None,
        text_languages=[Locale(item).language for item in text_locales],
        simplified_normalization=simplified_normalization)
    for ent in dic_entries:
        ant = CourtAnnotation(coords=ent.coords)
        if ent.entity[0]:
            toponim = ent.entity[0]  # type: DictionaryEntry
            ant.entity_id = toponim.id
            ant.entity_category = toponim.category
            ant.entity_priority = toponim.priority
            ant.name_en = toponim.entity_name
            ant.name = toponim.name
            if toponim.extra_columns:
                for extr_col in toponim.extra_columns:
                    setattr(ant, extr_col, toponim.extra_columns[extr_col])

        if ent.entity[1]:  # alias
            ant.alias = ent.entity[1].alias
            ant.locale = ent.entity[1].language
        if not ant.locale:
            ant.locale = locale_obj.language
        yield ant
示例#12
0
def get_geoentity_annotations(
    locale: str,
    text: str,
    geo_config_list: List[DictionaryEntry],
    conflict_resolving_field: str = 'none',
    priority_direction: str = 'asc',
    text_languages: List[str] = None,
    min_alias_len: Optional[int] = None,
    prepared_alias_ban_list: Optional[Dict[str, Tuple[List[str],
                                                      List[str]]]] = None,
    simplified_normalization: bool = False
) -> Generator[GeoAnnotation, None, None]:
    routine = ROUTINE_BY_LOCALE.get(
        Locale(locale).language, ROUTINE_BY_LOCALE[DEFAULT_LANGUAGE.code])
    yield from routine(text, geo_config_list, conflict_resolving_field,
                       priority_direction, text_languages, min_alias_len,
                       prepared_alias_ban_list, simplified_normalization)
示例#13
0
def get_date_annotations(text: str,
                         strict: Optional[bool] = None,
                         locale: Optional[str] = '',
                         base_date: Optional[datetime.datetime] = None,
                         threshold: float = 0.50) \
        -> Generator[DateAnnotation, None, None]:
    """
    Find dates after cleaning false positives.
    :param text: raw text to search
    :param strict: whether to return only complete or strict matches
    :param locale: locale string
    :param base_date: base date to use for implied or partial matches
    :param threshold: probability threshold to use for false positive classifier
    :return:
    """

    # Get raw dates
    strict = strict if strict is not None else False
    raw_date_results = get_raw_date_list(text,
                                         strict=strict,
                                         base_date=base_date,
                                         return_source=True,
                                         locale=Locale(locale))

    for raw_date in raw_date_results:
        features_dict = get_date_features(text,
                                          raw_date[1][0],
                                          raw_date[1][1],
                                          characters=DATE_MODEL_CHARS)
        row_df = DateFeaturesDataframeBuilder.build_feature_df(features_dict)
        # row_df = pd.DataFrame([get_date_features(text, raw_date[1][0], raw_date[1][1])])
        date_score = MODEL_DATE.predict_proba(row_df.loc[:,
                                                         MODEL_DATE.columns])
        if date_score[0, 1] >= threshold:
            ant = DateAnnotation(coords=raw_date[1],
                                 date=raw_date[0],
                                 score=date_score[0, 1])
            yield ant
示例#14
0
 def __init__(self,
              characters: List[str],
              text: Optional[str] = None,
              locale: Locale = Locale('en-US'),
              dateparser_settings: Optional[Dict[str, Any]] = None,
              enable_classifier_check: bool = True,
              classifier_model: Optional[Any] = None,
              classifier_threshold: float = 0.5):
     """
     :param locale: locale object with language code and locale code
     :param enable_classifier_check: bool - enable date check using classifier model
     :param classifier_model: obj - classifier itself
     :param classifier_threshold: float 0<x<1 - min value to predict date
     :param dateparser_settings: dict - settings for dateparser
     """
     self.characters = characters
     self.locale = locale
     self.text = text
     self.dates = []
     self.enable_classifier_check = enable_classifier_check
     self.classifier_model = classifier_model
     self.classifier_threshold = classifier_threshold
     self.dateparser_settings = dateparser_settings or self.DEFAULT_DATEPARSER_SETTINGS
示例#15
0
                            ("{0} through {1}".format(d.isoformat(),
                                                      d2.isoformat()), [d,
                                                                        d2]))
                        examples.append(
                            ("{0} through {1}".format(d.strftime("%b d, %Y"),
                                                      d2.strftime("%b d, %Y")),
                             [d, d2]))
                    except ValueError:
                        continue

    # Output
    output_path = 'test_date_model.pickle'
    if save:
        output_path = os.path.join(MODULE_PATH, 'date_model.pickle')

    build_date_model(examples,
                     output_path,
                     lambda date_str: get_raw_date_list(
                         date_str, strict=False, return_source=True),
                     characters=DATE_MODEL_CHARS)
    if not save:
        os.unlink("test_date_model.pickle")


parser = DateParser(DATE_MODEL_CHARS,
                    enable_classifier_check=True,
                    locale=Locale('en-US'),
                    classifier_model=MODEL_DATE)
_get_dates = parser.get_dates
_get_date_list = parser.get_date_list
示例#16
0
        dates = list(dateparser_dates_dict.values())

        for w_date_re, w_date_norm in self.WEIRD_DATES_NORM:
            w_dates = w_date_re.findall(self.text)
            for w_date_str in w_dates:
                date_str = w_date_norm(w_date_str)
                date_res = self.get_dateparser_dates(date_str, strict)
                if date_res:
                    dates.append((w_date_str, date_res[0][1]))

        self.dates = dates


parser = ESDateParser(enable_classifier_check=False,
                      locale=Locale('es-ES'),
                      dateparser_settings={
                          'PREFER_DAY_OF_MONTH': 'first',
                          'STRICT_PARSING': False,
                          'DATE_ORDER': 'DMY'
                      })


def get_date_annotations(
        text: str,
        strict: Optional[bool] = None,
        locale: Optional[str] = '',
        _base_date: Optional[datetime.datetime] = None,
        _threshold: float = 0.50) -> Generator[DateAnnotation, None, None]:
    strict = strict if strict is not None else False
    yield from parser.get_date_annotations(text, Locale(locale), strict)
示例#17
0
def get_court_citation_annotations(locale: str, text: str, language: str = None) -> \
        Generator[CourtCitationAnnotation, None, None]:
    routine = ROUTINE_BY_LOCALE.get(
        Locale(locale).language, ROUTINE_BY_LOCALE[LANG_DE.code])
    yield from routine(text, language)
示例#18
0
import joblib

from lexnlp.extract.all_locales.languages import Locale
from lexnlp.extract.common.annotations.date_annotation import DateAnnotation
from lexnlp.extract.common.dates import DateParser
from lexnlp.extract.de.date_model import DATE_MODEL_CHARS

# Setup path
MODULE_PATH = os.path.dirname(os.path.abspath(__file__))

# Load model
MODEL_DATE = joblib.load(os.path.join(MODULE_PATH, "./date_model.pickle"))

parser = DateParser(DATE_MODEL_CHARS,
                    enable_classifier_check=True,
                    locale=Locale('de-DE'),
                    dateparser_settings={
                        'PREFER_DAY_OF_MONTH': 'first',
                        'STRICT_PARSING': False,
                        'DATE_ORDER': 'DMY'
                    },
                    classifier_model=MODEL_DATE)


def get_date_annotations(
        text: str,
        strict: Optional[bool] = None,
        locale: Optional[str] = '',
        _base_date: Optional[datetime] = None,
        _threshold: float = 0.50) -> Generator[DateAnnotation, None, None]:
    strict = strict if strict is not None else False