def test_parse_date_string_find_replace(date_string, expected_parse_arg, expected_captures, expected_date): dt = datefinder.DateFinder() with mock.patch.object(parser, 'parse', wraps=parser.parse) as spy: actual_datetime = dt.parse_date_string(date_string, expected_captures) spy.assert_called_with(expected_parse_arg) logger.debug("acutal={} expected={}".format(actual_datetime, expected_date)) assert actual_datetime == expected_date
def test_extract_date_strings(date_string, expected_match_date_string): dt = datefinder.DateFinder() for actual_date_string, indexes, captures in dt.extract_date_strings( date_string): logger.debug("actual={} expected={}".format( actual_date_string, expected_match_date_string)) assert actual_date_string == expected_match_date_string assert len(captures.get('timezones', [])) > 0
def test_find_and_replace(date_string, expected_replaced_string, captures, expected_tz_string): dt = datefinder.DateFinder() expected_replacements = copy.copy(dt.REPLACEMENTS) actual_date_string, actual_tz_string = dt._find_and_replace(date_string, captures) # assert that dt._find_and_replace did not mutate dt.REPLACEMENTS assert dt.REPLACEMENTS == expected_replacements # assert the return values of dt._find_and_replace assert actual_date_string == expected_replaced_string assert actual_tz_string == expected_tz_string
def test_extract_date_strings_with_strict_option(date_string, expected_match_date_string): """ make sure that `strict` mode works for the dates we care about and doesn't work for others :param date_string: :param expected_match_date_string: :return: """ dt = datefinder.DateFinder() for actual_date_string, indexes, captures in dt.extract_date_strings(date_string,strict=True): logger.debug("actual={} expected={}".format(actual_date_string, expected_match_date_string)) assert actual_date_string == expected_match_date_string
def identifyDatetime(text, typeName="DATETIME_TYPE"): try: finder = datefinder.DateFinder() #print(finder.DATES_PATTERN) finds = finder.extract_date_strings(text, strict=True) for date_string, indices, captures in finds: #print(date_string) #print(indices) #print(captures) parsed_date = finder.parse_date_string(date_string, captures) if parsed_date is not None: #print(parsed_date) text = text.replace(date_string, formatTypeName(typeName)) except Exception as e: pass return text
def extract_datereference(self, file_obj, date): self.date_referred_list = [] base_date = self.str_to_dtfmt(date) date_finder = DF.DateFinder(base_date=base_date) for line in file_obj: str1 = date_finder.find_dates(line) while True: try: t = (str1.__iter__()).next() one, flag = self.valid_date(t) if flag: #one_unaware = one.replace(tzinfo=None) self.date_referred_list.append(one) except ValueError: continue except StopIteration: break
def test_parse_date_string_find_replace_nonexistent_tzinfo(date_string, expected_parse_arg, expected_captures, expected_date): ''' mimic what happens when dateutil.tz.gettz tries to find a non-existent tzinfo string with mocks because some operating systems might resolve 'CST' and 'IRST' :param date_string: :param expected_parse_arg: :param expected_captures: :param expected_date: :return: ''' dt = datefinder.DateFinder() with mock.patch.object(tz, 'gettz', wraps=tz.gettz) as mock_gettz: mock_gettz.return_value = None actual_datetime = dt.parse_date_string(date_string, expected_captures) mock_gettz.assert_called_with(expected_captures['timezones'][0]) logger.debug("acutal={} expected={}".format(actual_datetime, expected_date)) assert actual_datetime == expected_date
def test_parse_date_string_find_replace_nonexistent_tzinfo( date_string, expected_parse_arg, expected_captures, expected_date): ''' mimic what happens when dateutil tries to find a non-existent tzinfo string because some operating systems might resolve 'CST' and 'IRST' this should raise a warning. :param date_string: :param expected_parse_arg: :param expected_captures: :param expected_date: :return: ''' with pytest.warns(parser.UnknownTimezoneWarning): warnings.simplefilter('always') dt = datefinder.DateFinder() actual_datetime = dt.parse_date_string(date_string, expected_captures) logger.debug("actual={} expected={}".format(actual_datetime, expected_date)) assert actual_datetime == expected_date
def test_tz_gettz_for_all_patterns(): """ determine which pattern matching tz_strings dateutil.tz.gettz will not handle :warning: currently tz.gettz only matches 14 of regex timezones of our ~400 [ GOOD MATCHES ]: ['PST', 'EST', 'MST', 'CET', 'EET', 'EST', 'GMT', 'HST', 'MET', 'MST', 'PDT', 'PST', 'UTC', 'WET'] """ bad_tz_strings = [] good_tz_strings = [] finder = datefinder.DateFinder() test_tz_strings = finder.NA_TIMEZONES_PATTERN.split( '|') + finder.TIMEZONES_PATTERN.split('|\s') for tz_string in test_tz_strings: if tz_string in finder.TIMEZONE_REPLACEMENTS.keys(): tz_string = finder.TIMEZONE_REPLACEMENTS[tz_string] tz_object = tz.gettz(tz_string.replace('\s', '')) if tz_object is None: bad_tz_strings.append(tz_string) else: good_tz_strings.append(tz_string) logger.debug("[ BAD TZINFO ]: {}".format(bad_tz_strings)) logger.debug("[ GOOD TZINFO ]: {}".format(good_tz_strings))
def get_raw_dates(text, strict=False, base_date=None, return_source=False) -> Generator: """ Find "raw" or potential date matches prior to false positive classification. :param text: raw text to search :param strict: whether to return only complete or strict matches :param base_date: base date to use for implied or partial matches :param return_source: whether to return raw text around date :return: """ # Setup base date if not base_date: base_date = datetime.datetime.now().replace(day=1, month=1, hour=0, minute=0, second=0, microsecond=0) # Find potential dates date_finder = datefinder.DateFinder(base_date=base_date) for extra_token in date_finder.EXTRA_TOKENS_PATTERN.split('|'): if extra_token != 't': date_finder.REPLACEMENTS[extra_token] = ' ' # Iterate through possible matches possible_dates = [(date_string, index, date_props) for date_string, index, date_props in date_finder.extract_date_strings(text, strict=strict)] possible_matched = [] for i, possible_date in enumerate(possible_dates): # Get date_string = possible_date[0] index = possible_date[1] date_props = possible_date[2] # Cleanup "day of" strings if "of" in date_props["extra_tokens"] or "OF" in date_props[ "extra_tokens"]: num_dig_mod = len(possible_dates[i - 1][2]["digits_modifier"]) if i > 0 and not possible_matched[i - 1] and num_dig_mod == 1: date_props["digits_modifier"].extend( possible_dates[i - 1][2]["digits_modifier"]) date_string = possible_dates[i - 1][2]["digits_modifier"].pop().replace("st", "").replace("nd", "") \ .replace("rd", "").replace("th", "") + date_string # Skip only digits modifiers num_dig_mod = len(date_props["digits_modifier"]) num_dig = len(date_props["digits"]) num_days = len(date_props["days"]) num_month = len(date_props["months"]) num_slash = date_props["delimiters"].count("/") num_hyphen = date_props["delimiters"].count("-") # Remove double months if num_month > 1: possible_matched.append(False) continue # Remove wrong months like Dec*ided or Mar*tin if num_month == 1 and date_props['extra_tokens'] \ and (date_props['months'][0] + date_props['extra_tokens'][-1]) in date_string: possible_matched.append(False) continue # Check strange strings if num_dig_mod > 0 and num_dig == 0: possible_matched.append(False) continue # Skip DOW only if num_days > 0 and num_dig == 0: possible_matched.append(False) continue # Skip DOM only if num_month == 0 and num_dig_mod == 0 and num_dig <= 1: possible_matched.append(False) continue # Skip fractions if (num_slash == 1 or num_hyphen == 1) and num_dig > 2: possible_matched.append(False) continue # Skip three-digit blocks and double zero years found_triple = False found_dz = False for digit in date_props["digits"]: if len(digit) == 3: found_triple = True if digit.startswith("00"): found_dz = True if found_triple or found_dz: possible_matched.append(False) continue # Skip " may " alone if num_dig == 0 and num_days == 0 and "".join( date_props["months"]).lower() == "may": possible_matched.append(False) continue # Cleanup for token in sorted(date_props["extra_tokens"], key=len, reverse=True): if token.lower() in ["to", "t"]: continue date_string = date_string.replace(token, "") date_string = date_string.strip() date_props["extra_tokens"] = [] # Skip strings too long if len(date_string) > DATE_MAX_LENGTH: possible_matched.append(False) continue # Skip numbers only match_delims = set("".join(date_props["delimiters"])) bad_delims = {",", " ", "\n", "\t"} len_diff_set = len(match_delims - bad_delims) if len_diff_set == 0 and num_month == 0: possible_matched.append(False) continue # Parse and skip nones date = None try: date_string_tokens = date_string.split() for cutter in range(len(date_string_tokens)): for direction in (0, 1): if cutter > 0: if direction: _date_string_tokens = date_string_tokens[cutter:] else: _date_string_tokens = date_string_tokens[:-cutter] date_string = ' '.join(_date_string_tokens) try: date = date_finder.parse_date_string( date_string, date_props) except: date = None if date: break else: continue # executed if the loop ended normally (no break) break # executed if 'continue' was skipped (break) except TypeError: possible_matched.append(False) continue if not date: possible_matched.append(False) continue else: # for case when datetime.datetime(2001, 1, 22, 20, 1, tzinfo=tzoffset(None, -104400)) if hasattr(date, 'tzinfo'): try: _ = date.isoformat() except ValueError: possible_matched.append(False) continue possible_matched.append(True) if isinstance( date, datetime.datetime) and date.hour == 0 and date.minute == 0: date = date.date() # Append if return_source: yield (date, index) else: yield date
def test_add_tzinfo(naive_datetime_obj, timezone_string): expected_datetime = naive_datetime_obj.replace( tzinfo=tz.gettz(timezone_string)) finder = datefinder.DateFinder() actual_datetime = finder._add_tzinfo(naive_datetime_obj, timezone_string) assert actual_datetime == expected_datetime
def get_raw_dates(text, strict=False, base_date=None, return_source=False) -> Generator: """ Find "raw" or potential date matches prior to false positive classification. :param text: raw text to search :param strict: whether to return only complete or strict matches :param base_date: base date to use for implied or partial matches :param return_source: whether to return raw text around date :return: """ # Setup base date if not base_date: base_date = datetime.date(datetime.date.today().year, 1, 1) # Find potential dates date_finder = datefinder.DateFinder(base_date=base_date) # Iterate through possible matches possible_dates = [(date_string, index, date_props) for date_string, index, date_props in date_finder.extract_date_strings(text, strict=strict)] possible_matched = [] for i, possible_date in enumerate(possible_dates): # Get date_string = possible_date[0] index = possible_date[1] date_props = possible_date[2] # Cleanup "day of" strings if "of" in date_props["extra_tokens"] or "OF" in date_props[ "extra_tokens"]: num_dig_mod = len(possible_dates[i - 1][2]["digits_modifier"]) if i > 0 and not possible_matched[i - 1] and num_dig_mod == 1: date_props["digits_modifier"].extend( possible_dates[i - 1][2]["digits_modifier"]) date_string = possible_dates[i - 1][2]["digits_modifier"].pop().replace("st", "").replace("nd", "") \ .replace("rd", "").replace("th", "") + date_string # Skip only digits modifiers num_dig_mod = len(date_props["digits_modifier"]) num_dig = len(date_props["digits"]) num_days = len(date_props["days"]) num_month = len(date_props["months"]) num_slash = date_props["delimiters"].count("/") num_hyphen = date_props["delimiters"].count("-") # Remove double months if num_month > 1: possible_matched.append(False) continue # Remove wrong months like Dec*ided or Mar*tin if num_month == 1 and date_props['extra_tokens'] \ and (date_props['months'][0] + date_props['extra_tokens'][-1]) in date_string: possible_matched.append(False) continue # Check strange strings if num_dig_mod > 0 and num_dig == 0: possible_matched.append(False) continue # Skip DOW only if num_days > 0 and num_dig == 0: possible_matched.append(False) continue # Skip DOM only if num_month == 0 and num_dig_mod == 0 and num_dig <= 1: possible_matched.append(False) continue # Skip fractions if (num_slash == 1 or num_hyphen == 1) and num_dig > 2: possible_matched.append(False) continue # Skip three-digit blocks and double zero years found_triple = False found_dz = False for digit in date_props["digits"]: if len(digit) == 3: found_triple = True if digit.startswith("00"): found_dz = True if found_triple or found_dz: possible_matched.append(False) continue # Skip " may " alone if num_dig == 0 and num_days == 0 and "".join( date_props["months"]).lower() == "may": possible_matched.append(False) continue # Cleanup for token in date_props["extra_tokens"]: if token.lower() in ["to"]: continue date_string = date_string.replace(token, "") date_string = date_string.strip() date_props["extra_tokens"] = [] # Skip strings too long if len(date_string) > DATE_MAX_LENGTH: possible_matched.append(False) continue # Skip numbers only match_delims = set("".join(date_props["delimiters"])) bad_delims = {",", " ", "\n", "\t"} len_diff_set = len(match_delims - bad_delims) if len_diff_set == 0 and num_month == 0: possible_matched.append(False) continue # Parse and skip nones try: date = date_finder.parse_date_string(date_string, date_props) except TypeError: possible_matched.append(False) continue if not date: possible_matched.append(False) continue else: possible_matched.append(True) # Append if return_source: yield (date, index) else: yield date
from pymarc import MARCReader import datefinder import re import hashlib dt = datefinder.DateFinder() def dateparser(s): if s is not None: m1 = re.search(r'c?\s*(\d{4})', s) m2 = re.search(r'\[(\d{4})\]', s) if m1: return m1.group(1) elif m2: return m2.group(1) else: return None cache = {} def authorparser(s): if s is not None: m = re.match( r'^(?P<last>\w+),\s*(?P<first>\w+)(,\s*(?P<born>\d{4})-(?P<died>\d{4}))?', s) if m: a = '' b = '' if m.group('born'):