def robust_date_parser(d): """ Robust wrapper around some date parsing libs, making a best effort to return a single 8601 date from the input string. No range checking is performed, and any date other than the first occuring will be ignored. We use timelib for its ability to make at least some sense of invalid dates, e.g. 2012/02/31 -> 2012/03/03 We rely only on dateutil.parser for picking out dates from nearly arbitrary strings (fuzzy=True), but at the cost of being forgiving of invalid dates in those kinds of strings. Returns None if it fails """ dd = dateparser.to_iso8601(d) if dd is None or out_of_range(d): try: dd = dateutil_parse(d, fuzzy=True, default=DEFAULT_DATETIME) if dd.year == DEFAULT_DATETIME.year: dd = None except Exception: try: dd = timelib.strtodatetime(d, now=DEFAULT_DATETIME_SECS) except ValueError: pass except Exception as e: logger.error(e) if dd: ddiso = dd.isoformat() return ddiso[:ddiso.index('T')] return dd
def smart_parse_date(date): ''' Accepts a string or unicode date to be parsed and returns a datetime.datetime result A very restrictive list of dates that can be parsed (i.e. some date formats not listed here should work): W3C dates, documented here: http://www.w3.org/TR/NOTE-datetime A subset of undelimited ISO-8601 dates work (as prevalent in LC MODS). YYYYDDMM YYYYDDMMhhmmss In general the dates have to be internationally unambiguous, Y2K-safe One exception is support for US convention, Y2K-safe year dates. MM/DD/YYYY ''' date = date.strip() #FIXME: Yes, layers on layers. Streamline it. try: dt = iso8601.parse_date(dateparser.to_iso8601(date)) return dt except (KeyboardInterrupt, SystemExit): raise except Exception, e: pass
def test_time_strings(): reference_date = None for line in TEST_DATES.splitlines(): if not line or line.startswith("#"): # Ignore blank lines and comments continue if line.startswith("ISO:"): # This is a new date reference. # Check that it's value and save for future comparisons _, reference_date = line.split() # Test for round-trip parsing iso_date = to_iso8601(reference_date) assert iso_date is not None, reference_date # could not be parsed assert iso_date == reference_date, (iso_date, reference_date) # did not match print "New date:", reference_date continue print "Test", line date = to_iso8601(line) assert date == reference_date, (line, date, reference_date)
def robust_date_parser(d): """ Robust wrapper around some date parsing libs, making a best effort to return a single 8601 date from the input string. No range checking is performed, and any date other than the first occuring will be ignored. We use timelib for its ability to make at least some sense of invalid dates, e.g. 2012/02/31 -> 2012/03/03 We rely only on dateutil.parser for picking out dates from nearly arbitrary strings (fuzzy=True), but at the cost of being forgiving of invalid dates in those kinds of strings. Returns None if it fails """ # Function for a formatted date string, since datetime.datetime.strftime() # only works with years >= 1900. return_date = lambda d: "%d-%02d-%02d" % (d.year, d.month, d.day) # Check for EDTF timestamp first, because it is simple. if edtf_date_and_time.match(d): try: dateinfo = dateutil_parse(d) return return_date(dateinfo) except TypeError: # not parseable by dateutil_parse() dateinfo = None isodate = dateparser.to_iso8601(d) if isodate is None or out_of_range(d): try: dateinfo = dateutil_parse(d, fuzzy=True, default=DEFAULT_DATETIME) if dateinfo.year == DEFAULT_DATETIME.year: dateinfo = None except Exception: try: dateinfo = timelib.strtodatetime(d, now=DEFAULT_DATETIME_SECS) except ValueError: dateinfo = None except Exception as e: logger.error("Exception %s in %s" % (e, __name__)) if dateinfo: return return_date(dateinfo) return isodate
def parse_date_or_range(d): #TODO: Handle dates with BC, AD, AH # Handle ranges like 1920s - 1930s # Handle ranges like 11th - 12th century a, b = None, None if re.search("B\.?C\.?|A\.?D\.?|A\.?H\.?", d.upper()): pass elif year_range.match(d): match = year_range.match(d) a, b = sorted((match.group("year1"), match.group("year2"))) elif len(d.split("-"))%2 == 0 or len(d.split("/"))%2 == 0: # Handle ranges delim = "-" if len(d.split("-"))%2 == 0 else "/" if day_range.match(d): # ie 1970-08-01/02 match = day_range.match(d) a = "%s-%s-%s" % (match.group("year"), match.group("month"), match.group("day_begin")) b = "%s-%s-%s" % (match.group("year"),match.group("month"), match.group("day_end")) elif decade_date.match(d): match = decade_date.match(d) a = match.group("year") + "0" b = match.group("year") + "9" elif any([0 < len(s) < 4 for s in d.split(delim) if len(d.split(delim)) == 2]): # ie 1970-90, 1970/90, 1970-9, 1970/9, 9/1979 match = circa_range.match(d) if match: year_begin = match.group("century") + match.group("year_begin") year_end = match.group("century") + match.group("year_end") if int(year_begin) < int(year_end): # ie 1970-90 a = robust_date_parser(year_begin) b = robust_date_parser(year_end) else: # ie 1970-9 (y, m) = d.split(delim) # If the second number is a month, format it to two digits # and use "-" as the delim for consistency in the # dateparser.to_iso8601 result if int(m) in range(1,13): d = "%s-%02d" % (y, int(m)) else: # ie 1970-13 # Just use the year d = y a = robust_date_parser(d) b = robust_date_parser(d) else: match = month_year.match(d) if match: d = "%s-%02d" % (match.group("year"), int(match.group("month"))) a = robust_date_parser(d) b = robust_date_parser(d) elif "" in d.split(delim): # ie 1970- or -1970 s = d.split(delim) if s[0]: a, b = s[0], None else: a, b = None, s[1] else: # ie 1970-01-01-1971-01-01, 1970Fall/August, 1970April/May, or # wordy date like "mid 11th century AH/AD 17th century (Mughal)" d = d.split(delim) begin = delim.join(d[:len(d)/2]) end = delim.join(d[len(d)/2:]) # Check if month in begin or end m1 = re.sub("[-\d/]", "", begin) m2 = re.sub("[-\d/]", "", end) if m1 or m2: # ie 2004July/August, 2004Fall/Winter, or wordy date begin, end = None, None # Extract year for v in d: y = re.sub(r"(?i)[a-z]", "", v) if len(y) == 4: begin = y + m1.capitalize() end = y + m2.capitalize() if not dateparser.to_iso8601(begin) or not\ dateparser.to_iso8601(end): begin, end = y, y break if begin: a, b = robust_date_parser(begin), robust_date_parser(end) elif decade_date_s.match(d): match = decade_date_s.match(d) year_begin = match.group("year") year_end = match.group("year")[:3] + "9" a, b = year_begin, year_end elif between_date.match(d): match = between_date.match(d) year1 = int(match.group("year1")) year2 = int(match.group("year2")) a, b = str(min(year1, year2)), str(max(year1, year2)) else: parsed = robust_date_parser(d) a, b = parsed, parsed return a, b
def parse_date_or_range(d): #TODO: Handle dates with BC, AD, AH # Handle ranges like 1920s - 1930s # Handle ranges like 11th - 12th century a, b = None, None if re.search("B\.?C\.?|A\.?D\.?|A\.?H\.?", d.upper()): pass is_edtf_timestamp = edtf_date_and_time.match(d) hyphen_split = d.split("-") slash_split = d.split("/") ellipse_split = d.split("..") is_hyphen_split = (len(hyphen_split) % 2 == 0) is_slash_split = (len(slash_split) % 2 == 0) is_ellipse_split = (len(ellipse_split) % 2 == 0) if year_range.match(d): match = year_range.match(d) a, b = sorted((match.group("year1"), match.group("year2"))) elif (is_hyphen_split or is_slash_split or is_ellipse_split) \ and not is_edtf_timestamp: # We passed over EDTF timestamps because they contain hyphens and we # can handle them below. Note that we don't deal with ranges of # timestamps. # # Handle ranges if is_hyphen_split: delim = "-" split_result = hyphen_split elif is_slash_split: delim = "/" split_result = slash_split elif is_ellipse_split: delim = ".." split_result = ellipse_split if day_range.match(d): # ie 1970-08-01/02 match = day_range.match(d) a = "%s-%s-%s" % (match.group("year"), match.group("month"), match.group("day_begin")) b = "%s-%s-%s" % (match.group("year"),match.group("month"), match.group("day_end")) elif decade_date.match(d): match = decade_date.match(d) a = match.group("year") + "0" b = match.group("year") + "9" elif any([0 < len(s) < 4 for s in split_result if len(split_result) == 2]): # ie 1970-90, 1970/90, 1970-9, 1970/9, 9/1979 match = circa_range.match(d) if match: year_begin = match.group("century") + match.group("year_begin") year_end = match.group("century") + match.group("year_end") if int(year_begin) < int(year_end): # ie 1970-90 a = robust_date_parser(year_begin) b = robust_date_parser(year_end) else: # ie 1970-9 (y, m) = split_result # If the second number is a month, format it to two digits # and use "-" as the delim for consistency in the # dateparser.to_iso8601 result if int(m) in range(1,13): d = "%s-%02d" % (y, int(m)) else: # ie 1970-13 # Just use the year d = y a = robust_date_parser(d) b = robust_date_parser(d) else: match = month_year.match(d) if match: d = "%s-%02d" % (match.group("year"), int(match.group("month"))) a = robust_date_parser(d) b = robust_date_parser(d) elif "" in split_result: # ie 1970- or -1970 (but not 19uu- nor -19uu) s = split_result if len(s[0]) == 4 and "u" not in s[0]: a, b = s[0], None elif len(s[1]) == 4 and "u" not in s[1]: a, b = None, s[1] else: a, b = None, None else: # ie 1970-01-01-1971-01-01, 1970 Fall/August, 1970 April/May, or # wordy date like "mid 11th century AH/AD 17th century (Mughal)" d = d.replace(" ", "") d = d.split(delim) begin = delim.join(d[:len(d)/2]) end = delim.join(d[len(d)/2:]) # Check if month in begin or end m1 = re.sub("[-\d/]", "", begin) m2 = re.sub("[-\d/]", "", end) if m1 or m2: # ie 2004July/August, 2004Fall/Winter, or wordy date begin, end = None, None # Extract year for v in d: y = re.sub(r"(?i)[a-z]", "", v) if len(y) == 4: begin = y + m1.capitalize() end = y + m2.capitalize() if not dateparser.to_iso8601(begin) or not\ dateparser.to_iso8601(end): begin, end = y, y break if begin: a, b = robust_date_parser(begin), robust_date_parser(end) elif decade_date_s.match(d): match = decade_date_s.match(d) year_begin = match.group("year") year_end = match.group("year")[:3] + "9" a, b = year_begin, year_end elif between_date.match(d): match = between_date.match(d) year1 = int(match.group("year1")) year2 = int(match.group("year2")) a, b = str(min(year1, year2)), str(max(year1, year2)) else: # This picks up a variety of things, in addition to timestamps. parsed = robust_date_parser(d) a, b = parsed, parsed return a, b
def parse_date_or_range(d): #TODO: Handle dates with BC, AD, AH # Handle ranges like 1920s - 1930s # Handle ranges like 11th - 12th century a, b = None, None if re.search("B\.?C\.?|A\.?D\.?|A\.?H\.?", d.upper()): pass elif year_range.match(d): match = year_range.match(d) a, b = sorted((match.group("year1"), match.group("year2"))) elif len(d.split("-")) % 2 == 0 or len(d.split("/")) % 2 == 0: # Handle ranges delim = "-" if len(d.split("-")) % 2 == 0 else "/" if day_range.match(d): # ie 1970-08-01/02 match = day_range.match(d) a = "%s-%s-%s" % (match.group("year"), match.group("month"), match.group("day_begin")) b = "%s-%s-%s" % (match.group("year"), match.group("month"), match.group("day_end")) elif decade_date.match(d): match = decade_date.match(d) a = match.group("year") + "0" b = match.group("year") + "9" elif any([ 0 < len(s) < 4 for s in d.split(delim) if len(d.split(delim)) == 2 ]): # ie 1970-90, 1970/90, 1970-9, 1970/9, 9/1979 match = circa_range.match(d) if match: year_begin = match.group("century") + match.group("year_begin") year_end = match.group("century") + match.group("year_end") if int(year_begin) < int(year_end): # ie 1970-90 a = robust_date_parser(year_begin) b = robust_date_parser(year_end) else: # ie 1970-9 (y, m) = d.split(delim) # If the second number is a month, format it to two digits # and use "-" as the delim for consistency in the # dateparser.to_iso8601 result if int(m) in range(1, 13): d = "%s-%02d" % (y, int(m)) else: # ie 1970-13 # Just use the year d = y a = robust_date_parser(d) b = robust_date_parser(d) else: match = month_year.match(d) if match: d = "%s-%02d" % (match.group("year"), int(match.group("month"))) a = robust_date_parser(d) b = robust_date_parser(d) elif "" in d.split(delim): # ie 1970- or -1970 s = d.split(delim) if s[0]: a, b = s[0], None else: a, b = None, s[1] else: # ie 1970-01-01-1971-01-01, 1970Fall/August, 1970April/May, or # wordy date like "mid 11th century AH/AD 17th century (Mughal)" d = d.split(delim) begin = delim.join(d[:len(d) / 2]) end = delim.join(d[len(d) / 2:]) # Check if month in begin or end m1 = re.sub("[-\d/]", "", begin) m2 = re.sub("[-\d/]", "", end) if m1 or m2: # ie 2004July/August, 2004Fall/Winter, or wordy date begin, end = None, None # Extract year for v in d: y = re.sub(r"(?i)[a-z]", "", v) if len(y) == 4: begin = y + m1.capitalize() end = y + m2.capitalize() if not dateparser.to_iso8601(begin) or not\ dateparser.to_iso8601(end): begin, end = y, y break if begin: a, b = robust_date_parser(begin), robust_date_parser(end) elif decade_date_s.match(d): match = decade_date_s.match(d) year_begin = match.group("year") year_end = match.group("year")[:3] + "9" a, b = year_begin, year_end elif between_date.match(d): match = between_date.match(d) year1 = int(match.group("year1")) year2 = int(match.group("year2")) a, b = str(min(year1, year2)), str(max(year1, year2)) else: parsed = robust_date_parser(d) a, b = parsed, parsed return a, b
def parse_date_or_range(d): # TODO: Handle dates with BC, AD, AH # Handle ranges like 1920s - 1930s # Handle ranges like 11th - 12th century a, b = None, None if re.search("B\.?C\.?|A\.?D\.?|A\.?H\.?", d.upper()): pass is_edtf_timestamp = edtf_date_and_time.match(d) hyphen_split = d.split("-") slash_split = d.split("/") ellipse_split = d.split("..") is_hyphen_split = (len(hyphen_split) % 2 == 0) is_slash_split = (len(slash_split) % 2 == 0) is_ellipse_split = (len(ellipse_split) % 2 == 0) if year_range.match(d): match = year_range.match(d) a, b = sorted((match.group("year1"), match.group("year2"))) elif (is_hyphen_split or is_slash_split or is_ellipse_split) \ and not is_edtf_timestamp: # We passed over EDTF timestamps because they contain hyphens and we # can handle them below. Note that we don't deal with ranges of # timestamps. # # Handle ranges if is_hyphen_split: delim = "-" split_result = hyphen_split elif is_slash_split: delim = "/" split_result = slash_split elif is_ellipse_split: delim = ".." split_result = ellipse_split if day_range.match(d): # ie 1970-08-01/02 match = day_range.match(d) a = "%s-%s-%s" % (match.group("year"), match.group("month"), match.group("day_begin")) b = "%s-%s-%s" % (match.group("year"), match.group("month"), match.group("day_end")) elif decade_date.match(d): match = decade_date.match(d) a = match.group("year") + "0" b = match.group("year") + "9" elif any( [0 < len(s) < 4 for s in split_result if len(split_result) == 2]): # ie 1970-90, 1970/90, 1970-9, 1970/9, 9/1979 match = circa_range.match(d) if match: year_begin = match.group("century") + match.group("year_begin") year_end = match.group("century") + match.group("year_end") if int(year_begin) < int(year_end): # ie 1970-90 a = robust_date_parser(year_begin) b = robust_date_parser(year_end) else: # ie 1970-9 (y, m) = split_result # If the second number is a month, format it to two digits # and use "-" as the delim for consistency in the # dateparser.to_iso8601 result if int(m) in range(1, 13): d = "%s-%02d" % (y, int(m)) else: # ie 1970-13 # Just use the year d = y a = robust_date_parser(d) b = robust_date_parser(d) else: match = month_year.match(d) if match: d = "%s-%02d" % (match.group("year"), int(match.group("month"))) a = robust_date_parser(d) b = robust_date_parser(d) elif "" in split_result: # ie 1970- or -1970 (but not 19uu- nor -19uu) s = split_result if len(s[0]) == 4 and "u" not in s[0]: a, b = s[0], None elif len(s[1]) == 4 and "u" not in s[1]: a, b = None, s[1] else: a, b = None, None else: # ie 1970-01-01-1971-01-01, 1970 Fall/August, 1970 April/May, or # wordy date like "mid 11th century AH/AD 17th century (Mughal)" d = d.replace(" ", "") d = d.split(delim) begin = delim.join(d[:len(d) / 2]) end = delim.join(d[len(d) / 2:]) # Check if month in begin or end m1 = re.sub("[-\d/]", "", begin) m2 = re.sub("[-\d/]", "", end) if m1 or m2: # ie 2004July/August, 2004Fall/Winter, or wordy date begin, end = None, None # Extract year for v in d: y = re.sub(r"(?i)[a-z]", "", v) if len(y) == 4: begin = y + m1.capitalize() end = y + m2.capitalize() if not dateparser.to_iso8601(begin) or not\ dateparser.to_iso8601(end): begin, end = y, y break if begin: a, b = robust_date_parser(begin), robust_date_parser(end) elif decade_date_s.match(d): match = decade_date_s.match(d) year_begin = match.group("year") year_end = match.group("year")[:3] + "9" a, b = year_begin, year_end elif between_date.match(d): match = between_date.match(d) year1 = int(match.group("year1")) year2 = int(match.group("year2")) a, b = str(min(year1, year2)), str(max(year1, year2)) else: # This picks up a variety of things, in addition to timestamps. parsed = robust_date_parser(d) a, b = parsed, parsed return a, b