def extract_dates(text, sorting=None): global patterns # convert to unicode if the text is in a bytestring # we conver to unicode because it is easier to work with # and it handles text in foreign languages much better if isinstance(text, str): text = text.decode('utf-8') matches = [] completes = [] partials = [] for match in re.finditer(re.compile(patterns['date'], flags), text): # print "match is", match.groupdict() # this goes through the dictionary and removes empties and changes the keys back, e.g. from month_myd to month match = dict((k.split("_")[0], num(v)) for k, v in match.groupdict().iteritems() if num(v)) if all(k in match for k in ("day","month", "year")): completes.append(match) else: partials.append(match) #print "\ncompletes are", completes # iterate through partials # if a more specific date is given in the completes, drop the partial # for example if Feb 1, 2014 is picked up and February 2014, too, drop February 2014 partials = [partial for partial in partials if not is_date_in_list(partial, completes)] #print "\npartials are", partials # convert completes and partials and return list ordered by: # complete/partial, most common, most recent for d in completes: try: print datetime(normalize_year(d['year']),int(d['month']),int(d['day'])) except Exception as e: print d['year'], d['month'], d['day'] completes = [datetime(normalize_year(d['year']),int(d['month']),int(d['day'])) for d in completes] if sorting: counter = Counter(completes) completes = remove_duplicates(sorted(completes, key = lambda x: (counter[x], x.toordinal()), reverse=True)) #average_date = mean([d for d in completes]) return completes
def date_from_dict(match): month = match['month'] if month.isdigit(): month = int(month) else: month = month_to_number[month.title()] try: day = int(match.group("day")) except Exception as e: #print "exception is", e day = 1 try: return datetime(int(match.group("year")), month, day, tzinfo=tzinfo) except Exception as e: print e
def getFirstDateFromText(text): print "starting getFirstDateFromText" global patterns # convert to unicode if the text is in a bytestring # we conver to unicode because it is easier to work with # and it handles text in foreign languages much better if isinstance(text, str): text = text.decode('utf-8') for match in re.finditer(re.compile(patterns['date'], flags), text): print "\nmatch is", match.group(0) if not isDefinitelyNotDate(match.group(0)): match = dict((k.split("_")[0], num(v)) for k, v in match.groupdict().iteritems() if num(v)) print "match is", match if all(k in match for k in ("day","month", "year")): print "returning getFirstDateFromText" return datetime(normalize_year(match['year']),int(match['month']),int(match['day']), tzinfo=tzinfo) print "finishing getFirstDateFromText"