def date_form_2(text, date_form = "raw", languages = ['en'], year_prefix = "20"): """ This function takes a string and extracts dates of the form: dd-mm or dd/mm It also extracts these dates when there is only 1 date or month number. Parameters ---------- text : str The string we wish to parse for dates date_form : str (raw or list) Whether we want the dates to be given in the raw form from the string, or as a list ([dd, mm, yyyy]). The two input options are "raw" and "list", with the default being "raw". languages : list of str What languages we wish to consider. Possible languages are English = 'en', French = 'fr' and German = 'de'. The default is just English (['en']). year_prefix : str A string of length 2 which gives the ? for the date, i.e. for dates known to be between 2010 and 2020 the prefix would be '20'. The default is '20'. Output ------ date_list : list A list of dates with qualities based on the given parameters. locations_list : list A list containing the locations of the dates within the text. text : str The original string altered such that it is in lowercase and the text where dates have been found is changed into astericks. Author ------ Ashley Dennis-Henderson [email protected]""" ## TEST INPUTS !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! errors.text_error(text) # Check text is a string errors.date_form_error(date_form) # Check date_form is "raw" or "list" #errors.location_error(location) # Check location is True or False errors.year_prefix_error(year_prefix) # Check that year_prefix is a string of length 2 ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! text = text.lower() # Put text in lowercase date_list = [] # Create empty list to store dates locations_list = [] # Create empty list to store locations ## Create Regular Expressions ---- (date_re, date_re2, month_num_re, date_minimal_re, month_re, year_re, dow_re, joins_re, breaks_re) = reg_exp(languages) # Obtain regular expressions for required languages date_RE = re.compile(date_re + '(-|/)') month_RE = re.compile('(-|/)' + month_num_re) complete_date_RE = re.compile(date_re + '(-|/)' + month_num_re) date_only_RE = re.compile(date_minimal_re) # Create Date List ---- for i in complete_date_RE.finditer(text): # For every string matching our RE raw_date = i.group() # Extract raw date if date_form == "list": # If dates should be listed jj = 0 # counter for j in date_RE.finditer(raw_date): # For every string matching our date RE date_raw = j.group() # Extract date for k in date_only_RE.finditer(date_raw): #date_raw = date_raw[:-1] # Remove last character (- or / or .) date = int(k.group()) # Convert date to integer for k in month_RE.finditer(raw_date): # For every string matching our month RE month_raw = k.group() # Extract month month_raw = month_raw[1:] # Remove first character (- or / or .) month = int(month_raw) # Convert month to integer year = 0 date_list.append([date, month, year]) # Add date to list else: # If dates should be raw date_list.append(raw_date) # Add date to list locations_list.append([i.start(), i.end()]) # Add date location to list n = i.end() - i.start() # How many characters long the date is text = text[:(i.start())] + '*'*(n) + text[(i.end()):] # Change text where date is to astericks return(date_list, locations_list, text)
def date_form_14(text, date_form='raw', languages=['en'], year_prefix='20'): """ Y DOW M D """ text = text.lower() # Convert text to lowercase date_list = [] # Create empty list to store dates locations_list = [] # Create empty list to store locations of dates (date_re, date_re2, month_num_re, date_minimal_re, month_re, year_re, dow_re, joins_re, breaks_re) = reg_exp( languages ) # Obtain regular expression building blocks for required languages complete_RE = re.compile("(" + "(')?" + year_re + breaks_re + ")?" + "(" + dow_re + breaks_re + ")?" + month_re + breaks_re + date_re) # RE to extract entire date date_RE = re.compile(date_minimal_re) # RE to match just the date month_RE = re.compile(month_re) # RE to match just the month year_RE = re.compile(year_re) # RE to match just the year for i1 in complete_RE.finditer( text): # For every piece of text matching our RE raw_date = i1.group() # Get raw date if date_form == "list": # If dates should be in list form counter = 0 for i2 in date_RE.finditer( raw_date): # For every piece of text matching our date RE date = i2.group() # Get date l = len(date) # Get length of date date = int(date) # Convert date to integer for i3 in month_RE.finditer( raw_date): # For every piece of text matching our month RE raw_month = i3.group() # Get month month = convert_month( raw_month ) # Find the integer value equivalent to this month year = 0 # Initialise year to 0 in case there was no year in the date counter2 = 0 num = len(re.findall( year_re, raw_date)) # Find number of matches for year re if (num == 1) and ( l == 2 ): # If there is only one match and length of date is 2 year = 0 # There is no year in the string else: # There is a year for i4 in year_RE.finditer( raw_date ): # For every piece of text matching our year RE raw_year = i4.group() # Get year counter2 = counter2 + 1 # Add to counter if counter2 == 1: # If first match if len( raw_year ) == 2: # If the year only contains two characters raw_year = year_prefix + raw_year # Add the year prefix year = int(raw_year) # Convert year to integer date_list.append([date, month, year]) # Add date to list else: # If dates should be in raw form date_list.append(raw_date) # Add date to list locations_list.append([i1.start(), i1.end()]) # Add date location to list n = i1.end() - i1.start() # Get length of date text = text[:(i1.start())] + '*' * (n) + text[ (i1.end()):] # Change text where date is to astericks ^ # ^ This is so that dates are not extracted multiple times return (date_list, locations_list, text)
def date_form_15(text, date_form='raw', languages=['en'], year_prefix='20'): """ DOW D """ text = text.lower() # Convert text to lowercase date_list = [] # Create empty list to store dates locations_list = [] # Create empty list to store locations of dates (date_re, date_re2, month_num_re, date_minimal_re, month_re, year_re, dow_re, joins_re, breaks_re) = reg_exp( languages ) # Obtain regular expression building blocks for required languages complete_RE = re.compile(dow_re + breaks_re + date_re) # RE to extract entire date date_RE = re.compile(date_minimal_re) # RE to match just the date for i1 in complete_RE.finditer( text): # For every piece of text matching our RE raw_date = i1.group() # Get raw date if date_form == "list": # If dates should be in list form counter = 0 for i2 in date_RE.finditer( raw_date): # For every piece of text matching our date RE counter = counter + 1 # Add to counter if counter == 1: # If it is the first string matching our date RE date = i2.group() # Get date date = int(date) # Convert date to integer month = 0 year = 0 date_list.append([date, month, year]) # Add date to list else: # If dates should be in raw form date_list.append(raw_date) # Add date to list locations_list.append([i1.start(), i1.end()]) # Add date location to list n = i1.end() - i1.start() # Get length of date text = text[:(i1.start())] + '*' * (n) + text[ (i1.end()):] # Change text where date is to astericks ^ # ^ This is so that dates are not extracted multiple times return (date_list, locations_list, text)