Exemplo n.º 1
0
def date_form_2(text, date_form = "raw", languages = ['en'], year_prefix = "20"):

    """
    This function takes a string and extracts dates of the form:

                dd-mm   or   dd/mm

    It also extracts these dates when there is only 1 date or month number.

    Parameters
    ----------
    text : str
        The string we wish to parse for dates
    date_form : str (raw or list)
        Whether we want the dates to be given in the raw form from the string,
        or as a list ([dd, mm, yyyy]). The two input options are "raw" and
        "list", with the default being "raw".
    languages : list of str
        What languages we wish to consider. Possible languages are English = 'en',
        French = 'fr' and German = 'de'. The default is just English (['en']).
    year_prefix : str
        A string of length 2 which gives the ? for the date, i.e. for dates known
        to be between 2010 and 2020 the prefix would be '20'. The default is '20'.


    Output
    ------
    date_list : list
        A list of dates with qualities based on the given parameters.
    locations_list : list
        A list containing the locations of the dates within the text.
    text : str
        The original string altered such that it is in lowercase and 
        the text where dates have been found is changed into astericks.


    Author
    ------
    Ashley Dennis-Henderson
    [email protected]"""



    ## TEST INPUTS !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    errors.text_error(text)  # Check text is a string

    errors.date_form_error(date_form)  # Check date_form is "raw" or "list"

    #errors.location_error(location)  # Check location is True or False

    errors.year_prefix_error(year_prefix)  # Check that year_prefix is a string of length 2


    ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    text = text.lower()  # Put text in lowercase
    
    date_list = []  # Create empty list to store dates

    locations_list = []  # Create empty list to store locations


    ## Create Regular Expressions ----

    (date_re, date_re2, month_num_re, date_minimal_re, month_re, year_re, dow_re, joins_re, breaks_re) = reg_exp(languages)  # Obtain regular expressions for required languages
    
    date_RE = re.compile(date_re + '(-|/)')
    
    month_RE = re.compile('(-|/)' + month_num_re)
    
    complete_date_RE = re.compile(date_re + '(-|/)' + month_num_re)
    
    date_only_RE = re.compile(date_minimal_re)

    # Create Date List ----
    
    for i in complete_date_RE.finditer(text):  # For every string matching our RE
        
        raw_date = i.group()  # Extract raw date

        if date_form == "list":  # If dates should be listed

            jj = 0  # counter
            
            for j in date_RE.finditer(raw_date):  # For every string matching our date RE

                    date_raw = j.group()  # Extract date
                    
                    for k in date_only_RE.finditer(date_raw):
                    #date_raw = date_raw[:-1]  # Remove last character (- or / or .)            
                        date = int(k.group())  # Convert date to integer

            for k in month_RE.finditer(raw_date):  # For every string matching our month RE

                month_raw = k.group()  # Extract month
                month_raw = month_raw[1:]  # Remove first character (- or / or .)
                month = int(month_raw)  # Convert month to integer

            year = 0
                
            date_list.append([date, month, year])  # Add date to list

        else:  # If dates should be raw

            date_list.append(raw_date)  # Add date to list

        locations_list.append([i.start(), i.end()])  # Add date location to list
                
        n = i.end() - i.start()  # How many characters long the date is
        text = text[:(i.start())] + '*'*(n) + text[(i.end()):]  # Change text where date is to astericks
        
    return(date_list, locations_list, text)
def date_form_14(text, date_form='raw', languages=['en'], year_prefix='20'):
    """

        Y DOW M D


    """

    text = text.lower()  # Convert text to lowercase

    date_list = []  # Create empty list to store dates

    locations_list = []  # Create empty list to store locations of dates

    (date_re, date_re2, month_num_re, date_minimal_re, month_re, year_re,
     dow_re, joins_re, breaks_re) = reg_exp(
         languages
     )  # Obtain regular expression building blocks for required languages

    complete_RE = re.compile("(" + "(')?" + year_re + breaks_re + ")?" + "(" +
                             dow_re + breaks_re + ")?" + month_re + breaks_re +
                             date_re)  # RE to extract entire date

    date_RE = re.compile(date_minimal_re)  # RE to match just the date

    month_RE = re.compile(month_re)  # RE to match just the month

    year_RE = re.compile(year_re)  # RE to match just the year

    for i1 in complete_RE.finditer(
            text):  # For every piece of text matching our RE

        raw_date = i1.group()  # Get raw date

        if date_form == "list":  # If dates should be in list form

            counter = 0

            for i2 in date_RE.finditer(
                    raw_date):  # For every piece of text matching our date RE

                date = i2.group()  # Get date
                l = len(date)  # Get length of date
                date = int(date)  # Convert date to integer

            for i3 in month_RE.finditer(
                    raw_date):  # For every piece of text matching our month RE

                raw_month = i3.group()  # Get month

                month = convert_month(
                    raw_month
                )  # Find the integer value equivalent to this month

            year = 0  # Initialise year to 0 in case there was no year in the date

            counter2 = 0

            num = len(re.findall(
                year_re, raw_date))  # Find number of matches for year re

            if (num == 1) and (
                    l == 2
            ):  # If there is only one match and length of date is 2

                year = 0  # There is no year in the string

            else:  # There is a year

                for i4 in year_RE.finditer(
                        raw_date
                ):  # For every piece of text matching our year RE

                    raw_year = i4.group()  # Get year

                    counter2 = counter2 + 1  # Add to counter

                    if counter2 == 1:  # If first match

                        if len(
                                raw_year
                        ) == 2:  # If the year only contains two characters

                            raw_year = year_prefix + raw_year  # Add the year prefix

                        year = int(raw_year)  # Convert year to integer

            date_list.append([date, month, year])  # Add date to list

        else:  # If dates should be in raw form

            date_list.append(raw_date)  # Add date to list

        locations_list.append([i1.start(),
                               i1.end()])  # Add date location to list

        n = i1.end() - i1.start()  # Get length of date

        text = text[:(i1.start())] + '*' * (n) + text[
            (i1.end()):]  # Change text where date is to astericks ^

        # ^ This is so that dates are not extracted multiple times

    return (date_list, locations_list, text)
Exemplo n.º 3
0
def date_form_15(text, date_form='raw', languages=['en'], year_prefix='20'):
    """

        DOW D


    """

    text = text.lower()  # Convert text to lowercase

    date_list = []  # Create empty list to store dates

    locations_list = []  # Create empty list to store locations of dates

    (date_re, date_re2, month_num_re, date_minimal_re, month_re, year_re,
     dow_re, joins_re, breaks_re) = reg_exp(
         languages
     )  # Obtain regular expression building blocks for required languages

    complete_RE = re.compile(dow_re + breaks_re +
                             date_re)  # RE to extract entire date

    date_RE = re.compile(date_minimal_re)  # RE to match just the date

    for i1 in complete_RE.finditer(
            text):  # For every piece of text matching our RE

        raw_date = i1.group()  # Get raw date

        if date_form == "list":  # If dates should be in list form

            counter = 0

            for i2 in date_RE.finditer(
                    raw_date):  # For every piece of text matching our date RE

                counter = counter + 1  # Add to counter

                if counter == 1:  # If it is the first string matching our date RE

                    date = i2.group()  # Get date
                    date = int(date)  # Convert date to integer

            month = 0
            year = 0

            date_list.append([date, month, year])  # Add date to list

        else:  # If dates should be in raw form

            date_list.append(raw_date)  # Add date to list

        locations_list.append([i1.start(),
                               i1.end()])  # Add date location to list

        n = i1.end() - i1.start()  # Get length of date

        text = text[:(i1.start())] + '*' * (n) + text[
            (i1.end()):]  # Change text where date is to astericks ^

        # ^ This is so that dates are not extracted multiple times

    return (date_list, locations_list, text)