Exemplo n.º 1
0
def prepare_data(df=None):
    '''
    Preps the data to be used in the model. Right now, the code itself must
    be modified to tweak which columns are included in what way.

    Parameters
    ----------
    df : Dataframe to use. If not specified, the dataframe is loaded automatically.

    Returns
    -------
    predictors : NxM DataFrame of the predictors for the classification problem.
    meta_info  : Nx6 DataFrame containing the columns 'Escherichia.coli' and
                 'Full_date', to be used, e.g., for leave-one-year-out cross
                 validation and creating the true class labels (elevated vs.
                 not elevated E. coli levels). The columns 'Client.ID','BEACH',
                 'Drek_Prediction'and 'Weekday' are also returned.
    '''

    # Meta columns are not used as predictors
    meta_columns = [
        'Client.ID', 'BEACH', 'Full_date', 'Escherichia.coli',
        'Drek_Prediction', 'Weekday'
    ]

    # Deterministic columns are known ahead of time, their actual values can be used.
    deterministic_columns = [
        'Client.ID',  # subsumed by the geographic flags
        'group_prior_mean',
        'previous_reading',
        'accum_rain',  #added to try to capture storm events
        'Collection_Time',  # mostly missing values but may still be of some use
        '12hrPressureChange',  # overnight pressure change

        #'precipIntensity',
        #'precipIntensityMax',
        #'temperatureMin',
        #'temperatureMax',
        #'humidity',
        #'windSpeed',
        #'cloudCover',

        #'flag_geographically_a_north_beach',
        'categorical_beach_grouping'
        #'12th_previous',
        #'Montrose_previous',
        #'Rainbow_previous',
        #'63rd_previous',
        #'Osterman_previous'
    ]

    # Deterministic columns are known ahead of time, their actual values are used.
    # These hourly variables have an additional parameter which defines what hours
    # should be used. For example, an entry
    #   'temperature':[-16,-13,-12,-11,-9,-3,0]
    # would indicate that the hourly temperature at offsets of
    # [-16,-13,-12,-11,-9,-3,0] from MIDNIGHT the day of should be included as
    # variables in the model.
    deterministic_hourly_columns = {
        'temperature': np.linspace(-19, 4, num=6,
                                   dtype=np.int64),  #range(-19,5),
        'windVectorX':
        np.linspace(-19, 4, num=6,
                    dtype=np.int64),  #range(-19,5),#[-4,-2,0,2,4],
        'windVectorY': np.linspace(-19, 4, num=6, dtype=np.int64),
        #'windSpeed':[-2,0,2,4],
        #'windBearing':[-2,0,2,4],
        'pressure': [0],
        'cloudCover': [-15],  #range(-19,5),
        'humidity': [4],
        #'precipIntensity':[4]#np.linspace(-10,4,num=4,dtype=np.int64)
    }
    for var in deterministic_hourly_columns:
        for hr in deterministic_hourly_columns[var]:
            deterministic_columns.append(var + '_hour_' + str(hr))

    # Historical columns have their previous days' values added to the predictors,
    # but not the current day's value(s) unless the historical column also exists
    # in the deterministic columns list.
    # Similar to the hourly columns, you need to specify which previous days
    # to include as variables. For example, below we have an entry
    #   'temperatureMax': range(1,4)
    # which indicates that the max temperature from 1, 2, and 3 days previous
    # should be included.
    historical_columns = {
        #'temperatureMin': range(2,3),
        'temperatureMax': range(2, 5),
        # 'humidity': range(1,3),
        #'windSpeed': range(1,3),
        'pressure': range(1, 3),
        'dewPoint': range(1, 3),
        #'cloudCover': range(1,3),
        'windVectorX': range(2, 3),
        'windVectorY': range(2, 3),
        'Escherichia.coli': range(2, 8)
    }
    historical_columns_list = list(historical_columns.keys())

    ######################################################
    #### Get relevant columns, add historical data
    ######################################################

    all_columns = meta_columns + deterministic_columns + historical_columns_list  #+ derived_columns
    all_columns = list(set(all_columns))

    df = df[all_columns]

    for var in historical_columns:
        df = rd.add_column_prior_data(df,
                                      var,
                                      historical_columns[var],
                                      beach_col_name='Client.ID',
                                      timestamp_col_name='Full_date')

    df.drop((set(historical_columns_list) - set(deterministic_columns)) -
            set(meta_columns),
            axis=1,
            inplace=True)

    ######################################################
    #### Average the historical columns, fill in NaNs
    ######################################################

    # Creates a "trailing_average_daily_" column for each historical variable
    # which is simply the mean of the previous day columns of that variable.
    # NaN values for any previous day data is filled in by that mean value.
    for var in historical_columns:
        cname = 'trailing_average_daily_' + var
        rnge = historical_columns[var]
        if len(
                rnge
        ) == 1:  # no need to create a trailing average of a single number...
            continue
        df[cname] = df[[str(n) + '_day_prior_' + var for n in rnge]].mean(1)
        for n in rnge:
            df[str(n) + '_day_prior_' + var].fillna(df[cname], inplace=True)

    # Do a similar process for the hourly data.
    for var in deterministic_hourly_columns:
        cname = 'trailing_average_hourly_' + var
        rnge = deterministic_hourly_columns[var]
        if len(
                rnge
        ) == 1:  # no need to create a trailing average of a single number...
            continue
        df[cname] = df[[var + '_hour_' + str(n) for n in rnge]].mean(1)
        for n in rnge:
            df[var + '_hour_' + str(n)].fillna(df[cname], inplace=True)

    ######################################################
    #### Process non-numeric columns
    ######################################################

    # process all of the nonnumeric columns
    # This method just assigns a numeric value to each possible value
    # of the non-numeric column. Note that this will not work well
    # for regression-style models, where instead dummy columns should
    # be created.
    def nonnumericCols(data, verbose=True):
        for f in data.columns:
            if data[f].dtype == 'object':
                if (verbose):
                    print('Column ' + str(f) + ' being treated as non-numeric')
                lbl = sklearn.preprocessing.LabelEncoder()
                lbl.fit(list(data[f].values))
                data.loc[:, f] = lbl.transform(list(data[f].values))
        return data

    # Do this at the end so meta_data has Beach names and Weekdays
    #df = nonnumericCols(df)

    # As a last NaN filling measure, we fill the NaNs of all columns
    # that are NOT the E. coli column with the mean value of the column,
    # the mean value taken over all data not from the same year as the
    # year of the row we are filling. For example, if there is a NaN
    # in the temperatureMax column in some row from 2010, then we will
    # fill that value with the mean temperatureMax value from all years
    # that are NOT 2010.
    cols = df.columns.tolist()
    cols.remove('Escherichia.coli')
    years = df['Full_date'].map(lambda x: x.year)
    for yr in years.unique():
        not_yr = np.array(years != yr)
        is_yr = np.array(years == yr)
        df.ix[is_yr, cols] = df.ix[is_yr, cols].fillna(df.ix[not_yr,
                                                             cols].median())

    ######################################################
    #### Drop any rows that still have NA, set up outputs
    ######################################################

    # The following lines will print the % of rows that:
    #  (a) have a NaN value in some column other than Escherichia.coli, AND
    #  (b) the column Escherichia.coli is NOT NaN.
    # Since we are now filling NaNs with column averages above, this should
    # always report 0%. I'm leaving the check in here just to be sure, though.
    total_rows_predictors = df.dropna(subset=['Escherichia.coli'],
                                      axis=0).shape[0]
    nonnan_rows_predictors = df.dropna(axis=0).shape[0]
    print('Dropping {0:.4f}% of rows because predictors contain NANs'.format(
        100.0 - 100.0 * nonnan_rows_predictors / total_rows_predictors))

    # Any rows that still have NaNs are NaN b/c there is no E. coli reading
    # We should drop these rows b/c there is nothing for us to predict.
    df.dropna(axis=0, inplace=True)
    #df.dropna(axis=0, how='any', subset=['Full_date','Escherichia.coli'], inplace=True)

    predictors = df.drop(set(meta_columns) - set(['Client.ID']), axis=1)
    meta_info = df[meta_columns]

    predictors = nonnumericCols(predictors)

    return predictors, meta_info
def prepare_data(df=None):
    '''
    Preps the data to be used in the model. Right now, the code itself must
    be modified to tweak which columns are included in what way.
    Parameters
    ----------
    df : Dataframe to use. If not specified, the dataframe is loaded automatically.
    Returns
    -------
    predictors : NxM DataFrame of the predictors for the classification problem.
    meta_info  : Nx6 DataFrame containing the columns 'Escherichia.coli' and
                 'Full_date', to be used, e.g., for leave-one-year-out cross
                 validation and creating the true class labels (elevated vs.
                 not elevated E. coli levels). The columns 'Client.ID','BEACH',
                 'Drek_Prediction'and 'Weekday' are also returned.
    '''

    # Meta columns are not used as predictors
    meta_columns = ['Client.ID','BEACH','Full_date','Escherichia.coli',
                    'Drek_Prediction','Weekday']

    # Deterministic columns are known ahead of time, their actual values can be used.
    deterministic_columns = [
        'Client.ID',  # subsumed by the geographic flags
        'group_prior_mean',
        'previous_reading',
        'accum_rain', #added to try to capture storm events
        'Collection_Time', # mostly missing values but may still be of some use
        '12hrPressureChange', # overnight pressure change

        #'precipIntensity',
        #'precipIntensityMax',
        #'temperatureMin',
        #'temperatureMax',
        #'humidity',
        #'windSpeed',
        #'cloudCover',

        #'flag_geographically_a_north_beach',
        'categorical_beach_grouping'
        #'12th_previous',
        #'Montrose_previous',
        #'Rainbow_previous',
        #'63rd_previous',
        #'Osterman_previous'
    ]

    # Deterministic columns are known ahead of time, their actual values are used.
    # These hourly variables have an additional parameter which defines what hours
    # should be used. For example, an entry
    #   'temperature':[-16,-13,-12,-11,-9,-3,0]
    # would indicate that the hourly temperature at offsets of
    # [-16,-13,-12,-11,-9,-3,0] from MIDNIGHT the day of should be included as
    # variables in the model.
    deterministic_hourly_columns = {
        'temperature':np.linspace(-19,4,num=6,dtype=np.int64),#range(-19,5),
        'windVectorX':np.linspace(-19,4,num=6,dtype=np.int64),#range(-19,5),#[-4,-2,0,2,4],
        'windVectorY':np.linspace(-19,4,num=6,dtype=np.int64),
        #'windSpeed':[-2,0,2,4],
        #'windBearing':[-2,0,2,4],
        'pressure':[0],
        'cloudCover':[-15], #range(-19,5),
        'humidity':[4],
        #'precipIntensity':[4]#np.linspace(-10,4,num=4,dtype=np.int64)
    }
    for var in deterministic_hourly_columns:
        for hr in deterministic_hourly_columns[var]:
            deterministic_columns.append(var + '_hour_' + str(hr))

    # Historical columns have their previous days' values added to the predictors,
    # but not the current day's value(s) unless the historical column also exists
    # in the deterministic columns list.
    # Similar to the hourly columns, you need to specify which previous days
    # to include as variables. For example, below we have an entry
    #   'temperatureMax': range(1,4)
    # which indicates that the max temperature from 1, 2, and 3 days previous
    # should be included.
    historical_columns = {
        #'temperatureMin': range(2,3),
        'temperatureMax': range(2,5),
        # 'humidity': range(1,3),
         #'windSpeed': range(1,3),
         'pressure': range(1,3),
         'dewPoint': range(1,3),
         #'cloudCover': range(1,3),
         'windVectorX': range(2,3),
         'windVectorY': range(2,3),
         'Escherichia.coli': range(2,8)
    }
    historical_columns_list = list(historical_columns.keys())


    ######################################################
    #### Get relevant columns, add historical data
    ######################################################

    all_columns = meta_columns + deterministic_columns + historical_columns_list #+ derived_columns
    all_columns = list(set(all_columns))

    df = df[all_columns]

    for var in historical_columns:
        df = rd.add_column_prior_data(
            df, var, historical_columns[var],
            beach_col_name='Client.ID', timestamp_col_name='Full_date'
        )

    df.drop((set(historical_columns_list) - set(deterministic_columns)) - set(meta_columns),
            axis=1, inplace=True)


    ######################################################
    #### Average the historical columns, fill in NaNs
    ######################################################

    # Creates a "trailing_average_daily_" column for each historical variable
    # which is simply the mean of the previous day columns of that variable.
    # NaN values for any previous day data is filled in by that mean value.
    for var in historical_columns:
        cname = 'trailing_average_daily_' + var
        rnge = historical_columns[var]
        if len(rnge) == 1:  # no need to create a trailing average of a single number...
            continue
        df[cname] = df[[str(n) + '_day_prior_' + var for n in rnge]].mean(1)
        for n in rnge:
            df[str(n) + '_day_prior_' + var].fillna(df[cname], inplace=True)

    # Do a similar process for the hourly data.
    for var in deterministic_hourly_columns:
        cname = 'trailing_average_hourly_' + var
        rnge = deterministic_hourly_columns[var]
        if len(rnge) == 1:  # no need to create a trailing average of a single number...
            continue
        df[cname] = df[[var + '_hour_' + str(n) for n in rnge]].mean(1)
        for n in rnge:
            df[var + '_hour_' + str(n)].fillna(df[cname], inplace=True)
            
    

    ######################################################
    #### Process non-numeric columns
    ######################################################

    # process all of the nonnumeric columns
    # This method just assigns a numeric value to each possible value
    # of the non-numeric column. Note that this will not work well
    # for regression-style models, where instead dummy columns should
    # be created.
    def nonnumericCols(data, verbose=True):
        for f in data.columns:
            if data[f].dtype=='object':
                if (verbose):
                    print('Column ' + str(f) + ' being treated as non-numeric')
                lbl = sklearn.preprocessing.LabelEncoder()
                lbl.fit(list(data[f].values))
                data.loc[:,f] = lbl.transform(list(data[f].values))
        return data

    # Do this at the end so meta_data has Beach names and Weekdays
    #df = nonnumericCols(df) 

    # As a last NaN filling measure, we fill the NaNs of all columns
    # that are NOT the E. coli column with the mean value of the column,
    # the mean value taken over all data not from the same year as the
    # year of the row we are filling. For example, if there is a NaN
    # in the temperatureMax column in some row from 2010, then we will
    # fill that value with the mean temperatureMax value from all years
    # that are NOT 2010.
    cols = df.columns.tolist()
    cols.remove('Escherichia.coli')
    years = df['Full_date'].map(lambda x: x.year)
    for yr in years.unique():
        not_yr = np.array(years != yr)
        is_yr = np.array(years == yr)
        df.ix[is_yr, cols] = df.ix[is_yr, cols].fillna(df.ix[not_yr, cols].median())


    ######################################################
    #### Drop any rows that still have NA, set up outputs
    ######################################################

    # The following lines will print the % of rows that:
    #  (a) have a NaN value in some column other than Escherichia.coli, AND
    #  (b) the column Escherichia.coli is NOT NaN.
    # Since we are now filling NaNs with column averages above, this should
    # always report 0%. I'm leaving the check in here just to be sure, though.
    total_rows_predictors = df.dropna(subset=['Escherichia.coli'], axis=0).shape[0]
    nonnan_rows_predictors = df.dropna(axis=0).shape[0]
    print('Dropping {0:.4f}% of rows because predictors contain NANs'.format(
        100.0 - 100.0 * nonnan_rows_predictors / total_rows_predictors
    ))

    # Any rows that still have NaNs are NaN b/c there is no E. coli reading
    # We should drop these rows b/c there is nothing for us to predict.
    df.dropna(axis=0, inplace=True)
    #df.dropna(axis=0, how='any', subset=['Full_date','Escherichia.coli'], inplace=True)

    predictors = df.drop(set(meta_columns)-set(['Client.ID']) , axis=1)
    meta_info = df[meta_columns]

    predictors = nonnumericCols(predictors)

    return predictors, meta_info
def prepare_data(df=None):
    '''
    Preps the data to be used in the model. Right now, the code itself must
    be modified to tweak which columns are included in what way.

    Parameters
    ----------
    df : Dataframe to use. If not specified, the dataframe is loaded automatically.

    Returns
    -------
    predictors : NxM DataFrame of the predictors for the classification problem.
    meta_info  : Nx2 DataFrame containing the columns 'Escherichia.coli' and
                 'Full_date', to be used, e.g., for leave-one-year-out cross
                 validation and creating the true class labels (elevated vs.
                 not elevated E. coli levels).
    '''
    if df is None:
        df = rd.read_data()

    # Leaving 2015 as the final validation set
    df = df[df['Full_date'] < '1-1-2015']


    ######################################################
    #### Add derived columns here
    ######################################################

    df['DayOfYear'] = df['Full_date'].map(lambda x: x.dayofyear)


    ######################################################
    #### List all columns you will use
    ######################################################

    # Meta columns are not used as predictors
    meta_columns = ['Full_date', 'Escherichia.coli']

    # Deterministic columns are known ahead of time, their actual values are used
    # with no previous days being used.
    deterministic_columns = [
        'Client.ID', 'Weekday', 'sunriseTime', 'DayOfYear'
    ]
    deterministic_hourly_columns = [
        'precipIntensity', 'temperature', 'windSpeed',
        'windBearing', 'pressure', 'cloudCover'
    ]
    for var in deterministic_hourly_columns:
        for hr in [-12, -8, -4, 0, 4]:
            deterministic_columns.append(var + '_hour_' + str(hr))

    # Historical columns have their previous days' values added to the predictors,
    # but not the current day's value(s). The value NUM_LOOKBACK_DAYS set below
    # controls the number of previous days added. Nothing is currently done to
    # fill NA values here, so if you wish to use columns with a high rate of data
    # loss, then you should add logic to fill the NA values.
    historical_columns = [
        'precipIntensity', 'precipIntensityMax',
        'temperatureMin', 'temperatureMax',
        'humidity', 'windSpeed', 'cloudCover'
    ]

    # Each historical column will have the data from 1 day back, 2 days back,
    # ..., NUM_LOOKBACK_DAYS days back added to the predictors.
    NUM_LOOKBACK_DAYS = 3


    ######################################################
    #### Get relevant columns, add historical data
    ######################################################

    all_columns = list(set(meta_columns + deterministic_columns + historical_columns))

    df = df[all_columns]

    df = rd.add_column_prior_data(
        df, historical_columns, range(1, NUM_LOOKBACK_DAYS + 1),
        beach_col_name='Client.ID', timestamp_col_name='Full_date'
    )

    df.drop(set(historical_columns) - set(deterministic_columns), axis=1, inplace=True)


    ######################################################
    #### Process non-numeric columns
    ######################################################

    # process all of the nonnumeric columns
    # This method just assigns a numeric value to each possible value
    # of the non-numeric column. Note that this will not work well
    # for regression-style models, where instead dummy columns should
    # be created.
    def nonnumericCols(data, verbose=True):
        for f in data.columns:
            if data[f].dtype=='object':
                if (verbose):
                    print('Column ' + str(f) + ' being treated as non-numeric')
                lbl = sklearn.preprocessing.LabelEncoder()
                lbl.fit(list(data[f].values))
                data[f] = lbl.transform(list(data[f].values))
        return data

    df = nonnumericCols(df)


    ######################################################
    #### Drop any rows that still have NA, set up outputs
    ######################################################

    total_rows_predictors = df.dropna(subset=['Escherichia.coli'], axis=0).shape[0]
    nonnan_rows_predictors = df.dropna(axis=0).shape[0]
    print('Dropping {0:.4f}% of rows because predictors contain NANs'.format(
        100.0 - 100.0 * nonnan_rows_predictors / total_rows_predictors
    ))

    df.dropna(axis=0, inplace=True)

    predictors = df.drop(['Escherichia.coli', 'Full_date'], axis=1)
    meta_info = df[['Escherichia.coli', 'Full_date']]

    return predictors, meta_info
Exemplo n.º 4
0
def prepare_data(df=None):
    '''
    Preps the data to be used in the model. Right now, the code itself must
    be modified to tweak which columns are included in what way.

    Parameters
    ----------
    df : Dataframe to use. If not specified, the dataframe is loaded automatically.

    Returns
    -------
    predictors : NxM DataFrame of the predictors for the classification problem.
    meta_info  : Nx3 DataFrame containing the columns 'Escherichia.coli' and
                 'Full_date', to be used, e.g., for leave-one-year-out cross
                 validation and creating the true class labels (elevated vs.
                 not elevated E. coli levels). The column 'Client.ID' is also
                 returned here, but is currently only used internally in this function.
    '''
    if df is None:
        df = rd.read_data()

    # Leaving 2015 as the final validation set
    df = df[df['Full_date'] < '1-1-2015']

    ######################################################
    #### Add derived columns here
    ######################################################

    df['DayOfYear'] = df['Full_date'].map(lambda x: x.dayofyear)
    derived_columns = ['DayOfYear']

    ######################################################
    #### List all columns you will use
    ######################################################

    # Meta columns are not used as predictors
    meta_columns = ['Client.ID', 'Full_date', 'Escherichia.coli']

    # Deterministic columns are known ahead of time, their actual values can be used.
    deterministic_columns = [
        # 'Client.ID',  # subsumed by the geographic flags
        'precipIntensity',
        'precipIntensityMax',
        'temperatureMin',
        'temperatureMax',
        'humidity',
        'windSpeed',
        'cloudCover',

        # 'sunriseTime',  # commenting for now since it is in absolute UNIX time

        # 'Days.Since.Last.Holiday',
        'flag_geographically_a_north_beach',
        'categorical_beach_grouping'
    ]

    # Deterministic columns are known ahead of time, their actual values are used.
    # These hourly variables have an additional parameter which defines what hours
    # should be used. For example, an entry
    #   'temperature':[-16,-13,-12,-11,-9,-3,0]
    # would indicate that the hourly temperature at offsets of
    # [-16,-13,-12,-11,-9,-3,0] from MIDNIGHT the day of should be included as
    # variables in the model.
    deterministic_hourly_columns = {
        'temperature': range(-19, 5),
        'windSpeed': [1, 2, 3, 4],
        'windBearing': [4],
        'pressure': [0],
        'cloudCover': [4],
        'humidity': [4],
        'precipIntensity': [0, 4]
    }
    for var in deterministic_hourly_columns:
        for hr in deterministic_hourly_columns[var]:
            deterministic_columns.append(var + '_hour_' + str(hr))

    # Historical columns have their previous days' values added to the predictors,
    # but not the current day's value(s) unless the historical column also exists
    # in the deterministic columns list.
    # Similar to the hourly columns, you need to specify which previous days
    # to include as variables. For example, below we have an entry
    #   'temperatureMax': range(1,4)
    # which indicates that the max temperature from 1, 2, and 3 days previous
    # should be included.
    historical_columns = {
        'temperatureMin': range(1, 3),
        'temperatureMax': range(1, 4),
        # 'humidity': range(1,3),
        # 'windSpeed': range(1,8),
        # 'cloudCover': range(1,8),
        # 'precipIntensity': [1],
        'Escherichia.coli': range(1, 8)
    }
    historical_columns_list = list(historical_columns.keys())

    # Specific geo group average columns will have their means calculated
    # for each of the 6 geographic groups, and these values will be used as
    # predictors everywhere.
    specific_geo_group_average_columns = [
        '1_day_prior_Escherichia.coli',
        # 'trailing_average_daily_Escherichia.coli',
        # '1_day_prior_precipIntensity'
    ]

    # Binary geo group average columns will have their means calculated
    # for the beaches North and South of Navy Pier separately, and these
    # values will be used as predictors everywhere.
    binary_geo_group_average_columns = [
        '1_day_prior_Escherichia.coli',
        # 'trailing_average_daily_Escherichia.coli',
        # '1_day_prior_precipIntensity'
    ]

    ######################################################
    #### Get relevant columns, add historical data
    ######################################################

    all_columns = meta_columns + deterministic_columns + historical_columns_list + derived_columns
    all_columns = list(set(all_columns))

    df = df[all_columns]

    for var in historical_columns:
        df = rd.add_column_prior_data(df,
                                      var,
                                      historical_columns[var],
                                      beach_col_name='Client.ID',
                                      timestamp_col_name='Full_date')

    df.drop((set(historical_columns_list) - set(deterministic_columns)) -
            set(meta_columns),
            axis=1,
            inplace=True)

    ######################################################
    #### Average the historical columns, fill in NaNs
    ######################################################

    # Creates a "trailing_average_daily_" column for each historical variable
    # which is simply the mean of the previous day columns of that variable.
    # NaN values for any previous day data is filled in by that mean value.
    for var in historical_columns:
        cname = 'trailing_average_daily_' + var
        rnge = historical_columns[var]
        if len(
                rnge
        ) == 1:  # no need to create a trailing average of a single number...
            continue
        df[cname] = df[[str(n) + '_day_prior_' + var for n in rnge]].mean(1)
        for n in rnge:
            df[str(n) + '_day_prior_' + var].fillna(df[cname], inplace=True)

    # Do a similar process for the hourly data.
    for var in deterministic_hourly_columns:
        cname = 'trailing_average_hourly_' + var
        rnge = deterministic_hourly_columns[var]
        if len(
                rnge
        ) == 1:  # no need to create a trailing average of a single number...
            continue
        df[cname] = df[[var + '_hour_' + str(n) for n in rnge]].mean(1)
        for n in rnge:
            df[var + '_hour_' + str(n)].fillna(df[cname], inplace=True)

    ######################################################
    #### Group Average Variables
    ######################################################

    for var in specific_geo_group_average_columns:
        df2 = df[['Full_date', 'categorical_beach_grouping', var]]
        grp_df = df2.groupby(['Full_date',
                              'categorical_beach_grouping']).mean().unstack()

        # flatten the hierarchical column index
        grp_df.columns = ['_'.join(col) for col in grp_df.columns.values]

        df = df.merge(grp_df,
                      how='left',
                      left_on='Full_date',
                      right_index=True)

    for var in binary_geo_group_average_columns:
        df2 = df[['Full_date', 'flag_geographically_a_north_beach', var]]
        grp_df = df2.groupby(
            ['Full_date',
             'flag_geographically_a_north_beach']).mean().unstack()

        # flatten the hierarchical column index
        grp_df.columns = [
            '_'.join([str(x) for x in col]) for col in grp_df.columns.values
        ]

        df = df.merge(grp_df,
                      how='left',
                      left_on='Full_date',
                      right_index=True)

    ######################################################
    #### Process non-numeric columns
    ######################################################

    # process all of the nonnumeric columns
    # This method just assigns a numeric value to each possible value
    # of the non-numeric column. Note that this will not work well
    # for regression-style models, where instead dummy columns should
    # be created.
    def nonnumericCols(data, verbose=True):
        for f in data.columns:
            if data[f].dtype == 'object':
                if (verbose):
                    print('Column ' + str(f) + ' being treated as non-numeric')
                lbl = sklearn.preprocessing.LabelEncoder()
                lbl.fit(list(data[f].values))
                data[f] = lbl.transform(list(data[f].values))
        return data

    df = nonnumericCols(df)

    ######################################################
    #### More NaN filling
    ######################################################

    # As a last NaN filling measure, we fill the NaNs of all columns
    # that are NOT the E. coli column with the mean value of the column,
    # the mean value taken over all data not from the same year as the
    # year of the row we are filling. For example, if there is a NaN
    # in the temperatureMax column in some row from 2010, then we will
    # fill that value with the mean temperatureMax value from all years
    # that are NOT 2010.
    cols = df.columns.tolist()
    cols.remove('Escherichia.coli')
    years = df['Full_date'].map(lambda x: x.year)
    for yr in years.unique():
        not_yr = np.array(years != yr)
        is_yr = np.array(years == yr)
        df.ix[is_yr, cols] = df.ix[is_yr, cols].fillna(df.ix[not_yr,
                                                             cols].mean())

    ######################################################
    #### Drop any rows that still have NA, set up outputs
    ######################################################

    # The following lines will print the % of rows that:
    #  (a) have a NaN value in some column other than Escherichia.coli, AND
    #  (b) the column Escherichia.coli is NOT NaN.
    # Since we are now filling NaNs with column averages above, this should
    # always report 0%. I'm leaving the check in here just to be sure, though.
    total_rows_predictors = df.dropna(subset=['Escherichia.coli'],
                                      axis=0).shape[0]
    nonnan_rows_predictors = df.dropna(axis=0).shape[0]
    print('Dropping {0:.4f}% of rows because predictors contain NANs'.format(
        100.0 - 100.0 * nonnan_rows_predictors / total_rows_predictors))

    # Any rows that still have NaNs are NaN b/c there is no E. coli reading
    # We should drop these rows b/c there is nothing for us to predict.
    df.dropna(axis=0, inplace=True)

    predictors = df.drop(meta_columns, axis=1)
    meta_info = df[meta_columns]

    return predictors, meta_info
def prepare_data(df=None):
    '''
    Preps the data to be used in the model. Right now, the code itself must
    be modified to tweak which columns are included in what way.

    Parameters
    ----------
    df : Dataframe to use. If not specified, the dataframe is loaded automatically.

    Returns
    -------
    predictors : NxM DataFrame of the predictors for the classification problem.
    meta_info  : Nx3 DataFrame containing the columns 'Escherichia.coli' and
                 'Full_date', to be used, e.g., for leave-one-year-out cross
                 validation and creating the true class labels (elevated vs.
                 not elevated E. coli levels). The column 'Client.ID' is also
                 returned here, but is currently only used internally in this function.
    '''
    if df is None:
        df = rd.read_data()

    # Leaving 2015 as the final validation set
    df = df[df['Full_date'] < '1-1-2015']


    ######################################################
    #### Add derived columns here
    ######################################################

    df['DayOfYear'] = df['Full_date'].map(lambda x: x.dayofyear)
    derived_columns = ['DayOfYear']


    ######################################################
    #### List all columns you will use
    ######################################################

    # Meta columns are not used as predictors
    meta_columns = ['Client.ID', 'Full_date', 'Escherichia.coli']

    # Deterministic columns are known ahead of time, their actual values can be used.
    deterministic_columns = [
        # 'Client.ID',  # subsumed by the geographic flags

        'precipIntensity',
        'precipIntensityMax',
        'temperatureMin',
        'temperatureMax',
        'humidity',
        'windSpeed',
        'cloudCover',

        # 'sunriseTime',  # commenting for now since it is in absolute UNIX time

        # 'Days.Since.Last.Holiday',

        'flag_geographically_a_north_beach',
        'categorical_beach_grouping'
    ]

    # Deterministic columns are known ahead of time, their actual values are used.
    # These hourly variables have an additional parameter which defines what hours
    # should be used. For example, an entry
    #   'temperature':[-16,-13,-12,-11,-9,-3,0]
    # would indicate that the hourly temperature at offsets of
    # [-16,-13,-12,-11,-9,-3,0] from MIDNIGHT the day of should be included as
    # variables in the model.
    deterministic_hourly_columns = {
        'temperature':range(-19,5),
        'windSpeed':[1,2,3,4],
        'windBearing':[4],
        'pressure':[0],
        'cloudCover':[4],
        'humidity':[4],
        'precipIntensity':[0,4]
    }
    for var in deterministic_hourly_columns:
        for hr in deterministic_hourly_columns[var]:
            deterministic_columns.append(var + '_hour_' + str(hr))

    # Historical columns have their previous days' values added to the predictors,
    # but not the current day's value(s) unless the historical column also exists
    # in the deterministic columns list.
    # Similar to the hourly columns, you need to specify which previous days
    # to include as variables. For example, below we have an entry
    #   'temperatureMax': range(1,4)
    # which indicates that the max temperature from 1, 2, and 3 days previous
    # should be included.
    historical_columns = {
        'temperatureMin': range(1,3),
        'temperatureMax': range(1,4),
        # 'humidity': range(1,3),
        # 'windSpeed': range(1,8),
        # 'cloudCover': range(1,8),
        # 'precipIntensity': [1],
        'Escherichia.coli': range(1,8)
    }
    historical_columns_list = list(historical_columns.keys())

    # Specific geo group average columns will have their means calculated
    # for each of the 6 geographic groups, and these values will be used as
    # predictors everywhere.
    specific_geo_group_average_columns = [
        '1_day_prior_Escherichia.coli',
        # 'trailing_average_daily_Escherichia.coli',
        # '1_day_prior_precipIntensity'
    ]

    # Binary geo group average columns will have their means calculated
    # for the beaches North and South of Navy Pier separately, and these
    # values will be used as predictors everywhere.
    binary_geo_group_average_columns = [
        '1_day_prior_Escherichia.coli',
        # 'trailing_average_daily_Escherichia.coli',
        # '1_day_prior_precipIntensity'
    ]


    ######################################################
    #### Get relevant columns, add historical data
    ######################################################

    all_columns = meta_columns + deterministic_columns + historical_columns_list + derived_columns
    all_columns = list(set(all_columns))

    df = df[all_columns]

    for var in historical_columns:
        df = rd.add_column_prior_data(
            df, var, historical_columns[var],
            beach_col_name='Client.ID', timestamp_col_name='Full_date'
        )

    df.drop((set(historical_columns_list) - set(deterministic_columns)) - set(meta_columns),
            axis=1, inplace=True)


    ######################################################
    #### Average the historical columns, fill in NaNs
    ######################################################

    # Creates a "trailing_average_daily_" column for each historical variable
    # which is simply the mean of the previous day columns of that variable.
    # NaN values for any previous day data is filled in by that mean value.
    for var in historical_columns:
        cname = 'trailing_average_daily_' + var
        rnge = historical_columns[var]
        if len(rnge) == 1:  # no need to create a trailing average of a single number...
            continue
        df[cname] = df[[str(n) + '_day_prior_' + var for n in rnge]].mean(1)
        for n in rnge:
            df[str(n) + '_day_prior_' + var].fillna(df[cname], inplace=True)

    # Do a similar process for the hourly data.
    for var in deterministic_hourly_columns:
        cname = 'trailing_average_hourly_' + var
        rnge = deterministic_hourly_columns[var]
        if len(rnge) == 1:  # no need to create a trailing average of a single number...
            continue
        df[cname] = df[[var + '_hour_' + str(n) for n in rnge]].mean(1)
        for n in rnge:
            df[var + '_hour_' + str(n)].fillna(df[cname], inplace=True)


    ######################################################
    #### Group Average Variables
    ######################################################

    for var in specific_geo_group_average_columns:
        df2 = df[['Full_date', 'categorical_beach_grouping', var]]
        grp_df = df2.groupby(['Full_date', 'categorical_beach_grouping']).mean().unstack()

        # flatten the hierarchical column index
        grp_df.columns = ['_'.join(col) for col in grp_df.columns.values]

        df = df.merge(grp_df, how='left', left_on='Full_date', right_index=True)

    for var in binary_geo_group_average_columns:
        df2 = df[['Full_date', 'flag_geographically_a_north_beach', var]]
        grp_df = df2.groupby(['Full_date', 'flag_geographically_a_north_beach']).mean().unstack()

        # flatten the hierarchical column index
        grp_df.columns = ['_'.join([str(x) for x in col]) for col in grp_df.columns.values]

        df = df.merge(grp_df, how='left', left_on='Full_date', right_index=True)


    ######################################################
    #### Process non-numeric columns
    ######################################################

    # process all of the nonnumeric columns
    # This method just assigns a numeric value to each possible value
    # of the non-numeric column. Note that this will not work well
    # for regression-style models, where instead dummy columns should
    # be created.
    def nonnumericCols(data, verbose=True):
        for f in data.columns:
            if data[f].dtype=='object':
                if (verbose):
                    print('Column ' + str(f) + ' being treated as non-numeric')
                lbl = sklearn.preprocessing.LabelEncoder()
                lbl.fit(list(data[f].values))
                data[f] = lbl.transform(list(data[f].values))
        return data

    df = nonnumericCols(df)


    ######################################################
    #### More NaN filling
    ######################################################

    # As a last NaN filling measure, we fill the NaNs of all columns
    # that are NOT the E. coli column with the mean value of the column,
    # the mean value taken over all data not from the same year as the
    # year of the row we are filling. For example, if there is a NaN
    # in the temperatureMax column in some row from 2010, then we will
    # fill that value with the mean temperatureMax value from all years
    # that are NOT 2010.
    cols = df.columns.tolist()
    cols.remove('Escherichia.coli')
    years = df['Full_date'].map(lambda x: x.year)
    for yr in years.unique():
        not_yr = np.array(years != yr)
        is_yr = np.array(years == yr)
        df.ix[is_yr, cols] = df.ix[is_yr, cols].fillna(df.ix[not_yr, cols].mean())


    ######################################################
    #### Drop any rows that still have NA, set up outputs
    ######################################################

    # The following lines will print the % of rows that:
    #  (a) have a NaN value in some column other than Escherichia.coli, AND
    #  (b) the column Escherichia.coli is NOT NaN.
    # Since we are now filling NaNs with column averages above, this should
    # always report 0%. I'm leaving the check in here just to be sure, though.
    total_rows_predictors = df.dropna(subset=['Escherichia.coli'], axis=0).shape[0]
    nonnan_rows_predictors = df.dropna(axis=0).shape[0]
    print('Dropping {0:.4f}% of rows because predictors contain NANs'.format(
        100.0 - 100.0 * nonnan_rows_predictors / total_rows_predictors
    ))

    # Any rows that still have NaNs are NaN b/c there is no E. coli reading
    # We should drop these rows b/c there is nothing for us to predict.
    df.dropna(axis=0, inplace=True)

    predictors = df.drop(meta_columns, axis=1)
    meta_info = df[meta_columns]

    return predictors, meta_info