def get_df(cols, load_from_temp, temp_path):
    weather_df = load_weather_df(CONFIG.preprocessed_meteo_path_complete)
    if not load_from_temp:
        logger.info('Loading train Dataframe...')
        train_df = load_train_df(CONFIG.preprocessed_train_path_means)
        logger.info('Loading weather Dataframe...')

        logger.info('Creating features...')
        ff = FeatureFactory(train_df, weather_df)
        for col in cols:
            logger.info('Creating %s feature...' % col)
            ff(col)
        if 'ASS_ASSIGNMENT' not in cols:
            cols = ['ASS_ASSIGNMENT'] + cols
        if 'DATE' not in cols:
            cols = ['DATE'] + cols
        if 'CSPL_RECEIVED_CALLS' not in cols:
            cols += ['CSPL_RECEIVED_CALLS']

        logger.info('Selecting features...')
        ff.select_features(cols)
        train_df = ff.X
        if temp_path is not None:
            train_df.to_csv(temp_path)
    else:
        assert temp_path is not None
        logger.info('Loading train Dataframe...')
        train_df = pd.read_csv(temp_path, encoding='latin-1', index_col=0, parse_dates=['DATE'])
    weather_df.reset_index(inplace=True)
    return train_df, weather_df
def get_holidays():
    df = load_train_df(CONFIG.preprocessed_train_path)
    # weather_df = load_weather_df(CONFIG.preprocessed_meteo_path)
    ff = FeatureFactory(df, weather_df=None)
    ff('WEEK_NUMBER')
    # ff('NUMB_FROZEN_DEPT')
    # ff('NUMB_WET_DEPT')
    df = ff.X
    for assignment in CONFIG.submission_assignments:
        plt.plot(df[df["ASS_ASSIGNMENT"] == assignment].groupby('WEEK_NUMBER')['CSPL_RECEIVED_CALLS'].mean().reset_index())
        plt.savefig(os.path.join(visualization_path, "holidays_absolute_values.jpg"))
def complete_data_with_zeros(in_path, out_path=None):
    logger.debug('Loading Dataframe...')
    train_df = load_train_df(in_path)

    logger.debug('Generating empty Dataframe...')
    dates = sorted(set(train_df['DATE']))
    zero_date_df = pd.DataFrame({'DATE': dates})
    zero_date_df['ZERO'] = 0
    zero_ass_df = pd.DataFrame({'ASS_ASSIGNMENT': CONFIG.submission_assignments})
    zero_ass_df['ZERO'] = 0
    zero_df = zero_date_df.merge(zero_ass_df, how='left', on='ZERO')

    logger.debug('Completing Dataframe...')
    train_df = zero_df.merge(train_df, how='left', on=['DATE', 'ASS_ASSIGNMENT'])
    train_df['CSPL_RECEIVED_CALLS'].fillna(0, inplace=True)
    train_df.drop('ZERO', axis=1, inplace=True)

    logger.debug('Saving Dataframe...')
    if out_path is not None:
        train_df.to_csv(out_path)
    return train_df
def run(train_or_meteo=None, train_cols=None, meteo_cols=None, verbose=0):
    if train_or_meteo is None or train_or_meteo == 'train':
        parse_train_as_df(CONFIG.raw_train_path, CONFIG.preprocessed_train_path, useful_cols=train_cols,
                          verbose=verbose)
        logger.info('Saved train in csv file.')
        complete_data_with_zeros(CONFIG.preprocessed_train_path, CONFIG.preprocessed_train_path_zeros)
        logger.info('Saved completed train in csv file.')
        complete_with_means(load_train_df(CONFIG.preprocessed_train_path_zeros), CONFIG.means_path)
        logger.info('Saved train with means in csv file.')

    if train_or_meteo is None or train_or_meteo == 'meteo':
        print('Reading meteo file 1...')
        df1 = parse_meteo_as_df(CONFIG.raw_meteo_path1)
        print('Reading meteo file 2...')
        df2 = parse_meteo_as_df(CONFIG.raw_meteo_path2)
        print('Concatenating meteo files...')
        df = pd.concat([df1, df2])
        print('Meteo files concatenated. Running preprocessing...')

        # Number of departments where it has rained and where it has frozen
        print('Meteo1...')
        preprocess_meteo1(df)

        # Average amount of rain and average lowest temperatures in each department.
        print('Meteo2...')
        preprocess_meteo2(df)

        # Booleans for each department where it has rained and where it has frozen
        print('Meteo3...')
        preprocess_meteo3(df)

        # Booleans for each department where average amount of rain is above 1mm and where it has frozen.
        print('Meteo4...')
        preprocess_meteo4(df)

        return df
def compare_calls(scale, out_path, assignments=None, datetime=None):
    """
    Plot the number of calls to compare them.

    Parameters
    ==========
    scale: 'DATETIME', 'DAY', 'WEEK' or 'YEAR', calls are averaged on all smaller scales,
        and plotted for larger scales.
    out_path: str, folder in which figures should be saved.
    assignments: str or list of str, assignments to take into account.
        None to take all columns into account.
    datetime: if 'DATETIME', the datetime to filter on

    Example
    =======
    Week comparison: For each day of the week, take the average number of calls, then compare for each week of the year.
    """
    assert scale in ['DATETIME', 'DAY', 'WEEK', 'YEAR']
    if assignments is not None:
        if isinstance(assignments, str):
            assignments = [assignments]
        assert not set(assignments).difference(CONFIG.submission_assignments)
    else:
        assignments = CONFIG.submission_assignments

    df = load_train_df(CONFIG.preprocessed_train_path)
    df = df[df["ASS_ASSIGNMENT"].isin(assignments)]
    # if remove_days_off:
    #     df = df[df["DAY_OFF"] == 0]
    #     df.drop("DAY_OFF", axis=1, inplace=True)
    ff = FeatureFactory(df)
    for column in ["WEEK_NUMBER", "WEEK_DAY", "TIME"]:
        ff(column)
    df = ff.X

    if scale == 'DATETIME':
        assert datetime is not None
        df = df[ff("WEEK_DAY") == datetime.isoweekday()]
        df = df[df['TIME'] == datetime.hour + float(datetime.minute)/60]
        for assignment in assignments:
            print(assignment)
            df_assignment = df[df['ASS_ASSIGNMENT'] == assignment].reset_index()
            plt.plot(df_assignment['CSPL_RECEIVED_CALLS'])
        weather_df = load_weather_df(CONFIG.preprocessed_meteo_path)
        good_days = [d for d in weather_df.index if d.isoweekday() == datetime.isoweekday()]
        weather_df = weather_df.loc[good_days, :].reset_index()
        plt.plot(weather_df['NUMB_FROZEN_DEPT'])
        plt.plot(weather_df['NUMB_WET_DEPT'])
        plt.savefig(os.path.join(out_path, scale+".jpg"))

    if scale == 'DAY':
        grouped = df.groupby(["ASS_ASSIGNMENT", "WEEK_NUMBER", "WEEK_DAY", "TIME"])
        df = grouped["CSPL_RECEIVED_CALLS"].sum().reset_index()
        for assignment in assignments:
            print(assignment)
            df_assignment = df[df['ASS_ASSIGNMENT'] == assignment]
            for day in range(366):
                df_day = df_assignment[df_assignment['WEEK_NUMBER'] == int(day/7 + 1)]
                df_day = df_day[df_day['WEEK_DAY'] == day % 7]
                plt.plot(df_day['TIME'], df_day["CSPL_RECEIVED_CALLS"])
            plt.savefig(os.path.join(out_path, scale+"_"+assignment+".jpg"))
            plt.clf()
    if scale == 'WEEK':
        grouped = df.groupby(["ASS_ASSIGNMENT", "WEEK_NUMBER", "WEEK_DAY"])
        df = grouped["CSPL_RECEIVED_CALLS"].mean().reset_index()
        for assignment in assignments:
            print(assignment)
            df_assignment = df[df['ASS_ASSIGNMENT'] == assignment]
            for week_number in range(53):
                df_week = df_assignment[df_assignment['WEEK_NUMBER'] == week_number]
                plt.plot(df_week['WEEK_DAY'], df_week["CSPL_RECEIVED_CALLS"])
            plt.savefig(os.path.join(out_path, scale+"_"+assignment+".jpg"))
            plt.clf()
    if scale == 'YEAR':
        grouped = df.groupby(["ASS_ASSIGNMENT", "WEEK_NUMBER"])
        df = grouped["CSPL_RECEIVED_CALLS"].mean().reset_index()
        for assignment in assignments:
            print(assignment)
            df_assignment = df[df['ASS_ASSIGNMENT'] == assignment]
            plt.plot(df_assignment['WEEK_DAY'], df_assignment["CSPL_RECEIVED_CALLS"])
        # plt.axis([0, 52, 0, 50])
        plt.savefig(os.path.join(out_path, scale+"_absolute_values.jpg"))
        plt.clf()
Пример #6
0
import preprocessing
import learning
from utils import load_submission, load_train_df
from configuration import CONFIG


if __name__ == "__main__":
    # Create train and meteo preprocessed files
    preprocessing.run()
    # Define a model
    from sklearn.neighbors import KNeighborsRegressor

    _df = load_train_df(CONFIG.preprocessed_train_path)
    _submission_df = load_submission()
    _estimator = KNeighborsRegressor(n_neighbors=4, weights='distance')
    # estimator = LogisticRegression()
    _scoring = 'mean_squared_error'
    _k_fold = 3
    _n_jobs = 3
    _verbose = 0
    _fit_params = None
    _cols = ["YEAR", "WEEK_NUMBER", "WEEK_DAY", "TIME"]
    _weights = [1, 1, 1, 0.1]

    # Test the model
    print(learning.cross_val_score(_estimator, _cols, _k_fold, _weights, _scoring, _n_jobs, _verbose, _fit_params,
                                   chunksize=100000))

    # Create the corresponding submission file
    learning.create_submission_file(_estimator, _cols, weights=_weights)