def get_df(cols, load_from_temp, temp_path): weather_df = load_weather_df(CONFIG.preprocessed_meteo_path_complete) if not load_from_temp: logger.info('Loading train Dataframe...') train_df = load_train_df(CONFIG.preprocessed_train_path_means) logger.info('Loading weather Dataframe...') logger.info('Creating features...') ff = FeatureFactory(train_df, weather_df) for col in cols: logger.info('Creating %s feature...' % col) ff(col) if 'ASS_ASSIGNMENT' not in cols: cols = ['ASS_ASSIGNMENT'] + cols if 'DATE' not in cols: cols = ['DATE'] + cols if 'CSPL_RECEIVED_CALLS' not in cols: cols += ['CSPL_RECEIVED_CALLS'] logger.info('Selecting features...') ff.select_features(cols) train_df = ff.X if temp_path is not None: train_df.to_csv(temp_path) else: assert temp_path is not None logger.info('Loading train Dataframe...') train_df = pd.read_csv(temp_path, encoding='latin-1', index_col=0, parse_dates=['DATE']) weather_df.reset_index(inplace=True) return train_df, weather_df
def get_holidays(): df = load_train_df(CONFIG.preprocessed_train_path) # weather_df = load_weather_df(CONFIG.preprocessed_meteo_path) ff = FeatureFactory(df, weather_df=None) ff('WEEK_NUMBER') # ff('NUMB_FROZEN_DEPT') # ff('NUMB_WET_DEPT') df = ff.X for assignment in CONFIG.submission_assignments: plt.plot(df[df["ASS_ASSIGNMENT"] == assignment].groupby('WEEK_NUMBER')['CSPL_RECEIVED_CALLS'].mean().reset_index()) plt.savefig(os.path.join(visualization_path, "holidays_absolute_values.jpg"))
def complete_data_with_zeros(in_path, out_path=None): logger.debug('Loading Dataframe...') train_df = load_train_df(in_path) logger.debug('Generating empty Dataframe...') dates = sorted(set(train_df['DATE'])) zero_date_df = pd.DataFrame({'DATE': dates}) zero_date_df['ZERO'] = 0 zero_ass_df = pd.DataFrame({'ASS_ASSIGNMENT': CONFIG.submission_assignments}) zero_ass_df['ZERO'] = 0 zero_df = zero_date_df.merge(zero_ass_df, how='left', on='ZERO') logger.debug('Completing Dataframe...') train_df = zero_df.merge(train_df, how='left', on=['DATE', 'ASS_ASSIGNMENT']) train_df['CSPL_RECEIVED_CALLS'].fillna(0, inplace=True) train_df.drop('ZERO', axis=1, inplace=True) logger.debug('Saving Dataframe...') if out_path is not None: train_df.to_csv(out_path) return train_df
def run(train_or_meteo=None, train_cols=None, meteo_cols=None, verbose=0): if train_or_meteo is None or train_or_meteo == 'train': parse_train_as_df(CONFIG.raw_train_path, CONFIG.preprocessed_train_path, useful_cols=train_cols, verbose=verbose) logger.info('Saved train in csv file.') complete_data_with_zeros(CONFIG.preprocessed_train_path, CONFIG.preprocessed_train_path_zeros) logger.info('Saved completed train in csv file.') complete_with_means(load_train_df(CONFIG.preprocessed_train_path_zeros), CONFIG.means_path) logger.info('Saved train with means in csv file.') if train_or_meteo is None or train_or_meteo == 'meteo': print('Reading meteo file 1...') df1 = parse_meteo_as_df(CONFIG.raw_meteo_path1) print('Reading meteo file 2...') df2 = parse_meteo_as_df(CONFIG.raw_meteo_path2) print('Concatenating meteo files...') df = pd.concat([df1, df2]) print('Meteo files concatenated. Running preprocessing...') # Number of departments where it has rained and where it has frozen print('Meteo1...') preprocess_meteo1(df) # Average amount of rain and average lowest temperatures in each department. print('Meteo2...') preprocess_meteo2(df) # Booleans for each department where it has rained and where it has frozen print('Meteo3...') preprocess_meteo3(df) # Booleans for each department where average amount of rain is above 1mm and where it has frozen. print('Meteo4...') preprocess_meteo4(df) return df
def compare_calls(scale, out_path, assignments=None, datetime=None): """ Plot the number of calls to compare them. Parameters ========== scale: 'DATETIME', 'DAY', 'WEEK' or 'YEAR', calls are averaged on all smaller scales, and plotted for larger scales. out_path: str, folder in which figures should be saved. assignments: str or list of str, assignments to take into account. None to take all columns into account. datetime: if 'DATETIME', the datetime to filter on Example ======= Week comparison: For each day of the week, take the average number of calls, then compare for each week of the year. """ assert scale in ['DATETIME', 'DAY', 'WEEK', 'YEAR'] if assignments is not None: if isinstance(assignments, str): assignments = [assignments] assert not set(assignments).difference(CONFIG.submission_assignments) else: assignments = CONFIG.submission_assignments df = load_train_df(CONFIG.preprocessed_train_path) df = df[df["ASS_ASSIGNMENT"].isin(assignments)] # if remove_days_off: # df = df[df["DAY_OFF"] == 0] # df.drop("DAY_OFF", axis=1, inplace=True) ff = FeatureFactory(df) for column in ["WEEK_NUMBER", "WEEK_DAY", "TIME"]: ff(column) df = ff.X if scale == 'DATETIME': assert datetime is not None df = df[ff("WEEK_DAY") == datetime.isoweekday()] df = df[df['TIME'] == datetime.hour + float(datetime.minute)/60] for assignment in assignments: print(assignment) df_assignment = df[df['ASS_ASSIGNMENT'] == assignment].reset_index() plt.plot(df_assignment['CSPL_RECEIVED_CALLS']) weather_df = load_weather_df(CONFIG.preprocessed_meteo_path) good_days = [d for d in weather_df.index if d.isoweekday() == datetime.isoweekday()] weather_df = weather_df.loc[good_days, :].reset_index() plt.plot(weather_df['NUMB_FROZEN_DEPT']) plt.plot(weather_df['NUMB_WET_DEPT']) plt.savefig(os.path.join(out_path, scale+".jpg")) if scale == 'DAY': grouped = df.groupby(["ASS_ASSIGNMENT", "WEEK_NUMBER", "WEEK_DAY", "TIME"]) df = grouped["CSPL_RECEIVED_CALLS"].sum().reset_index() for assignment in assignments: print(assignment) df_assignment = df[df['ASS_ASSIGNMENT'] == assignment] for day in range(366): df_day = df_assignment[df_assignment['WEEK_NUMBER'] == int(day/7 + 1)] df_day = df_day[df_day['WEEK_DAY'] == day % 7] plt.plot(df_day['TIME'], df_day["CSPL_RECEIVED_CALLS"]) plt.savefig(os.path.join(out_path, scale+"_"+assignment+".jpg")) plt.clf() if scale == 'WEEK': grouped = df.groupby(["ASS_ASSIGNMENT", "WEEK_NUMBER", "WEEK_DAY"]) df = grouped["CSPL_RECEIVED_CALLS"].mean().reset_index() for assignment in assignments: print(assignment) df_assignment = df[df['ASS_ASSIGNMENT'] == assignment] for week_number in range(53): df_week = df_assignment[df_assignment['WEEK_NUMBER'] == week_number] plt.plot(df_week['WEEK_DAY'], df_week["CSPL_RECEIVED_CALLS"]) plt.savefig(os.path.join(out_path, scale+"_"+assignment+".jpg")) plt.clf() if scale == 'YEAR': grouped = df.groupby(["ASS_ASSIGNMENT", "WEEK_NUMBER"]) df = grouped["CSPL_RECEIVED_CALLS"].mean().reset_index() for assignment in assignments: print(assignment) df_assignment = df[df['ASS_ASSIGNMENT'] == assignment] plt.plot(df_assignment['WEEK_DAY'], df_assignment["CSPL_RECEIVED_CALLS"]) # plt.axis([0, 52, 0, 50]) plt.savefig(os.path.join(out_path, scale+"_absolute_values.jpg")) plt.clf()
import preprocessing import learning from utils import load_submission, load_train_df from configuration import CONFIG if __name__ == "__main__": # Create train and meteo preprocessed files preprocessing.run() # Define a model from sklearn.neighbors import KNeighborsRegressor _df = load_train_df(CONFIG.preprocessed_train_path) _submission_df = load_submission() _estimator = KNeighborsRegressor(n_neighbors=4, weights='distance') # estimator = LogisticRegression() _scoring = 'mean_squared_error' _k_fold = 3 _n_jobs = 3 _verbose = 0 _fit_params = None _cols = ["YEAR", "WEEK_NUMBER", "WEEK_DAY", "TIME"] _weights = [1, 1, 1, 0.1] # Test the model print(learning.cross_val_score(_estimator, _cols, _k_fold, _weights, _scoring, _n_jobs, _verbose, _fit_params, chunksize=100000)) # Create the corresponding submission file learning.create_submission_file(_estimator, _cols, weights=_weights)