def lgbm_insight_er(): """Return 5-fold cross validation scores r2, mae, rmse""" steps = [('scaler', t.MyScaler(dont_scale='for_profit')), ('knn', t.KNNKeepDf())] pipe = Pipeline(steps) pipe.fit(X_raw_er) X = pipe.transform(X_raw_er) # Run once to get ideal parameters # params = { # 'max_bin': [10, 20, 50, 100, 255], # 'num_leaves': [5, 10, 31, 50], # 'min_data_in_leaf': [10, 20, 30], # 'bagging_fraction': [.1, .3, .5, .7, 1] # } # lgb_q = LGBMRegressor(objective='quantile') # gs = RandomizedSearchCV(lgb_q, params, # scoring=['r2', 'neg_mean_squared_error', # 'neg_mean_absolute_error'], # refit='neg_mean_squared_error' # ) # gs.fit(X, y_er) lgbm = LGBMRegressor(num_leaves=50, max_bin=100, bagging_fraction=0.1, objective='quantile') cv_results = cross_validate( lgbm, X.to_numpy(), y_er, scoring=['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'], return_train_score=True) output = pd.DataFrame( { 'train_r2': [cv_results['train_r2'].mean()], 'train_rmse': [ np.mean([ np.sqrt(abs(i)) for i in cv_results['train_neg_mean_squared_error'] ]) ], 'train_mae': [abs(cv_results['train_neg_mean_absolute_error'].mean())], 'test_r2': [cv_results['test_r2'].mean()], 'test_rmse': [ np.mean([ np.sqrt(abs(i)) for i in cv_results['test_neg_mean_squared_error'] ]) ], 'test_mae': [abs(cv_results['test_neg_mean_absolute_error'].mean())] }, index=['LGBM']) return output
def r_out(df, x_vars, y_var, csv_label): def to_r(x, y, csv_label): """merges y and X and exports csv to data/processed for use in R, etc. Use after scaling/imputing""" return pd.concat([y, x], axis=1).to_csv('data/processed/' + csv_label + '.csv', index=False) X_raw, y = get_train_test(df, x_vars, y_var) steps = [('scaler', t.MyScaler(dont_scale=['for_profit'])), ('knn', t.KNNKeepDf())] pipe = Pipeline(steps) pipe.fit(X_raw) X = pipe.transform(X_raw) to_r(X, y, csv_label)
def lr_insight_wr(): """Return 5-fold cross validation scores r2, mae, rmse""" steps = [('scaler', t.MyScaler(dont_scale='for_profit')), ('knn', t.KNNKeepDf())] pipe = Pipeline(steps) pipe.fit(X_raw) X = pipe.transform(X_raw) lr = LinearRegression() lr.fit(X, y) cv_results = cross_validate( lr, X, y, scoring=['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'], return_train_score=True) output = pd.DataFrame( { 'train_r2': [cv_results['train_r2'].mean()], 'train_rmse': [ np.mean([ np.sqrt(abs(i)) for i in cv_results['train_neg_mean_squared_error'] ]) ], 'train_mae': [abs(cv_results['train_neg_mean_absolute_error'].mean())], 'test_r2': [cv_results['test_r2'].mean()], 'test_rmse': [ np.mean([ np.sqrt(abs(i)) for i in cv_results['test_neg_mean_squared_error'] ]) ], 'test_mae': [abs(cv_results['test_neg_mean_absolute_error'].mean())] }, index=['LR']) return output
def resample_kfold(X, y, sparse_df, model, folds=5, prefix='name_'): # Get folds folds = ShuffleSplit(n_splits=folds, test_size=(1 / folds)) folds.get_n_splits(X, y) # Set up history/scoring lists model_history = [] train_r2 = [] train_mse = [] train_mae = [] test_r2 = [] test_mse = [] test_mae = [] # execute k-fold for train_index, test_index in folds.split(X, y): xtrain, xtest = X.iloc[train_index, :], X.iloc[test_index, :] ytrain, ytest = y[train_index], y[test_index] # Simulate missingness on test fold xtest = pd.DataFrame(xtest, columns=X.columns) xtest = sim_miss(xtest, sparse_df) # Scale/transform xtrain steps = [('scaler', t.MyScaler(defs.dummy_vars)), ('knn', t.KNNKeepDf())] pipe = Pipeline(steps) pipe.fit(xtrain) xtrain = pipe.transform(xtrain) # scale/impute test (test has simulated missing, imputed on # data from training folds) xtest = pipe.transform(xtest) # Run the model loop_model = copy.copy(model) loop_model.fit(xtrain, ytrain) # Save models model_history.append(loop_model) # Save Performance train_r2.append(r2_score(ytrain, loop_model.predict(xtrain))) train_mse.append(mean_squared_error(ytrain, loop_model.predict(xtrain))) train_mae.append( mean_absolute_error(ytrain, loop_model.predict(xtrain))) test_r2.append(r2_score(ytest, loop_model.predict(xtest))) test_mse.append(mean_squared_error(ytest, loop_model.predict(xtest))) test_mae.append(mean_absolute_error(ytest, loop_model.predict(xtest))) performance = { prefix + 'train_r2': train_r2, prefix + 'train_mse': train_mse, prefix + 'train_mae': train_mae, prefix + 'test_r2': test_r2, prefix + 'test_mse': test_mse, prefix + 'test_mae': test_mae } return performance
complete_df = pd.read_pickle('data/interim/complete_df.pickle') X_raw, y = get_train_test(complete_df, defs.sparse_vars, 'would_recommend', return_full=True) X_raw_er, y_er = get_train_test(complete_df, defs.sparse_vars, 'RATING_EST', return_full=True) X_raw.rename(columns=renames, inplace=True) X_raw_er.rename(columns=renames, inplace=True) steps = [('scaler', t.MyScaler(dont_scale='for_profit')), ('knn', t.KNNKeepDf())] pipe = Pipeline(steps) pipe.fit(X_raw, defs.dummy_vars) X = pipe.transform(X_raw) #%% # RandomSearch best XGBRegressor parameters params = { "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30], "max_depth": [3, 4, 5, 6, 8, 10, 12, 15], "min_child_weight": [1, 3, 5, 7], "gamma": [0.0, 0.1, 0.2, 0.5, 1], "colsample_bytree": [0.3, 0.4, 0.5, 0.7] } rs = RandomizedSearchCV(XGBRegressor(), params)