def test_early_stopping(self): np.random.seed(3187) # Create a MERF model with a high early stopping threshold m = MERF(max_iterations=10, gll_early_stop_threshold=0.1) # Fit m.fit(self.X_train, self.Z_train, self.clusters_train, self.y_train) # The number of iterations should be less than max_iterations self.assertTrue(len(m.gll_history) < 10)
def test_fit_and_predict_numpy(self): m = MERF(max_iterations=5) # Train m.fit(np.array(self.X_train), np.array(self.Z_train), self.clusters_train, self.y_train) # Predict Known Clusters yhat_known = m.predict(np.array(self.X_known), np.array(self.Z_known), self.clusters_known) self.assertEqual(len(yhat_known), 5) # Predict New Clusters yhat_new = m.predict(np.array(self.X_new), np.array(self.Z_new), self.clusters_new) self.assertEqual(len(yhat_new), 2)
def test_fit_and_predict_pandas(self): m = MERF(max_iterations=10) # Train m.fit(self.X_train, self.Z_train, self.clusters_train, self.y_train) self.assertEqual(len(m.gll_history), 10) # Predict Known Clusters yhat_known = m.predict(self.X_known, self.Z_known, self.clusters_known) self.assertEqual(len(yhat_known), 5) # Predict New Clusters yhat_new = m.predict(self.X_new, self.Z_new, self.clusters_new) self.assertEqual(len(yhat_new), 2)
def test_user_defined_fe_model(self): lgbm = LGBMRegressor() m = MERF(fixed_effects_model=lgbm, max_iterations=5) # Train m.fit(self.X_train, self.Z_train, self.clusters_train, self.y_train) self.assertEqual(len(m.gll_history), 5) # Predict Known Clusters yhat_known = m.predict(self.X_known, self.Z_known, self.clusters_known) self.assertEqual(len(yhat_known), 5) # Predict New Clusters yhat_new = m.predict(self.X_new, self.Z_new, self.clusters_new) self.assertEqual(len(yhat_new), 2)
def test_viz(self): lgbm = LGBMRegressor() m = MERF(fixed_effects_model=lgbm, max_iterations=5) # Train m.fit( self.X_train, self.Z_train, self.clusters_train, self.y_train, self.X_known, self.Z_known, self.clusters_known, self.y_known, ) plot_merf_training_stats(m)
def test_validation_numpy(self): m = MERF(max_iterations=3) # Train m.fit( np.array(self.X_train), np.array(self.Z_train), self.clusters_train, self.y_train, np.array(self.X_new), np.array(self.Z_new), self.clusters_new, self.y_new, ) self.assertEqual(len(m.val_loss_history), 3) # Predict Known Clusters yhat_known = m.predict(self.X_known, self.Z_known, self.clusters_known) self.assertEqual(len(yhat_known), 5) # Predict New Clusters yhat_new = m.predict(self.X_new, self.Z_new, self.clusters_new) self.assertEqual(len(yhat_new), 2)
def test_pickle(self): m = MERF(max_iterations=5) # Train m.fit(self.X_train, self.Z_train, self.clusters_train, self.y_train) # Write to pickle file with open("model.pkl", "wb") as fin: pickle.dump(m, fin) # Read back from pickle file with open("model.pkl", "rb") as fout: m_pkl = pickle.load(fout) # Check that m is not the same object as m_pkl self.assertIsNot(m_pkl, m) # Predict Known Clusters yhat_known_pkl = m_pkl.predict(self.X_known, self.Z_known, self.clusters_known) yhat_known = m.predict(self.X_known, self.Z_known, self.clusters_known) assert_almost_equal(yhat_known_pkl, yhat_known) # Predict New Clusters yhat_new_pkl = m_pkl.predict(self.X_new, self.Z_new, self.clusters_new) yhat_new = m.predict(self.X_new, self.Z_new, self.clusters_new) assert_almost_equal(yhat_new_pkl, yhat_new)
results.loc["Boosting_Ign","RMSE"] = np.sqrt(np.mean((y_test - y_pred) ** 2)) # 4. Gradient tree-boosting including the grouping variable as a categorical variable ('Boosting_Cat') X_train_cat = np.column_stack((group_train,X_train)) X_test_cat = np.column_stack((group_test,X_test)) data_train_cat = gpb.Dataset(X_train_cat, y_train, categorical_feature=[0]) cvbst = gpb.cv(params=params, train_set=data_train_cat, num_boost_round=1000, early_stopping_rounds=5, nfold=4, verbose_eval=True, show_stdv=False, seed=1) best_iter = np.argmin(cvbst['l2-mean']) print("Best number of iterations: " + str(best_iter)) # Best number of iterations: 49 start_time = time.time() # measure time bst = gpb.train(params=params, train_set=data_train_cat, num_boost_round=best_iter) results.loc["Boosting_Cat","Time"] = time.time() - start_time y_pred = bst.predict(data=X_test_cat) results.loc["Boosting_Cat","RMSE"] = np.sqrt(np.mean((y_test - y_pred) ** 2)) # 5. Mixed-effects random forest ('MERF') from merf import MERF rf_params={'max_depth': 6, 'n_estimators': 300} merf_model = MERF(max_iterations=100, rf_params=rf_params) print("Warning: the following takes a lot of time") start_time = time.time() # measure time merf_model.fit(pd.DataFrame(X_train), np.ones(shape=(ntrain,1)), pd.Series(group_train), y_train) results.loc["MERF","Time"] = time.time() - start_time y_pred = merf_model.predict(pd.DataFrame(X_test), np.ones(shape=(ntrain,1)), pd.Series(group_test)) results.loc["MERF","RMSE"] = np.sqrt(np.mean((y_test - y_pred) ** 2)) print(results.apply(pd.to_numeric).round(3))
def mae(pred, true): return np.mean(np.abs(pred - true)) # In[78]: # Split data into train and test sets indices = np.arange(len(y_samples)) X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split( X_samples, y_samples, indices, test_size=0.2, random_state=301) # In[ ]: from merf import MERF merf = MERF() clusters_train = [ids[e] for e in idx_train] clusters_train = pd.Series(clusters_train) clusters_test = [ids[e] for e in idx_test] clusters_test = pd.Series(clusters_test) Z_train = np.ones(shape=(X_train.shape[0], 1)) merf.fit(X_train, Z_train, clusters_train, y_train) train_preds = merf.predict(X_train, Z_train, clusters_train) train_preds = np.maximum(train_preds, 0) # Don't predict negative cases print('Train MAE:', mae(train_preds, y_train)) Z_test = np.ones(shape=(X_test.shape[0], 1)) test_preds = merf.predict(X_test, Z_test, clusters_test) test_preds = np.maximum(test_preds, 0) # Don't predict negative cases print('Test MAE:', mae(test_preds, y_test))
train = filter_train_by_visit( visit_type, data['train'][data['train'].TRR_ID.isin(train_ids)]) if eq_train_ratio: train = equalize_num_case_control(train, data['eq_cases_train_cols']) if visit_type == "all": if i > 0: val_aurocs2 = [] for max_depth in [5, 10, 15]: model = MERF(n_estimators=100, gll_early_stop_threshold=0.001, max_iterations=2, max_depth=max_depth) model.fit(train[cols], pandas.DataFrame(np.ones((train.shape[0], 1))), train.TRR_ID, train.diab_in_1_year) test_y_hat = model.predict( test[cols], pandas.DataFrame(np.ones((test.shape[0], 1))), test.TRR_ID) test_auroc = roc_auc_score(test.diab_in_1_year, test_y_hat) val_aurocs2.append(test_auroc) max_depth = [5, 10, 15][np.argmax(val_aurocs2)] else: max_depth = max_depths[np.argmax(val_aurocs)] model = MERF(n_estimators=100, gll_early_stop_threshold=0.001, max_iterations=2, max_depth=max_depth) model.fit(train[cols], pandas.DataFrame(np.ones((train.shape[0], 1))),
def merf(normalise = False): hyper_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': ['l1', 'rmse'], 'learning_rate': 0.001, 'feature_fraction': 0.8, "max_depth": 6, "max_bin": 512, "num_leaves": 40, "num_iterations": 100000, "n_estimators": 300, "verbose": -1 } # 'bagging_fraction': 0.7, # 'bagging_freq': 10, "num_leaves": 12, gbm = lgb.LGBMRegressor(**hyper_params) ap2 = ap.fillna(method = "pad") ap2.isna().sum().sum() X_train, Y_train, X_test, Y_test = preprocessing(ap2, hour2int = True, onehotencode = False) Z_train = np.ones((len(X_train), 1)) clusters_train = X_train['hours'] clusters_test= X_test['hours'] X_train1 = X_train.drop(["hours"],axis = 1) X_test1 = X_test.drop(["hours"],axis = 1) if normalise: X_train1 =(X_train1-X_train1.mean())/X_train1.std() X_test1 =(X_test1-X_test1.mean())/X_test1.std() # we should not nornalise the Y (response) # Y_train1 =(Y_train-Y_train.mean())/Y_train.std() #my_imputer = SimpleImputer() #X_train1 = my_imputer .fit_transform(X_train1) # fit missing #X_test1 = my_imputer .fit_transform(X_test1) # normalising for boosting is commonly not necessary, but for the mixed effect models # we actually may want to normalise. But we only normalise X (predictors)! # check if missing print( Y_train1.isnull().any().any(),X_train1.isnull().any().any(),X_test.isnull().any().any()) merf = MERF(gbm, max_iterations = 4) merf.fit(X_train1, Z_train, clusters_train, Y_train1) Z_test = np.ones((len(X_test1), 1)) y_pred_ = merf.predict(X_test1, Z_test, clusters_test) # also normalise the response and prediction wont work #if normalise: # y_pred = y_pred_*Y_train.std()+Y_train.mean() mae = abs(y_pred - Y_test).mean() rmse = math.sqrt(((y_pred - Y_test)*(y_pred - Y_test)).mean()) rrmse = rmse / Y_test.median() r2 = get_r2_numpy_corrcoef(Y_test, y_pred) return(mae, rmse, rrmse, r2)