Пример #1
0
 def test_early_stopping(self):
     np.random.seed(3187)
     # Create a MERF model with a high early stopping threshold
     m = MERF(max_iterations=10, gll_early_stop_threshold=0.1)
     # Fit
     m.fit(self.X_train, self.Z_train, self.clusters_train, self.y_train)
     # The number of iterations should be less than max_iterations
     self.assertTrue(len(m.gll_history) < 10)
Пример #2
0
 def test_viz(self):
     lgbm = LGBMRegressor()
     m = MERF(fixed_effects_model=lgbm, max_iterations=5)
     # Train
     m.fit(
         self.X_train,
         self.Z_train,
         self.clusters_train,
         self.y_train,
         self.X_known,
         self.Z_known,
         self.clusters_known,
         self.y_known,
     )
     plot_merf_training_stats(m)
Пример #3
0
 def test_type_error(self):
     m = MERF(max_iterations=5)
     self.assertRaises(
         TypeError,
         m.fit,
         np.array(self.X_train),
         np.array(self.Z_train),
         np.array(self.clusters_train),
         self.y_train,
     )
Пример #4
0
 def test_fit_and_predict_numpy(self):
     m = MERF(max_iterations=5)
     # Train
     m.fit(np.array(self.X_train), np.array(self.Z_train), self.clusters_train, self.y_train)
     # Predict Known Clusters
     yhat_known = m.predict(np.array(self.X_known), np.array(self.Z_known), self.clusters_known)
     self.assertEqual(len(yhat_known), 5)
     # Predict New Clusters
     yhat_new = m.predict(np.array(self.X_new), np.array(self.Z_new), self.clusters_new)
     self.assertEqual(len(yhat_new), 2)
Пример #5
0
 def test_fit_and_predict_pandas(self):
     m = MERF(max_iterations=10)
     # Train
     m.fit(self.X_train, self.Z_train, self.clusters_train, self.y_train)
     self.assertEqual(len(m.gll_history), 10)
     # Predict Known Clusters
     yhat_known = m.predict(self.X_known, self.Z_known, self.clusters_known)
     self.assertEqual(len(yhat_known), 5)
     # Predict New Clusters
     yhat_new = m.predict(self.X_new, self.Z_new, self.clusters_new)
     self.assertEqual(len(yhat_new), 2)
Пример #6
0
 def test_user_defined_fe_model(self):
     lgbm = LGBMRegressor()
     m = MERF(fixed_effects_model=lgbm, max_iterations=5)
     # Train
     m.fit(self.X_train, self.Z_train, self.clusters_train, self.y_train)
     self.assertEqual(len(m.gll_history), 5)
     # Predict Known Clusters
     yhat_known = m.predict(self.X_known, self.Z_known, self.clusters_known)
     self.assertEqual(len(yhat_known), 5)
     # Predict New Clusters
     yhat_new = m.predict(self.X_new, self.Z_new, self.clusters_new)
     self.assertEqual(len(yhat_new), 2)
Пример #7
0
 def test_validation_numpy(self):
     m = MERF(max_iterations=3)
     # Train
     m.fit(
         np.array(self.X_train),
         np.array(self.Z_train),
         self.clusters_train,
         self.y_train,
         np.array(self.X_new),
         np.array(self.Z_new),
         self.clusters_new,
         self.y_new,
     )
     self.assertEqual(len(m.val_loss_history), 3)
     # Predict Known Clusters
     yhat_known = m.predict(self.X_known, self.Z_known, self.clusters_known)
     self.assertEqual(len(yhat_known), 5)
     # Predict New Clusters
     yhat_new = m.predict(self.X_new, self.Z_new, self.clusters_new)
     self.assertEqual(len(yhat_new), 2)
Пример #8
0
    def test_pickle(self):
        m = MERF(max_iterations=5)
        # Train
        m.fit(self.X_train, self.Z_train, self.clusters_train, self.y_train)

        # Write to pickle file
        with open("model.pkl", "wb") as fin:
            pickle.dump(m, fin)

        # Read back from pickle file
        with open("model.pkl", "rb") as fout:
            m_pkl = pickle.load(fout)

        # Check that m is not the same object as m_pkl
        self.assertIsNot(m_pkl, m)
        # Predict Known Clusters
        yhat_known_pkl = m_pkl.predict(self.X_known, self.Z_known, self.clusters_known)
        yhat_known = m.predict(self.X_known, self.Z_known, self.clusters_known)
        assert_almost_equal(yhat_known_pkl, yhat_known)
        # Predict New Clusters
        yhat_new_pkl = m_pkl.predict(self.X_new, self.Z_new, self.clusters_new)
        yhat_new = m.predict(self.X_new, self.Z_new, self.clusters_new)
        assert_almost_equal(yhat_new_pkl, yhat_new)
results.loc["Boosting_Ign","RMSE"] = np.sqrt(np.mean((y_test - y_pred) ** 2))

# 4. Gradient tree-boosting including the grouping variable as a categorical variable ('Boosting_Cat')
X_train_cat = np.column_stack((group_train,X_train))
X_test_cat = np.column_stack((group_test,X_test))
data_train_cat = gpb.Dataset(X_train_cat, y_train, categorical_feature=[0])
cvbst = gpb.cv(params=params, train_set=data_train_cat,
               num_boost_round=1000, early_stopping_rounds=5,
               nfold=4, verbose_eval=True, show_stdv=False, seed=1)
best_iter = np.argmin(cvbst['l2-mean'])
print("Best number of iterations: " + str(best_iter))
# Best number of iterations: 49
start_time = time.time() # measure time
bst = gpb.train(params=params, train_set=data_train_cat, num_boost_round=best_iter)
results.loc["Boosting_Cat","Time"] = time.time() - start_time
y_pred = bst.predict(data=X_test_cat)
results.loc["Boosting_Cat","RMSE"] = np.sqrt(np.mean((y_test - y_pred) ** 2))

# 5. Mixed-effects random forest ('MERF')
from merf import MERF
rf_params={'max_depth': 6, 'n_estimators': 300}
merf_model = MERF(max_iterations=100, rf_params=rf_params)
print("Warning: the following takes a lot of time")
start_time = time.time() # measure time
merf_model.fit(pd.DataFrame(X_train), np.ones(shape=(ntrain,1)), pd.Series(group_train), y_train)
results.loc["MERF","Time"] = time.time() - start_time
y_pred = merf_model.predict(pd.DataFrame(X_test), np.ones(shape=(ntrain,1)), pd.Series(group_test))
results.loc["MERF","RMSE"] = np.sqrt(np.mean((y_test - y_pred) ** 2))

print(results.apply(pd.to_numeric).round(3))
Пример #10
0
# Helpful function to compute mae
def mae(pred, true):
    return np.mean(np.abs(pred - true))


# In[78]:

# Split data into train and test sets
indices = np.arange(len(y_samples))
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X_samples, y_samples, indices, test_size=0.2, random_state=301)

# In[ ]:

from merf import MERF
merf = MERF()
clusters_train = [ids[e] for e in idx_train]
clusters_train = pd.Series(clusters_train)
clusters_test = [ids[e] for e in idx_test]
clusters_test = pd.Series(clusters_test)

Z_train = np.ones(shape=(X_train.shape[0], 1))
merf.fit(X_train, Z_train, clusters_train, y_train)
train_preds = merf.predict(X_train, Z_train, clusters_train)
train_preds = np.maximum(train_preds, 0)  # Don't predict negative cases
print('Train MAE:', mae(train_preds, y_train))

Z_test = np.ones(shape=(X_test.shape[0], 1))
test_preds = merf.predict(X_test, Z_test, clusters_test)
test_preds = np.maximum(test_preds, 0)  # Don't predict negative cases
print('Test MAE:', mae(test_preds, y_test))
Пример #11
0
        test = data['train'][data['train']['TRR_ID'].isin(val_ids)]
    else:
        train_ids = nontest_ids
        test = data['test']

    train = filter_train_by_visit(
        visit_type, data['train'][data['train'].TRR_ID.isin(train_ids)])
    if eq_train_ratio:
        train = equalize_num_case_control(train, data['eq_cases_train_cols'])

    if visit_type == "all":
        if i > 0:
            val_aurocs2 = []
            for max_depth in [5, 10, 15]:
                model = MERF(n_estimators=100,
                             gll_early_stop_threshold=0.001,
                             max_iterations=2,
                             max_depth=max_depth)
                model.fit(train[cols],
                          pandas.DataFrame(np.ones((train.shape[0], 1))),
                          train.TRR_ID, train.diab_in_1_year)
                test_y_hat = model.predict(
                    test[cols], pandas.DataFrame(np.ones((test.shape[0], 1))),
                    test.TRR_ID)
                test_auroc = roc_auc_score(test.diab_in_1_year, test_y_hat)
                val_aurocs2.append(test_auroc)
            max_depth = [5, 10, 15][np.argmax(val_aurocs2)]
        else:
            max_depth = max_depths[np.argmax(val_aurocs)]

        model = MERF(n_estimators=100,
                     gll_early_stop_threshold=0.001,
Пример #12
0
 def test_not_fitted_error(self):
     m = MERF()
     with self.assertRaises(NotFittedError):
         m.predict(self.X_known, self.Z_known, self.clusters_known)
Пример #13
0
classifier = LogisticRegression(random_state=0)
print(classifier.fit(X_train, y_train))

y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(
    classifier.score(X_test, y_test)))
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

Z_train = np.ones((271, 1))
print(Z_train)
from merf import MERF
mrf = MERF(n_estimators=300, max_iterations=100)
mrf.fit(X_train, Z_train, clusters_train, y_train)

cat_vars = ['Speaker', 'Token', 'Item']
for var in cat_vars:
    cat_list = 'var' + '_' + var
    cat_list = pd.get_dummies(data[var], prefix=var)
    data1 = data.join(cat_list)
    data = data1

data_vars = data.columns.values.tolist()
to_keep = [i for i in data_vars if i not in cat_vars]

data_final = data[to_keep]
data_final_vars = data_final.columns.values.tolist()
y = ['Subj case']
Пример #14
0
def merf(normalise = False):
    hyper_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': ['l1', 'rmse'],
        'learning_rate': 0.001,
        'feature_fraction': 0.8,
         
        "max_depth": 6,
        "max_bin": 512,
        "num_leaves": 40,
        "num_iterations": 100000,
        "n_estimators": 300,
        "verbose": -1
        }
      #   'bagging_fraction': 0.7,
      #  'bagging_freq': 10, "num_leaves": 12, 
      
    gbm = lgb.LGBMRegressor(**hyper_params)    
    
    ap2 = ap.fillna(method = "pad") 
    ap2.isna().sum().sum()
    X_train, Y_train, X_test, Y_test = preprocessing(ap2, hour2int = True, onehotencode = False)
      
    Z_train = np.ones((len(X_train), 1))
    
    clusters_train = X_train['hours']
    clusters_test= X_test['hours']
    
    X_train1 = X_train.drop(["hours"],axis = 1)
    
    X_test1 = X_test.drop(["hours"],axis = 1)
    
    if normalise:
        X_train1 =(X_train1-X_train1.mean())/X_train1.std()
        X_test1 =(X_test1-X_test1.mean())/X_test1.std()
    # we should not nornalise the Y (response)    
    #   Y_train1 =(Y_train-Y_train.mean())/Y_train.std()
         
    #my_imputer = SimpleImputer()
    #X_train1 = my_imputer .fit_transform(X_train1)   # fit missing
    #X_test1  = my_imputer .fit_transform(X_test1)  
    
    # normalising for boosting is commonly not necessary, but for the mixed effect models 
    # we actually may want to normalise. But we only normalise X (predictors)!
       # check if missing 
    print( Y_train1.isnull().any().any(),X_train1.isnull().any().any(),X_test.isnull().any().any())
    
    merf = MERF(gbm, max_iterations = 4)
    merf.fit(X_train1,  Z_train, clusters_train, Y_train1)
    
    Z_test = np.ones((len(X_test1), 1))
    y_pred_ = merf.predict(X_test1, Z_test, clusters_test)
    # also normalise the response and prediction wont work
    #if normalise:
    #    y_pred = y_pred_*Y_train.std()+Y_train.mean() 
        
    mae = abs(y_pred - Y_test).mean()
    rmse =  math.sqrt(((y_pred - Y_test)*(y_pred - Y_test)).mean())
    rrmse = rmse / Y_test.median()
    r2 = get_r2_numpy_corrcoef(Y_test, y_pred)
    return(mae, rmse, rrmse, r2)
Пример #15
0
#r1  /16  # 0.56 0-16
#r1 /5: # 0.625 19-23
#catboost
 
 
# encode string class values as integers
import sklearn
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
X = X_train


 
inspect.signature(MERF)
xgb_reg = xgb.XGBRegressor(objective = "reg:squarederror",booster = "dart", learning_rate = 0.007, max_depth =6 , n_estimators = 300,gamma =5, alpha =2) 

merf = MERF(xgb_reg, max_iterations = 20)
Z_train = np.ones((len(X_train), 1))

clusters_train = X_train['hours']
clusters_test= X_test['hours']
my_imputer = SimpleImputer()

X_train = my_imputer .fit_transform(X_train)   # fit missing
X_test  = my_imputer .fit_transform(X_test)  
merf.fit(X_train,  Z_train, clusters_train, Y_train)


    

# %% [code]
Z_test = np.ones((len(X_test), 1))