Пример #1
0
classifier = LogisticRegression(random_state=0)
print(classifier.fit(X_train, y_train))

y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(
    classifier.score(X_test, y_test)))
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

Z_train = np.ones((271, 1))
print(Z_train)
from merf import MERF
mrf = MERF(n_estimators=300, max_iterations=100)
mrf.fit(X_train, Z_train, clusters_train, y_train)

cat_vars = ['Speaker', 'Token', 'Item']
for var in cat_vars:
    cat_list = 'var' + '_' + var
    cat_list = pd.get_dummies(data[var], prefix=var)
    data1 = data.join(cat_list)
    data = data1

data_vars = data.columns.values.tolist()
to_keep = [i for i in data_vars if i not in cat_vars]

data_final = data[to_keep]
data_final_vars = data_final.columns.values.tolist()
y = ['Subj case']
Пример #2
0
 def test_not_fitted_error(self):
     m = MERF()
     with self.assertRaises(NotFittedError):
         m.predict(self.X_known, self.Z_known, self.clusters_known)
Пример #3
0

# In[78]:

# Split data into train and test sets
indices = np.arange(len(y_samples))
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X_samples, y_samples, indices, test_size=0.2, random_state=301)

# In[ ]:

from merf import MERF

import xgboost as xgb
model = xgb.XGBRegressor()
merf = MERF(model)

clusters_train = [ids[e] for e in idx_train]
clusters_train = pd.Series(clusters_train)
clusters_test = [ids[e] for e in idx_test]
clusters_test = pd.Series(clusters_test)

Z_train = np.ones(shape=(X_train.shape[0], 1))
merf.fit(X_train, Z_train, clusters_train, y_train)
train_preds = merf.predict(X_train, Z_train, clusters_train)
train_preds = np.maximum(train_preds, 0)  # Don't predict negative cases
print('Train MAE:', mae(train_preds, y_train))

Z_test = np.ones(shape=(X_test.shape[0], 1))
test_preds = merf.predict(X_test, Z_test, clusters_test)
test_preds = np.maximum(test_preds, 0)  # Don't predict negative cases
Пример #4
0
               early_stopping_rounds=5,
               nfold=4,
               verbose_eval=True,
               show_stdv=False,
               seed=1)
best_iter = np.argmin(cvbst['l2-mean'])
print("Best number of iterations: " + str(best_iter))
# Best number of iterations: 49
start_time = time.time()  # measure time
bst = gpb.train(params=params,
                train_set=data_train_cat,
                num_boost_round=best_iter)
results.loc["Boosting_Cat", "Time"] = time.time() - start_time
y_pred = bst.predict(data=X_test_cat)
results.loc["Boosting_Cat", "RMSE"] = np.sqrt(np.mean((y_test - y_pred)**2))

# 5. Mixed-effects random forest ('MERF')
from merf import MERF
rf_params = {'max_depth': 6, 'n_estimators': 300}
merf_model = MERF(max_iterations=100, rf_params=rf_params)
print("Warning: the following takes a lot of time")
start_time = time.time()  # measure time
merf_model.fit(pd.DataFrame(X_train), np.ones(shape=(ntrain, 1)),
               pd.Series(group_train), y_train)
results.loc["MERF", "Time"] = time.time() - start_time
y_pred = merf_model.predict(pd.DataFrame(X_test), np.ones(shape=(ntrain, 1)),
                            pd.Series(group_test))
results.loc["MERF", "RMSE"] = np.sqrt(np.mean((y_test - y_pred)**2))

print(results.apply(pd.to_numeric).round(3))
Пример #5
0
def merf(normalise = False):
    hyper_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': ['l1', 'rmse'],
        'learning_rate': 0.001,
        'feature_fraction': 0.8,
         
        "max_depth": 6,
        "max_bin": 512,
        "num_leaves": 40,
        "num_iterations": 100000,
        "n_estimators": 300,
        "verbose": -1
        }
      #   'bagging_fraction': 0.7,
      #  'bagging_freq': 10, "num_leaves": 12, 
      
    gbm = lgb.LGBMRegressor(**hyper_params)    
    
    ap2 = ap.fillna(method = "pad") 
    ap2.isna().sum().sum()
    X_train, Y_train, X_test, Y_test = preprocessing(ap2, hour2int = True, onehotencode = False)
      
    Z_train = np.ones((len(X_train), 1))
    
    clusters_train = X_train['hours']
    clusters_test= X_test['hours']
    
    X_train1 = X_train.drop(["hours"],axis = 1)
    
    X_test1 = X_test.drop(["hours"],axis = 1)
    
    if normalise:
        X_train1 =(X_train1-X_train1.mean())/X_train1.std()
        X_test1 =(X_test1-X_test1.mean())/X_test1.std()
    # we should not nornalise the Y (response)    
    #   Y_train1 =(Y_train-Y_train.mean())/Y_train.std()
         
    #my_imputer = SimpleImputer()
    #X_train1 = my_imputer .fit_transform(X_train1)   # fit missing
    #X_test1  = my_imputer .fit_transform(X_test1)  
    
    # normalising for boosting is commonly not necessary, but for the mixed effect models 
    # we actually may want to normalise. But we only normalise X (predictors)!
       # check if missing 
    print( Y_train1.isnull().any().any(),X_train1.isnull().any().any(),X_test.isnull().any().any())
    
    merf = MERF(gbm, max_iterations = 4)
    merf.fit(X_train1,  Z_train, clusters_train, Y_train1)
    
    Z_test = np.ones((len(X_test1), 1))
    y_pred_ = merf.predict(X_test1, Z_test, clusters_test)
    # also normalise the response and prediction wont work
    #if normalise:
    #    y_pred = y_pred_*Y_train.std()+Y_train.mean() 
        
    mae = abs(y_pred - Y_test).mean()
    rmse =  math.sqrt(((y_pred - Y_test)*(y_pred - Y_test)).mean())
    rrmse = rmse / Y_test.median()
    r2 = get_r2_numpy_corrcoef(Y_test, y_pred)
    return(mae, rmse, rrmse, r2)
Пример #6
0
#r1  /16  # 0.56 0-16
#r1 /5: # 0.625 19-23
#catboost
 
 
# encode string class values as integers
import sklearn
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
X = X_train


 
inspect.signature(MERF)
xgb_reg = xgb.XGBRegressor(objective = "reg:squarederror",booster = "dart", learning_rate = 0.007, max_depth =6 , n_estimators = 300,gamma =5, alpha =2) 

merf = MERF(xgb_reg, max_iterations = 20)
Z_train = np.ones((len(X_train), 1))

clusters_train = X_train['hours']
clusters_test= X_test['hours']
my_imputer = SimpleImputer()

X_train = my_imputer .fit_transform(X_train)   # fit missing
X_test  = my_imputer .fit_transform(X_test)  
merf.fit(X_train,  Z_train, clusters_train, Y_train)


    

# %% [code]
Z_test = np.ones((len(X_test), 1))