y_test = np.zeros((X_test.shape[0],1)) y_train = np.zeros((X_train.shape[0],1)) y_train[range(X_train_pos.shape[0])]=1 y_test[range(X_test_pos.shape[0])]=1 print("X size: ",X_train.shape[0],'x',X_train.shape[1]) print("y size: ",y_train.shape[0],'x',y_train.shape[1]) print("X-test size: ",X_test.shape[0],'x',X_test.shape[1]) print("y-test size: ",y_test.shape[0],'x',y_test.shape[1]) # train and test, performance output #clf = tune_ebm(X_train, y_train) clf = ExplainableBoostingClassifier(random_state=seed, interactions=100) clf.fit(X_train, y_train) print("Finished training ...") curr_perf = [] y_pred = clf.predict(X_test) curr_perf += [metrics.accuracy_score(y_test, y_pred)] print(metrics.confusion_matrix(y_test, y_pred)) y_pred = clf.predict_proba(X_test) curr_perf += [get_aucpr(y_test, y_pred[:,1])] curr_perf += [get_auc(y_test, y_pred[:,1])] print("Performance: ",curr_perf) # predict on larger set, output predictions print("Predicting on all test pairs now... ") scores = (clf.predict_proba(X_neg_all))[:,1] neg_pps['score'] = scores neg_pps.to_csv(outfile) # save model #save_model(clf,format("models/ebm_covonly_split%d_1to1_int.pkl" % split))
iX_train, iX_test, y_train, y_test = \ train_test_split(iX, y, test_size=0.25, stratify=y, random_state=0) X_train, X_test = X[iX_train], X[iX_test] X_test_out = data_test_out y_test_out = labels_test_out #%% from interpret.glassbox import ExplainableBoostingClassifier ebm = ExplainableBoostingClassifier() ebm.fit(data_pts_1, labels_pts_1) labels_pt_2_pred = ebm.predict(data_pts_2) #%% # Try isolation forest for outlier detection X = data_pts_1 from sklearn.ensemble import IsolationForest clf = IsolationForest(random_state=0, n_jobs=-1, contamination=0.25).fit(X) A = clf.predict(X) print((A == -1).mean(), (labels != 0).mean(), ((A == -1) == (labels != 0)).mean()) #%%
show(lr_global) # %% Fit decision tree model tree = ClassificationTree() tree.fit(X_train, y_train) print("Training finished.") y_pred = tree.predict(X_test) print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}") print(f"Accuracy {accuracy_score(y_test, y_pred)}") # %% Explain local prediction tree_local = tree.explain_local(X_test[:100], y_test[:100], name='Tree') show(tree_local) # %% Fit Explainable Boosting Machine ebm = ExplainableBoostingClassifier(random_state=2021) ebm.fit(X_train, y_train) print("Training finished.") y_pred = ebm.predict(X_test) print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}") print(f"Accuracy {accuracy_score(y_test, y_pred)}") # %% Explain locally ebm_local = ebm.explain_local(X_test[:100], y_test[:100], name='EBM') show(ebm_local) # %% Explain globally ebm_global = ebm.explain_global(name='EBM') show(ebm_global) # %%
plt.show() # ### Explainable Boosting Machine # In[9]: from interpret.glassbox import ExplainableBoostingClassifier ebm = ExplainableBoostingClassifier() ebm.fit(train_X, train_y) # In[12]: # display confusion matrices for train and test data classificationSummary(train_y, ebm.predict(train_X)) classificationSummary(test_y, ebm.predict(test_X)) # In[10]: from interpret import show ebm_global = ebm.explain_global() show(ebm_global) # In[ ]: ebm_local = ebm.explain_local(test_X, test_y) show(ebm_local) # ### RandomForest Regression Model
test_idxes = [] for train_index, test_index in kf.split(X, y): train_idxes.append(train_index) test_idxes.append(test_index) splitwise_perf = [] for split in range(0, 5): X_train, X_test = X.iloc[train_idxes[split], :], X.iloc[ test_idxes[split], :] y_train, y_test = y[train_idxes[split]], y[test_idxes[split]] #X_train, X_test, X_cov = normalize_train_test_cov(X_train, X_test, X_cov) y_train = y_train.ravel() clf = ExplainableBoostingClassifier( random_state=seed) #, interactions=100) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(metrics.confusion_matrix(y_test, y_pred)) curr_perf = [] curr_perf += [metrics.accuracy_score(y_test, y_pred)] y_pred = clf.predict_proba(X_test) curr_perf += [get_aucpr(y_test, y_pred[:, 1])] curr_perf += [get_auc(y_test, y_pred[:, 1])] y_pred_cov = clf.predict(X_cov) print(metrics.confusion_matrix(y_cov, y_pred_cov)) y_pred_cov = clf.predict_proba(X_cov) curr_perf += [get_aucpr(y_cov, y_pred_cov[:, 1])] curr_perf += [get_auc(y_cov, y_pred_cov[:, 1])] print(curr_perf) splitwise_perf.append(curr_perf) # save model #save_model(clf,format("models/ebm_humanpartners_1to1_no3mer_nonorm_split%d.pkl" % split))
y_train_cov, y_test_cov = y_cov[train_idxes_cov[split]], y_cov[ test_idxes_cov[split]] #X_train_cov, y_train_cov = undersample_negatives(X_train_cov, y_train_cov, 50) y_train_cov = y_train_cov.ravel() #clf = tune_ebm(X_train_cov, y_train_cov) if interac == 0: clf = ExplainableBoostingClassifier() else: clf = ExplainableBoostingClassifier(interactions=interac) clf.fit(X_train_cov, y_train_cov) curr_perf = [] y_pred_cov = clf.predict(X_test_cov) #curr_perf += [metrics.accuracy_score(y_test_cov, y_pred_cov)] print(metrics.confusion_matrix(y_test_cov, y_pred_cov)) y_pred_cov = clf.predict_proba(X_test_cov) curr_perf += [get_aucpr_R(y_test_cov, y_pred_cov[:, 1])] curr_perf += [get_auc_R(y_test_cov, y_pred_cov[:, 1])] curr_perf += [get_fmax(y_test_cov, y_pred_cov[:, 1])] curr_perf += get_early_prec(y_test_cov, y_pred_cov[:, 1]) print(curr_perf) splitwise_perf.append(curr_perf) # save model #save_model(clf,format("models//ebm_covonly_split%d_1to10_int%d.pkl" % (split, interac))) save_model( clf, format("%s/split%d_1to%d_int%d_trial%d.pkl" % (out_dir, split, int(negfrac), interac, trial)))