def train_titanic_binary_classification(interactions, with_categorical=False): df = pd.read_csv( os.path.join('examples', 'titanic_train.csv'), #dtype= { # 'Age': np.float32, # 'Fare': np.float32, # 'Pclass': np.float32, # np.int #} ) df = df.dropna() df['Old'] = df['Age'] > 65 feature_types = ['continuous', 'continuous', 'continuous', 'continuous'] feature_columns = ['Age', 'Fare', 'Pclass', 'Old'] if with_categorical is True: feature_columns.append('Embarked') feature_types.append('categorical') label_column = "Survived" y = df[[label_column]] le = LabelEncoder() y_enc = le.fit_transform(y) x = df[feature_columns] x_train, x_test, y_train, y_test = train_test_split(x, y_enc) model = ExplainableBoostingClassifier(interactions=interactions, feature_types=feature_types) model.fit(x_train, y_train) return model, x_test, y_test
def run_training_process(): df = load_and_clean_data() X = df[train_cols].reset_index(drop=True) y = df["target"].to_numpy() clf = ExplainableBoostingClassifier() for tr, tst in StratifiedKFold(n_splits=3).split(X, y): print("Shape of train data: {:d}\nShape of test data: {:d}\n".format( len(tr), len(tst))) print( "Sum of labels in train: {:d}\nSum of labels in test: {:d}".format( y[tr].sum(), y[tst].sum())) clf.fit(X.loc[tr], y[tr]) print("ROC AUC Score: {:4f}".format( roc_auc_score(y[tst], clf.predict_proba(X.loc[tst])[:, 1]))) clf.fit(X, y) with open("model_file", "bw") as file: pickle.dump(clf, file) df.to_csv("features_file.csv", index=False) df["preds"] = clf.predict_proba(X)[:, 1] df[[ "inn", "preds", "target", ]].to_csv("score.csv", index=False)
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): from interpret.glassbox import ( ExplainableBoostingClassifier, ExplainableBoostingRegressor, ) logging.root.level = ( 10 ) # HACK - EBM can't handle our custom logger with unknown level 9 (DATA) orig_cols = list(X.names) if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) model = ExplainableBoostingClassifier(**self.params) else: model = ExplainableBoostingRegressor(**self.params) # Replace missing values with a value smaller than all observed values self.min = dict() for col in X.names: XX = X[:, col] self.min[col] = XX.min1() if self.min[col] is None or np.isnan(self.min[col]): self.min[col] = -1e10 else: self.min[col] -= 1 XX.replace(None, self.min[col]) X[:, col] = XX assert X[dt.isna(dt.f[col]), col].nrows == 0 X = X.to_numpy() model.fit(X, y) importances = self.get_importances(model, X.shape[1]) self.set_model_properties( model=model, features=orig_cols, importances=importances, iterations=self.params["n_estimators"], )
def train_bank_churners_multiclass_classification(): df = pd.read_csv(os.path.join('examples', 'BankChurners.csv'), ) df = df.dropna() feature_types = ['continuous', 'continuous', 'categorical', 'continuous'] feature_columns = [ 'Customer_Age', 'Dependent_count', 'Education_Level', 'Credit_Limit' ] label_column = "Income_Category" y = df[[label_column]] le = LabelEncoder() y_enc = le.fit_transform(y) x = df[feature_columns] x_train, x_test, y_train, y_test = train_test_split(x, y_enc) model = ExplainableBoostingClassifier(interactions=0, feature_types=feature_types) model.fit(x_train, y_train) return model, x_test, y_test
def tune_ebm(X_train, y_train): reslist = [] metric_idx = 1 # index where AUC is stored for interac in [50, 100, 500]: clf = ExplainableBoostingClassifier(random_state=seed, interactions=interac) cv_results = cross_validate(clf, X_train, y_train, cv=3, scoring='average_precision') reslist.append((interac, np.mean(cv_results['test_score']))) print(*reslist, sep='\n') reslist = np.asarray(reslist) bestid = np.where(reslist[:, metric_idx] == max(reslist[:, metric_idx]))[0][0] clf = ExplainableBoostingClassifier(random_state=seed, interactions=reslist[bestid, 0]) clf.fit(X_train, y_train) return clf
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): from interpret.glassbox import ( ExplainableBoostingClassifier, ExplainableBoostingRegressor, ) logging.root.level = ( 10 ) # HACK - EBM can't handle our custom logger with unknown level 9 (DATA) orig_cols = list(X.names) if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) model = ExplainableBoostingClassifier(**self.params) else: model = ExplainableBoostingRegressor(**self.params) X = self.basic_impute(X) X = X.to_numpy() model.fit(X, y) importances = self.get_importances(model, X.shape[1]) self.set_model_properties( model=model, features=orig_cols, importances=importances, iterations=self.params["n_estimators"], )
from sklearn.linear_model import LogisticRegression model = LogisticRegression(max_iter=1000) model = model.fit(X=X_train, y=y_train) model.predict(X_train).mean() model.coef_ X_train.columns model.intercept_ model.get_params() # %% Explainable gbm from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression from interpret import show ebm = ExplainableBoostingClassifier() ebm.fit(X=X_train, y=y_train) ebm_global = ebm.explain_global(name='EBM') show(ebm_global) # %% log_model = LogisticRegression() log_model.fit(X=X_train, y=y_train) log_global = log_model.explain_global(name='LogReg') show(log_global) show([ebm_global, log_global], share_tables=True) # %% from interpret.data import ClassHistogram
X_train_neg, X_test_neg = train_test_split(X_neg, test_size=0.2) X_train = pd.DataFrame(np.row_stack((X_train_pos, X_train_neg)), columns=feat_names) X_test = pd.DataFrame(np.row_stack((X_test_pos, X_test_neg)), columns=feat_names) y_test = np.zeros((X_test.shape[0],1)) y_train = np.zeros((X_train.shape[0],1)) y_train[range(X_train_pos.shape[0])]=1 y_test[range(X_test_pos.shape[0])]=1 print("X size: ",X_train.shape[0],'x',X_train.shape[1]) print("y size: ",y_train.shape[0],'x',y_train.shape[1]) print("X-test size: ",X_test.shape[0],'x',X_test.shape[1]) print("y-test size: ",y_test.shape[0],'x',y_test.shape[1]) # train and test, performance output #clf = tune_ebm(X_train, y_train) clf = ExplainableBoostingClassifier(random_state=seed, interactions=100) clf.fit(X_train, y_train) print("Finished training ...") curr_perf = [] y_pred = clf.predict(X_test) curr_perf += [metrics.accuracy_score(y_test, y_pred)] print(metrics.confusion_matrix(y_test, y_pred)) y_pred = clf.predict_proba(X_test) curr_perf += [get_aucpr(y_test, y_pred[:,1])] curr_perf += [get_auc(y_test, y_pred[:,1])] print("Performance: ",curr_perf) # predict on larger set, output predictions print("Predicting on all test pairs now... ") scores = (clf.predict_proba(X_neg_all))[:,1] neg_pps['score'] = scores neg_pps.to_csv(outfile)
def fit_ga2m(configuration, res_dir, predicted_variable='Row', threshold=3): """ Fits a ga2m model, using the data retrieved by the function get_data, and stores the fit object the training data and the test set in pickle files. Always fits a two class prediction model for a given predicted_variable, and a threshold to separate that variable by. The predicted_variable is assumed to be ordinal. The defaults predicted_variable and threshold are set up for predicting the LFS and WT mutation of p53 for the individuals in the dataset. :param configuration: a dictionary of list of str :param res_dir: path to directory to store resulting fit model, test split and train split :param predicted_variable: The column in the LFS data which will be predicted. :param threshold: threshold for the predicted_variable :return: dictionary with keys 'fit', 'train', 'test' with values corresponding to the paths to the respective files. """ seed(7) dat = get_data() # Label "mutant" observations, comes from the original prediction task though mutant may not be an appropriate label # depending on the predicted_variable, but the mutant column will be the binary predicted classes for the fit model. dat['mutant'] = dat[predicted_variable] > threshold # dat['mutant'] = dat.Column >= (max(dat.Column) - min(dat.Column))/2 + min(dat.Column) # Apply given configuration if configuration['subset_features'][0] != 'None': dat = dat[configuration['subset_features'] + ['mutant']] # Drop labelling columns and shuffle data order. if configuration['test'][0] == 'random': dat = dat.drop(columns=['Row', 'Column', 'Time', 'S', 'M', 'FocusScore3', 'FocusScore4', 'FocusScore5', 'Centroid_1', 'Centroid_2', 'Orientation']).sample(frac=1) # Select random train and test sets. dat_train = dat.iloc[:floor(len(dat) * 0.9), :] dat_test = dat.iloc[floor(len(dat) * 0.9):, :] elif sum([b.isdigit() for b in configuration['test']]) == len(configuration['test']): # Assume the values in configuration['test'] refer to specific entries which will only be in the test set. if not (sum([int(b) in dat[predicted_variable] for b in configuration['test']]) == len(configuration['test'])): raise Exception('not all test values are rows in the data.') test_rows = [int(r) for r in configuration['test']] # Let the test set be a set of entries, for default predicted_variable this corresponds to individuals in our # data. dat_train = dat.loc[~dat[predicted_variable].isin(test_rows)] # This are all indicator/irrelevant variables we don't want to consider, which should be removed from train and # test sets. dat_train = dat_train.drop(columns=['Row', 'Column', 'Time', 'S', 'M', 'FocusScore3', 'FocusScore4', 'FocusScore5', 'Centroid_1', 'Centroid_2', 'Orientation']) dat_test = dat.loc[dat[predicted_variable].isin(test_rows)] dat_test = dat_test.drop(columns=['Row', 'Column', 'Time', 'S', 'M', 'FocusScore3', 'FocusScore4', 'FocusScore5', 'Centroid_1', 'Centroid_2', 'Orientation']) else: raise Exception('test = x, where x must be random, or a comma separated seq of digits which are valid entries ' 'in the predicted_variable in the data') # Check that the original predicted_variable isn't in the training or testing data ebm = ExplainableBoostingClassifier(interactions=int(configuration['num_interaction'][0])) ebm.fit(X=dat_train.drop(columns='mutant'), y=dat_train['mutant']) with open(res_dir + 'ga2m_fit', 'wb') as ga2m_file: pk.dump(ebm, ga2m_file) with open(res_dir + 'dat_train', 'wb') as train_file: pk.dump(dat_train, train_file) with open(res_dir + 'dat_test', 'wb') as test_file: pk.dump(dat_test, test_file) return {'fit': res_dir + 'ga2m_fit', 'train': res_dir + 'dat_train', 'test': res_dir + 'dat_test'}
X = data_train y = labels_train.ravel() iX_train, iX_test, y_train, y_test = \ train_test_split(iX, y, test_size=0.25, stratify=y, random_state=0) X_train, X_test = X[iX_train], X[iX_test] X_test_out = data_test_out y_test_out = labels_test_out #%% from interpret.glassbox import ExplainableBoostingClassifier ebm = ExplainableBoostingClassifier() ebm.fit(data_pts_1, labels_pts_1) labels_pt_2_pred = ebm.predict(data_pts_2) #%% # Try isolation forest for outlier detection X = data_pts_1 from sklearn.ensemble import IsolationForest clf = IsolationForest(random_state=0, n_jobs=-1, contamination=0.25).fit(X) A = clf.predict(X) print((A == -1).mean(), (labels != 0).mean(), ((A == -1) == (labels != 0)).mean())
plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.savefig('RFTree_ROC') plt.show() # ### Explainable Boosting Machine # In[9]: from interpret.glassbox import ExplainableBoostingClassifier ebm = ExplainableBoostingClassifier() ebm.fit(train_X, train_y) # In[12]: # display confusion matrices for train and test data classificationSummary(train_y, ebm.predict(train_X)) classificationSummary(test_y, ebm.predict(test_X)) # In[10]: from interpret import show ebm_global = ebm.explain_global() show(ebm_global)
df_A['category'] = 0 df_B['category'] = 1 #define training df (first 500 elements of each cathegory) training_columns = ['x', 'y'] training_df = pd.concat([df_A.iloc[:500], df_B.iloc[:500]], ignore_index=True, sort=True) #define test df (second 500 elements of each cathegory) test_df = pd.concat([df_A.iloc[500:], df_B.iloc[500:]], ignore_index=True, sort=True) ebm_clf = ExplainableBoostingClassifier() ebm_clf.fit(training_df[training_columns], training_df['category']) probabilities = ebm_clf.predict_proba(test_df[training_columns]) ebm_global = ebm_clf.explain_global() show(ebm_global) for prob in range(2): test_df['prob_{0}'.format(prob)] = probabilities[:, prob] figcontur = plt.figure(figsize=(18, 7.5)) contourax = figcontur.add_subplot(111) xx, yy = make_meshgrid(test_df['x'], test_df['y']) plot_contours(contourax, ebm_clf, xx, yy, cmap='RdYlBu', alpha=0.8) contourax.scatter(test_df.x, test_df.y, c=test_df['category'],
train_idxes_cov[split], :], X_cov.iloc[ test_idxes_cov[split], :] y_train_cov, y_test_cov = y_cov[train_idxes_cov[split]], y_cov[ test_idxes_cov[split]] #X_train_cov, y_train_cov = undersample_negatives(X_train_cov, y_train_cov, 50) y_train_cov = y_train_cov.ravel() #clf = tune_ebm(X_train_cov, y_train_cov) if interac == 0: clf = ExplainableBoostingClassifier() else: clf = ExplainableBoostingClassifier(interactions=interac) clf.fit(X_train_cov, y_train_cov) curr_perf = [] y_pred_cov = clf.predict(X_test_cov) #curr_perf += [metrics.accuracy_score(y_test_cov, y_pred_cov)] print(metrics.confusion_matrix(y_test_cov, y_pred_cov)) y_pred_cov = clf.predict_proba(X_test_cov) curr_perf += [get_aucpr_R(y_test_cov, y_pred_cov[:, 1])] curr_perf += [get_auc_R(y_test_cov, y_pred_cov[:, 1])] curr_perf += [get_fmax(y_test_cov, y_pred_cov[:, 1])] curr_perf += get_early_prec(y_test_cov, y_pred_cov[:, 1]) print(curr_perf) splitwise_perf.append(curr_perf) # save model #save_model(clf,format("models//ebm_covonly_split%d_1to10_int%d.pkl" % (split, interac))) save_model( clf,
feat_names = X_pos.columns X_cov = pd.DataFrame(np.row_stack((X_pos, X_neg)), columns=feat_names) y_cov = np.zeros((npos + nneg, 1)) y_cov[range(npos)] = 1 print("X size: ", X_cov.shape[0], 'x', X_cov.shape[1]) print("y size: ", y_cov.shape[0], 'x', y_cov.shape[1]) #del X_neg #for interac in [0]: # [5, 10, 50, 100, 300, 500]: if True: print("======================== ", interac, " ======================") if interac == 0: clf = ExplainableBoostingClassifier() else: clf = ExplainableBoostingClassifier(interactions=interac) clf.fit(X_cov, y_cov) # test on everything #X_neg = pd.read_csv(negfile, header=0) X_cov = pd.DataFrame(np.row_stack((X_pos, X_neg_all)), columns=feat_names) print('Predicting on #examples:', X_cov.shape[0]) y_pred = clf.predict_proba(X_cov) y_pred = y_pred[:, 1] np.save( format("%s/int%d_trial%d_preds.npy") % (out_dir, interac, trial), y_pred) save_model(clf, format("%s/int%d_trial%d.pkl" % (out_dir, interac, trial)))
# ### Training and Interpreting EBM # Train a Explainable Boosting Machine (with [interpret.ml](https://github.com/interpretml/interpret/)) # # For a tutorial see: [[Tutorial](https://nbviewer.jupyter.org/github/interpretml/interpret/blob/master/examples/python/notebooks/Interpretable%20Classification%20Methods.ipynb)] # # **Q7**. Report (global) feature importances for EBM as a table or figure. What are the most important three features in EBM? Are they the same as in the linear model? # # w_1X + w_2Y + w_3(XY) = Z # %% from interpret.glassbox import ExplainableBoostingClassifier from interpret import show train_features, train_labels, dev_features, dev_labels, test_features, test_labels = prepare_load_classification_data( ) ebm = ExplainableBoostingClassifier(n_jobs=-1) ebm.fit(train_features, train_labels) # EBM #%% # Global Explanation ebm_global = ebm.explain_global(name='EBM') show(ebm_global) #%% # Local Explanation ebm_local = ebm.explain_local(dev_features[:5], dev_labels[:5], name='EBM') show(ebm_local) #%% # Performance from interpret.perf import ROC ebm_perf = ROC(ebm.predict_proba).explain_perf(dev_features, dev_labels, name='EBM') show(ebm_perf) # %% [markdown] # ### Training and Explaining Neural Networks
train_data = train_data.fillna( train_data.groupby(['Pclass', 'Sex']).transform('mean')) test_data = test_data.fillna( test_data.groupby(['Pclass', 'Sex']).transform('mean')) train_data = train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Survived']] test_data = test_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']] X_train, X_validate, y_train, y_validate = train_test_split( train_data.drop('Survived', axis=1), train_data['Survived'], test_size=.25) ebm = ExplainableBoostingClassifier() lrm = LogisticRegression() ebm.fit(X_train, y_train) le = LabelEncoder() X_train_lr = X_train X_train_lr['Sex'] = le.fit_transform(X_train['Sex']) lrm.fit(X_train_lr, y_train) ebm_global = ebm.explain_global() show(ebm_global) ebm_local = ebm.explain_local(X_validate, y_validate) show(ebm_local) lrm_global = lrm.explain_global() show(lrm_global) X_validate_lr = X_validate X_validate_lr['Sex'] = le.fit_transform(X_validate['Sex'])
def build_model(): ucihd_attr = [ "age", "sex", # 0 = female 1 = male "cp", # chest pain type 1: typical angina 2: atypical angina 3: non-anginal pain 4: asymptomatic # resting blood pressure (in mm Hg on admission to the hospital) "trestbps", "chol", # serum cholestoral in mg/dl "fbs", # (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) "restecg", # resting electrocardiographic results 0: normal 1: having ST-T wave abnormality 2: showing probable or definite left ventricular hypertrophy by Estes' criteria "thalach", # maximum heart rate achieved "exang", # exercise induced angina (1 = yes; 0 = no) "oldpeak", # ST depression induced by exercise relative to rest "slope", # the slope of the peak exercise ST segment "ca", # number of major vessels (0-3) colored by flouroscopy "thal", # 3 = normal; 6 = fixed defect; 7 = reversable defect # diagnosis of heart disease (angiographic disease status) 0: < 50% diameter narrowing 1-4: > 50% diameter narrowing "label" ] ucihd_local_path = "../datasets/processed.cleveland.data" ucihd = pd.read_csv(ucihd_local_path, header=None, names=ucihd_attr, na_values="?") categorical_attr = ["sex", "cp", "fbs", "restecg", "exang", "thal"] for col in categorical_attr: ucihd[col] = ucihd[col].astype("category") # Clean label. ucihd.loc[ucihd["label"] > 1, "label"] = 1 # sklearn's implementation of RF doesn't allow missing value. # For categorical (as string) we can leave one special category for missing, # but for numerical we need to do some special encoding or imputation. ucihd_2 = ucihd.copy() ucihd_2.loc[ucihd_2["ca"].isna(), "ca"] = -1 # Encode missing numerical. ucihd_2 = pd.get_dummies(ucihd_2, columns=categorical_attr, dummy_na=True) ucihd_y = ucihd_2.pop("label") train, test, ucihd_y_train, _ = train_test_split(ucihd_2, ucihd_y.values, test_size=.3, random_state=64) # horrible hack to reverse effect of pd.get_dummies _, test_display, _, _ = train_test_split(ucihd, ucihd_y.values, test_size=.3, random_state=64) ucihd_rf = RandomForestClassifier(n_estimators=100, random_state=64) _ = ucihd_rf.fit(train, ucihd_y_train) feature_names = ucihd_2.columns class_names = ["Negative", "Positive"] caterogical_features = [ i for i, col in enumerate(feature_names) if "_" in col ] feature_names_display = ucihd_attr ucihd_ebm = ExplainableBoostingClassifier(n_estimators=16, feature_names=ucihd_2.columns, n_jobs=1) _ = ucihd_ebm.fit(train, ucihd_y_train) return (ucihd_rf, train.values, test, feature_names, class_names, caterogical_features, test_display, feature_names_display, ucihd_ebm)