예제 #1
0
def train_titanic_binary_classification(interactions, with_categorical=False):
    df = pd.read_csv(
        os.path.join('examples', 'titanic_train.csv'),
        #dtype= {
        #    'Age': np.float32,
        #    'Fare': np.float32,
        #    'Pclass': np.float32, # np.int
        #}
    )
    df = df.dropna()
    df['Old'] = df['Age'] > 65
    feature_types = ['continuous', 'continuous', 'continuous', 'continuous']
    feature_columns = ['Age', 'Fare', 'Pclass', 'Old']
    if with_categorical is True:
        feature_columns.append('Embarked')
        feature_types.append('categorical')
    label_column = "Survived"

    y = df[[label_column]]
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    x = df[feature_columns]
    x_train, x_test, y_train, y_test = train_test_split(x, y_enc)
    model = ExplainableBoostingClassifier(interactions=interactions,
                                          feature_types=feature_types)
    model.fit(x_train, y_train)

    return model, x_test, y_test
예제 #2
0
def run_training_process():
    df = load_and_clean_data()
    X = df[train_cols].reset_index(drop=True)
    y = df["target"].to_numpy()

    clf = ExplainableBoostingClassifier()

    for tr, tst in StratifiedKFold(n_splits=3).split(X, y):
        print("Shape of train data: {:d}\nShape of test data: {:d}\n".format(
            len(tr), len(tst)))
        print(
            "Sum of labels in train: {:d}\nSum of labels in test: {:d}".format(
                y[tr].sum(), y[tst].sum()))

        clf.fit(X.loc[tr], y[tr])
        print("ROC AUC Score: {:4f}".format(
            roc_auc_score(y[tst],
                          clf.predict_proba(X.loc[tst])[:, 1])))

    clf.fit(X, y)

    with open("model_file", "bw") as file:
        pickle.dump(clf, file)

    df.to_csv("features_file.csv", index=False)
    df["preds"] = clf.predict_proba(X)[:, 1]

    df[[
        "inn",
        "preds",
        "target",
    ]].to_csv("score.csv", index=False)
예제 #3
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        from interpret.glassbox import (
            ExplainableBoostingClassifier,
            ExplainableBoostingRegressor,
        )

        logging.root.level = (
            10
        )  # HACK - EBM can't handle our custom logger with unknown level 9 (DATA)

        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
            model = ExplainableBoostingClassifier(**self.params)
        else:
            model = ExplainableBoostingRegressor(**self.params)

        # Replace missing values with a value smaller than all observed values
        self.min = dict()
        for col in X.names:
            XX = X[:, col]
            self.min[col] = XX.min1()
            if self.min[col] is None or np.isnan(self.min[col]):
                self.min[col] = -1e10
            else:
                self.min[col] -= 1
            XX.replace(None, self.min[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()

        model.fit(X, y)
        importances = self.get_importances(model, X.shape[1])
        self.set_model_properties(
            model=model,
            features=orig_cols,
            importances=importances,
            iterations=self.params["n_estimators"],
        )
예제 #4
0
def train_bank_churners_multiclass_classification():
    df = pd.read_csv(os.path.join('examples', 'BankChurners.csv'), )
    df = df.dropna()
    feature_types = ['continuous', 'continuous', 'categorical', 'continuous']
    feature_columns = [
        'Customer_Age', 'Dependent_count', 'Education_Level', 'Credit_Limit'
    ]
    label_column = "Income_Category"

    y = df[[label_column]]
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    x = df[feature_columns]
    x_train, x_test, y_train, y_test = train_test_split(x, y_enc)
    model = ExplainableBoostingClassifier(interactions=0,
                                          feature_types=feature_types)
    model.fit(x_train, y_train)

    return model, x_test, y_test
예제 #5
0
def tune_ebm(X_train, y_train):
    reslist = []
    metric_idx = 1  # index where AUC is stored
    for interac in [50, 100, 500]:
        clf = ExplainableBoostingClassifier(random_state=seed,
                                            interactions=interac)
        cv_results = cross_validate(clf,
                                    X_train,
                                    y_train,
                                    cv=3,
                                    scoring='average_precision')
        reslist.append((interac, np.mean(cv_results['test_score'])))
    print(*reslist, sep='\n')
    reslist = np.asarray(reslist)
    bestid = np.where(reslist[:, metric_idx] == max(reslist[:,
                                                            metric_idx]))[0][0]
    clf = ExplainableBoostingClassifier(random_state=seed,
                                        interactions=reslist[bestid, 0])
    clf.fit(X_train, y_train)
    return clf
예제 #6
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        from interpret.glassbox import (
            ExplainableBoostingClassifier,
            ExplainableBoostingRegressor,
        )

        logging.root.level = (
            10
        )  # HACK - EBM can't handle our custom logger with unknown level 9 (DATA)

        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
            model = ExplainableBoostingClassifier(**self.params)
        else:
            model = ExplainableBoostingRegressor(**self.params)

        X = self.basic_impute(X)
        X = X.to_numpy()

        model.fit(X, y)
        importances = self.get_importances(model, X.shape[1])
        self.set_model_properties(
            model=model,
            features=orig_cols,
            importances=importances,
            iterations=self.params["n_estimators"],
        )
예제 #7
0
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model = model.fit(X=X_train, y=y_train)
model.predict(X_train).mean()
model.coef_
X_train.columns
model.intercept_
model.get_params()

# %% Explainable gbm
from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression
from interpret import show

ebm = ExplainableBoostingClassifier()
ebm.fit(X=X_train, y=y_train)

ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

# %%
log_model = LogisticRegression()
log_model.fit(X=X_train, y=y_train)
log_global = log_model.explain_global(name='LogReg')
show(log_global)

show([ebm_global, log_global], share_tables=True)

# %%
from interpret.data import ClassHistogram
예제 #8
0
    X_train_neg, X_test_neg = train_test_split(X_neg, test_size=0.2)
    X_train = pd.DataFrame(np.row_stack((X_train_pos, X_train_neg)), columns=feat_names)
    X_test = pd.DataFrame(np.row_stack((X_test_pos, X_test_neg)), columns=feat_names)
    y_test = np.zeros((X_test.shape[0],1))
    y_train = np.zeros((X_train.shape[0],1))
    y_train[range(X_train_pos.shape[0])]=1
    y_test[range(X_test_pos.shape[0])]=1
    print("X size: ",X_train.shape[0],'x',X_train.shape[1])
    print("y size: ",y_train.shape[0],'x',y_train.shape[1])
    print("X-test size: ",X_test.shape[0],'x',X_test.shape[1])
    print("y-test size: ",y_test.shape[0],'x',y_test.shape[1])

    # train and test, performance output    
    #clf = tune_ebm(X_train, y_train)
    clf = ExplainableBoostingClassifier(random_state=seed, interactions=100)
    clf.fit(X_train, y_train)
    print("Finished training ...")
    curr_perf = []
    y_pred = clf.predict(X_test)
    curr_perf += [metrics.accuracy_score(y_test, y_pred)]
    print(metrics.confusion_matrix(y_test, y_pred))
    y_pred = clf.predict_proba(X_test)
    curr_perf += [get_aucpr(y_test, y_pred[:,1])]
    curr_perf += [get_auc(y_test, y_pred[:,1])]
    print("Performance: ",curr_perf)

    # predict on larger set, output predictions
    print("Predicting on all test pairs now... ")
    scores = (clf.predict_proba(X_neg_all))[:,1]
    neg_pps['score'] = scores   
    neg_pps.to_csv(outfile)
예제 #9
0
def fit_ga2m(configuration, res_dir, predicted_variable='Row', threshold=3):
    """
    Fits a ga2m model, using the data retrieved by the function get_data, and stores the fit object the training data
    and the test set in pickle files. Always fits a two class prediction model for a given predicted_variable, and a
    threshold to separate that variable by. The predicted_variable is assumed to be ordinal. The defaults
    predicted_variable and threshold are set up for predicting the LFS and WT mutation of p53 for the individuals in the
    dataset.

    :param configuration: a dictionary of list of str
    :param res_dir: path to directory to store resulting fit model, test split and train split
    :param predicted_variable: The column in the LFS data which will be predicted.
    :param threshold: threshold for the predicted_variable
    :return: dictionary with keys 'fit', 'train', 'test' with values corresponding to the paths to the respective files.
    """
    seed(7)
    dat = get_data()

    # Label "mutant" observations, comes from the original prediction task though mutant may not be an appropriate label
    # depending on the predicted_variable, but the mutant column will be the binary predicted classes for the fit model.
    dat['mutant'] = dat[predicted_variable] > threshold
    # dat['mutant'] = dat.Column >= (max(dat.Column) - min(dat.Column))/2 + min(dat.Column)

    # Apply given configuration
    if configuration['subset_features'][0] != 'None':
        dat = dat[configuration['subset_features'] + ['mutant']]

    # Drop labelling columns and shuffle data order.
    if configuration['test'][0] == 'random':
        dat = dat.drop(columns=['Row', 'Column', 'Time', 'S', 'M', 'FocusScore3', 'FocusScore4', 'FocusScore5', 'Centroid_1', 'Centroid_2', 'Orientation']).sample(frac=1)

    # Select random train and test sets.
        dat_train = dat.iloc[:floor(len(dat) * 0.9), :]
        dat_test = dat.iloc[floor(len(dat) * 0.9):, :]

    elif sum([b.isdigit() for b in configuration['test']]) == len(configuration['test']):
        # Assume the values in configuration['test'] refer to specific entries which will only be in the test set.
        if not (sum([int(b) in dat[predicted_variable] for b in configuration['test']]) == len(configuration['test'])):
            raise Exception('not all test values are rows in the data.')
        test_rows = [int(r) for r in configuration['test']]
        # Let the test set be a set of entries, for default predicted_variable this corresponds to individuals in our
        # data.
        dat_train = dat.loc[~dat[predicted_variable].isin(test_rows)]
        # This are all indicator/irrelevant variables we don't want to consider, which should be removed from train and
        # test sets.
        dat_train = dat_train.drop(columns=['Row', 'Column', 'Time', 'S', 'M', 'FocusScore3', 'FocusScore4',
                                            'FocusScore5', 'Centroid_1', 'Centroid_2', 'Orientation'])
        dat_test = dat.loc[dat[predicted_variable].isin(test_rows)]
        dat_test = dat_test.drop(columns=['Row', 'Column', 'Time', 'S', 'M', 'FocusScore3', 'FocusScore4',
                                          'FocusScore5', 'Centroid_1', 'Centroid_2', 'Orientation'])
    else:
        raise Exception('test = x, where x must be random, or a comma separated seq of digits which are valid entries '
                        'in the predicted_variable in the data')

    # Check that the original predicted_variable isn't in the training or testing data

    ebm = ExplainableBoostingClassifier(interactions=int(configuration['num_interaction'][0]))
    ebm.fit(X=dat_train.drop(columns='mutant'), y=dat_train['mutant'])

    with open(res_dir + 'ga2m_fit', 'wb') as ga2m_file:
        pk.dump(ebm, ga2m_file)

    with open(res_dir + 'dat_train', 'wb') as train_file:
        pk.dump(dat_train, train_file)

    with open(res_dir + 'dat_test', 'wb') as test_file:
        pk.dump(dat_test, test_file)

    return {'fit': res_dir + 'ga2m_fit', 'train': res_dir + 'dat_train', 'test': res_dir + 'dat_test'}
예제 #10
0
X = data_train
y = labels_train.ravel()

iX_train,  iX_test, y_train, y_test = \
    train_test_split(iX, y, test_size=0.25, stratify=y, random_state=0)

X_train, X_test = X[iX_train], X[iX_test]

X_test_out = data_test_out
y_test_out = labels_test_out

#%%
from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier()
ebm.fit(data_pts_1, labels_pts_1)

labels_pt_2_pred = ebm.predict(data_pts_2)
#%%

# Try isolation forest for outlier detection
X = data_pts_1

from sklearn.ensemble import IsolationForest

clf = IsolationForest(random_state=0, n_jobs=-1, contamination=0.25).fit(X)

A = clf.predict(X)

print((A == -1).mean(), (labels != 0).mean(),
      ((A == -1) == (labels != 0)).mean())
예제 #11
0
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('RFTree_ROC')
plt.show()

# ### Explainable Boosting Machine

# In[9]:

from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier()
ebm.fit(train_X, train_y)

# In[12]:

# display confusion matrices for train and test data

classificationSummary(train_y, ebm.predict(train_X))
classificationSummary(test_y, ebm.predict(test_X))

# In[10]:

from interpret import show

ebm_global = ebm.explain_global()
show(ebm_global)
예제 #12
0
df_A['category'] = 0
df_B['category'] = 1

#define training df (first 500 elements of each cathegory)
training_columns = ['x', 'y']
training_df = pd.concat([df_A.iloc[:500], df_B.iloc[:500]],
                        ignore_index=True,
                        sort=True)

#define test df (second 500 elements of each cathegory)
test_df = pd.concat([df_A.iloc[500:], df_B.iloc[500:]],
                    ignore_index=True,
                    sort=True)

ebm_clf = ExplainableBoostingClassifier()
ebm_clf.fit(training_df[training_columns], training_df['category'])

probabilities = ebm_clf.predict_proba(test_df[training_columns])
ebm_global = ebm_clf.explain_global()
show(ebm_global)

for prob in range(2):
    test_df['prob_{0}'.format(prob)] = probabilities[:, prob]

figcontur = plt.figure(figsize=(18, 7.5))
contourax = figcontur.add_subplot(111)
xx, yy = make_meshgrid(test_df['x'], test_df['y'])
plot_contours(contourax, ebm_clf, xx, yy, cmap='RdYlBu', alpha=0.8)
contourax.scatter(test_df.x,
                  test_df.y,
                  c=test_df['category'],
예제 #13
0
                train_idxes_cov[split], :], X_cov.iloc[
                    test_idxes_cov[split], :]
            y_train_cov, y_test_cov = y_cov[train_idxes_cov[split]], y_cov[
                test_idxes_cov[split]]

            #X_train_cov, y_train_cov = undersample_negatives(X_train_cov, y_train_cov, 50)

            y_train_cov = y_train_cov.ravel()
            #clf = tune_ebm(X_train_cov, y_train_cov)

            if interac == 0:
                clf = ExplainableBoostingClassifier()
            else:
                clf = ExplainableBoostingClassifier(interactions=interac)

            clf.fit(X_train_cov, y_train_cov)
            curr_perf = []
            y_pred_cov = clf.predict(X_test_cov)
            #curr_perf += [metrics.accuracy_score(y_test_cov, y_pred_cov)]
            print(metrics.confusion_matrix(y_test_cov, y_pred_cov))
            y_pred_cov = clf.predict_proba(X_test_cov)
            curr_perf += [get_aucpr_R(y_test_cov, y_pred_cov[:, 1])]
            curr_perf += [get_auc_R(y_test_cov, y_pred_cov[:, 1])]
            curr_perf += [get_fmax(y_test_cov, y_pred_cov[:, 1])]
            curr_perf += get_early_prec(y_test_cov, y_pred_cov[:, 1])
            print(curr_perf)
            splitwise_perf.append(curr_perf)
            # save model
            #save_model(clf,format("models//ebm_covonly_split%d_1to10_int%d.pkl" % (split, interac)))
            save_model(
                clf,
    feat_names = X_pos.columns
    X_cov = pd.DataFrame(np.row_stack((X_pos, X_neg)), columns=feat_names)
    y_cov = np.zeros((npos + nneg, 1))
    y_cov[range(npos)] = 1
    print("X size: ", X_cov.shape[0], 'x', X_cov.shape[1])
    print("y size: ", y_cov.shape[0], 'x', y_cov.shape[1])
    #del X_neg

    #for interac in [0]: # [5, 10, 50, 100, 300, 500]:
    if True:
        print("======================== ", interac, " ======================")
        if interac == 0:
            clf = ExplainableBoostingClassifier()
        else:
            clf = ExplainableBoostingClassifier(interactions=interac)

        clf.fit(X_cov, y_cov)
        # test on everything
        #X_neg = pd.read_csv(negfile, header=0)
        X_cov = pd.DataFrame(np.row_stack((X_pos, X_neg_all)),
                             columns=feat_names)
        print('Predicting on #examples:', X_cov.shape[0])
        y_pred = clf.predict_proba(X_cov)
        y_pred = y_pred[:, 1]
        np.save(
            format("%s/int%d_trial%d_preds.npy") % (out_dir, interac, trial),
            y_pred)
        save_model(clf,
                   format("%s/int%d_trial%d.pkl" % (out_dir, interac, trial)))
예제 #15
0
# ### Training and Interpreting EBM
# Train a Explainable Boosting Machine (with [interpret.ml](https://github.com/interpretml/interpret/))
#
# For a tutorial see: [[Tutorial](https://nbviewer.jupyter.org/github/interpretml/interpret/blob/master/examples/python/notebooks/Interpretable%20Classification%20Methods.ipynb)]
#
# **Q7**. Report (global) feature importances for EBM as a table or figure. What are the most important three features in EBM? Are they the same as in the linear model?
#
# w_1X + w_2Y + w_3(XY) = Z
# %%
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

train_features, train_labels, dev_features, dev_labels, test_features, test_labels = prepare_load_classification_data(
)
ebm = ExplainableBoostingClassifier(n_jobs=-1)
ebm.fit(train_features, train_labels)
# EBM
#%% # Global Explanation
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)
#%% # Local Explanation
ebm_local = ebm.explain_local(dev_features[:5], dev_labels[:5], name='EBM')
show(ebm_local)
#%% # Performance
from interpret.perf import ROC
ebm_perf = ROC(ebm.predict_proba).explain_perf(dev_features,
                                               dev_labels,
                                               name='EBM')
show(ebm_perf)
# %% [markdown]
# ### Training and Explaining Neural Networks
예제 #16
0
train_data = train_data.fillna(
    train_data.groupby(['Pclass', 'Sex']).transform('mean'))
test_data = test_data.fillna(
    test_data.groupby(['Pclass', 'Sex']).transform('mean'))

train_data = train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Survived']]
test_data = test_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]

X_train, X_validate, y_train, y_validate = train_test_split(
    train_data.drop('Survived', axis=1), train_data['Survived'], test_size=.25)

ebm = ExplainableBoostingClassifier()
lrm = LogisticRegression()

ebm.fit(X_train, y_train)

le = LabelEncoder()
X_train_lr = X_train
X_train_lr['Sex'] = le.fit_transform(X_train['Sex'])
lrm.fit(X_train_lr, y_train)

ebm_global = ebm.explain_global()
show(ebm_global)
ebm_local = ebm.explain_local(X_validate, y_validate)
show(ebm_local)

lrm_global = lrm.explain_global()
show(lrm_global)
X_validate_lr = X_validate
X_validate_lr['Sex'] = le.fit_transform(X_validate['Sex'])
예제 #17
0
def build_model():

    ucihd_attr = [
        "age",
        "sex",  # 0 = female 1 = male
        "cp",  # chest pain type 1: typical angina 2: atypical angina 3: non-anginal pain 4: asymptomatic
        # resting blood pressure (in mm Hg on admission to the hospital)
        "trestbps",
        "chol",  # serum cholestoral in mg/dl
        "fbs",  # (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
        "restecg",  # resting electrocardiographic results 0: normal 1: having ST-T wave abnormality 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
        "thalach",  # maximum heart rate achieved
        "exang",  # exercise induced angina (1 = yes; 0 = no)
        "oldpeak",  # ST depression induced by exercise relative to rest
        "slope",  # the slope of the peak exercise ST segment
        "ca",  # number of major vessels (0-3) colored by flouroscopy
        "thal",  # 3 = normal; 6 = fixed defect; 7 = reversable defect
        # diagnosis of heart disease (angiographic disease status) 0: < 50% diameter narrowing 1-4: > 50% diameter narrowing
        "label"
    ]

    ucihd_local_path = "../datasets/processed.cleveland.data"

    ucihd = pd.read_csv(ucihd_local_path,
                        header=None,
                        names=ucihd_attr,
                        na_values="?")

    categorical_attr = ["sex", "cp", "fbs", "restecg", "exang", "thal"]
    for col in categorical_attr:
        ucihd[col] = ucihd[col].astype("category")

    # Clean label.
    ucihd.loc[ucihd["label"] > 1, "label"] = 1

    # sklearn's implementation of RF doesn't allow missing value.
    # For categorical (as string) we can leave one special category for missing,
    # but for numerical we need to do some special encoding or imputation.
    ucihd_2 = ucihd.copy()
    ucihd_2.loc[ucihd_2["ca"].isna(), "ca"] = -1  # Encode missing numerical.

    ucihd_2 = pd.get_dummies(ucihd_2, columns=categorical_attr, dummy_na=True)
    ucihd_y = ucihd_2.pop("label")
    train, test, ucihd_y_train, _ = train_test_split(ucihd_2,
                                                     ucihd_y.values,
                                                     test_size=.3,
                                                     random_state=64)

    # horrible hack to reverse effect of pd.get_dummies
    _, test_display, _, _ = train_test_split(ucihd,
                                             ucihd_y.values,
                                             test_size=.3,
                                             random_state=64)

    ucihd_rf = RandomForestClassifier(n_estimators=100, random_state=64)
    _ = ucihd_rf.fit(train, ucihd_y_train)

    feature_names = ucihd_2.columns
    class_names = ["Negative", "Positive"]
    caterogical_features = [
        i for i, col in enumerate(feature_names) if "_" in col
    ]
    feature_names_display = ucihd_attr

    ucihd_ebm = ExplainableBoostingClassifier(n_estimators=16,
                                              feature_names=ucihd_2.columns,
                                              n_jobs=1)
    _ = ucihd_ebm.fit(train, ucihd_y_train)

    return (ucihd_rf, train.values, test, feature_names, class_names,
            caterogical_features, test_display, feature_names_display,
            ucihd_ebm)