예제 #1
0
    def plot_shap_force_plot(self, games, model, shap_df=None):
        model_dict = model.model_dict
        features_df = model.cv_scores

        if type(shap_df) is type(None):
            shap_df = self.get_shap_vals(model_dict, model.model_objects)
        # check if games is a single element
        if not np.array(games).shape:
            games = np.array(games).tolist()
            bias = shap_df.loc[games, 'bias']
            multi = False
        else:
            bias = shap_df.loc[games, 'bias'].mean()
            multi = True

        force_plot = shap.force_plot(
            bias, shap_df.loc[games, model_dict['features_list']].values,
            features_df.loc[games, model_dict['features_list']])

        if self.plots_dict['save']['plots'] is True:
            if multi is True:
                img_path =  '{}/force_plot_multi.html'\
                                .format(self.plots_dir)
            else:
                img_path = '{}/force_plot_{}.html' \
                                .format(self.plots_dir, games)

            shap.save_html(img_path, force_plot)
        else:
            return force_plot
예제 #2
0
 def print_shap(self, data_for_pred, outcome):
     #shap.initjs()
     shap_values = self.explainer.shap_values(data_for_pred)
     shap.save_html(
         self.out + "/individual_shap.html",
         shap.force_plot(self.explainer.expected_value[outcome],
                         shap_values[outcome], data_for_pred))
예제 #3
0
    def classify(self,
                 items,
                 probabilities=False,
                 importances=False,
                 importance_cutoff=0.15):
        assert items is not None
        assert (self.extraction_pipeline is not None and self.clf
                is not None), "The module needs to be initialized first"

        if not isinstance(items, list):
            items = [items]

        assert isinstance(items[0], dict) or isinstance(items[0], tuple)

        X = self.extraction_pipeline.transform(items)
        if probabilities:
            classes = self.clf.predict_proba(X)
        else:
            classes = self.clf.predict(X)

        classes = self.overwrite_classes(items, classes, probabilities)

        if importances:
            explainer = shap.TreeExplainer(self.clf)
            shap_values = explainer.shap_values(X)

            # TODO: Actually implement feature importance visualization for multiclass problems.
            if isinstance(shap_values, list):
                shap_values = np.sum(np.abs(shap_values), axis=0)

            top_importances = self.get_important_features(
                importance_cutoff, shap_values)

            top_indexes = [
                int(index)
                for importance, index, is_positive in top_importances
            ]

            feature_names = self.get_feature_names()

            with io.StringIO() as out:
                p = shap.force_plot(
                    explainer.expected_value,
                    shap_values[:, top_indexes],
                    X.toarray()[:, top_indexes],
                    feature_names=[feature_names[i] for i in top_indexes],
                    matplotlib=False,
                    show=False,
                )

                # TODO: use full_html=False
                shap.save_html(out, p)

                html = out.getvalue()

            return classes, {"importances": top_importances, "html": html}

        return classes
예제 #4
0
def single_force_plot(i, html=True):
    if html:
        fig = shap.force_plot(explainer.expected_value, shap_values[i, :], data_to_explain.iloc[i, :],
            feature_names=feat_used, show=False, link='logit')
        shap.save_html('./result/shap_force_plot_' + str(i) + '.htm', fig)
    else:
        fig = shap.force_plot(explainer.expected_value, shap_values[i, :], data_to_explain.iloc[i, :],
            feature_names=feat_used, show=False, matplotlib=True, link='logit')
        # fig = plt.gcf()
        # fig.savefig('./result/shap_force_plot_' + str(i) + '.svg')
        # fig.close()
    return fig
예제 #5
0
    def heart_disease_risk_factors(model, patient):
        # Get weights of each feature
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(patient)

        # Plot weights
        shap.initjs()
        shap.save_html(
            "./test.html",
            shap.force_plot(explainer.expected_value[1], shap_values[1],
                            patient))
        return shap.force_plot(explainer.expected_value[1], shap_values[1],
                               patient)
예제 #6
0
def go():

    # save user input in query
    pid_query = request.args.get('pid', '')

    # Load data:
    cur = db.cursor()
    cur.execute(f"select * from {config.DB_TABLE} where patientId={pid_query}")

    # Convert to json format:
    results_json = []
    header = [i[0] for i in cur.description]
    results = cur.fetchall()
    for i in results:
        results_json.append(dict(zip(header, i)))

    # Clean data:
    patient = list(results[0])
    patient_clean = process_data.clean_data(data=patient)

    # Data transform:
    X_transformed = predict.transform_data(model=model, X=patient_clean)

    # Prediction:
    pred = predict.prediction(model=model, X=X_transformed)

    # Model insights:
    decision = predict.model_decision(pred=pred)

    # Risk:
    risk = 'dropping off' if (pred[0][0] < 0.5) else 'engaged'

    # SHAP force_plot in html format:
    shap_plot = predict.shap_plot(model=model, X=X_transformed)
    shap.save_html('../app/templates/shap.html', shap_plot)

    # This will render the go.html Please see that file.
    return render_template(
        'go.html',
        results=results_json[0],
        # decision = decision,
        pred=f'{round(pred[0][0]*100,3)}%',
        risk=risk
        # query=query
    )
예제 #7
0
def generate_shap_html(feature, user_bin, user_id):
    xgb_clf = XGBClassifier()
    xgb_clf.load_model(os.path.join(MODEL_DIRECTORY, "xgb.model"))
    explainer = shap.TreeExplainer(xgb_clf)
    values = explainer.shap_values(feature)
    shap.initjs()
    fp = shap.force_plot(explainer.expected_value[user_bin - 1],
                         values[user_bin - 1][0],
                         feature,
                         show=False)

    shap.save_html(os.path.join(MODEL_DIRECTORY, f"User_{user_id}.html"), fp)
    with open(os.path.join(MODEL_DIRECTORY, f"User_{user_id}.html"),
              "r",
              encoding='utf-8') as f:
        html = f.read()
    os.remove(os.path.join(MODEL_DIRECTORY, f"User_{user_id}.html"))
    return str(html), values
def meta_learning(working_dir):
    meta_features_df = pd.read_csv(f'{working_dir}/RegressionAll.csv')
    datasets = list(meta_features_df['name'])

    X = meta_features_df.iloc[:, :-1]
    X = X.fillna(0)
    y = meta_features_df.iloc[:, -1]

    test_data = []

    for dataset_index in range(len(datasets)):
        dataset_name = datasets[dataset_index]

        print(f"Dataset {dataset_name}, {dataset_index+1}/100")

        test_cross_data = {'Dataset Name': dataset_name, 'Algorithm Name': 'XGBoost meta learning',
                           'Hyper-Parameters Values': None, 'Accuracy': None, 'TPR': None, 'FPR': None,
                           'Precision': None, 'AUC': ' ', 'PR Curve': ' ', 'Predict Probability': None,
                           'Predict Model': None, 'True Label': None, 'Training Time': None, 'Inference Time': None}

        X_train = X.loc[X['name'] != dataset_name]
        X_train = X_train.iloc[:, 1:]
        X_test = X.loc[X['name'] == dataset_name]
        X_test = X_test.iloc[:, 1:]

        y_train = y.loc[X_train.index]
        y_test = y.loc[X_test.index]

        meta_learning_model = xgb.XGBClassifier()

        time_before_train = datetime.now()
        meta_learning_model.fit(X_train, y_train)
        train_time = datetime.now() - time_before_train
        test_cross_data['Training Time'] = f"{train_time.microseconds} microseconds"

        time_before_predict = datetime.now()
        y_pred = meta_learning_model.predict(X_test)
        y_scores = meta_learning_model.predict_proba(X_test)
        predict_time = datetime.now() - time_before_predict
        test_cross_data['Inference Time'] = f"{predict_time.microseconds} microseconds"

        test_cross_data['Predict Probability'] = y_scores[0][1]
        test_cross_data['Predict Probability'] = f"{test_cross_data['Predict Probability']:.4f}"
        test_cross_data['Predict Model'] = 'Ensemble Genetic Programming' if y_pred[0] == 1 else 'Extra Tree Regressor'
        test_cross_data['True Label'] = 'Ensemble Genetic Programming' if y_test.values[0] == 1 \
            else 'Extra Tree Regressor'

        test_cross_data['Accuracy'] = accuracy_score(y_test, y_pred)

        test_cross_data['TPR'] = 1 if y_pred[0] == y_test.values[0] else 0
        test_cross_data['FPR'] = 0 if y_pred[0] == y_test.values[0] else 1
        test_cross_data['Precision'] = test_cross_data['TPR']

        test_data.append(test_cross_data)
        
    pd.DataFrame(test_data).to_csv('meta_learning_final_results.csv', index=False)

    X = X.iloc[:, 1:]
    meta_learning_model = xgb.XGBClassifier()
    meta_learning_model.fit(X, y)

    importance_types = ['weight', 'cover', 'gain']
    plt.rcParams["figure.figsize"] = (40, 40)
    for imp_type in importance_types:
        ax = plot_importance(meta_learning_model, max_num_features=167, importance_type=imp_type,
                             title=f'Meta Learning XGBoost {imp_type} importance')
        plt.show()

    shap.initjs()
    explainer = shap.TreeExplainer(meta_learning_model)
    shap_values = explainer.shap_values(X)
    shap.save_html('SHAP force plot.html', shap.force_plot(explainer.expected_value, shap_values, X, figsize=(20, 20)))
    shap.summary_plot(shap_values, X, plot_size=(20, 20), title="SHAP summary plot")
예제 #9
0
def shap_prop(df_cli2_scaled, df_t_scaled, clf_brf_all):
    '''
    Function to explore the random forest decision mechanism.
    
    It consists in using the Shapley approach. Here we have the main contributors, the dependence plots and the decision triggers.
    
    Input: climatic features, yield output, model (random forest)
    '''

    df_severe = pd.DataFrame(np.where(
        df_t_scaled < df_t_scaled.mean() - df_t_scaled.std(), True, False),
                             index=df_t_scaled.index,
                             columns=['severe_loss']).astype(int)
    loss_intensity = df_severe
    X, y = df_cli2_scaled, loss_intensity
    #divide data train and test
    # X_train, X_test, y_train, y_test = train_test_split(df_cli2_scaled, loss_intensity, test_size=0.3, random_state=0)

    #train explainer shap
    explainer = shap.TreeExplainer(clf_brf_all)
    shap_values = explainer.shap_values(X,
                                        approximate=False,
                                        check_additivity=True)

    # train for bars and scatters
    explainer_dif = shap.TreeExplainer(clf_brf_all, X)
    shap_values_dif = explainer_dif(X)

    # get just the explanations for the positive class
    shap_values_dif_one = shap_values_dif[..., 1]

    # Summary plots
    shap.summary_plot(shap_values, X, plot_type="bar")
    shap.summary_plot(shap_values[1], X, plot_type="bar")
    shap.summary_plot(shap_values[1], X)  # Failure

    # bar plot priority
    # shap.plots.bar(shap_values_dif_one) # - not sure why it is giving different results

    # plots for dependence plots and scatter + interaction
    # for feature in X_train.columns.values.tolist():
    #     shap.dependence_plot(feature, shap_values[1], X_train, interaction_index=None)

    for name in X.columns:
        shap.dependence_plot(name, shap_values[1], X)

    # HTML to interact with all predictors
    shap_display_all = shap.force_plot(explainer.expected_value[1],
                                       shap_values[1],
                                       X,
                                       show=False)
    shap.save_html("index.html",
                   shap_display_all)  ## open browser for the interactive model

    # Decision plots explaining decisions to classify
    shap.decision_plot(explainer.expected_value[1], shap_values[1], X)
    shap.decision_plot(explainer.expected_value[1], shap_values[1][52],
                       X.loc[[2012]])  #2012 year
    shap.decision_plot(explainer.expected_value[1], shap_values[1][53],
                       X.loc[[2013]])  #2012 year

    # Calculate force plot for a given value 2012
    shap.initjs()
    shap_values_2012 = explainer.shap_values(X.loc[[2012]])
    shap_display = shap.force_plot(explainer.expected_value[1],
                                   shap_values_2012[1],
                                   X.loc[[2012]],
                                   matplotlib=True)
    shap_display2013 = shap.force_plot(explainer.expected_value[1],
                                       explainer.shap_values(X.loc[[2013]])[1],
                                       X.loc[[2013]],
                                       matplotlib=True)
    display(shap_display)
def training(train, test, validation_size, estimator, target_variable, drop_list, target_type, cv_folds, scoring_cv, cv=True, final=False, hypertuning=False):

            import matplotlib.pyplot as plt
            import pandas as pd
            import lightgbm as lgbm
            import training
            import os
            import sklearn
            import numpy as np
            import seaborn as sns
            import re
            import matplotlib.pyplot as plt
            import math
            from datetime import datetime
            import datetime

            import statsmodels.api as sm
            from sklearn.model_selection import train_test_split
            from scipy import stats
            from sklearn.feature_selection import SelectFromModel
            from sklearn.model_selection import cross_val_score, validation_curve
            from sklearn.ensemble import RandomForestClassifier
            from sklearn.ensemble import RandomForestRegressor
            from sklearn.ensemble import GradientBoostingRegressor
            from sklearn.ensemble import GradientBoostingClassifier
            from sklearn.model_selection import RandomizedSearchCV
            from sklearn.model_selection import train_test_split
            from sklearn.pipeline import Pipeline
            from sklearn.compose import ColumnTransformer
            from sklearn import ensemble
            from sklearn.linear_model import LogisticRegression
            from sklearn.model_selection import cross_val_score
            from sklearn.model_selection import GridSearchCV
            from sklearn.preprocessing import OneHotEncoder
            from sklearn.impute import SimpleImputer
            from sklearn.preprocessing import KBinsDiscretizer
            from sklearn.metrics import mean_squared_log_error
            from sklearn.metrics import make_scorer
            from sklearn.model_selection import KFold
            from sklearn.metrics import (confusion_matrix,  
                                    accuracy_score, 
                                    recall_score,
                                    roc_curve,
                                    roc_auc_score,
                                    plot_roc_curve,
                                    mean_squared_error) 

            import xgboost
            import shap
            from catboost import CatBoostClassifier
            from catboost import CatBoostRegressor
            import lightgbm as lgbm
            import optuna.integration.lightgbm as lgb
            from optuna.integration import _lightgbm_tuner as tuner
            from optuna.integration._lightgbm_tuner import LightGBMTuner 
            from optuna.integration._lightgbm_tuner import LightGBMTunerCV 

            rmsle_scorer = make_scorer(score_func)

            train_y = train[target_variable]
            train_x = train.drop(columns=drop_list)

            test_y = test[target_variable]
            test_x = test.drop(columns=drop_list) 

            column_names = list(train_x.columns)
            
            if final==True:

                train_x = train_x.append(test_x)
                train_y = train_y.append(test_y)

            if target_type=="bin":

                if estimator == "log_sk":
                    model = LogisticRegression(max_iter=1000)
                    log_sk = model.fit(train_x, train_y)
                    fitted_model = log_sk

                if estimator == "gb" and hypertuning==False:
                    model = ensemble.GradientBoostingClassifier(learning_rate = 0.1, max_depth=3, n_estimators= 100)
                    gb = model.fit(train_x, train_y)
                    fitted_model = gb   

                if estimator == "gb" and hypertuning==True:

                    param_grid = {
                                    'n_estimators': [100, 200, 400],
                                    'max_depth': [3, 5, 7],
                                    'learning_rate': [0.1, 0.05, 0.025, 0.01, 0.001, 0.005],
                                    'random_state': [42]
                                }

                    gb = ensemble.GradientBoostingClassifier()
                    gb_grid = GridSearchCV(gb, param_grid, cv=cv_folds, scoring=scoring_cv)
                    gb_grid.fit(train_x, train_y)
                    print('Optimal parameters for gradient boosting classifier = ', gb_grid.best_params_)
                    gb = gb_grid.best_estimator_
                    fitted_model = gb

                if estimator == "rf" and hypertuning==False:
                    model = ensemble.RandomForestClassifier(max_depth= 80, max_features= 5, min_samples_leaf= 3, min_samples_split= 12, n_estimators= 100)
                    rf = model.fit(train_x, train_y)
                    fitted_model=rf

                if estimator == "rf" and hypertuning==True:

                    param_grid = {
                                    'bootstrap': [True],
                                    'max_depth': [10, 20, 30],
                                    'max_features': [2, 3, 5],
                                    'min_samples_leaf': [3, 5, 10],
                                    'min_samples_split': [8, 12],
                                    'n_estimators': [100, 300, 500],
                                    'n_jobs': [3]
                                }

                    rf = RandomForestClassifier()
                    rf_grid = GridSearchCV(rf, param_grid, cv=cv_folds, scoring=scoring_cv)
                    rf_grid.fit(train_x, train_y)
                    print('Optimal parameters for random forest classifier = ', rf_grid.best_params_)
                    rf = rf_grid.best_estimator_
                    fitted_model = rf

                if cv and hypertuning==False:
                    cross_val_accuracy = cross_val_score(estimator=model
                            , X=train_x
                            , y=train_y
                            , cv=cv_folds
                            , scoring=scoring_cv)

                    print(f'The average cross validation accuracy of the model is {round(cross_val_accuracy.mean(), 2)}')
                    print(cross_val_accuracy)

            if target_type=="con":

                if estimator == "lgbm" and hypertuning==False:

                    train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=validation_size, shuffle=True, random_state=42)
                    train_data=lgb.Dataset(train_x,label=train_y)
                    valid_data=lgb.Dataset(valid_x,label=valid_y)

                    model = lgbm.LGBMRegressor(random_state=42, n_estimators=1000)
                    lgbm_model = model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], eval_metric=scoring_cv, verbose = -1)
                    fitted_model = lgbm_model

                if estimator == "lin_reg" and hypertuning==False:
                    model = LinearRegression(max_iter=1000)
                    lin_reg = model.fit(train_x, train_y)
                    fitted_model = lin_reg

                if estimator == "gb" and hypertuning==False:
                    model = ensemble.GradientBoostingRegressor(learning_rate = 0.001, max_depth=5, n_estimators= 100)
                    gb = model.fit(train_x, train_y)
                    fitted_model = gb   

                if estimator == "rf" and hypertuning==False:
                    model = ensemble.RandomForestRegressor(max_depth= 30, max_features= 5, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 500, n_jobs= -1)
                    rf = model.fit(train_x, train_y)
                    fitted_model=rf

                if estimator == "gb" and hypertuning==True:
                    # {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'random_state': 42}
                    param_grid = {
                                'n_estimators': [100,500,1000],
                                'max_features': ["auto","sqrt","log2",0.6,0.8],
                                'min_samples_leaf':[30,50,70],
                                'min_samples_split':[10,20,500,100],
                                'max_depth' : [10,15,20,25],
                                'learning_rate':[0.1,0.01,0.001]
                                }

                    gb = ensemble.GradientBoostingRegressor()
                    gb_grid = GridSearchCV(gb, param_grid, cv=cv_folds, scoring=scoring_cv)
                    gb_grid.fit(train_x, train_y)
                    print('Optimal parameters for gradient boosting regressor = ', gb_grid.best_params_)
                    gb = gb_grid.best_estimator_
                    fitted_model = gb

                if estimator == "lgbm" and hypertuning==True:
                    if __name__ == "__main__":

                            dtrain = lgb.Dataset(train_x, label=train_y)

                            params = {
                                    "objective": "regression",
                                    "metric": "rmse",
                                    "verbosity": -1,
                                    "boosting_type": "gbdt",
                                }

                            tuner = lgb.LightGBMTunerCV(
                                    params, dtrain, verbose_eval=100, early_stopping_rounds=100, folds=KFold(n_splits=5)
                                )

                            tuner.run()

                            print("Best score:", tuner.best_score)
                            best_params = tuner.best_params
                            print("Best params:", best_params)
                            print("  Params: ")
                            for key, value in best_params.items():
                                print("    {}: {}".format(key, value))


                if estimator == "rf" and hypertuning==True: 
                    # {'bootstrap': True, 'max_depth': 80, 'max_features': 2, 'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 100, 'n_jobs': 1}
                    # max_depth= 80, max_features= 5, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 300, n_jobs= 1
                    # {'bootstrap': True, 'max_depth': 100, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 500, 'n_jobs': 4}
                    
                    param_grid = {
                                    'max_depth': [10, 20, 30],
                                    'max_features': [2, 3, 5],
                                    'min_samples_leaf': [3, 5, 10],
                                    'min_samples_split': [8, 12],
                                    'n_estimators': [100, 300, 500],
                                    'n_jobs': [4]
                                }

                    rf = RandomForestRegressor()
                    rf_grid = GridSearchCV(rf, param_grid, cv=cv_folds, scoring=scoring_cv)
                    rf_grid.fit(train_x, train_y)
                    print('Optimal parameters for random forest regressor = ', rf_grid.best_params_)
                    rf = rf_grid.best_estimator_
                    fitted_model = rf


                if cv and hypertuning==False:
                    cross_val_rmse = cross_val_score(estimator=model
                            , X=train_x
                            , y=train_y
                            , cv=cv_folds
                            , scoring=scoring_cv)

                    print(f'The average cross validation rmsle of the model is {-1*round(cross_val_rmse.mean(), 2)}')
                    print(cross_val_rmse)

                if estimator=="gb" or estimator=="rf" or estimator=="lgbm":
                    list_all_Features = train_x.columns.tolist()

                    # Feature importance
                    fi_df = pd.DataFrame({"Feature": list_all_Features, "Importance": fitted_model.feature_importances_}).sort_values(by="Importance", ascending=False)
                    fi_selected=fi_df[:15]
                    important_feature_list = fi_selected["Feature"].tolist()

                    if estimator=="gb":
                        fi_selected.to_excel(r'fi_selected.xlsx')
                        fig = plt.figure(figsize=(20,10))
                        feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features)
                        feat_importances.nlargest(30).plot(kind='barh', color="green")
                        plt.title("Feature Importance from Gradient Boosting")
                        plt.savefig('Feature Importance from Gradient Boosting.png',  bbox_inches = "tight")

                    if estimator=="rf":
                        fi_selected.to_excel(r'fi_selected.xlsx')
                        fig = plt.figure(figsize=(20,20))
                        feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features)
                        feat_importances.nlargest(30).plot(kind='barh', color="green")
                        plt.title("Feature Importance from Random Forest")
                        plt.savefig('Feature Importance from Random Forest.png',  bbox_inches = "tight")

                    if estimator=="lgbm":
                        fi_selected.to_excel(r'fi_selected.xlsx')
                        feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features)
                        explainer = shap.TreeExplainer(fitted_model)
                        shap_values = explainer.shap_values(valid_x)

                        shap.initjs()

                        force_plot = shap.force_plot(explainer.expected_value, shap_values[0,:], valid_x.iloc[0,:])
                        shap.save_html("index_force_plot.htm", force_plot)
                        force_plot_all = shap.force_plot(explainer.expected_value, shap_values, valid_x)
                        shap.save_html("index_force_plot_all.htm", force_plot_all)
                        plt.figure(figsize=(10,20))
                        shap.summary_plot(shap_values, valid_x, show=False)
                        plt.savefig('summary_plot.png',  bbox_inches = "tight")

                        top_features = feat_importances.nlargest(10)
                        top_features = top_features.reset_index()
                        top_features = top_features['index'].to_list()    

                        for i in top_features:
                            plt.figure(figsize=(20,20))
                            shap.dependence_plot(i, shap_values, valid_x, show=False)
                            plt.savefig(f"dep_plot_{i}.png",  bbox_inches = "tight")

                if final==False and target_type=="con":
                    yhat = fitted_model.predict(test_x).astype(float)
                    y_pred = list(yhat.astype(float))
                    y_true = list(test_y) 
                    print(np.sqrt(mean_squared_error(y_true, y_pred)))

                if final==False and target_type=="bin":
                    yhat = fitted_model.predict(test_x) 
                    y_pred = list(map(round, yhat)) 
                    cm = confusion_matrix(test_y, y_pred)  
                    print ("Confusion Matrix : \n", cm) 
                    print('Test accuracy = ', accuracy_score(test_y, prediction))
                    print('Test recall = ', recall_score(test_y, prediction))
                
                return fitted_model
예제 #11
0
	import warnings
	warnings.filterwarnings("ignore")
	# Create SHAP explainer
	explainer = shap.TreeExplainer(RFModel)	

	# Get shap values for observtation of interest
	shap_values = explainer.shap_values(data_for_prediction.values, check_additivity=False)

	decisionhtml = shap.decision_plot(base_value= explainer.expected_value[1], shap_values= shap_values[1], features= data_for_prediction, feature_names=data_for_prediction.columns.tolist(),show = False)
	plt.savefig('decisionPlot.pdf')
	plt.close()

	onedshap_values = shap_values[1].flatten()
	shap.waterfall_plot(explainer.expected_value[1], onedshap_values, feature_names=data_for_prediction.columns, max_display=10, show=False)
	plt.savefig('waterfallPlot.pdf')
	plt.close()

	# SHAP Plots for Class 1 (sRNA-mRNA Interaction)
	forcehtml = shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction)
	shap.save_html(out_file = 'forcePlot.html', full_html=False, plot = forcehtml)

	
elif((len(sys.argv) - 1) < 2):

	print("Error: Required parameters not passed! Please pass two parameters, sRNA ID and mRNA ID.")


else:

	print("Error: Only two parameters can be passed. sRNA ID and mRNA ID.")
plt.savefig("dB.png", bbox_inches='tight', dpi=600)

plt.close('all')
shap.dependence_plot("median_stride_length", shap_values, X_train)
plt.savefig("sl-dep.pdf", bbox_inches='tight', dpi=600)

plt.close('all')
shap.force_plot(explainer.expected_value,
                shap_values[0, :],
                X_test.iloc[0, :],
                show=False,
                matplotlib=True)
plt.savefig('tmp.pdf')
shap.force_plot(explainer.expected_value, shap_values, X_train)

shap.save_html('explainer.html',
               shap.force_plot(explainer.expected_value, shap_values, X_train))

#XGBoost
best_params = {
    'XGBRegressor__alpha': 0.8,
    'XGBRegressor__colsample_bytree': 0.7,
    'XGBRegressor__eta': 0.05,
    'XGBRegressor__max_depth': 5,
    'XGBRegressor__objective': 'reg:squarederror',
    'XGBRegressor__subsample': 0.5
}
#This set of parameters is obtained by running the main grid search code on Winter cluster.

pipe = make_pipeline(
    RobustScaler(),
    XGBRegressor(
예제 #13
0
def get_shap_force(df, explainer, key):
    shap_values = explainer.shap_values(df)
    plot = shap.force_plot(explainer.expected_value[0], shap_values[0], show=False, feature_names = loaded_model.get_booster().feature_names)
    shap.save_html(f'templates/force_plots/{key}.html', plot, full_html=False)
shap.summary_plot(shap_values, X)

# # Force Plot  - Feature Contribution Visualization Across Observatons

# In[ ]:

# load JS visualization code to notebook
shap.initjs()
# visualize the prediction's explanation (use matplotlib=True to avoid Javascript)
# Impact on Day 0 Price
output = shap.force_plot(explainer.expected_value[0],
                         shap_values[0],
                         X,
                         plot_cmap=["#FF0000", "#008000"])
shap.save_html("Price Influence by Features Across Observations.html", output)
shap.force_plot(explainer.expected_value[0],
                shap_values[0],
                X,
                plot_cmap=["#FF0000", "#008000"])

# # Key Price Influencer Daily Slider

# In[ ]:

import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
예제 #15
0
def save_plot(plt, name):
    shap.save_html("plots/" + name, plt)
예제 #16
0
    def shap_plot(self,
                  explainer=None,
                  shap_vals=None,
                  specific_var=None,
                  interactions=False,
                  interaction_vars=None,
                  classwise=True,
                  class_ind=1,
                  num_display=20):
        """
        :param explainer: explainer
        :param shap_vals: vals derived from running the explainer
        :param specific_var: if desired, run the individual feature plots
        :param interaction_var: which desired var to plot as interacting with 'specific var'
        :param class ind: when plotting classifier results, pick class index to plot with
        :return shap_interaction_vals: these are expensive to compute, so only want to do so once!
        """
        interaction_var = None
        if interaction_vars is not None:
            if len(interaction_vars) > 2:
                raise Exception(
                    'Interaction vars list cannot be greater than 2.')

        def plot_interactions(data, expl=None, vars_=None, class_index=1):
            if self.shap_interaction_vals is None:
                if self.type_ == 'cls':
                    self.shap_interaction_vals = expl.shap_interaction_values(
                        data)[class_index]
                elif self.type_ == 'reg':
                    self.shap_interaction_vals = expl.shap_interaction_values(
                        data)

            tmp = np.abs(self.shap_interaction_vals).sum(0)
            for i in range(tmp.shape[0]):
                tmp[i, i] = 0
            inds = np.argsort(-tmp.sum(0))[:50]
            tmp2 = tmp[inds, :][:, inds]
            plt.figure(figsize=(12, 12))
            plt.imshow(tmp2)
            plt.yticks(range(tmp2.shape[0]),
                       data.columns[inds],
                       rotation=50.4,
                       horizontalalignment="right")
            plt.xticks(range(tmp2.shape[0]),
                       data.columns[inds],
                       rotation=50.4,
                       horizontalalignment="left")
            plt.gca().xaxis.tick_top()
            plt.tight_layout()
            plt.savefig(
                os.path.join(
                    self.output_dir, self.outcome_var + '_' + str(self.type_) +
                    '_' + str(self.class_) +
                    '_interaction_matrix_{}.png'.format(class_index)))
            plt.show()
            plt.close()

            if vars_ != None:
                shap.dependence_plot(vars_,
                                     self.shap_interaction_vals,
                                     data,
                                     show=False)
                plt.tight_layout()
                plt.savefig(
                    os.path.join(
                        self.output_dir,
                        self.outcome_var + '_' + str(self.type_) + '_' +
                        str(self.class_) + '_interaction_{}_{}_{}.png'.format(
                            vars_[0], vars_[1], class_index)))
                plt.show()
                plt.close()

        if self.k_cv == 'split':
            X_test_plot = self.X
        elif self.k_cv == 'loo_cv' or self.k_cv == 'k_fold':
            X_test_plot = self.X

        if self.type_ == 'cls':
            if interactions:
                plot_interactions(X_test_plot, explainer, interaction_vars,
                                  class_ind)

            if classwise or (self.class_ == 'lin'):
                shap.summary_plot(shap_values=shap_vals,
                                  features=X_test_plot,
                                  max_display=num_display,
                                  plot_type='bar',
                                  show=False)
                plt.xlabel('mean(|SHAP value|) (impact on output magnitude)')
                plt.tight_layout()
                plt.savefig(
                    os.path.join(
                        self.output_dir, self.outcome_var + '_' +
                        str(self.type_) + '_' + str(self.class_) + '_' +
                        str(num_display) + '_shap_val_summary.png'))
                plt.show()
                plt.close()
            else:
                shap.summary_plot(shap_values=shap_vals[class_ind],
                                  features=X_test_plot,
                                  max_display=num_display,
                                  plot_type='bar',
                                  show=False)
                plt.xlabel('mean(|SHAP value|) (impact on output magnitude)')
                plt.tight_layout()
                plt.savefig(
                    os.path.join(
                        self.output_dir, self.outcome_var + '_' +
                        str(self.type_) + '_' + str(self.class_) + '_' +
                        str(num_display) + '_shap_val_summary.png'))
                plt.show()
                plt.close()

            if self.class_ == 'RF':
                shap.summary_plot(shap_values=shap_vals[class_ind],
                                  features=X_test_plot,
                                  max_display=num_display,
                                  plot_type='dot',
                                  show=False)
            elif self.class_ == 'lin':
                shap.summary_plot(shap_values=shap_vals,
                                  features=X_test_plot,
                                  max_display=num_display,
                                  plot_type='dot',
                                  show=False)
            elif self.class_ == 'svm':
                print('not implemented shap for svm yet')
            plt.tight_layout()
            plt.savefig(
                os.path.join(
                    self.output_dir, self.outcome_var + '_' + str(self.type_) +
                    '_' + str(self.class_) + '_' + str(class_ind) + '_' +
                    str(num_display) + '_shap_effects_summary.png'))
            plt.show()
            plt.close()

            if specific_var is not None:
                if self.class_ == 'RF':
                    shap.dependence_plot(specific_var,
                                         interaction_index=interaction_var,
                                         shap_values=shap_vals[class_ind],
                                         features=X_test_plot,
                                         show=False)
                else:
                    shap.dependence_plot(specific_var,
                                         interaction_index=interaction_var,
                                         shap_values=shap_vals,
                                         features=X_test_plot,
                                         show=False)

                plt.tight_layout()
                plt.savefig(
                    os.path.join(
                        self.output_dir,
                        self.outcome_var + '_' + str(self.type_) + '_' +
                        str(self.class_) + '_' + str(num_display) +
                        '_shap_interaction_summary_{}.png'.format(specific_var)
                    ))
                plt.show()
                plt.close()

        elif self.type_ == 'reg':
            if interactions:
                plot_interactions(X_test_plot, explainer, interaction_vars,
                                  class_ind)

            shap.summary_plot(shap_values=shap_vals,
                              features=X_test_plot,
                              max_display=num_display,
                              plot_type='bar',
                              show=False)
            plt.xlabel('mean(|SHAP value|) (impact on output magnitude)')
            plt.tight_layout()
            plt.savefig(
                os.path.join(
                    self.output_dir, self.outcome_var + '_' + str(self.type_) +
                    '_' + str(self.class_) + '_' + str(num_display) +
                    '_shap_val_summary.png'))
            plt.show()
            plt.close()

            if self.class_ == 'RF':
                shap.summary_plot(shap_values=shap_vals,
                                  features=X_test_plot,
                                  max_display=num_display,
                                  plot_type='dot',
                                  show=False)
            else:
                shap.summary_plot(shap_values=shap_vals,
                                  features=X_test_plot,
                                  max_display=num_display,
                                  plot_type='dot',
                                  show=False)
            plt.tight_layout()
            plt.savefig(
                os.path.join(
                    self.output_dir, self.outcome_var + '_' + str(self.type_) +
                    '_' + str(self.class_) + '_' + str(num_display) +
                    '_shap_effects_summary.png'))
            plt.show()
            plt.close()

            if specific_var is not None:
                if self.class_ == 'RF':
                    shap.dependence_plot(specific_var,
                                         interaction_index=interaction_var,
                                         shap_values=shap_vals,
                                         features=X_test_plot,
                                         show=False)
                else:
                    shap.dependence_plot(specific_var,
                                         interaction_index=interaction_var,
                                         shap_values=shap_vals,
                                         features=X_test_plot,
                                         show=False)
                plt.tight_layout()
                plt.savefig(
                    os.path.join(
                        self.output_dir,
                        self.outcome_var + '_' + str(self.type_) + '_' +
                        str(self.class_) + '_' + str(num_display) +
                        '_shap_interaction_summary_{}.png'.format(specific_var)
                    ))
                plt.show()
                plt.close()

        # visualize the training set predictions
        f = os.path.join(
            self.output_dir, self.outcome_var + '_' + str(self.type_) + '_' +
            str(self.class_) + 'shap_forceplot_{}.html'.format(class_ind))

        if self.type_ == 'cls':
            shap.save_html(
                f,
                shap.force_plot(explainer.expected_value[class_ind],
                                shap_vals[class_ind],
                                X_test_plot,
                                show=False))
        elif self.type_ == 'reg':
            shap.save_html(
                f,
                shap.force_plot(explainer.expected_value,
                                shap_vals,
                                X_test_plot,
                                show=False))
        if interactions:
            return self.shap_interaction_vals
예제 #17
0
파일: SHAP.py 프로젝트: DrDevSK/MLMD
def main():
    print(tf.__version__)

    #Loading input data - test, val, train data and dropping different labels (differential diagnosis, combined label, lab tests from the dataset)
    test_dat = pd.read_pickle(test)
    test_dat.drop(name, axis=1, inplace=True)
    test_dat.drop('CM_Label', axis=1, inplace=True)
    test_dat.drop('PrimaryDx', axis=1, inplace=True)
    print(test_dat['PrimaryDx_Label'].value_counts())

    val_dat = pd.read_pickle(val)
    val_dat.drop(name, axis=1, inplace=True)
    val_dat.drop('CM_Label', axis=1, inplace=True)
    val_dat.drop('PrimaryDx', axis=1, inplace=True)
    print(val_dat['PrimaryDx_Label'].value_counts())

    train_dat = pd.read_pickle(train)
    train_dat.drop(name, axis=1, inplace=True)
    train_dat.drop('CM_Label', axis=1, inplace=True)
    train_dat.drop('PrimaryDx', axis=1, inplace=True)
    print(train_dat['PrimaryDx_Label'].value_counts())

    train_dat = train_dat.astype('int')
    test_dat = test_dat.astype('int')
    val_dat = val_dat.astype('int')

    train_dat = balance_classes(
        1, train_dat)  #Calling function to upsample the minority class

    print("Data Loaded")

    #Extract the labels from the dataset
    test_y = np.array(test_dat.pop(Label))
    train_y = np.array(train_dat.pop(Label))
    val_y = np.array(val_dat.pop(Label))

    #Input features x to the models
    test_x = test_dat
    train_x = train_dat
    val_x = val_dat

    #Getting feature names from column headers
    feature = list(train_x.columns)

    #Transform features by scaling each feature to a given range
    sc_X = MinMaxScaler()
    train_x = sc_X.fit_transform(train_x)
    test_x = sc_X.transform(test_x)
    val_x = sc_X.transform(val_x)

    positive_results = 1 - len([i for i in train_y if i == 1]) / len(train_y)
    print(positive_results)
    positive_results = 1 - len([i for i in test_y if i == 1]) / len(test_y)
    print(positive_results)

    train_x = np.nan_to_num(train_x)
    test_x = np.nan_to_num(test_x)

    #Note a pretrained model can be loaded instead of training a new model here as an option.
    """Neural Network Model"""
    model = keras.Sequential()
    model.add(keras.layers.Dense(2048, activation=tf.nn.relu))
    model.add(keras.layers.Dense(1024, activation=tf.nn.relu))
    model.add(keras.layers.Dense(210, activation=tf.nn.relu))
    model.add(keras.layers.Dense(120, activation=tf.nn.relu))
    model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

    US = 1000
    No_US = 1

    #Optimizer and Loss Function
    opt = keras.optimizers.SGD(lr=0.0001,
                               decay=1e-6,
                               momentum=0.9,
                               nesterov=True)
    model.compile(class_weight={
        0: US,
        1: No_US
    },
                  loss="binary_crossentropy",
                  optimizer=opt,
                  metrics=['accuracy'],
                  kernel_regularizer=keras.regularizers.l2(0.05),
                  bias_regularizer=keras.regularizers.l2(0.01))

    early_stopping_monitor = EarlyStopping(patience=20)
    history = model.fit(train_x,
                        train_y,
                        epochs=1000,
                        batch_size=1000,
                        validation_data=(val_x, val_y),
                        callbacks=[early_stopping_monitor],
                        verbose=1)

    model.save(path + name + '_NN.h5')  #Saving the NN Model

    #Determing the SHAP values and generating SHAP plots
    #https://github.com/slundberg/shap

    background = train_x[np.random.choice(train_x.shape[0], 10, replace=False)]
    e = shap.DeepExplainer(model, background)
    shap_values = e.shap_values(train_x)

    summary_plot = shap.summary_plot(shap_values[0],
                                     train_x,
                                     feature_names=feature,
                                     show=False)
    plt.savefig(path + 'NN_Shap_Summary_plot_3.png',
                bbox_inches='tight',
                dpi=600)
    plt.close()

    bar_plot = shap.summary_plot(shap_values,
                                 train_x,
                                 feature_names=feature,
                                 show=False)
    plt.savefig(path + 'NN_Shap_Bar_plot_3.png', bbox_inches='tight', dpi=600)
    plt.close()

    D_plot = shap.dependence_plot("Sex_F",
                                  shap_values[0],
                                  train_x,
                                  interaction_index='Age',
                                  feature_names=feature)
    plt.savefig(path + 'Gender_NN_D_plot_3.png', bbox_inches='tight', dpi=600)

    Age_plot = shap.dependence_plot("Age",
                                    shap_values[0],
                                    train_x,
                                    interaction_index=None,
                                    feature_names=feature)
    plt.savefig(path + 'Age_NN_D_plot_3.png', bbox_inches='tight', dpi=600)

    Pulse_plot = shap.dependence_plot("Pulse",
                                      shap_values[0],
                                      train_x,
                                      interaction_index=None,
                                      feature_names=feature)
    plt.savefig(path + 'Pulse_NN_D_plot_3.png', bbox_inches='tight', dpi=600)

    Pulse_Age_plot = shap.dependence_plot("Age",
                                          shap_values[0],
                                          train_x,
                                          interaction_index='Pulse',
                                          feature_names=feature)
    plt.savefig(path + 'Pulse_Age_NN_D_plot_3.png',
                bbox_inches='tight',
                dpi=600)

    #Selecting individual patients predictions and generating patient specific SHAP values as examples
    data_for_prediction = test_x[9:10, :]
    background = train_x[0:100, :]
    explainer = shap.DeepExplainer(model, background)
    shap_values = explainer.shap_values(data_for_prediction)
    force_plot = shap.force_plot(explainer.expected_value[0],
                                 shap_values[0],
                                 data_for_prediction,
                                 feature_names=feature)
    shap.save_html(path + "force_plot.html", force_plot)

    data_for_prediction = test_x[22:23, :]
    background = train_x[0:100, :]
    explainer = shap.DeepExplainer(model, background)
    shap_values = explainer.shap_values(data_for_prediction)
    force_plot = shap.force_plot(explainer.expected_value[0],
                                 shap_values[0],
                                 data_for_prediction,
                                 feature_names=feature)
    shap.save_html(path + "force_plot2.html", force_plot)

    data_for_prediction = test_x[100:101, :]
    background = train_x[0:100, :]
    explainer = shap.DeepExplainer(model, background)
    shap_values = explainer.shap_values(data_for_prediction)
    force_plot = shap.force_plot(explainer.expected_value[0],
                                 shap_values[0],
                                 data_for_prediction,
                                 feature_names=feature)
    shap.save_html(path + "force_plot3.html", force_plot)

    data_for_prediction = test_x[8:9, :]
    background = train_x[0:100, :]
    explainer = shap.DeepExplainer(model, background)
    shap_values = explainer.shap_values(data_for_prediction)
    force_plot = shap.force_plot(explainer.expected_value[0],
                                 shap_values[0],
                                 data_for_prediction,
                                 feature_names=feature)
    shap.save_html(path + "force_plot4.html", force_plot)

    data_for_prediction = test_x[68:69, :]
    background = train_x[0:100, :]
    explainer = shap.DeepExplainer(model, background)
    shap_values = explainer.shap_values(data_for_prediction)
    force_plot = shap.force_plot(explainer.expected_value[0],
                                 shap_values[0],
                                 data_for_prediction,
                                 feature_names=feature)
    shap.save_html(path + "force_plot5.html", force_plot)

    data_for_prediction = test_x[0:100, :]
    background = train_x[0:100, :]
    explainer = shap.DeepExplainer(model, background)
    shap_values = explainer.shap_values(data_for_prediction)
    force_plot = shap.force_plot(explainer.expected_value[0],
                                 shap_values[0],
                                 data_for_prediction,
                                 feature_names=feature)
    shap.save_html(path + "summary_force_plot3.html", force_plot)
# save
xgb_results_df = pd.DataFrame(results, columns=columns)
result_file = 'xgb_results_T.csv'
xgb_results_df.to_csv(result_file, index=False)

# train all
print('Trainig based on all')
model, _ = train(X, y)
ax = plot_importance(model, title='Weight', importance_type='weight', max_num_features=10)
plt.show()
ax = plot_importance(model, title='Gain', importance_type='gain', max_num_features=10)
plt.show()
ax = plot_importance(model, title='Cover', importance_type='cover', max_num_features=10)
plt.show()

# fix tree
booster = model.get_booster()
model_bytearray = booster.save_raw()[4:]
def fix(self=None):
    return model_bytearray
booster.save_raw = fix

# shap
import shap
explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(X)
shap.initjs()
for i in range(4):
    shap.save_html('shap_' + str(i) + '.html', shap.force_plot(explainer.expected_value[i], shap_values[i], X))
shap.summary_plot(shap_values, X)
예제 #19
0
파일: model.py 프로젝트: Felihong/bugbug
    def classify(self,
                 items,
                 probabilities=False,
                 importances=False,
                 importance_cutoff=0.15):
        assert items is not None
        assert (self.extraction_pipeline is not None and self.clf
                is not None), "The module needs to be initialized first"

        if not isinstance(items, list):
            items = [items]

        assert isinstance(items[0], dict) or isinstance(items[0], tuple)

        X = self.extraction_pipeline.transform(items)
        if probabilities:
            classes = self.clf.predict_proba(X)
        else:
            classes = self.clf.predict(X)

        classes = self.overwrite_classes(items, classes, probabilities)

        if importances:
            explainer = shap.TreeExplainer(self.clf)
            shap_values = explainer.shap_values(X)

            important_features = self.get_important_features(
                importance_cutoff, shap_values)
            important_features["values"] = X

            # Workaround: handle multi class case for force_plot to work correctly
            if len(classes[0]) > 2:
                pred_class_index = classes.argmax(axis=-1)[0]
                explainer.expected_value = explainer.expected_value[
                    pred_class_index]
                shap_values = shap_values[pred_class_index]
            else:
                pred_class_index = 0

            pred_class = self.class_names[pred_class_index]
            top_indexes = [
                int(index) for importance, index, is_positive in
                important_features["classes"][pred_class][0]
            ]

            feature_names = self.get_human_readable_feature_names()

            feature_legend = {
                str(i + 1): feature_names[feature_i]
                for i, feature_i in enumerate(top_indexes)
            }

            with io.StringIO() as out:
                p = shap.force_plot(
                    explainer.expected_value,
                    shap_values[:, top_indexes],
                    X.toarray()[:, top_indexes],
                    feature_names=[
                        str(i + 1) for i in range(len(top_indexes))
                    ],
                    matplotlib=False,
                    show=False,
                )

                # TODO: use full_html=False
                shap.save_html(out, p)

                html = out.getvalue()

            return (
                classes,
                {
                    "importances": important_features,
                    "html": html,
                    "feature_legend": feature_legend,
                },
            )

        return classes
#classifier = model_keras (needs fixing)


#We will use SHAP KernelExplainer to explain the model.
explainer = shap.KernelExplainer(model=classifier.predict_proba, data=X_train.iloc[0:100,:])

#Next, we compute the SHAP values
shap_values= explainer.shap_values(X=X_test.iloc[0:50,:])

#Since is binary classification, len = 2
print(len(shap_values))
#(50,6) - 50 objects, 6 features
print(shap_values[0].shape)

#Explaining a single prediction for passing
shap.initjs()
plot = shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], X_test.iloc[0,:])
shap.save_html('plot_1_instances.html', plot)

#Explaining a single prediction for failing
plot = shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:])
shap.save_html('plot_2_instances.html', plot)

#Explaining predictions for passing for 50 instances of X_test
plot = shap.force_plot(explainer.expected_value[1], shap_values[1], X_test)
shap.save_html('plot_X_test_instances.html', plot)

#Shap summary plot
print(shap.summary_plot(shap_values, X_test))