def training(train, test, validation_size, estimator, target_variable, drop_list, target_type, cv_folds, scoring_cv, cv=True, final=False, hypertuning=False):

            import matplotlib.pyplot as plt
            import pandas as pd
            import lightgbm as lgbm
            import training
            import os
            import sklearn
            import numpy as np
            import seaborn as sns
            import re
            import matplotlib.pyplot as plt
            import math
            from datetime import datetime
            import datetime

            import statsmodels.api as sm
            from sklearn.model_selection import train_test_split
            from scipy import stats
            from sklearn.feature_selection import SelectFromModel
            from sklearn.model_selection import cross_val_score, validation_curve
            from sklearn.ensemble import RandomForestClassifier
            from sklearn.ensemble import RandomForestRegressor
            from sklearn.ensemble import GradientBoostingRegressor
            from sklearn.ensemble import GradientBoostingClassifier
            from sklearn.model_selection import RandomizedSearchCV
            from sklearn.model_selection import train_test_split
            from sklearn.pipeline import Pipeline
            from sklearn.compose import ColumnTransformer
            from sklearn import ensemble
            from sklearn.linear_model import LogisticRegression
            from sklearn.model_selection import cross_val_score
            from sklearn.model_selection import GridSearchCV
            from sklearn.preprocessing import OneHotEncoder
            from sklearn.impute import SimpleImputer
            from sklearn.preprocessing import KBinsDiscretizer
            from sklearn.metrics import mean_squared_log_error
            from sklearn.metrics import make_scorer
            from sklearn.model_selection import KFold
            from sklearn.metrics import (confusion_matrix,  
                                    accuracy_score, 
                                    recall_score,
                                    roc_curve,
                                    roc_auc_score,
                                    plot_roc_curve,
                                    mean_squared_error) 

            import xgboost
            import shap
            from catboost import CatBoostClassifier
            from catboost import CatBoostRegressor
            import lightgbm as lgbm
            import optuna.integration.lightgbm as lgb
            from optuna.integration import _lightgbm_tuner as tuner
            from optuna.integration._lightgbm_tuner import LightGBMTuner 
            from optuna.integration._lightgbm_tuner import LightGBMTunerCV 

            rmsle_scorer = make_scorer(score_func)

            train_y = train[target_variable]
            train_x = train.drop(columns=drop_list)

            test_y = test[target_variable]
            test_x = test.drop(columns=drop_list) 

            column_names = list(train_x.columns)
            
            if final==True:

                train_x = train_x.append(test_x)
                train_y = train_y.append(test_y)

            if target_type=="bin":

                if estimator == "log_sk":
                    model = LogisticRegression(max_iter=1000)
                    log_sk = model.fit(train_x, train_y)
                    fitted_model = log_sk

                if estimator == "gb" and hypertuning==False:
                    model = ensemble.GradientBoostingClassifier(learning_rate = 0.1, max_depth=3, n_estimators= 100)
                    gb = model.fit(train_x, train_y)
                    fitted_model = gb   

                if estimator == "gb" and hypertuning==True:

                    param_grid = {
                                    'n_estimators': [100, 200, 400],
                                    'max_depth': [3, 5, 7],
                                    'learning_rate': [0.1, 0.05, 0.025, 0.01, 0.001, 0.005],
                                    'random_state': [42]
                                }

                    gb = ensemble.GradientBoostingClassifier()
                    gb_grid = GridSearchCV(gb, param_grid, cv=cv_folds, scoring=scoring_cv)
                    gb_grid.fit(train_x, train_y)
                    print('Optimal parameters for gradient boosting classifier = ', gb_grid.best_params_)
                    gb = gb_grid.best_estimator_
                    fitted_model = gb

                if estimator == "rf" and hypertuning==False:
                    model = ensemble.RandomForestClassifier(max_depth= 80, max_features= 5, min_samples_leaf= 3, min_samples_split= 12, n_estimators= 100)
                    rf = model.fit(train_x, train_y)
                    fitted_model=rf

                if estimator == "rf" and hypertuning==True:

                    param_grid = {
                                    'bootstrap': [True],
                                    'max_depth': [10, 20, 30],
                                    'max_features': [2, 3, 5],
                                    'min_samples_leaf': [3, 5, 10],
                                    'min_samples_split': [8, 12],
                                    'n_estimators': [100, 300, 500],
                                    'n_jobs': [3]
                                }

                    rf = RandomForestClassifier()
                    rf_grid = GridSearchCV(rf, param_grid, cv=cv_folds, scoring=scoring_cv)
                    rf_grid.fit(train_x, train_y)
                    print('Optimal parameters for random forest classifier = ', rf_grid.best_params_)
                    rf = rf_grid.best_estimator_
                    fitted_model = rf

                if cv and hypertuning==False:
                    cross_val_accuracy = cross_val_score(estimator=model
                            , X=train_x
                            , y=train_y
                            , cv=cv_folds
                            , scoring=scoring_cv)

                    print(f'The average cross validation accuracy of the model is {round(cross_val_accuracy.mean(), 2)}')
                    print(cross_val_accuracy)

            if target_type=="con":

                if estimator == "lgbm" and hypertuning==False:

                    train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=validation_size, shuffle=True, random_state=42)
                    train_data=lgb.Dataset(train_x,label=train_y)
                    valid_data=lgb.Dataset(valid_x,label=valid_y)

                    model = lgbm.LGBMRegressor(random_state=42, n_estimators=1000)
                    lgbm_model = model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], eval_metric=scoring_cv, verbose = -1)
                    fitted_model = lgbm_model

                if estimator == "lin_reg" and hypertuning==False:
                    model = LinearRegression(max_iter=1000)
                    lin_reg = model.fit(train_x, train_y)
                    fitted_model = lin_reg

                if estimator == "gb" and hypertuning==False:
                    model = ensemble.GradientBoostingRegressor(learning_rate = 0.001, max_depth=5, n_estimators= 100)
                    gb = model.fit(train_x, train_y)
                    fitted_model = gb   

                if estimator == "rf" and hypertuning==False:
                    model = ensemble.RandomForestRegressor(max_depth= 30, max_features= 5, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 500, n_jobs= -1)
                    rf = model.fit(train_x, train_y)
                    fitted_model=rf

                if estimator == "gb" and hypertuning==True:
                    # {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'random_state': 42}
                    param_grid = {
                                'n_estimators': [100,500,1000],
                                'max_features': ["auto","sqrt","log2",0.6,0.8],
                                'min_samples_leaf':[30,50,70],
                                'min_samples_split':[10,20,500,100],
                                'max_depth' : [10,15,20,25],
                                'learning_rate':[0.1,0.01,0.001]
                                }

                    gb = ensemble.GradientBoostingRegressor()
                    gb_grid = GridSearchCV(gb, param_grid, cv=cv_folds, scoring=scoring_cv)
                    gb_grid.fit(train_x, train_y)
                    print('Optimal parameters for gradient boosting regressor = ', gb_grid.best_params_)
                    gb = gb_grid.best_estimator_
                    fitted_model = gb

                if estimator == "lgbm" and hypertuning==True:
                    if __name__ == "__main__":

                            dtrain = lgb.Dataset(train_x, label=train_y)

                            params = {
                                    "objective": "regression",
                                    "metric": "rmse",
                                    "verbosity": -1,
                                    "boosting_type": "gbdt",
                                }

                            tuner = lgb.LightGBMTunerCV(
                                    params, dtrain, verbose_eval=100, early_stopping_rounds=100, folds=KFold(n_splits=5)
                                )

                            tuner.run()

                            print("Best score:", tuner.best_score)
                            best_params = tuner.best_params
                            print("Best params:", best_params)
                            print("  Params: ")
                            for key, value in best_params.items():
                                print("    {}: {}".format(key, value))


                if estimator == "rf" and hypertuning==True: 
                    # {'bootstrap': True, 'max_depth': 80, 'max_features': 2, 'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 100, 'n_jobs': 1}
                    # max_depth= 80, max_features= 5, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 300, n_jobs= 1
                    # {'bootstrap': True, 'max_depth': 100, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 500, 'n_jobs': 4}
                    
                    param_grid = {
                                    'max_depth': [10, 20, 30],
                                    'max_features': [2, 3, 5],
                                    'min_samples_leaf': [3, 5, 10],
                                    'min_samples_split': [8, 12],
                                    'n_estimators': [100, 300, 500],
                                    'n_jobs': [4]
                                }

                    rf = RandomForestRegressor()
                    rf_grid = GridSearchCV(rf, param_grid, cv=cv_folds, scoring=scoring_cv)
                    rf_grid.fit(train_x, train_y)
                    print('Optimal parameters for random forest regressor = ', rf_grid.best_params_)
                    rf = rf_grid.best_estimator_
                    fitted_model = rf


                if cv and hypertuning==False:
                    cross_val_rmse = cross_val_score(estimator=model
                            , X=train_x
                            , y=train_y
                            , cv=cv_folds
                            , scoring=scoring_cv)

                    print(f'The average cross validation rmsle of the model is {-1*round(cross_val_rmse.mean(), 2)}')
                    print(cross_val_rmse)

                if estimator=="gb" or estimator=="rf" or estimator=="lgbm":
                    list_all_Features = train_x.columns.tolist()

                    # Feature importance
                    fi_df = pd.DataFrame({"Feature": list_all_Features, "Importance": fitted_model.feature_importances_}).sort_values(by="Importance", ascending=False)
                    fi_selected=fi_df[:15]
                    important_feature_list = fi_selected["Feature"].tolist()

                    if estimator=="gb":
                        fi_selected.to_excel(r'fi_selected.xlsx')
                        fig = plt.figure(figsize=(20,10))
                        feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features)
                        feat_importances.nlargest(30).plot(kind='barh', color="green")
                        plt.title("Feature Importance from Gradient Boosting")
                        plt.savefig('Feature Importance from Gradient Boosting.png',  bbox_inches = "tight")

                    if estimator=="rf":
                        fi_selected.to_excel(r'fi_selected.xlsx')
                        fig = plt.figure(figsize=(20,20))
                        feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features)
                        feat_importances.nlargest(30).plot(kind='barh', color="green")
                        plt.title("Feature Importance from Random Forest")
                        plt.savefig('Feature Importance from Random Forest.png',  bbox_inches = "tight")

                    if estimator=="lgbm":
                        fi_selected.to_excel(r'fi_selected.xlsx')
                        feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features)
                        explainer = shap.TreeExplainer(fitted_model)
                        shap_values = explainer.shap_values(valid_x)

                        shap.initjs()

                        force_plot = shap.force_plot(explainer.expected_value, shap_values[0,:], valid_x.iloc[0,:])
                        shap.save_html("index_force_plot.htm", force_plot)
                        force_plot_all = shap.force_plot(explainer.expected_value, shap_values, valid_x)
                        shap.save_html("index_force_plot_all.htm", force_plot_all)
                        plt.figure(figsize=(10,20))
                        shap.summary_plot(shap_values, valid_x, show=False)
                        plt.savefig('summary_plot.png',  bbox_inches = "tight")

                        top_features = feat_importances.nlargest(10)
                        top_features = top_features.reset_index()
                        top_features = top_features['index'].to_list()    

                        for i in top_features:
                            plt.figure(figsize=(20,20))
                            shap.dependence_plot(i, shap_values, valid_x, show=False)
                            plt.savefig(f"dep_plot_{i}.png",  bbox_inches = "tight")

                if final==False and target_type=="con":
                    yhat = fitted_model.predict(test_x).astype(float)
                    y_pred = list(yhat.astype(float))
                    y_true = list(test_y) 
                    print(np.sqrt(mean_squared_error(y_true, y_pred)))

                if final==False and target_type=="bin":
                    yhat = fitted_model.predict(test_x) 
                    y_pred = list(map(round, yhat)) 
                    cm = confusion_matrix(test_y, y_pred)  
                    print ("Confusion Matrix : \n", cm) 
                    print('Test accuracy = ', accuracy_score(test_y, prediction))
                    print('Test recall = ', recall_score(test_y, prediction))
                
                return fitted_model
Пример #2
0
def mlflowtisation(
                   train_x,train_y,test_x,test_y,
                   modele=[ElasticNet],
                   params={"random_state":44},
                   nombre_de_lignes="",
                   nombre_de_colonnes="",
                   dataframe_non_qualifié=None
                   ):

    path=os.getcwd()
    os.chdir("./../")

    import logging
    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)
    def categ(x):
        if x <=0.1:
            return 0
        else:
            return 1
    def eval_metrics(actual, pred):
        acc = accuracy_score(actual, pred)
        return acc
    with mlflow.start_run():
        mod = modele[0](**params)
        mod.fit(train_x, train_y)
        try:
            with open(type(mod).__name__+'.html', 'w') as f:
                f.write(str(eli5.show_weights(mod).data))
        except Exception as e:
            print(e)
        try:
            shap.initjs()
            explainer = shap.TreeExplainer(mod)
            observations = mod.transform(train_x.sample(1000))
            shap_values = explainer.shap_values(observations)
            i = 0
            shap.force_plot(explainer.expected_value, shap_values[i], features=observations[i])
        except Exception as e:
            print(e)
        predicted_qualities = np.array(list(map(lambda x: categ(x),list(mod.predict(test_x)))))
        acc = eval_metrics(test_y, predicted_qualities)
        rapport_details=classification_report(test_y, predicted_qualities)
        f=open(type(mod).__name__+'.txt', 'w')
        f.write(rapport_details)
        f.close()
        print("La précision du modèle {} est : {}%".format(str(mod),round(acc*100,2)))
        
        try:
            print("Qualification des données...")
            res=mod.predict(dataframe_non_qualifié)
            pd.DataFrame.from_dict({"pred":list(res)}).to_csv(type(mod).__name__+'.csv')
        except:
            pass
        mlflow.log_param("Modèle utilisé", type(mod).__name__)
        for param in params:
            mlflow.log_param(param, params[param])
        mlflow.log_param("nombre de lignes", nombre_de_lignes)
        mlflow.log_param("nombre de colonnes", nombre_de_colonnes)
        mlflow.log_param("rapport_details", rapport_details)
        mlflow.log_metric("acc", acc) 
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        # Model registry does not work with file store
        if tracking_url_type_store != "file":
            mlflow.sklearn.log_model(mod,"model", registered_model_name=str(mod))
        else:

            mlflow.sklearn.log_model(mod,"model")

            mlflow.sklearn.log_model(mod, "model")
    os.chdir(str(path))
Пример #3
0
def deepshap_top_feat(data_all, sub_index, to_csv, FLAGS):
    """
    Getting top important features of patients.
    
    ....
    data_all: all data
    sub_index: Indices of patients to be found their top important featues.
    to_csv: Save dataframe or not
    ....
    
    Output: DataFrame of cancer patients with top important features.
    """

    models = []

    #subset of similar patients
    sub = data_all[data_all.index.isin(sub_index)]

    #DeepShap to extract features
    for num_groups in FLAGS.num_groups:

        #import trained model
        model = import_keras_models(FLAGS, num_groups, 'train')

        #initialize js methods for visualization
        shap.initjs()

        clustering_part = Model(
            inputs=model.inputs,
            outputs=model.
            outputs[2],  # specifying a single output for shap usage
        )

        # create an instance of the DeepSHAP which is called DeepExplainer
        explainer_shap = shap.DeepExplainer(
            model=clustering_part, data=data_all.iloc[:, 0:FLAGS.dimension])

        # Fit the explainer on a subset of the data (you can try all but then gets slower)
        shap_values = explainer_shap.shap_values(
            X=sub.iloc[:, 0:FLAGS.dimension].values, ranked_outputs=True)

        features = []
        #get top %1 pencentile features for each index
        for i in range(sub.shape[0]):
            abso = np.absolute(shap_values[0][0][i])
            ind = abso.argsort()[-round(FLAGS.dimension *
                                        FLAGS.percent):][::-1]
            feat = sub.columns.values[ind]
            features.append(feat)

        models.append(features)

        gc.collect()

    inter_features = []

    #get intersection of top features of models
    for i in range(sub.shape[0]):

        intsec = list(
            functools.reduce(set.intersection,
                             [set(item[i]) for item in models]))
        inter_features.append(intsec)

    shap_df = pd.DataFrame(list(
        dict(zip(sub.index.values, inter_features)).items()),
                           columns=['patient', 'shaps'])

    shap_df['label'] = sub['label'].values

    if to_csv == True:
        shap_df.to_csv('shaps_top_features.csv', index=True)

    return shap_df
Пример #4
0
def main():
    shap.initjs()
    ##1. read configuration file
    configs = json.load(open('Configuration.json', 'r'))
    if not os.path.exists(configs['model']['save_dir']):
        os.makedirs(configs['model']['save_dir'])

    ##2. read data
    clustered_timeseries_path = configs['paths']['clustered_timeseries_path']
    time_series = pd.read_csv(clustered_timeseries_path +
                              "TimeSeriesAggregatedClusteredDeltaTwoDays.csv")

    ##3. impute
    dynamic_features = configs['data']['dynamic_columns']
    grouping = configs['data']['grouping']
    time_series[dynamic_features] = impute(time_series, dynamic_features)

    ##4. generate new features based on delta from baseline
    outcome_columns = configs['data']['classification_outcome']
    baseline_features = configs['data']['baseline_columns']
    static_features = configs['data']['static_columns']

    new_series = generate_trajectory_timeseries(time_series, baseline_features,
                                                static_features,
                                                dynamic_features, grouping,
                                                outcome_columns)

    ##5. scale
    normalized_timeseries = scale(new_series, dynamic_features)

    groups = np.array(time_series[grouping])
    X = normalized_timeseries[dynamic_features]
    X_student = time_series[static_features]
    X_student[grouping] = time_series[grouping]

    print(" AFTER AGGREGATION, DIM OF X_STUDENT: ", X_student.shape)
    ##6. Training/Prediction for all outcomes.
    for outcome in configs['data']['classification_outcome']:

        outcome_df = pd.DataFrame()

        number_of_features = configs['data']['sequence_length']
        batch_size = configs['training']['batch_size']

        y = time_series[outcome]
        y = y.astype(int)
        teacher_model = LSTMModel(configs['model']['name'] + outcome)
        teacher_model.build_model(configs)

        student_model = XGBoostModel(configs['model']['name'] + outcome)

        for ffold_ind, (training_ind, testing_ind) in enumerate(
                stratified_group_k_fold(X, y, groups,
                                        k=5)):  # CROSS-VALIDATION
            training_groups, testing_groups = groups[training_ind], groups[
                testing_ind]

            this_X_train, this_X_val = X.iloc[training_ind], X.iloc[
                testing_ind]
            this_y_train, this_y_val = y.iloc[training_ind], y.iloc[
                testing_ind]

            print("testing groups!!!!!", len(testing_groups),
                  len(set(testing_groups)))
            this_y_ids = groups[testing_ind]

            assert len(set(training_groups) & set(testing_groups)) == 0

            #(NumberOfExamples, TimeSteps, FeaturesPerStep).

            reshaped_x = (this_X_train.values).reshape(-1, batch_size,
                                                       number_of_features)
            reshaped_y = (this_y_train.values).reshape(-1, batch_size, 1)
            reshaped_x_val = (this_X_val.values).reshape(
                -1, batch_size, number_of_features)
            reshaped_y_val = (this_y_val.values).reshape(-1, batch_size, 1)
            print(" THE RESHAPED: ")
            print(" TRAINING X SHAPE: ", reshaped_x.shape)
            print(" TRAINING Y SHAPE: ", reshaped_y.shape)
            print(" VAL X SHAPE: ", reshaped_x_val.shape)
            print(" VAL Y SHAPE: ", reshaped_y_val.shape)
            teacher_model.train(reshaped_x,
                                reshaped_y,
                                reshaped_x_val,
                                reshaped_y_val,
                                epochs=configs['training']['epochs'],
                                batch_size=batch_size,
                                save_dir=configs['model']['save_dir'])

            this_y_val = pd.DataFrame(this_y_val)
            this_y_val[grouping] = testing_groups
            print(" before reshaping:  ")
            print(" TRAINING X SHAPE: ", this_X_train.shape)
            print(" TRAINING Y SHAPE: ", this_y_train.shape)
            this_X_val.reset_index()

            y_pred_val_teacher = teacher_model.predict(
                (this_X_val.values).reshape(-1, batch_size,
                                            number_of_features))
            print(" DIMENSIONS OF WHAT THE TEACHER PREDICTED: ",
                  y_pred_val_teacher.shape)

        ##ZI MAKE SURE YS CORRESPOND TO THE XS. DON'T JUST USE Y IN THIS CALL
        ## ZI WORK ON THIS

        print(" DIM OF Y PRED BY TEACHER:", y_pred_val_teacher.shape)
        print(" DIM OF THIS Y VAL: ", this_y_val.shape)

        #training_groups, testing_groups = groups[training_ind], groups[testing_ind]

        #this_X_train, this_X_val = X.iloc[training_ind], X.iloc[testing_ind]
        #this_y_train, this_y_val = y.iloc[training_ind], y.iloc[testing_ind]

        print(" COLUMNS OF THIS Y VAL WHICH IS XGBOOST TRAINING: ")
        print(this_y_val.columns)
        xgboost_y_training = this_y_val

        print(" PRINTING HEAD")
        print(xgboost_y_training.head())
        xgboost_y_training = xgboost_y_training.groupby(grouping).first()
        xgboost_y_training = xgboost_y_training.reset_index()
        lstm_output = pd.DataFrame(
            y_pred_val_teacher.reshape(len(xgboost_y_training), batch_size))
        lstm_output = lstm_output.reset_index()
        print(" SHAPES: df SO FAR: ", xgboost_y_training.shape,
              " LSTM OUTPUT: ", lstm_output.shape, type(lstm_output))
        xgboost_y_training = pd.merge(xgboost_y_training,
                                      lstm_output,
                                      left_index=True,
                                      right_index=True)

        #xgboost_y_training = pd.concat([xgboost_y_training, lstm_output], ignore_index=True, sort=False)
        #student_model.train(Xgboost_X, this_y_val, outcome, configs)

        static_df = time_series[static_features]
        static_df[grouping] = time_series[grouping]
        static_df = static_df.drop_duplicates(grouping)
        xgboost_y_training = xgboost_y_training.merge(static_df,
                                                      how='left',
                                                      on=grouping)
        xgboost_y_training.to_csv("StuentTrainig" + outcome + ".csv")
        student_model.train(xgboost_y_training.iloc[:, 3:],
                            xgboost_y_training[outcome], outcome, configs)
Пример #5
0
def shap_js(model, value):
    shap.initjs()
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(value)
    return shap.force_plot(explainer.expected_value[0], shap_values[0], value)
Пример #6
0
 def test_init(self):
     shap.initjs()
Пример #7
0
def train(train_data, test_data=None):
    G = train_data[0]  # G 是一个Networkx里的对象,这几个都是经过load_data()处理过的
    features = train_data[1]
    id_map = train_data[2]
    class_map1 = train_data[4]
    class_map2 = train_data[5]
    class_map3 = train_data[6]
    dict_classmap = {
        0: class_map1,
        1: class_map2,
        2: class_map3,
        3: class_map3
    }
    hierarchy = FLAGS.hierarchy
    features_shape1 = None
    a_class = construct_class_numpy(class_map1)
    b_class = construct_class_numpy(class_map2)
    c_class = construct_class_numpy(class_map3)
    a_class = tf.cast(a_class, tf.float32)
    b_class = tf.cast(b_class, tf.float32)
    c_class = tf.cast(c_class, tf.float32)

    num_class = []
    #    for key in class_map.keys():
    #        num_class = num_class.append(sum(class_map[key]))

    for hi_num in range(hierarchy):
        #tf.reset_default_graph()
        if hi_num == 0:
            class_map = class_map1
            features = features
            features_shape1 = features.shape[1]
            if features is not None:
                # pad with dummy zero vector
                features = np.vstack(
                    [features, np.zeros((features.shape[1], ))])
            features = tf.cast(features, tf.float32)

        else:
            print("hierarchy %d finished" % (hi_num), end='\n\n')
            class_map = dict_classmap[hi_num]
            features = features2
            features = tf.cast(features, tf.float32)
            features = tf.concat(
                [features,
                 tf.zeros([1, features_shape1 + num_classes])],
                axis=0)
            features_shape1 = features.shape[1]

        if hi_num == 0:
            if isinstance(list(class_map.values())[0], list):
                num_classes = len(list(class_map.values())[0])
            else:
                num_classes = len(set(class_map.values()))
        else:
            if isinstance(list(dict_classmap[hi_num].values())[0], list):
                num_classes = len(list(dict_classmap[hi_num].values())[0])
            else:
                num_classes = len(set(dict_classmap[hi_num].values()))
        """"" 
        if features is not None:
            # pad with dummy zero vector
            features = np.vstack([features, np.zeros((features.shape[1],))])
        """ ""

        # features = tf.cast(features, tf.float32)
        # embeding_weight=tf.get_variable('emb_weights', [50, 128], initializer=tf.random_normal_initializer(),dtype=tf.float32)
        # features=tf.matmul(features,embeding_weight)
        context_pairs = train_data[3] if FLAGS.random_context else None
        placeholders = construct_placeholders(num_classes)
        minibatch = NodeMinibatchIterator(G,
                                          id_map,
                                          placeholders,
                                          class_map,
                                          num_classes,
                                          batch_size=FLAGS.batch_size,
                                          max_degree=FLAGS.max_degree,
                                          context_pairs=context_pairs)
        ##########
        with open('test_nodes.txt', 'w') as f:
            json.dump(minibatch.test_nodes, f)
        ###########
        if hi_num == 0:
            adj_info_ph = tf.placeholder(tf.int32,
                                         shape=minibatch.adj.shape,
                                         name='adj_info_ph')

        # 把adj_info设成Variable应该是因为在训练和测试时会改变adj_info的值,所以
        # 用Varible然后用tf.assign()赋值。
        adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")

        shap.initjs()
        if FLAGS.model == 'graphsage_mean':
            # Create model
            sampler = UniformNeighborSampler(adj_info)

            if FLAGS.samples_3 != 0:
                layer_infos = [
                    SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                    SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2),
                    SAGEInfo("node", sampler, FLAGS.samples_3, FLAGS.dim_2)
                ]

            elif FLAGS.samples_2 != 0:
                layer_infos = [
                    SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                    SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)
                ]

            else:
                layer_infos = [
                    SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1)
                ]

            model = SupervisedGraphsage(
                num_classes,
                placeholders,
                features,
                adj_info,
                minibatch.deg,  # 每一个的度
                layer_infos,
                model_size=FLAGS.model_size,
                sigmoid_loss=FLAGS.sigmoid,
                identity_dim=FLAGS.identity_dim,
                logging=True,
                concat=True,
            )

        elif FLAGS.model == 'gcn':
            # Create model
            sampler = UniformNeighborSampler(adj_info)
            layer_infos = [
                SAGEInfo("node", sampler, FLAGS.samples_1, 2 * FLAGS.dim_1),
                SAGEInfo("node", sampler, FLAGS.samples_2, 2 * FLAGS.dim_2)
            ]

            model = SupervisedGraphsage(num_classes,
                                        placeholders,
                                        features,
                                        adj_info,
                                        minibatch.deg,
                                        layer_infos=layer_infos,
                                        aggregator_type="gcn",
                                        model_size=FLAGS.model_size,
                                        concat=False,
                                        sigmoid_loss=FLAGS.sigmoid,
                                        identity_dim=FLAGS.identity_dim,
                                        logging=True)

        elif FLAGS.model == 'graphsage_seq':
            sampler = UniformNeighborSampler(adj_info)
            layer_infos = [
                SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)
            ]

            model = SupervisedGraphsage(num_classes,
                                        placeholders,
                                        features,
                                        adj_info,
                                        minibatch.deg,
                                        layer_infos=layer_infos,
                                        aggregator_type="seq",
                                        model_size=FLAGS.model_size,
                                        sigmoid_loss=FLAGS.sigmoid,
                                        identity_dim=FLAGS.identity_dim,
                                        logging=True,
                                        concat=True)

        elif FLAGS.model == 'graphsage_maxpool':
            sampler = UniformNeighborSampler(adj_info)
            layer_infos = [
                SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)
            ]

            model = SupervisedGraphsage(num_classes,
                                        placeholders,
                                        features,
                                        adj_info,
                                        minibatch.deg,
                                        layer_infos=layer_infos,
                                        aggregator_type="maxpool",
                                        model_size=FLAGS.model_size,
                                        sigmoid_loss=FLAGS.sigmoid,
                                        identity_dim=FLAGS.identity_dim,
                                        logging=True,
                                        concat=True)

        elif FLAGS.model == 'graphsage_meanpool':
            sampler = UniformNeighborSampler(adj_info)
            layer_infos = [
                SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)
            ]

            model = SupervisedGraphsage(num_classes,
                                        placeholders,
                                        features,
                                        adj_info,
                                        minibatch.deg,
                                        layer_infos=layer_infos,
                                        aggregator_type="meanpool",
                                        model_size=FLAGS.model_size,
                                        sigmoid_loss=FLAGS.sigmoid,
                                        identity_dim=FLAGS.identity_dim,
                                        logging=True,
                                        concat=True)
        elif FLAGS.model == 'gat':
            sampler = UniformNeighborSampler(adj_info)
            # 建立两层网络 采样邻居、邻居个数、输出维度
            layer_infos = [
                SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)
            ]

            model = SupervisedGraphsage(
                num_classes,
                placeholders,
                features,
                adj_info,
                minibatch.deg,
                concat=True,
                layer_infos=layer_infos,
                aggregator_type="gat",
                model_size=FLAGS.model_size,
                sigmoid_loss=FLAGS.sigmoid,
                identity_dim=FLAGS.identity_dim,
                logging=True,
            )
        else:
            raise Exception('Error: model name unrecognized.')

        config = tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)
        config.gpu_options.allow_growth = True
        config.gpu_options.per_process_gpu_memory_fraction = GPU_MEM_FRACTION
        config.allow_soft_placement = True

        # Initialize session

        sess = tf.Session(config=config)
        # sess = tf_dbg.LocalCLIDebugWrapperSession(sess)
        #merged = tf.summary.merge_all()  # 将所有东西保存到磁盘,可视化会用到
        #summary_writer = tf.summary.FileWriter(log_dir(), sess.graph)  # 记录信息,可视化,可以用tensorboard查看

        # Init variables

        sess.run(tf.global_variables_initializer(),
                 feed_dict={adj_info_ph: minibatch.adj})
        #sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph2: minibatch2.adj})

        # Train model
        total_steps = 0
        avg_time = 0.0
        epoch_val_costs = []
        epoch_val_costs2 = []
        # 这里minibatch.adj和minibathc.test_adj的大小是一样的,只不过adj里面把不是train的值都变成一样
        # val在这里是validation的意思,验证
        train_adj_info = tf.assign(
            adj_info, minibatch.adj
        )  # tf.assign()是为一个tf.Variable赋值,返回值是一个Variable,是赋值后的值
        val_adj_info = tf.assign(
            adj_info,
            minibatch.test_adj)  # assign()是一个Opration,要用sess.run()才能执行

        it = 0
        train_loss = []
        val_loss = []
        train_f1_mics = []
        val_f1_mics = []
        loss_plt = []
        loss_plt2 = []
        trainf1mi = []
        trainf1ma = []
        valf1mi = []
        valf1ma = []
        iter_num = 0

        if hi_num == 0:
            epochs = FLAGS.epochs
        elif hi_num == 1:
            epochs = FLAGS.epochs2
        elif hi_num == 2:
            epochs = FLAGS.epochs3
        else:
            epochs = FLAGS.epochs4

        for epoch in range(epochs + 1):
            if epoch < epochs:
                minibatch.shuffle()
                iter = 0
                print('Epoch: %04d' % (epoch + 1))
                epoch_val_costs.append(0)
                while not minibatch.end():
                    # Construct feed dictionary
                    # 通过改变feed_dict来改变每次minibatch的节点
                    feed_dict, labels = minibatch.next_minibatch_feed_dict(
                    )  # feed_dict是mibatch修改过的placeholder
                    feed_dict.update({placeholders['dropout']: FLAGS.dropout})
                    t = time.time()
                    # Training step
                    outs = sess.run([model.opt_op, model.loss, model.preds],
                                    feed_dict=feed_dict)
                    train_cost = outs[1]
                    iter_num = iter_num + 1
                    loss_plt.append(float(train_cost))
                    if iter % FLAGS.print_every == 0:
                        # Validation 验证集
                        sess.run(val_adj_info.op
                                 )  # sess.run()  fetch参数是一个Opration,代表执行这个操作。
                        if FLAGS.validate_batch_size == -1:
                            val_cost, val_f1_mic, val_f1_mac, duration, otu_lazy, _ = incremental_evaluate(
                                sess, model, minibatch, FLAGS.batch_size)
                        else:
                            val_cost, val_f1_mic, val_f1_mac, duration = evaluate(
                                sess, model, minibatch,
                                FLAGS.validate_batch_size)
                        sess.run(train_adj_info.op
                                 )  # 每一个tensor都有op属性,代表产生这个张量的opration。
                        epoch_val_costs[-1] += val_cost

                    #if iter % FLAGS.print_every == 0:
                    #summary_writer.add_summary(outs[0], total_steps)

                    # Print results
                    avg_time = (avg_time * total_steps + time.time() -
                                t) / (total_steps + 1)
                    loss_plt2.append(float(val_cost))
                    valf1mi.append(float(val_f1_mic))
                    valf1ma.append(float(val_f1_mac))

                    if iter % FLAGS.print_every == 0:
                        train_f1_mic, train_f1_mac, train_f1_none = calc_f1(
                            labels, outs[-1])
                        trainf1mi.append(float(train_f1_mic))
                        trainf1ma.append(float(train_f1_mac))

                        print(
                            "Iter:",
                            '%04d' % iter,
                            # 训练集上的损失函数等信息
                            "train_loss=",
                            "{:.5f}".format(train_cost),
                            "train_f1_mic=",
                            "{:.5f}".format(train_f1_mic),
                            "train_f1_mac=",
                            "{:.5f}".format(train_f1_mac),
                            # 在测试集上的损失函数值等信息
                            "val_loss=",
                            "{:.5f}".format(val_cost),
                            "val_f1_mic=",
                            "{:.5f}".format(val_f1_mic),
                            "val_f1_mac=",
                            "{:.5f}".format(val_f1_mac),
                            "time=",
                            "{:.5f}".format(avg_time))
                        train_loss.append(train_cost)
                        val_loss.append(val_cost)
                        train_f1_mics.append(train_f1_mic)
                        val_f1_mics.append(val_f1_mic)
                    iter += 1
                    total_steps += 1
                    if total_steps > FLAGS.max_total_steps:
                        break
                if total_steps > FLAGS.max_total_steps:
                    break

            # concat features
            elif hi_num == FLAGS.hierarchy - 1:
                print("the last outputs")
            else:
                iter = 0
                minibatch.shuffle()
                while not minibatch.end():
                    print("Iter:", '%04d' % iter, "concat")
                    feed_dict, labels = minibatch.next_minibatch_feed_dict(
                    )  # feed_dict是mibatch修改过的placeholder
                    feed_dict.update({placeholders['dropout']: FLAGS.dropout})
                    x = feed_dict[placeholders['batch']]
                    outs = sess.run([
                        model.opt_op, model.loss, model.preds, model.node_preds
                    ],
                                    feed_dict=feed_dict)
                    features_tail = outs[3]
                    features_tail = tf.cast(features_tail, tf.float32)
                    """""
                    if hi_num == 0:
                        features_tail = tf.nn.embedding_lookup(a_class, feed_dict[placeholders["batch"]])
                    elif hi_num == 1:
                        features_tail = tf.nn.embedding_lookup(b_class, feed_dict[placeholders["batch"]])
                    else:
                        features_tail = tf.nn.embedding_lookup(c_class, feed_dict[placeholders["batch"]])
                    """ ""
                    hidden = tf.nn.embedding_lookup(
                        features, feed_dict[placeholders["batch"]])
                    features_inter = tf.concat([hidden, features_tail], axis=1)

                    if iter == 0:
                        features2 = features_inter
                    else:
                        features2 = tf.concat([features2, features_inter],
                                              axis=0)
                    iter += 1

                # val features & test features
                iter_num = 0
                finished = False
                while not finished:
                    feed_dict_val, batch_labels, finished, _ = minibatch.incremental_node_val_feed_dict(
                        FLAGS.batch_size, iter_num, test=False)
                    node_outs_val = sess.run(
                        [model.preds, model.loss, model.node_preds],
                        feed_dict=feed_dict_val)
                    tail_val = tf.cast(node_outs_val[2], tf.float32)
                    hidden_val = tf.nn.embedding_lookup(
                        features, feed_dict_val[placeholders["batch"]])
                    features_inter_val = tf.concat([hidden_val, tail_val],
                                                   axis=1)
                    iter_num += 1
                    features2 = tf.concat([features2, features_inter_val],
                                          axis=0)
                print("val features finished")
                iter_num = 0
                finished = False
                while not finished:
                    feed_dict_test, batch_labels, finished, _ = minibatch.incremental_node_val_feed_dict(
                        FLAGS.batch_size, iter_num, test=True)
                    node_outs_test = sess.run(
                        [model.preds, model.loss, model.node_preds],
                        feed_dict=feed_dict_test)
                    tail_test = tf.cast(node_outs_test[2], tf.float32)
                    hidden_test = tf.nn.embedding_lookup(
                        features, feed_dict_test[placeholders["batch"]])
                    features_inter_test = tf.concat([hidden_test, tail_test],
                                                    axis=1)
                    iter_num += 1
                    features2 = tf.concat([features2, features_inter_test],
                                          axis=0)
                print("test features finished")

                print("finish features concat")
                #features2 = sess.run(features2)

    print("Optimization Finished!")
    sess.run(val_adj_info.op)
    val_cost, val_f1_mic, val_f1_mac, duration, otu_f1, ko_none = incremental_evaluate(
        sess, model, minibatch, FLAGS.batch_size, test=True)
    print("Full validation stats:", "loss=", "{:.5f}".format(val_cost),
          "f1_micro=", "{:.5f}".format(val_f1_mic), "f1_macro=",
          "{:.5f}".format(val_f1_mac), "time=", "{:.5f}".format(duration))
    pred = y_ture_pre(sess, model, minibatch, FLAGS.batch_size)
    for i in range(pred.shape[0]):
        sum = 0
        for l in range(pred.shape[1]):
            sum = sum + pred[i, l]
        for m in range(pred.shape[1]):
            pred[i, m] = pred[i, m] / sum
    id = json.load(open(FLAGS.train_prefix + "-id_map.json"))
    # x_train = np.empty([pred.shape[0], array.s)
    num = 0
    session = tf.Session()
    array = session.run(features)
    x_test = np.empty([pred.shape[0], array.shape[1]])
    x_train = np.empty([len(minibatch.train_nodes), array.shape[1]])
    for node in minibatch.val_nodes:
        x_test[num] = array[id[node]]
        num = num + 1
    num1 = 0
    for node in minibatch.train_nodes:
        x_train[num1] = array[id[node]]
        num1 = num1 + 1

    with open(log_dir() + "val_stats.txt", "w") as fp:
        fp.write(
            "loss={:.5f} f1_micro={:.5f} f1_macro={:.5f} time={:.5f}".format(
                val_cost, val_f1_mic, val_f1_mac, duration))

    print("Writing test set stats to file (don't peak!)")
    val_cost, val_f1_mic, val_f1_mac, duration, otu_lazy, ko_none = incremental_evaluate(
        sess, model, minibatch, FLAGS.batch_size, test=True)
    with open(log_dir() + "test_stats.txt", "w") as fp:
        fp.write("loss={:.5f} f1_micro={:.5f} f1_macro={:.5f}".format(
            val_cost, val_f1_mic, val_f1_mac))

    incremental_evaluate_for_each(sess,
                                  model,
                                  minibatch,
                                  FLAGS.batch_size,
                                  test=True)

    ##################################################################################################################
    # plot loss
    plt.figure()
    plt.plot(loss_plt, label='train_loss')
    plt.plot(loss_plt2, label='val_loss')
    plt.legend(loc=0)
    plt.xlabel('Iteration')
    plt.ylabel('loss')
    plt.title('Loss plot')
    plt.grid(True)
    plt.axis('tight')
    #plt.savefig("./graph/HMC12_loss.png")
    # plt.show()

    # plot f1 score
    plt.figure()
    plt.subplot(211)
    plt.plot(trainf1mi, label='train_f1_micro')
    plt.plot(valf1mi, label='val_f1_micro')
    plt.legend(loc=0)
    plt.xlabel('Iterations')
    plt.ylabel('f1_micro')
    plt.title('train_val_f1_score')
    plt.grid(True)
    plt.axis('tight')

    plt.subplot(212)
    plt.plot(trainf1ma, label='train_f1_macro')
    plt.plot(valf1ma, label='val_f1_macro')
    plt.legend(loc=0)
    plt.xlabel('Iteration')
    plt.ylabel('f1_macro')
    plt.grid(True)
    plt.axis('tight')
    #plt.savefig("./graph/HMC123_f1.png")
    # plt.show()

    plt.figure()
    plt.plot(np.arange(len(train_loss)) + 1, train_loss, label='train')
    plt.plot(np.arange(len(val_loss)) + 1, val_loss, label='val')
    plt.legend()
    plt.savefig('loss.png')
    plt.figure()
    plt.plot(np.arange(len(train_f1_mics)) + 1, train_f1_mics, label='train')
    plt.plot(np.arange(len(val_f1_mics)) + 1, val_f1_mics, label='val')
    plt.legend()
    plt.savefig('f1.png')

    # OTU f1
    plt.figure()
    plt.plot(otu_f1, label='otu_f1')
    plt.legend(loc=0)
    plt.xlabel('OTU')
    plt.ylabel('f1_score')
    plt.title('OTU f1 plot')
    plt.grid(True)
    plt.axis('tight')
    #plt.savefig("./graph/HMC123_otu_f1.png")
    # plt.show()

    #Ko f1 score
    plt.figure()
    plt.plot(ko_none, label='Ko f1 score')
    plt.legend(loc=0)
    plt.xlabel('Ko')
    plt.ylabel('f1_score')
    plt.grid(True)
    plt.axis('tight')
    #plt.savefig("./graph/HMC123_ko_f1.png")

    bad_ko = []
    b02 = 0
    b05 = 0
    b07 = 0
    for i in range(len(ko_none)):
        if ko_none[i] < 0.2:
            bad_ko.append(i)
            b02 += 1
            bad_ko = np.array(bad_ko)
        elif ko_none[i] < 0.5:
            b05 += 1
        elif ko_none[i] < 0.7:
            b07 += 1
    print("ko f1 below 0.2:", b02)
    print("ko f1 below 0.5:", b05)
    print("ko f1 below 0.7:", b07)
Пример #8
0
    def explainable_results(specific_prediction_sample_to_explain: int, X, Y,
                            input_label_index_value,
                            num_labels: int):  # , anamoly_data
        """
        Understand, interpret, and trust the results on the deep models at individual/samples level
        """
        '''
        from sklearn.ensemble import RandomForestRegressor
        import xgboost
        import shap
        import numpy as np
        shap.initjs()

        y = [max(i) for i in Y]

        my_model_1 = xgboost.XGBRegressor().fit(X, np.array(y))

        # explain the model's predictions using SHAP
        # (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.)
        explainer_xgb = shap.Explainer(my_model_1)
        shap_values_xgb = explainer_xgb(anamoly_data)

        # visualize the first prediction's explanation
        shap.plots.waterfall(shap_values_xgb[specific_prediction_sample_to_explain])

        
        my_model_2 = RandomForestRegressor(random_state=0).fit(X, np.array(y))

        data_for_prediction = X[specific_prediction_sample_to_explain]  # use 1 row of data here. Could use multiple rows if desired
        # Create object that can calculate shap values
        explainer_rf = shap.TreeExplainer(my_model_2)

        # Calculate Shap values
        shap_values = explainer_rf.shap_values(data_for_prediction)

        shap.force_plot(explainer_rf.expected_value[specific_prediction_sample_to_explain], shap_values[1], data_for_prediction)
        '''

        # Quick Clean Hack Suggested by - Cory Randolph @coryroyce
        import shap
        import numpy as np
        import pandas as pd
        from keras.models import Sequential
        from keras.layers import Dense
        import ipywidgets as widgets

        # Get the number of inputs and outputs from the dataset
        n_inputs, n_outputs = X.shape[1], Y.shape[1]

        def get_model(n_inputs, n_outputs):
            model_nn = Sequential()
            model_nn.add(
                Dense(32,
                      input_dim=n_inputs,
                      kernel_initializer='he_uniform',
                      activation='relu'))
            model_nn.add(Dense(n_outputs, kernel_initializer='he_uniform'))
            model_nn.compile(loss='mae', optimizer='adam')
            return model_nn

        model_nn = get_model(n_inputs, n_outputs)

        model_nn.fit(X.iloc[10:, :].values, Y, epochs=30)

        model_nn.evaluate(x=X.iloc[10:, :].values, y=Y)

        XpredictInputData = X.iloc[
            specific_prediction_sample_to_explain, :]  # X[specific_prediction_sample_to_explain,:]

        if (XpredictInputData.ndim == 1):
            XpredictInputData = np.array([XpredictInputData])

        print(model_nn.predict(XpredictInputData))  # 0:1
        '''
        Here we take the Keras model trained above and explain why it makes different predictions on individual samples.

        Set the explainer using the Kernel Explainer (Model agnostic explainer method form SHAP).
        '''
        explainer = shap.KernelExplainer(model=model_nn.predict,
                                         data=X.head(50),
                                         link="identity")  # data = X[0:50]
        '''
        Get the Shapley value for a single example.
        '''
        # Set the index of the specific example to explain

        shap_value_single = explainer.shap_values(
            X=X.iloc[specific_prediction_sample_to_explain, :],
            nsamples=100)  # X[specific_prediction_sample_to_explain,:]
        '''
        Display the details of the single example
        '''
        print(X.iloc[specific_prediction_sample_to_explain, :])
        '''
        Choose the label/output/target to run individual explanations on:

        Note: The dropdown menu can easily be replaced by manually setting the index on the label to explain.
        '''
        # Create the list of all labels for the drop down list
        #label_cols = ['window_diff_0', 'window_diff_1', 'window_diff_2', 'window_diff_3', 'window_diff_4', 'window_diff_5', 'window_diff_6']
        label_cols = ['window_diff_' + str(i) for i in range(num_labels)]
        #print(label_cols)
        df_labels = pd.DataFrame(data=Y, columns=label_cols)
        df_labels.to_csv('y_labels.csv')
        list_of_labels = df_labels.columns.to_list()  # Y.columns.to_list()

        # Create a list of tuples so that the index of the label is what is returned
        tuple_of_labels = list(zip(list_of_labels, range(len(list_of_labels))))

        # Create a widget for the labels and then display the widget
        current_label = widgets.Dropdown(options=tuple_of_labels,
                                         value=input_label_index_value,
                                         description='Select Label:')

        # Display the dropdown list (Note: access index value with 'current_label.value')
        print(current_label)
        #Dropdown(description='Select Label:', options=(('labels_01', 0), ('labels_02', 1), ('labels_03', 2), etc
        '''
        Plot the force plot for a single example and a single label/output/target
        '''
        print(f'Current label Shown: {list_of_labels[current_label.value]}')

        # print the JS visualization code to the notebook
        shap.initjs()

        shap.force_plot(
            base_value=explainer.expected_value[current_label.value],
            shap_values=shap_value_single[current_label.value],
            features=X.iloc[
                specific_prediction_sample_to_explain, :]  # X_idx:X_idx+1
        )
        '''
        Create the summary plot for a specific output/label/target.
        '''
        # Note: We are limiting to the first 50 training examples since it takes time to calculate the full number of sampels
        shap_values = explainer.shap_values(X=X.iloc[0:50, :],
                                            nsamples=100)  # X[0:50,:]

        print(f'Current Label Shown: {list_of_labels[current_label.value]}\n')

        # print the JS visualization code to the notebook
        shap.initjs()

        shap.summary_plot(
            shap_values=shap_values[current_label.value],
            features=X.iloc[0:50, :]  # X[0:50,:]
        )
        '''
        Force Plot for the first 50 individual examples.
        '''
        print(f'Current Label Shown: {list_of_labels[current_label.value]}\n')

        # print the JS visualization code to the notebook
        shap.initjs()

        shap.force_plot(
            base_value=explainer.expected_value[current_label.value],
            shap_values=shap_values[current_label.value],
            features=X.iloc[0:50, :]  # X[0:50,:]
        )
Пример #9
0
def lightgbm(train_X, test_X, train_Y, args):
    train_X, valid_X, train_Y, valid_Y = train_test_split(train_X,
                                                          train_Y,
                                                          test_size=0.1,
                                                          random_state=4)

    # データセットを生成する
    lgb_train = lgb.Dataset(train_X, train_Y)
    lgb_eval = lgb.Dataset(valid_X, valid_Y, reference=lgb_train)

    if args.optuna:
        # optunaを使用
        print("Using optuna!!")
        import optuna.integration.lightgbm as lgb_optuna

        # LightGBM のハイパーパラメータ
        lgbm_params = {
            # 回帰分析
            'objective': 'regression',
            # AUC の最大化を目指す
            'metric': 'rmse',
            # Fatal の場合出力
            'verbosity': -1,
            "feature_pre_filter": False
        }

        best_params, history = {}, []

        # 上記のパラメータでモデルを学習する
        model = lgb_optuna.train(
            lgbm_params,
            lgb_train,
            valid_sets=lgb_eval,
            verbose_eval=100,  # 100イテレーション毎に学習結果出力
            num_boost_round=1000,  # 最大イテレーション回数指定
            early_stopping_rounds=100,
            best_params=best_params,
            tuning_history=history,
        )

        print(f'best_params : {best_params}')
        with open('optuna.txt', 'w') as f:
            print(best_params, file=f)
    else:
        best_params = {
            'lambda_l1': 3.89081415861961e-06,
            'lambda_l2': 0.02666349731287391,
            'num_leaves': 6,
            'max_depth': -1,
            'feature_fraction': 0.8999999999999999,
            'bagging_fraction': 1.0,
            'bagging_freq': 0,
            'min_child_samples': 20,
            'objective': 'regression',
            'metric': 'rmse'
        }
    model = lgb.train(
        best_params,
        lgb_train,
        valid_sets=lgb_eval,
        verbose_eval=50,  # 50イテレーション毎に学習結果出力
        num_boost_round=1000,  # 最大イテレーション回数指定
        early_stopping_rounds=100)

    # テストデータを予測する
    y_pred = model.predict(test_X, num_iteration=model.best_iteration)

    shap.initjs()
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(train_X)
    shap.summary_plot(shap_values, train_X)

    return y_pred, model
Пример #10
0
def make_shap_interpretation(model, training_set, column_names, ml_name,
                             target, dataset, X, processor):
    """display shap's multi class values and force plots based on
    personal id selection"""
    # Summary plot
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(training_set)
    shap.summary_plot(shap_values,
                      column_names,
                      class_names=model.classes_,
                      plot_type='bar',
                      max_display=10,
                      show=True,
                      auto_size_plot=True)
    plt.title(f'SHAP Multi Class Values from {ml_name}',
              fontsize=12,
              fontweight='bold')
    plt.legend(loc='lower right')
    st.markdown("#### Shap Summary Plot")
    info_global = st.button("How it is calculated")
    if info_global:
        st.info("""
            The shap summary plot explains how each features impact the output
            of the model to get the overall influence of each class using
            absolute values. The bigger the bar of the class is the more
            influence it has on that particular feature.

            The shap summary plot is only displaying the top 10 features.

            For more information, check out this free course at kaggle:
            [Link](https://www.kaggle.com/dansbecker/shap-values)

            To check out the shap values documentation, click the link:
            [Shap Values Documentation](
                https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html
                )
            """)
    st.pyplot()

    st.markdown("#### Shap Force Plot")
    info_local = st.button("How this works")
    if info_local:
        st.info("""
            The shap force plot demonstrates how each individual feature
            influence the prediction outcome. Features in the red are the
            likely ones to be the predicted class whereas the features in blue
            reduces that probabily to be the predicted class. Is sort of like
            hot and cold. Heat rises and cold sinks.

            You can choose one of out the five prediction classes to see the
            effects of a selected feature.

            Please expand the force plot for better readability.

            For more information, check out this free course at kaggle:
            [Link](https://www.kaggle.com/dansbecker/shap-values)

            To check out the shap values documentation, click the link:
            [Shap Values Documentation](
                https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html
                )
            """)
    # Force plot
    slider_idx = st.selectbox('Personal ID of Guest', X.index)
    row_p = X.loc[[slider_idx]]
    row = processor.transform(row_p)
    explainer_force = shap.TreeExplainer(model)
    shap_values_force = explainer_force.shap_values(row)

    class_list = list(dataset['Target Exit Destination'].value_counts().index)
    target_value = st.selectbox("Choose the class to plot",
                                class_list,
                                index=1)
    shap.initjs()
    if target_value == 'Unknown/Other':
        shap.force_plot(
            base_value=explainer_force.expected_value[0],
            shap_values=shap_values_force[0],
            features=row,
            feature_names=column_names,
            link='logit',
            show=True,
            matplotlib=True,
            figsize=(30, 12),
            text_rotation=45,
        )
    elif target_value == 'Permanent Exit':
        shap.force_plot(base_value=explainer_force.expected_value[1],
                        shap_values=shap_values_force[1],
                        features=row,
                        feature_names=column_names,
                        link='logit',
                        show=True,
                        matplotlib=True,
                        figsize=(30, 12),
                        text_rotation=45)
    elif target_value == 'Emergency Shelter':
        shap.force_plot(base_value=explainer_force.expected_value[2],
                        shap_values=shap_values_force[2],
                        features=row,
                        feature_names=column_names,
                        link='logit',
                        show=True,
                        matplotlib=True,
                        figsize=(30, 12),
                        text_rotation=45)
    elif target_value == 'Temporary Exit':
        shap.force_plot(base_value=explainer_force.expected_value[3],
                        shap_values=shap_values_force[3],
                        features=row,
                        feature_names=column_names,
                        link='logit',
                        show=True,
                        matplotlib=True,
                        figsize=(30, 12),
                        text_rotation=45)
    elif target_value == 'Transitional Housing':
        shap.force_plot(base_value=explainer_force.expected_value[4],
                        shap_values=shap_values_force[4],
                        features=row,
                        feature_names=column_names,
                        link='logit',
                        show=False,
                        matplotlib=True,
                        figsize=(30, 12),
                        text_rotation=45)
    """
    Known bugs:
    1. Posx and posy should be finite values. Text and fig
       scaling issues.
    2. Shap - matplotlib = True is not yet supported for force plots 
       with multiple samples! Example: Pick [Personal ID 53716]
    3. Segmentation fault. It crashes.
    """
    st.pyplot()
Пример #11
0
    def shap_why_connector(self, target, *arg):
        #Input: Numpy. Output: Pandas df. Turns numbers into categories.
        def adapter(n):
            d = pd.DataFrame(data=n, columns=self.featureNames)
            categories = self.getCategoricalFeatures()
            for c in categories:
                d[c] = d[c].map(self.dictionary[c]["values"])
            #d['Sex'] = d['Sex'].map({0:'Male', 1: 'Female'})
            #d['Embarked'] = d['Embarked'].map({0: 'Southampton', 1: 'Cherbourg', 2: 'Queenstown'})
            #d['Pclass'] = d['Pclass'].map({0: 'First', 1: 'Second', 2: 'Third'})
            return d

        #Input: Pandas df. Output: Numpy. Turns categories into numbers.
        def reverse_adapter(p):
            d = p.copy()
            categories = self.getCategoricalFeatures()
            for c in categories:
                d[c] = d[c].map(
                    {v: k
                     for k, v in self.dictionary[c]["values"].items()})
            #d['Sex'] = d['Sex'].map({'Male': 0, 'Female': 1})
            #d['Embarked'] = d['Embarked'].map({'Southampton': 0, 'Cherbourg': 1, 'Queenstown': 2})
            #d['Pclass'] = d['Pclass'].map({'First': 0, 'Second': 1, 'Third': 2})
            return (d)

        #filter floats and convert to int
        query_instance = dict(s.split(':') for s in arg)
        for k, v in query_instance.items():
            print(f"{k}: {v} ({type(v)})")
            try:
                x = float(v)
                x = int(x)
                query_instance[k] = f"{x}"
            except:
                pass

        sorted_query_instance = {}
        for f in self.featureNames:
            sorted_query_instance[f] = query_instance[f]

        original_instance = pd.DataFrame([sorted_query_instance])
        print(original_instance.iloc[0, :])
        shap_instance = reverse_adapter(pd.DataFrame([sorted_query_instance]))
        print(shap_instance.iloc[0, :])
        shap_training = reverse_adapter(self.X_train)
        predict_fn = lambda x: self.model.predict_proba(adapter(x))

        shap.initjs()
        explainer = shap.KernelExplainer(predict_fn,
                                         shap_training,
                                         link='logit')
        single_shap = explainer.shap_values(
            shap_instance.iloc[0, :].astype("int64"), nsamples=100)
        print(single_shap)

        fig = shap.force_plot(
            explainer.expected_value[0],
            single_shap[0],
            original_instance,
            out_names=[
                "Chance of " + self.dictionary["class"]["values"][0],
                self.dictionary["class"]["values"][1]
            ],
            link="logit",
            matplotlib=True,
            show=False,
            text_rotation=90)
        fig.savefig('temp/shap.png', bbox_inches="tight")

        first_target = self.dictionary["class"]["values"][0]
        self.explanation = "The plot shows what feature values influenced the prediction to become <big>" + str(
            target
        ) + "</big>." + " Particularly, the plot shows the forces that affect the decision to predict " + first_target + ". Red forces increase the chance of " + first_target + ". Blue forces decrease the chance of " + first_target + ". The forces push the average chance of " + first_target + " (base value) up or down. The boundary where the prediction outcome switches is 0.5."
        self.certainty = "That is hard to tell. The computation is based on perturbation with " + str(
            self.X_train.shape[0]) + " data samples."
        plt.clf()
        return (str('temp/shap.png'))
Пример #12
0
 def __init__(self, model):
     shap.initjs()
     self.explainer = shap.TreeExplainer(model)
     self.shap_values = None
     self.expected_values = None
     self.feature_importance = None
Пример #13
0
def upload2():
    from werkzeug.datastructures import ImmutableMultiDict

    with open(ff[0], 'rb') as file:
        model = pickle.load(file)

    with open(ff[1], 'rb') as file:
        X_data = pickle.load(file)

    with open(ff[2], 'rb') as file:
        y_data = pickle.load(file)

    print('start')
    print(request.form)
    hh = request.form
    hh = hh.to_dict(flat=False)
    print('hh ', hh)
    for file in request.files.getlist("gg"):
        print(file)
    print(list(X_data.columns))

    series = pd.Series(hh)

    import shap
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_data)

    # load JS visualization code to notebook
    shap.initjs()

    #plt.style.use("_classic_test_patch")
    #plt.clf()
    # visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
    #shap.force_plot(explainer.expected_value, shap_values[1,:], series, matplotlib=True, figsize=(22, 4))
    #shap.force_plot(explainer.expected_value, shap_values[10,:],  \
    #                series,feature_names=X_data.columns,\
    #               matplotlib=True, show=False)

    # plt.savefig("gg.png",dpi=150, bbox_inches='tight')

    #yyy = shap.getjs()
    '''
    oo = yyy.matplotlib
    p = yyy.html  
    yyy_str = mpld3.fig_to_html(p)  
    print('dfsdfsdf ',p)     
    '''
    series = series.tolist()
    print('im a he ', series)
    pp = []
    for i in series:
        for j in i:
            j = float(j)
            pp.append(j)

    series = np.array(pp)
    print('im a she ', series)

    #lime
    import lime
    from lime.lime_tabular import LimeTabularExplainer
    explainer = LimeTabularExplainer(X_data,
                                     mode='regression',
                                     feature_names=list(X_data.columns),
                                     random_state=42,
                                     discretize_continuous=False,
                                     kernel_width=0.2)

    exp = explainer.explain_instance(series, model.predict)

    print(exp.local_pred)

    fig = exp.as_pyplot_figure(label=list(X_data.columns))

    fig_2 = exp.as_html(labels=list(X_data.columns))
    #print('dddd ',fig_2)

    plt.tight_layout()
    #fig = plt.figure(figsize = (18,8))

    #    plt.tight_layout()
    #    #plt.boxplot(bank_data.transpose())
    #
    #    #Add titles to the chart and axes
    #    plt.hist(bank_data.transpose(), bins = 50)
    #    plt.title('Boxplot of Bank Stock Prices (5Y Lookback)')
    #    plt.xlabel('Bank')
    #    plt.ylabel('Stock Prices')
    #
    #mpld3.show(fig)
    #
    html_str = mpld3.fig_to_html(fig)
    Html_file = open("templates/lime.html", "w")
    Html_file.write(html_str)
    Html_file.close()
    #

    # plt.savefig('static/img/new34_plot.png')
    #plt.close()

    return render_template('local_result.html',
                           LIME=html_str,
                           SH=fig_2,
                           gh=html_str)
Пример #14
0
def upload():
    print('eer  0', request.form)
    dropdown_selection = str(request.form)
    dropdown_selection = dropdown_selection.split()

    print(dropdown_selection)
    model_type = dropdown_selection[3]
    dropdown_selection = dropdown_selection[1]

    print('model type ji ', model_type)

    print(dropdown_selection, "  nuna bhai")

    global id_name

    target = 'images/'
    print('tt', target)

    if not os.path.isdir(target):
        os.mkdir(target)
    global ff
    ff = []
    for file in request.files.getlist("file"):
        print(file)
        filename = file.filename
        destination = "/".join([target, filename])
        print('des', destination)
        file.save(destination)
        ff.append(destination)

    mypath = os.getcwd()
    onlyfiles = [
        os.path.join(mypath, f) for f in os.listdir(mypath)
        if os.path.isfile(os.path.join(mypath, f))
    ]

    print('raJA ', ff)
    import warnings
    warnings.filterwarnings("ignore")

    with open(ff[0], 'rb') as file:
        model = pickle.load(file)

    with open(ff[1], 'rb') as file:
        X_data = pickle.load(file)

    with open(ff[2], 'rb') as file:
        y_data = pickle.load(file)

    if 'GL' in dropdown_selection:

        if 'RR' in model_type:

            PI = permutation_importance(model, X_data, y_data)

            row_to_show = 5

            data_for_prediction = X_data.iloc[row_to_show]

            explainer = shap.Explainer(model,
                                       X_data,
                                       feature_names=X_data.columns)
            shap_values = explainer.shap_values(X_data)

            shap.summary_plot(shap_values, X_data)

            import matplotlib.pyplot as pl
            pl.savefig('static/img/new_plot.png')
            pl.close()

            ICE = ind_cond_exp(model, X_data, y_data)

            #global surgat
            from sklearn.tree import DecisionTreeRegressor
            from sklearn.tree import plot_tree

            predictions = model.predict(X_data)
            dt = DecisionTreeRegressor(random_state=100, max_depth=3)
            # We fit the shallow tree to the matrix X and the predictions of the random forest model
            dt.fit(X_data, predictions)

            fig, ax = plt.subplots(figsize=(20, 10))

            plot_tree(dt,
                      feature_names=list(X_data.columns),
                      precision=3,
                      filled=True,
                      fontsize=12,
                      impurity=True)
            pl.savefig('static/img/new2_plot.png')
            pl.close()

            return render_template('model_explanation_result.html',
                                   PI=PI,
                                   ICE=ICE,
                                   SH="static/img/new_plot.png",
                                   SM="static/img/new2_plot.png")

        if 'RF' in model_type:
            PI = permutation_importance(model, X_data, y_data)

            explainer = shap.TreeExplainer(model,
                                           X_data,
                                           feature_names=X_data.columns)
            shap_values = explainer.shap_values(X_data)

            shap.summary_plot(shap_values, X_data)

            import matplotlib.pyplot as pl
            pl.savefig('static/img/new_plot.png')
            pl.close()

            ICE = ind_cond_exp(model, X_data, y_data)

            #global surgat
            from sklearn.tree import DecisionTreeRegressor
            from sklearn.tree import plot_tree

            predictions = model.predict(X_data)
            dt = DecisionTreeRegressor(random_state=100, max_depth=3)
            # We fit the shallow tree to the matrix X and the predictions of the random forest model
            dt.fit(X_data, predictions)

            fig, ax = plt.subplots(figsize=(20, 10))

            plot_tree(dt,
                      feature_names=list(X_data.columns),
                      precision=3,
                      filled=True,
                      fontsize=12,
                      impurity=True)
            pl.savefig('static/img/new2_plot.png')
            pl.close()

            return render_template('model_explanation_result.html',
                                   PI=PI,
                                   ICE=ICE,
                                   SH="static/img/new_plot.png",
                                   SM="static/img/new2_plot.png")

        if 'CC' in model_type:
            PI = permutation_importance(model, X_data, y_data)

            explainer = shap.KernelExplainer(model.predict_proba, X_data)
            shap_values = explainer.shap_values(X_data)

            shap.summary_plot(shap_values, X_data)

            import matplotlib.pyplot as pl
            pl.savefig('static/img/new_plot.png')
            pl.close()

            #ICE = ind_cond_exp(model,X_data,y_data)

            #global surgat
            from sklearn.tree import DecisionTreeRegressor
            from sklearn.tree import plot_tree

            predictions = model.predict(X_data)

            return render_template(
                'model_explanation_result_classification.html',
                PI=PI,
                SH="static/img/new_plot.png")

    if 'WI' in dropdown_selection:

        # print(res," resss")

        #
        import dash
        from dash.dependencies import Input, Output
        import dash_table
        import dash_core_components as dcc
        import dash_html_components as html

        app = dash.Dash(__name__)
        import pandas as pd
        #should be X data

        mean_list = []
        features = X_data.columns.tolist()
        for i in features:
            mean_list.append(round(X_data[i].mean()))

        explainer = shap.TreeExplainer(model)
        shap.initjs()

        params = features

        id_name_str = "my_graph" + str(id_name)
        print('---------------', id_name_str)
        id_name = id_name + 1

        what_plot.layout = html.Div([
            dash_table.DataTable(
                id='table-editing-simple',
                columns=([{
                    'id': 'Model',
                    'name': 'Model'
                }] + [{
                    'id': p,
                    'name': p
                } for p in params]),
                data=[
                    dict(zip(features, mean_list))
                    #dict(Model=i, **{param: mean_list[i] for param in params})
                    # for i in range(0, len(mean_list))
                ],
                editable=True),
            html.Div(id=id_name_str)
        ])

        @what_plot.callback(Output(id_name_str, "children"),
                            Input('table-editing-simple', 'data'),
                            Input('table-editing-simple', 'columns'))
        def update_graphs(rows, columns):
            df = pd.DataFrame(rows, columns=[c['name'] for c in columns])
            print(rows)

            #
            rows = rows[0]
            col = []
            vvalue = []
            for key in rows:
                print(key, '->', int(rows[key]))
                col.append(key)
                vvalue.append([int(rows[key])])

            ik = dict(zip(col, vvalue))
            instance = pd.DataFrame.from_dict(ik)

            print('instancceee ', instance)

            from shap.plots._force_matplotlib import draw_additive_plot

            # explain the model's predictions using SHAP values(same syntax works for LightGBM, CatBoost, and scikit-learn models)
            #explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(instance)
            shap.initjs()

            #plt.style.use("_classic_test_patch")

            ytu = model.predict(instance)
            print('ress ', ytu)

            koko = _force_plot_html2(explainer.expected_value, shap_values,
                                     instance)

            #print('kkkk ',koko)

            print('Done')

            return koko
    #

        return render_template('local_explain_lime.html', LL=what_plot.index())

    if 'LL' in dropdown_selection:
        None
        #table and plots ========================================================
        import dash
        from dash.dependencies import Input, Output
        import dash_table
        import dash_core_components as dcc
        import dash_html_components as html
        import pandas as pd

        id_name_str = "my_graph" + str(id_name)
        print('---------------', id_name_str)
        id_name = id_name + 1

        print('in LL')
        # make graph===============================================================
        table_plot.layout = html.Div([
            dash_table.DataTable(
                id='datatable-interactivity',
                columns=[{
                    "name": i,
                    "id": i,
                    "deletable": True,
                    "selectable": True
                } for i in X_data.columns],
                data=X_data.to_dict('records'),
                editable=True,
                filter_action="native",
                sort_action="native",
                sort_mode="multi",
                column_selectable="single",
                row_selectable="single",
                row_deletable=True,
                selected_columns=[],
                selected_rows=[],
                page_action="native",
                page_current=0,
                page_size=10,
            ),
            html.Div(id=id_name_str)
        ])

        print('miod LL')

        @table_plot.callback(Output(id_name_str, "children"),
                             Input('datatable-interactivity',
                                   "derived_virtual_data"),
                             Input('datatable-interactivity',
                                   "derived_virtual_selected_rows"))
        def update_graphs(rows, derived_virtual_selected_rows):
            # When the table is first rendered, `derived_virtual_data` and
            # `derived_virtual_selected_rows` will be `None`. This is due to an
            # idiosyncrasy in Dash (unsupplied properties are always None and Dash
            # calls the dependent callbacks when the component is first rendered).
            # So, if `rows` is `None`, then the component was just rendered
            # and its value will be the same as the component's dataframe.
            # Instead of setting `None` in here, you could also set
            # `derived_virtual_data=df.to_rows('dict')` when you initialize
            # the component.
            if derived_virtual_selected_rows is None:
                derived_virtual_selected_rows = []

            dff = X_data if rows is None else pd.DataFrame(rows)

            colors = [
                '#7FDBFF' if i in derived_virtual_selected_rows else '#0074D9'
                for i in range(len(dff))
            ]

            print('my value', derived_virtual_selected_rows)
            print('i am row ', X_data.iloc[derived_virtual_selected_rows])
            print(type(derived_virtual_selected_rows))

            from shap.plots._force_matplotlib import draw_additive_plot

            ttt = X_data.loc[derived_virtual_selected_rows]
            # explain the model's predictions using SHAP values(same syntax works for LightGBM, CatBoost, and scikit-learn models)
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(ttt)
            shap.initjs()

            plt.style.use("_classic_test_patch")

            bubu = _force_plot_html(explainer.expected_value, shap_values, ttt)

            shap_values = explainer.shap_values(X_data)
            #shap.force_plot(explainer.expected_value, shap_values, X_data)
            explain_all = _force_plot_html(explainer.expected_value,
                                           shap_values, X_data)

            print('bubu ', bubu)

            return bubu, explain_all

        return render_template('local_explain_lime.html',
                               LL=table_plot.index())

    if 'BD' in dropdown_selection:
        None

    #FI
    if 'DB' in dropdown_selection:

        #  if 'CC' in model_type:
        #   from explainerdashboard import ClassifierExplainer, ExplainerDashboard
        #  ExplainerDashboard(ClassifierExplainer(model, X_data, y_data)).run()

        if 'RF' in model_type:
            import threading
            import time

            def dashboard_exp(model, X_data, y_data):
                import dash_bootstrap_components as dbc

                from explainerdashboard import RegressionExplainer, ExplainerDashboard
                ExplainerDashboard(
                    RegressionExplainer(model, X_data, y_data),
                    bootstrap=dbc.themes.SANDSTONE,
                    importances=True,
                    model_summary=False,
                    contributions=True,
                    whatif=True,
                    shap_dependence=False,
                    shap_interaction=False,
                    decision_trees=False,
                    hide_whatifindexselector=True,
                    hide_whatifprediction=True,
                    hide_inputeditor=False,
                    hide_whatifcontributiongraph=False,
                    hide_whatifcontributiontable=True,
                    hide_whatifpdp=False,
                    hide_predindexselector=True,
                    hide_predictionsummary=True,
                    hide_contributiongraph=False,
                    hide_pdp=False,
                    hide_contributiontable=True,
                    hide_dropna=True,
                    hide_range=True,
                    hide_depth=True,
                    hide_sort=True,
                    hide_sample=True,  # hide sample size input on pdp component
                    hide_gridlines=True,  # hide gridlines on pdp component
                    hide_gridpoints=True,
                    hide_cats_sort=
                    True,  # hide the sorting option for categorical features
                    hide_cutoff=
                    True,  # hide cutoff selector on classification components
                    hide_percentage=
                    True,  # hide percentage toggle on classificaiton components
                    hide_log_x=
                    True,  # hide x-axis logs toggle on regression plots
                    hide_log_y=
                    True,  # hide y-axis logs toggle on regression plots
                    hide_ratio=True,  # hide the residuals type dropdown
                    hide_points=
                    True,  # hide the show violin scatter markers toggle
                    hide_winsor=True,  # hide the winsorize input
                    hide_wizard=
                    True,  # hide the wizard toggle in lift curve component
                    hide_star_explanation=True,
                ).run()

            t1 = threading.Thread(target=dashboard_exp,
                                  args=(model, X_data, y_data))

            t1.start()

            return '''<H2>
Пример #15
0
    def saveAndGetSHAP(self, user_all_label, pred, new_row_raw, new_row_norm, initModel):
        start_time = datetime.datetime.now()
        model_results = []

        # 상위 버전 xgboost의 경우, 모델 encoding 버퍼 문제 발생
        # https://github.com/slundberg/shap/issues/1215

        xgb_booster = initModel.get_booster()
        model_bytearray = xgb_booster.save_raw()[4:]
        def byte_error(self=None):
            return model_bytearray

        xgb_booster.save_raw = byte_error

        features = StressModel.feature_df_with_state['features'].values
        feature_state_df = StressModel.feature_df_with_state
        model_accuracy = 0

        # model 성능 평가
        y_pred_proba = initModel.predict_proba(new_row_norm[features])
        model_accuracy = y_pred_proba[0]
        print("model_accuracy: ", model_accuracy)

        # shap setting
        shap.initjs()

        try:
            explainer = shap.TreeExplainer(xgb_booster)
        except Exception as e:
            print("shap tree explainer error: ", e)
        # explainer.feature_perturbation = "tree_path_dependent"

        shap_values = explainer.shap_values(new_row_norm[features])
        # print(shap_values)
        expected_value = explainer.expected_value

        # ## TODO : SHAP Exception 발생 가능 부분 ==> SHAP 에서 적은 빈도수의 Label 해석 안줄때/...혹시나 해서 모델 한번 더 학습
        # try:
        #     print("expected_value: ", type(expected_value))
        #     print("expected_value: ", expected_value.shape[0])
        #     if (expected_value.shape[0]) != len(user_all_label):
        #         print("Shap if statement...")
        #         with open('data_result/' + str(self.uid) + "_features.p", 'rb') as file:
        #             preprocessed = pickle.load(file)
        #
        #         norm_df = StressModel.normalizing(self, "default", preprocessed, None, None, None, None)
        #         StressModel.initModel(self, norm_df)
        #
        #         explainer = shap.TreeExplainer(initModel)
        #         explainer.feature_perturbation = "tree_path_dependent"
        #
        #         features = StressModel.feature_df_with_state['features'].values
        #         feature_state_df = StressModel.feature_df_with_state
        #
        #         ###  model 성능 평가
        #         y_pred_proba = initModel.predict_proba(new_row_norm[features])
        #         model_accuracy = y_pred_proba[0]
        #         print("if model_accuracy: ", model_accuracy)
        #
        #         shap_values = explainer.shap_values(new_row_norm[features])
        #         # print(shap_values)
        #         expected_value = explainer.expected_value
        #         # print("len expected_value: ", len(expected_value))
        # except Exception as e:
        #     print("SHAP label length error: ", e)
        #     pass


        check_label = [0 for i in range(3)]
        # not_user_label_list = list(set(check_label) - set(user_all_label)) # 유저한테 없는 label 계산

        try:
            for label in user_all_label:  # 유저한테 있는 Stress label 에 따라
                feature_list = ""

                index = user_all_label.index(label)
                # shap_accuracy = expected_value[index]
                shap_list = shap_values[index]

                if len(shap_list.shape) == 1: ## EXCEPTION CASE..
                    shap_dict = dict(zip(features, shap_list))
                else:
                    shap_dict = dict(zip(features, shap_list[0]))

                shap_dict_sorted = sorted(shap_dict.items(), key=(lambda x: x[1]), reverse=True)

                # act_features = ['Duration WALKING', 'Duration RUNNING', 'Duration BICYCLE', 'Duration ON_FOOT', 'Duration VEHICLE']
                app_features = ['Social & Communication','Entertainment & Music','Utilities','Shopping', 'Games & Comics',
                                'Health & Wellness', 'Education', 'Travel', 'Art & Design & Photo', 'News & Magazine', 'Food & Drink']

                act_tmp = ""
                for feature_name, s_value in shap_dict_sorted:
                    if s_value > 0: #check
                        feature_id = feature_state_df[feature_state_df['features'] == feature_name]['feature_id'].values[0]
                        feature_value = new_row_norm[feature_name].values[0]
                        ## TODO : 데이터가 전부 다 0인 경우..추가 작업이 필요할 수 있음
                        # 현재는 feature_list가 0일 경우, NO_FEATURES 반환
                        if new_row_raw[feature_name].values[0] != 0:
                            # ACT FEATURE
                            # if feature_name in act_features:
                            #     if act_tmp == "":
                            #         act_tmp += feature_name
                            #
                            #         if feature_value >= 0.5:
                            #             feature_list += str(feature_id) + '-high '
                            #         else:
                            #             feature_list += str(feature_id) + '-low '
                            if feature_name in app_features:
                                # Add package
                                try:
                                    pkg_result = AppUsed.objects.get(uid=self.uid, day_num=self.dayNo, ema_order=self.emaNo)
                                    pkg_text = ""
                                    if feature_name == "Entertainment & Music":
                                        pkg_text = pkg_result.Entertainment_Music
                                    elif feature_name == "Utilities":
                                        pkg_text = pkg_result.Utilities
                                    elif feature_name == "Shopping":
                                        pkg_text = pkg_result.Shopping
                                    elif feature_name == "Games & Comics":
                                        pkg_text = pkg_result.Games_Comics
                                    elif feature_name == "Others":
                                        pkg_text = pkg_result.Others
                                    elif feature_name == "Health & Wellness":
                                        pkg_text = pkg_result.Health_Wellness
                                    elif feature_name == "Social & Communication":
                                        pkg_text = pkg_result.Social_Communication
                                    elif feature_name == "Education":
                                        pkg_text = pkg_result.Education
                                    elif feature_name == "Travel":
                                        pkg_text = pkg_result.Travel
                                    elif feature_name == "Art & Design & Photo":
                                        pkg_text = pkg_result.Art_Photo
                                    elif feature_name == "News & Magazine":
                                        pkg_text = pkg_result.News_Magazine
                                    elif feature_name == "Food & Drink":
                                        pkg_text = pkg_result.Food_Drink

                                    if pkg_text != "":
                                        if feature_value >= 0.5:
                                            feature_list += str(feature_id) + '-high&' + pkg_text + " "
                                        else:
                                            feature_list += str(feature_id) + '-low '

                                except Exception as e:
                                    print("Exception during making feature_list of app...get AppUsed db", e)

                            else:
                                if feature_value >= 0.5:
                                    feature_list += str(feature_id) + '-high '
                                else:
                                    feature_list += str(feature_id) + '-low '

                if feature_list == "":
                    feature_list = "NO_FEATURES"

                try:
                    if label == pred:
                        model_result = ModelResult.objects.create(uid=self.uid, timestamp=start_time,
                                                                  day_num=self.dayNo, ema_order=self.emaNo,
                                                                  prediction_result=label, accuracy=model_accuracy[label],
                                                                  feature_ids=feature_list, model_tag=True)
                    else:
                        model_result = ModelResult.objects.create(uid=self.uid, timestamp=start_time,
                                                                  day_num=self.dayNo, ema_order=self.emaNo,
                                                                  prediction_result=label, accuracy=model_accuracy[label],
                                                                  feature_ids=feature_list)
                except Exception as e:
                    print("ModelResult.objects.create error: ", e)

                check_label[label] = 1
                model_results.append(model_result)

        except Exception as e:
            print("Exception at saveAndGetSHAP error: ", e)
            pass

        try:
            ## For 문 끝난 후, model_result 에 없는 stress lvl 추가 & 일반적인 문구 추가
            for i in range(3):
                if check_label[i] == 0:
                    # random_acc = random.uniform(0.0, 1.0)
                    # random_acc = round(random_acc, 2)
                    try:
                        if i == 0 : # LOW General message, 마지막 띄어쓰기 조심!
                            feature_list = '0-general_0 7-general_0 11-general_0 17-general_0 28-general_0 '
                            model_result = ModelResult.objects.create(uid=self.uid, timestamp=start_time,
                                                                      day_num=self.dayNo, ema_order=self.emaNo,
                                                                      prediction_result=i, accuracy=0,
                                                                      feature_ids=feature_list)
                        else: #LITTLE HIGH, HIGH General message
                            feature_list = '0-general_1 7-general_1 11-general_1 17-general_1 28-general_1 '
                            model_result = ModelResult.objects.create(uid=self.uid, timestamp=start_time,
                                                                      day_num=self.dayNo, ema_order=self.emaNo,
                                                                      prediction_result=i, accuracy=0,
                                                                      feature_ids=feature_list)
                    except Exception as e:
                        print("model result에 없는 stress lvl 추가 오류: ", e)

                    model_results.append(model_result)

        except Exception as e:
            print("saveAndGetSHAP general statement error: ",e)

        # print("Total SaveAndGetSHAP Working... ", datetime.datetime.now() - start_time) # 시간 1초도 안 걸림

        return model_results
 def blockbox(self,model, patient):
     explainer = shap.TreeExplainer(model)
     shap_values = explainer.shap_values(patient)
     shap.initjs()
     return shap.force_plot(explainer.expected_value[1], shap_values[1], patient,matplotlib=True,show=False)
Пример #17
0
    def training(self):
        data_train = pd.read_csv(
            "D:/Python_Project/Keywords_extraction/train_balance.csv")
        data_test = pd.read_csv(
            "D:/Python_Project/Keywords_extraction/test_balance.csv")

        acc = 0
        # cols = [col for col in data_train.columns if col not in ['id', '关键词', '标签']]
        # cols = [col for col in data_train.columns if col  in ['头词频','词频','词长','IDF','出现在标题','首次出现词位置','最后出现词位置','词方差','词平均','词偏度','词峰度','词差方差','最大词差','最小词差','最小句中位置','首次句位置','最后句位置','出现在第一句','出现在最后一句','句子出现频率','句平均','句偏度','包含英文','度中心性','接近中心性','s','f','v','d','k','x','i','l','un','包含数字']]
        '''
        cols=['词频','词长','IDF','出现在标题','首次出现词位置','最后出现词位置','词方差','词偏度','最大句中位置','最小句中位置',
              '平均句中位置','平均句长','首次句位置','出现在最后一句','句子出现频率','句方差',
              '句平均','句差方差','最大句差','包含英文','接近中心性','n', 't', 'v', 'z', 'q', 'd', 'k', 'x', 'y', '包含数字']

         ['词频', '词长', 'IDF', '出现在标题', '首次出现词位置', '词方差', '词平均', '最大词差', '最大句中位置', '平均句中位置', 
         '首次句位置', '出现在第一句', '出现在最后一句', '句子出现频率', '句方差', '句差方差', '最大句差', '度中心性',
          'n', 'v', 'a', 'z', 'd', 'h', 'k', 'x', 'g', 'j', 'y', 'un', '包含数字']

         '''
        cols = [
            '词频', '词长', 'IDF', '出现在标题', '首次出现词位置', '词方差', '词平均', '最大词差',
            '最大句中位置', '平均句中位置', '首次句位置', '出现在第一句', '出现在最后一句', '句子出现频率', '句方差',
            '句差方差', '最大句差', '度中心性', 'n', 'v', 'a', 'z', 'd', 'h', 'k', 'x',
            'g', 'j', 'y', 'un', '包含数字'
        ]
        # cols = [col for col in data_train.columns if col not in ['id', '关键词', '标签']]
        x_train = data_train.loc[:, cols]
        y_train = data_train.loc[:, '标签']
        x_train = x_train.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        x_val = data_test.loc[:, cols]
        y_val = data_test.loc[:, '标签']
        x_val = x_val.reset_index(drop=True)
        y_val = y_val.reset_index(drop=True)

        # 测试集为30%,训练集为70%
        # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

        lgb_train = lgb.Dataset(x_train, y_train)

        lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train)
        #     print('开始训练......')

        params = {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': {'auc'},
            'learning_rate': 0.025,
            'num_leaves': 100,
            'min_data_in_leaf': 70,
            'bagging_fraction': 0.85,
            'is_unbalance': 'true',
            'seed': 42
        }

        gbm = lgb.train(
            params,
            lgb_train,
            num_boost_round=5000,
            valid_sets=lgb_eval,
            early_stopping_rounds=30,
            verbose_eval=False,
        )

        y_pred = gbm.predict(x_val)
        y_pred = list(y_pred)
        Y_val = list(y_val)
        pos = 0
        pos_acc = 0
        pos_pre = 0
        for i, j in zip(Y_val, y_pred):
            if (i >= 0.5):
                pos += 1

            if (i >= 0.5 and j >= 0.5):
                pos_acc += 1
            if (j >= 0.5):
                pos_pre += 1

        pos_r = pos_acc / pos
        pos_a = pos_acc / pos_pre
        print((pos_a * pos_r) / (pos_a + pos_r) * 2)
        i = 0
        count = 0

        for item in y_pred:
            if item > 0.5:
                y_pred[i] = 1
            else:
                y_pred[i] = 0

            i = i + 1
        # print(report(Y_val, y_pred,digits=4))

        y_pred = gbm.predict(x_train)
        y_pred = list(y_pred)
        Y_train = list(y_train)

        i = 0
        count = 0

        for item in y_pred:
            if item > 0.5:
                y_pred[i] = 1
            else:
                y_pred[i] = 0

            i = i + 1
        print(report(Y_train, y_pred, digits=4))
        plt.rc('font', family='SimSun', size=13)
        # gbm.save_model('lgbmodel_allfeature.model')
        explainer = shap.TreeExplainer(gbm)
        shap_values = explainer.shap_values(x_train)
        # 基线值y_base就是训练集的目标变量的拟合值的均值。
        y_base = explainer.expected_value
        shap.initjs()
        # shap.summary_plot(shap_values[0], x_train, sort=True, color_bar_label=("FEATURE_VALUE0"))#1
        shap.summary_plot(shap_values[1],
                          x_train,
                          sort=True,
                          color_bar_label=("FEATURE_VALUE1"))  # 2
Пример #18
0
def importance_plot(model_type, model_name, base_path):
    shap.initjs()
    # For RNN
    if model_name == 'RNN':
        rnn = rnnForward().double()
        rnn.load_state_dict(
            torch.load(base_path + '/Result/' + model_name + '/' + model_type +
                       '.shap'))
        rnn_df = pd.read_csv(base_path + '/Result/' + model_name + '/' +
                             model_type + '.csv',
                             index_col=0)

        feature_doc_df = pd.read_csv(base_path +
                                     '/Data/feature_documentation.csv')
        feature_dict = dict(
            zip(feature_doc_df['var_name'], feature_doc_df['short_name']))
        rnn_df.rename(columns=lambda x: change_feature_name(x, feature_dict),
                      inplace=True)

        shap_values = shap.DeepExplainer(rnn, torch.tensor(
            rnn_df.values)).shap_values(torch.tensor(rnn_df.values))
        summaryplot = shap.summary_plot(shap_values, rnn_df, show=False)
        plt.savefig('temp.png', bbox_inches='tight')
        plt.close()

        df = cumulativeImportance(rnn_df, shap_values)

    # For NN
    elif model_name == 'NN':
        n_nn = 1000  #number of lines want to look
        nn = nNet().double()
        nn.load_state_dict(
            torch.load(base_path + '/Model/' + model_name + '/' + model_type))
        nn_df = pd.read_csv(base_path + '/Data/Transformed/' + model_type +
                            '_transformed_800.csv',
                            index_col=0)
        cols = [
            col for col in nn_df.columns
            if col not in ['fips', 'value', 'year']
        ]
        nn_df = nn_df[cols]

        feature_doc_df = pd.read_csv(base_path +
                                     '/Data/feature_documentation.csv')
        feature_dict = dict(
            zip(feature_doc_df['var_name'], feature_doc_df['short_name']))
        nn_df.rename(columns=lambda x: change_feature_name(x, feature_dict),
                     inplace=True)

        shap_values = shap.DeepExplainer(
            nn, torch.tensor(nn_df.values[:n_nn])).shap_values(
                torch.tensor(nn_df.values[:n_nn]))
        summaryplot = shap.summary_plot(shap_values, nn_df[:n_nn], show=False)
        plt.savefig('temp.png', bbox_inches='tight')
        plt.close()

        df = cumulativeImportance(nn_df[:n_nn], shap_values)

    # For Random Forest
    elif model_name == 'RF':
        n_rf = 1000
        rf_df = pd.read_csv(base_path + '/Data/Transformed/' + model_type +
                            '_transformed_800.csv',
                            index_col=0)
        cols = [
            col for col in rf_df.columns
            if col not in ['fips', 'value', 'year']
        ]
        rf_df = rf_df[cols]

        feature_doc_df = pd.read_csv(base_path +
                                     '/Data/feature_documentation.csv')
        feature_dict = dict(
            zip(feature_doc_df['var_name'], feature_doc_df['short_name']))
        rf_df.rename(columns=lambda x: change_feature_name(x, feature_dict),
                     inplace=True)

        rf = joblib.load(base_path + '/Model/' + model_name + '/' + model_type)
        shap_values = shap.TreeExplainer(rf, rf_df.values[:n_rf]).shap_values(
            rf_df.values[:n_rf], check_additivity=False)
        summaryplot = shap.summary_plot(shap_values, rf_df[:n_rf], show=False)
        plt.savefig('temp.png', bbox_inches='tight')
        plt.close()

        df = cumulativeImportance(rf_df[:n_rf], shap_values)

    # For KNN
    # Must use Kernel method on knn
    elif model_name == 'KNN':
        n_knn = 10
        knn_df = pd.read_csv(base_path + '/Data/Transformed/' + model_type +
                             '_transformed_800.csv',
                             index_col=0)
        cols = [
            col for col in knn_df.columns
            if col not in ['fips', 'value', 'year']
        ]
        knn_df = knn_df[cols]

        feature_doc_df = pd.read_csv(base_path +
                                     '/Data/feature_documentation.csv')
        feature_dict = dict(
            zip(feature_doc_df['var_name'], feature_doc_df['short_name']))
        knn_df.rename(columns=lambda x: change_feature_name(x, feature_dict),
                      inplace=True)

        knn = joblib.load(base_path + '/Model/' + model_name + '/' +
                          model_type)
        shap_values = shap.KernelExplainer(knn.predict,
                                           knn_df.values[:n_knn]).shap_values(
                                               knn_df.values[:n_knn])
        summaryplot = shap.summary_plot(shap_values,
                                        knn_df[:n_knn],
                                        show=False)
        plt.savefig('temp.png', bbox_inches='tight')
        plt.close()

        df = cumulativeImportance(knn_df[:n_knn], shap_values)

    # For LR
    elif model_name == 'LR':
        n_lr = 1000
        lr_df = pd.read_csv(base_path + '/Data/Transformed/' + model_type +
                            '_transformed_800.csv',
                            index_col=0)
        cols = [
            col for col in lr_df.columns
            if col not in ['fips', 'value', 'year']
        ]
        lr_df = lr_df[cols]

        feature_doc_df = pd.read_csv(base_path +
                                     '/Data/feature_documentation.csv')
        feature_dict = dict(
            zip(feature_doc_df['var_name'], feature_doc_df['short_name']))
        lr_df.rename(columns=lambda x: change_feature_name(x, feature_dict),
                     inplace=True)

        lr = joblib.load(base_path + '/Model/' + model_name + '/' + model_type)
        shap_values = shap.LinearExplainer(
            lr, lr_df.values[:n_lr]).shap_values(lr_df.values[:n_lr])
        summaryplot = shap.summary_plot(shap_values, lr_df[:n_lr], show=False)
        plt.savefig('temp.png', bbox_inches='tight')
        plt.close()

        df = cumulativeImportance(lr_df[:n_lr], shap_values)

    encoded_image = base64.b64encode(open('temp.png',
                                          'rb').read()).decode('ascii')
    return [
        html.Div([
            html.Img(src='data:image/png;base64,{}'.format(encoded_image),
                     style={
                         'width': '80%',
                         'height': '80%'
                     })
        ])
    ], px.bar(df,
              x='importance',
              y='feature',
              color='Correlation',
              category_orders={'feature': list(df['feature'].iloc[::-1])})
Пример #19
0
def patient_analysis(model, patient):
  explainer = shap.TreeExplainer(model)
  shap_values = explainer.shap_values(patient)
  shap.initjs()
  return shap.force_plot(explainer.expected_value[1], shap_values[1], patient)
Пример #20
0
def shap_js_bar_plot(model, values):
    shap.initjs()
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(values)
    return shap.summary_plot(shap_values, values, plot_type="bar")
Пример #21
0
def main():
    "SHapley Additive exPlanations"

    datapath = os.path.join(
        os.path.dirname(__file__),
        "combine_data_since_2000_PROCESSED_2018-04-26.csv")
    data_df = pd.read_csv(datapath)

    # onyl get players that have been in the league for 3 years
    data_df2 = data_df.loc[data_df.Year <= 2015].copy()

    # calculate the player AV percentiles by position
    data_df2['AV_pctile'] = data_df2.groupby('Pos').AV.rank(pct=True,
                                                            method='min',
                                                            ascending=True)

    # Get the data for the position we want, in this case it's DE
    pos_df = data_df2.loc[data_df2.Pos == 'DE'].copy().reset_index(drop=True)

    # Combine measurables
    features = [
        'Forty', 'Wt', 'Ht', 'Vertical', 'BenchReps', 'BroadJump', 'Cone',
        'Shuttle'
    ]
    # what we want to predict
    target = 'AV_pctile'

    # Split the data into train and test sets
    train_df = pos_df.loc[pos_df.Year <= 2011]
    test_df = pos_df.loc[pos_df.Year.isin([2012, 2013, 2014, 2015])]

    X = train_df[features].values
    y = train_df[target].values

    X_test = test_df[features].values
    y_test = test_df[target].values

    # best parameter set
    pipe = Pipeline([("imputer", Imputer(strategy='median')),
                     ("estimator",
                      RandomForestRegressor(max_features=6,
                                            min_samples_split=63,
                                            n_estimators=500,
                                            random_state=420))])

    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')
        #search.fit(X, y)
        pipe.fit(X, y)

    estimator = pipe.named_steps['estimator']
    imputer = pipe.named_steps['imputer']

    # create our SHAP explainer
    shap_explainer = shap.TreeExplainer(estimator)

    test_X_imp = imputer.transform(X_test)

    # calculate the shapley values for our test set
    test_shap_vals = shap_explainer.shap_values(test_X_imp)

    # load JS in order to use some of the plotting functions from the shap
    # package in the notebook
    shap.initjs()

    test_X_imp = imputer.transform(X_test)

    test_X_imp_df = pd.DataFrame(test_X_imp, columns=features)

    # plot the explanation for a single prediction
    #shap.force_plot(test_shap_vals[0, :], test_X_imp_df.iloc[0, :])
    #shap.force_plot(test_X_imp_df.iloc[0, :], test_shap_vals[0, :])

    # visualize the first prediction's explanation
    shap.force_plot(shap_explainer.expected_value, test_shap_vals[0, :],
                    test_X_imp_df.iloc[0, :])
Пример #22
0
def main():
    
    st.sidebar.info('This app is created to predict CO2 Solubility in Brine')
    st.sidebar.success('https://www.pycaret.org')
    
    add_selectbox = st.sidebar.selectbox(
        "How would you like to predict?", 
        ("Single value prediction", "Multiple value prediction"))
    
    st.title("CO2 Solubility in Brine Prediction App")
    st.subheader("Created by: Khoirrashif")
    image_CCS = Image.open('CCS.jpg')
    st.image(image_CCS, use_column_width=False)
    st.text("(Image source: Global CCS Institute)")
    st.set_option('deprecation.showPyplotGlobalUse', False)
    

    
##Single value Prediction
    if add_selectbox == 'Single value prediction':
    
        mNaCl = st.number_input('mNaCl (mol/kg) | min = 0.016 mol/kg, max = 6.14 mol/kg',value=3.25, min_value=0.016, max_value=6.14) #input mNaCl
        
        Pressure = st.number_input('Pressure (bar) | min = 0.98 bar, max = 1400.00 bar',value=500.00, min_value=0.98, max_value=1400.00) #input Pressure
        
        Temperature = st.number_input('Temperature (K) | min = 273.15 K, max = 723.15 K', value=425.00,min_value=273.15, max_value=723.15) #input Temperature
        
    
        output=""
    
        input_dict = {'mNaCl (mol/kg)': mNaCl, 'Pressure (bar)': Pressure, 'Temperature (K)': Temperature}
        input_df = pd.DataFrame([input_dict])
    
        if st.button("Predict"):
            output = predict(model = model, input_df = input_df)
            output = str(output) + 'mol/kg'
    
        st.success('The CO2 solubility is {}'.format(output))

    
##Multiple value Prdiction
    if add_selectbox == 'Multiple value prediction':
    
        file_upload = st.file_uploader("Upload csv file for predictions", type=["csv"])
    
        if file_upload is not None:
            data = pd.read_csv(file_upload)   
            prediction = predict_model(estimator=model, data=data)
            st.write(prediction)
            
            shap.initjs()

            # train catBoost model
            X,y = data, prediction['Label']
            #mod = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 100)
            mod = CatBoostRegressor(iterations=100, learning_rate=0.1, random_seed=123)
            mod.fit(X, y, verbose=False, plot=False)
    
            # explain the model's predictions using SHAP
            # (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
            explainer = shap.TreeExplainer(mod)
            shap_values = explainer.shap_values(X)
            
            st.title("Feature Importance and Prediction Explanation based on the SHAP values")
            st.write("For a complete explanation about SHAP (SHapley Additive exPlanations) values and their impacts on machine learning models interpretability please refer to  Lundberg and Lee (2016), and their GitHub (https://github.com/slundberg/shap/blob/master/README.md)")
            
            st.header("Total distribution of observations based on the SHAP values, colored by Target Value")
            st.write("The plot below sorts features by the sum of SHAP value magnitudes all over samples, and uses SHAP values to show the distribution of the impacts each feature has on the model output. The colour represents the feature value (e.g: red shows high impact, while blue shows negative impact.")
            #st.write("and uses SHAP values to show the distribution of the impacts each feature has on the model output.")
            #st.write("The colour represents the feature value (e.g: red shows high impact, while blue shows negative impact.")
            shap.summary_plot(shap_values, X)
            st.pyplot()
            plt.clf()
            
            st.header("Feature Importance according to the SHAP values (simplified plot)")
            st.write("The following plot is the simplified version of the plot above which is basically built by taking the mean absolute value of the SHAP value for each feature. It also shows the feature importance in descending order and highlights the correlation in colours.")
            #st.write("for each feature to get a standard bar plot.")
            ABS_SHAP(shap_values, X)
            st.pyplot()
            plt.clf()
            
            st.header("Prediction explanation for a single observation")
            st.write("The following plots are the Individual Force Plots. Each of them shows how each feature affects the model output from the base value for a single prediction. Features pushing the prediction higher are shown in red, while those pushing the prediction lower are in blue. A set of samples are provided below from the 3rd, 7th, and 10th observation from the dataset.")
            st.subheader("Example on the 3rd observation")
            shap.force_plot(explainer.expected_value, shap_values[3,:], X.iloc[3,:], matplotlib=True, show=False, figsize=(16,5))
            st.pyplot()
            plt.clf()
            
            st.subheader("Example on the 7th observation")
            shap.force_plot(explainer.expected_value, shap_values[7,:], X.iloc[7,:], matplotlib=True, show=False, figsize=(16,5))
            st.pyplot()
            plt.clf()
            
            st.subheader("Example on the 10th observation")
            shap.force_plot(explainer.expected_value, shap_values[10,:], X.iloc[10,:], matplotlib=True, show=False, figsize=(16,5))
            st.pyplot()
            plt.clf()
            
            #st.header("Prediction explanation for the entire dataset")
            #st.write("The plot below is the Collective Force Plot. It is built by rotating the individual force plot 90 degrees, and stack them horizontally for the entire dataset.")
            #st_shap(shap.force_plot(explainer.expected_value, shap_values, X), 400)
            
            st.header("Dependence plots for each feature")
            st.write("The following plots are the partial dependence plots which each of them shows the marginal effect one or two features have on the predicted outcome of a machine learning model (J.H. Friedman, 2001). The partial dependence plot tells wether the relationship between the target and a feature is linear, monotonic or more complex.")
            st.subheader("Pressure")
            shap.dependence_plot("Pressure (bar)",shap_values,X,show=False)
            st.pyplot()
            plt.clf()
            
            st.subheader("Temperature")
            shap.dependence_plot("Temperature (K)",shap_values,X,show=False)
            st.pyplot()
            plt.clf()
            
            st.subheader("mNaCl")
            shap.dependence_plot("mNaCl (mol/kg)",shap_values,X,show=False)
            st.pyplot()
            plt.clf()
Пример #23
0
def get_Kbest_feature_lgb(train_x,
                          train_y,
                          val_x,
                          val_y,
                          method="gain",
                          span=(0, 1000, 1),
                          sorted_feature_list=None,
                          verbose=True):
    """get feature importances by lgb model, supported methods included:
     ["split","gain","shap"]
       "span" is a list with start, end and step index
       "sorted_feature_list" is a feature importance list if provided
    """
    assert span[1] > span[0], "span should be a 3-gram tuple, span[1] > span[0]"
    span = list(span)
    span[1] = min((train_x.shape[1], span[1]))
    score_ls = []
    num_feature_ls = []
    eli_cols = []
    import lightgbm as lgb
    import gc
    params = {
        # "max_bin": 512,
        "learning_rate": 0.01,
        "boosting_type": "gbdt",
        "objective": "binary",
        "metric": "auc",
        "num_leaves": 31,
        "max_depth": -1,
        "verbose": 200,
        "subsample": 0.8,
        "colsample_bytree": 0.9,
        "subsample_freq": 1,
        "reg_alpha": 0,
        "min_child_weight": 25,
        "random_state": 2018,
        "reg_lambda": 1,
        "n_jobs": -1,
    }
    if sorted_feature_list is None:
        d_train = lgb.Dataset(train_x, label=train_y)
        d_test = lgb.Dataset(val_x, label=val_y)
        print("begin to train ")
        clf_lgb = lgb.train(params,
                            d_train,
                            4000,
                            valid_sets=[d_train, d_test],
                            early_stopping_rounds=100,
                            verbose_eval=200)
        pre_score_val = clf_lgb.best_score["valid_1"]["auc"]
        pre_score_train = clf_lgb.best_score["training"]["auc"]
        score_ls.append(pre_score_val)
        num_feature_ls.append(span[1] + 1)
        if method == "gain":
            feature_importances_gain = sorted(zip(
                train_x.columns,
                clf_lgb.feature_importance(importance_type="gain")),
                                              key=lambda x: x[1],
                                              reverse=True)
            feature_importances = pd.DataFrame(
                [list(f) for f in feature_importances_gain],
                columns=["features", "importance"])
        elif method == "split":
            feature_importances_split = sorted(zip(
                train_x.columns,
                clf_lgb.feature_importance(importance_type="split")),
                                               key=lambda x: x[1],
                                               reverse=True)
            feature_importances = pd.DataFrame(
                [list(f) for f in feature_importances_split],
                columns=["features", "importance"])
        elif method == "shap":
            import shap
            import numpy as np
            shap.initjs()
            explainer = shap.TreeExplainer(clf_lgb)
            # shap_sample = val_x.sample(frac=1.0)
            shap_sample = train_x.sample(frac=0.6)
            shap_values = explainer.shap_values(shap_sample)
            shap.summary_plot(shap_values, shap_sample, plot_type="bar")
            feature_importances_shap = sorted(zip(
                train_x.columns, np.mean(np.abs(shap_values), axis=0)),
                                              key=lambda x: x[1],
                                              reverse=True)
            feature_importances = pd.DataFrame(
                [list(f) for f in feature_importances_shap],
                columns=["features", "importance"])
        feature_importances.to_csv("../work/feature_importance_eli_cor.csv",
                                   header=True,
                                   index=False)
        del d_test, d_train, clf_lgb
        gc.collect()
        if verbose:
            print(feature_importances)
            print("feature {} to {}, score {}".format(0, span[1], score_ls[0]))
        num_turn = max((0, int((span[1] - span[0]) / span[2])))
        feature_all = feature_importances["features"].unique().tolist()
        for i in range(num_turn):
            print("the {}th turn ".format(i))
            num_feature = span[1] - span[2] * (i + 1)
            temp_features = feature_all[0:num_feature]
            d_train_temp = lgb.Dataset(train_x[temp_features], label=train_y)
            d_test_temp = lgb.Dataset(val_x[temp_features], label=val_y)
            print("begin to train ")
            clf_temp = lgb.train(params,
                                 d_train_temp,
                                 4000,
                                 valid_sets=[d_train_temp, d_test_temp],
                                 early_stopping_rounds=100,
                                 verbose_eval=200)
            temp_score_val = clf_temp.best_score["valid_1"]["auc"]
            temp_score_train = clf_temp.best_score["training"]["auc"]
            if temp_score_val > pre_score_val and temp_score_train > pre_score_train:
                for f in feature_all[num_feature:num_feature + span[2]]:
                    eli_cols.append(f)
                print("features do not help:", eli_cols)
            pre_score_train = temp_score_train
            pre_score_val = temp_score_val
            score_ls.append(temp_score_val)
            num_feature_ls.append(num_feature)

            del d_test_temp, d_train_temp, clf_temp
        best_score = max(score_ls)
        best_num_feature = num_feature_ls[score_ls.index(best_score)]
        if verbose:
            print("best score {}, best number of feature span {} to {}".format(
                best_score, 0, best_num_feature))
        return feature_all[0:best_num_feature], eli_cols
    else:
        feature_importances = sorted_feature_list
        if verbose:
            print(feature_importances)
        num_turn = max((1, int((span[1] - span[0]) / span[2])))
        feature_all = feature_importances["features"].unique().tolist()
        pre_score_val = 0
        pre_score_train = 0
        for i in range(num_turn):
            print("the {}th turn ".format(i))
            num_feature = span[1] - span[2] * i
            temp_features = feature_all[0:num_feature]
            d_train_temp = lgb.Dataset(train_x[temp_features], label=train_y)
            d_test_temp = lgb.Dataset(val_x[temp_features], label=val_y)
            print("begin to train ")
            clf_temp = lgb.train(params,
                                 d_train_temp,
                                 4000,
                                 valid_sets=[d_train_temp, d_test_temp],
                                 early_stopping_rounds=100,
                                 verbose_eval=100)
            temp_score_val = clf_temp.best_score["valid_1"]["auc"]
            temp_score_train = clf_temp.best_score["training"]["auc"]
            if i == 0:
                pre_score_val = temp_score_val
                pre_score_train = temp_score_train
            if temp_score_val > pre_score_val and temp_score_train > pre_score_train:
                for f in feature_all[num_feature:num_feature + span[2]]:
                    eli_cols.append(f)
                print("features do not help:", eli_cols)
            pre_score_train = temp_score_train
            pre_score_val = temp_score_val
            score_ls.append(temp_score_val)
            num_feature_ls.append(num_feature)
            del d_test_temp, d_train_temp, clf_temp
        best_score = max(score_ls)
        best_num_feature = num_feature_ls[score_ls.index(best_score)]
        if verbose:
            print("best score {}, best number of feature span {} to {}".format(
                best_score, 0, best_num_feature))
        return feature_all[0:best_num_feature], eli_cols
Пример #24
0
    print(_+1, "Model Evaluation Result:", "\n", classification_report(y_test, cat_predict)) # 전체적인 성능 평가

bagging_predict = [] # 빈 리스트 생성
for lst2_index in range(X_test.shape[0]): # 테스트 데이터 개수만큼 반복
    temp_predict = [] # 반복문 내 임시 빈 리스트 생성
    for lst_index in range(len(bagging_predict_result)): # Bagging 결과 리스트 개수 만큼 반복
        temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장
    if np.mean(temp_predict) >= 0.5: # 0, 1 이진분류이므로, 예측값의 평균이 0.5보다 크면 1, 아니면 0으로 예측 다수결)
        bagging_predict.append(1)
    elif np.mean(temp_predict) < 0.5: # 예측값의 평균이 0.5보다 낮으면 0으로 결과 저장
        bagging_predict.append(0)
print("Confusion_Matrix: \n", confusion_matrix(y_test, bagging_predict)) # 혼돈행렬
print('\n')
print("Model Evaluation Result: \n", classification_report(y_test, bagging_predict)) # 전체적인 성능 평가


import shap
cat_model = CatBoostClassifier(n_estimators = 50, # 50번 추정
                           max_depth = 10, # 트리 최대 깊이 10
                           random_state = 42, # 시드값 고정
                           verbose = True) # 학습 진행 과정 표시
cat_model.fit(X_train_res2, y_train_res2) # 학습 진행
explainer = shap.TreeExplainer(cat_model) # 트리 모델 Shap Value 계산 객체 지정
shap_values = explainer.shap_values(X_test) # Shap Values 계산

shap.initjs() # 자바스크립트 초기화 (그래프 초기화)
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test[0,:]) # 첫 번째 검증 데이터 인스턴스에 대해 Shap Value를 적용하여 시각화
# 빨간색이 영향도가 높으며, 파란색이 영향도가 낮음

shap.summary_plot(shap_values, X_test)
shap.summary_plot(shap_values, X_test, plot_type = "bar") # 각 변수에 대한 Shap Values의 절대값으로 중요도 파악
def func():

    # split data into train and test sets
    seed = 7
    test_size = .25

    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        label,
                                                        test_size=test_size,
                                                        random_state=seed)
    original_col = X_train.columns
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X_train)
    X_train = imp.transform(X_train)
    X_test = imp.transform(X_test)

    # Random Forest
    regr_rf = RandomForestRegressor(max_depth=30, random_state=2)
    regr_rf.fit(X_train, y_train)
    y_pred_train1 = regr_rf.predict(X_train)
    y_pred1 = regr_rf.predict(X_test)
    # random forest end

    # XGBoost
    xgdmat = xgb.DMatrix(X_train, y_train)
    our_params={'eta':.03,'seed':0,'subsample':0.8,\
                'colsample_bytree':0.8,'objective':'reg:linear',\
                'max_depth':7,'min_child_weight':.5}
    final_gb = xgb.train(our_params, xgdmat, num_boost_round=1500)
    testmat = xgb.DMatrix(X_test)
    trainmat = xgb.DMatrix(X_train)
    y_pred2 = final_gb.predict(testmat)
    y_pred_train2 = final_gb.predict(trainmat)

    # SVM
    clf = svm.SVR(kernel='rbf',
                  degree=3,
                  gamma='auto',
                  coef0=0.0,
                  tol=0.1,
                  C=1.0,
                  epsilon=0.1,
                  shrinking=True,
                  cache_size=200,
                  verbose=False,
                  max_iter=-1)
    clf.fit(X_train, y_train)
    y_pred_train3 = clf.predict(X_train)
    y_pred3 = clf.predict(X_test)

    ###### Evaluation ######

    # Random Forest
    mae = mean_absolute_error(y_test.values, y_pred1)
    print("MAE: %.5f" % mae)
    rmse = np.sqrt(mean_squared_error(y_test.values, y_pred1))
    print("RMSE: %.5f" % rmse)
    R = np.corrcoef(y_test.values, y_pred1)
    print("Correlation Coef: %.5f" % R[0, 1])
    r2 = r2_score(y_test.values, y_pred1)
    print("r2 score: %.5f" % r2)

    # XGBoost
    mae = mean_absolute_error(y_test.values, y_pred2)
    print("MAE: %.5f" % mae)
    rmse = np.sqrt(mean_squared_error(y_test.values, y_pred2))
    print("RMSE: %.5f" % rmse)
    R = np.corrcoef(y_test.values, y_pred2)
    print("Correlation Coef: %.5f" % R[0, 1])
    r2 = r2_score(y_test.values, y_pred2)
    print("r2 score: %.5f" % r2)

    # SVM
    mae = mean_absolute_error(y_test.values, y_pred3)
    print("MAE: %.5f" % mae)
    rmse = np.sqrt(mean_squared_error(y_test.values, y_pred3))
    print("RMSE: %.5f" % rmse)
    R = np.corrcoef(y_test.values, y_pred3)
    print("Correlation Coef: %.5f" % R[0, 1])
    r2 = r2_score(y_test.values, y_pred3)
    print("r2 score: %.5f" % r2)

    ###### Visualization ######

    # plot predict error
    plt.gcf().set_size_inches((10, 4))
    plt.plot(((y_pred1 - y_test.values) / y_test.values)[::8],
             color='g',
             marker='*',
             label='random forest')
    plt.plot(((y_pred2 - y_test.values) / y_test.values)[::8],
             color='c',
             marker='s',
             markerfacecolor='none',
             label='XGBoost')
    plt.plot(((y_pred3 - y_test.values) / y_test.values)[::8],
             color='y',
             marker='o',
             markerfacecolor='none',
             label='SVM')
    # plt.gca().legend()
    plt.legend(loc='upper right')
    plt.savefig('junk.jpg')

    # plot training error
    plt.gcf().set_size_inches((10, 4))
    plt.plot(((y_pred_train1 - y_train.values) / y_train.values)[::20],
             color='g',
             marker='*',
             label='random forest')
    plt.plot(((y_pred_train2 - y_train.values) / y_train.values)[::20],
             color='c',
             marker='s',
             markerfacecolor='none',
             label='XGBoost')
    plt.plot(((y_pred_train3 - y_train.values) / y_train.values)[::20],
             color='y',
             marker='o',
             markerfacecolor='none',
             label='SVM')
    # plt.gca().legend()
    plt.legend(loc='upper right')
    plt.savefig('junk.jpg')

    # plot predictions on test split
    plt.gcf().set_size_inches((10, 4))
    plt.plot(y_test.values[::3], color='b', label='value')
    plt.plot(y_pred1[::3],
             color='g',
             marker='*',
             markerfacecolor='none',
             label='random forest',
             linestyle='None')
    plt.plot(y_pred2[::3],
             color='c',
             marker='s',
             markerfacecolor='none',
             label='XGBoost',
             linestyle='None')
    plt.plot(y_pred3[::3],
             color='y',
             marker='o',
             markerfacecolor='none',
             label='SVM',
             linestyle='None')
    # plt.gca().legend()
    plt.legend(loc='upper right')
    plt.savefig('junk.jpg')

    # plot predictions on training split
    plt.gcf().set_size_inches((10, 4))
    plt.plot(y_train.values[::10], color='b', label='value')
    plt.plot(y_pred_train1[::10],
             color='g',
             marker='*',
             markerfacecolor='none',
             label='random forest',
             linestyle='None')
    plt.plot(y_pred_train2[::10],
             color='c',
             marker='s',
             markerfacecolor='none',
             label='XGBoost',
             linestyle='None')
    plt.plot(y_pred_train3[::10],
             color='y',
             marker='o',
             markerfacecolor='none',
             label='SVM',
             linestyle='None')
    # plt.gca().legend()
    plt.legend(loc='upper right')
    plt.savefig('junk2.jpg')

    # shap the value for better visualization
    shap.initjs()
    shap_values = shap.TreeExplainer(final_gb).shap_values(X_train)
    X_train = pd.DataFrame(data=X_train, columns=original_col)
    X_train = X_train.rename(
        columns={
            "X2": "X7",
            "X3": "X6",
            "X4": "X14",
            "X5": "X4",
            "X6": "X8",
            "X7": "X9",
            "X8": "X10",
            "X9": "X12",
            "X10": "X11",
            "X11": "X13",
            "X12": "X5",
            "X13": "X1",
            "X14": "X2",
            "X15": "X3"
        })
    shap.summary_plot(shap_values, X_train)
def multiclass_s_lightGBM(merge_data3, outnameimp, outname, dayname, taitol_1):
    # 目的変数を分離
    X = merge_data3.drop("target", axis=1).values
    y = merge_data3["target"].values
    columns_name = merge_data3.drop("target", axis=1).columns

    # 分類するための関数を定義 0508日 ここをシンプルにしたい
    # 訓練用のデータと、テスト用のデータに分ける関数 ~2019.3 でtrain, valid.シャッフルせずクロスバリデーション予定
    def Test_data_and_training_data_split(df, X, Y):
        from sklearn.model_selection import train_test_split
        N_train = int(len(df) * 0.86)
        N_test = len(df) - N_train
        X_train, X_test, y_train, y_test = \
           train_test_split(X, Y, test_size=N_test,shuffle=False)
        return X_train, X_test, y_train, y_test

    # 訓練用のデータと、テスト用のデータに分ける関数実行
    X_train, X_test, y_train, y_test = Test_data_and_training_data_split(
        merge_data3, X, y)
    X_trainpp = pd.DataFrame(X_train, columns=columns_name)
    #X_test = pd.DataFrame(X_test, columns=columns_name)

    #pd.DataFrame に戻して 縦に train val 結合していく
    y_trainp = pd.DataFrame(y_train)
    X_trainp = pd.DataFrame(X_trainpp)
    train = pd.concat([y_trainp, X_trainp], axis=1)
    print("train shape", train.shape)
    print("train", train)
    merge_data_p = train.rename(columns={0: "target"})
    #train_dataをクロスバリデーション

    # 目的変数を分離
    X = merge_data_p.drop("target", axis=1).values
    y = merge_data_p["target"].values
    columns_name = merge_data_p.drop("target", axis=1).columns
    from sklearn.model_selection import train_test_split

    # 分類するための関数を定義 シャッフル実施
    def Test_data_and_training_data_split(df, X, Y):
        N_train = int(len(df) * 0.80)
        N_test = len(df) - N_train
        X_train, X_test, y_train, y_test = \
           train_test_split(X, Y, test_size=N_test,random_state=42)
        return X_train, X_test, y_train, y_test

    # 訓練用のデータと、テスト用のデータに分ける関数実行
    X_train, X_val, y_train, y_val = Test_data_and_training_data_split(
        merge_data_p, X, y)

    #X_train = pd.DataFrame(X_train, columns=columns_name)
    #X_val = pd.DataFrame(X_val, columns=columns_name)
    # shape 確認
    print("train shape", X_train.shape)
    print("X_train", X_train)
    print("test shape", X_test.shape)
    print("validation shape", X_val.shape)
    # shape 確認
    print("y_train shape", y_train.shape)
    print("y_test shape", y_test.shape)
    print("y_validation shape", y_val.shape)
    print("y_val", y_val)

    import lightgbm as lgb
    #shap
    import shap
    shap.initjs()
    # データセットを作成
    train = lgb.Dataset(X_train, label=y_train)
    valid = lgb.Dataset(X_val, label=y_val)

    # モデルのパラメータを設定
    # パラメータを設定
    params = {
        'reg_lambda': 0.2,
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class': 8,
        'reg_alpha': 0.1,
        'min_data_leaf': 100,
        'learning_rate': 0.025,
        #     'feature_fraction': 0.8,
        #     'bagging_fraction': 0.8
    }

    # モデルを訓練
    model = lgb.train(params,
                      train,
                      valid_sets=valid,
                      num_boost_round=5000,
                      early_stopping_rounds=500)

    # 予測
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred = np.argmax(y_pred, axis=1)

    #--------------------------モデルの評価-----------------------------------------------
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import cohen_kappa_score
    #shap
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)
    shap.summary_plot(shap_values, X_test, plot_type="bar")
    # 混合行列を作成
    result_matrix = pd.DataFrame(confusion_matrix(y_test, y_pred))

    # クラス毎の正解率を計算
    class_accuracy = [(result_matrix[i][i] / result_matrix[i].sum()) * 1
                      for i in range(len(result_matrix))]
    result_matrix[7] = class_accuracy

    # Accuracy を計算する
    accuracy = sum(y_test == y_pred) / len(y_test)
    print('accuracy:', accuracy)
    #kappa係数を計算
    kappa = cohen_kappa_score(y_test, y_pred)
    print("kappa score:", kappa)
    result_matrix.to_csv(r"" + "./output/" + dayname + '/' + "result_matrix" +
                         taitol_1 + outname + '.csv',
                         encoding='shift-jis')

    from sklearn.metrics import accuracy_score
    accuracy_score1 = accuracy_score(y_test, y_pred)
    rezurt_1 = pd.DataFrame({
        taitol_1 + 'accuracy_score': accuracy_score1,
        taitol_1 + "y_test": y_test,
        taitol_1 + "y_pred": y_pred
    })

    #rezurt_1[taitol_1+"y_train"] =y_train
    #rezurt_1[taitol_1+"y_val"] =y_val

    rezurt_1.to_csv(r"" + "./output/" + dayname + '/' + "rezurt_1" + taitol_1 +
                    outname + '.csv',
                    encoding='shift-jis')

    importance = pd.DataFrame(model.feature_importance(),
                              columns=[taitol_1 + 'importance'])
    display(importance)
    importance.to_csv(r"" + "./output/" + dayname + '/' + "importance" +
                      taitol_1 + outname + '.csv',
                      encoding='shift-jis')
Пример #27
0
def compute_predictor_importance():
    shap.initjs()
    explainer = shap.KernelExplainer(model.predict_proba, test_X[0:100, :])
    shap_values = explainer.shap_values(test_X[0:100, :])
    shap.summary_plot(shap_values, test_X[0:100, :], plot_type="bar")
Пример #28
0
def train(train_data, test_data=None):
    G = train_data[0]  # G 是一个Networkx里的对象,这几个都是经过load_data()处理过的
    features = train_data[1]
    id_map = train_data[2]
    class_map = train_data[4]
    class_map2 = train_data[5]
    class_map3 = train_data[6]
    #class_map = class_map
    hierarchy = FLAGS.hierarchy

    degreelist = []
    countnode = 0
    sumedge = 0
    for key in G.edge:
        if len(G.edge[key]) > 1:
            countnode += 1
            sumedge = sumedge + len(G.edge[key])
            degreelist.append(key)
    avg_edge = sumedge/countnode

    if features is not None:
        # pad with dummy zero vector
        features = np.vstack([features, np.zeros((features.shape[1],))])
    features = tf.cast(features, tf.float32)
    for hi_num in range(hierarchy):
        if hi_num == 0:
            class_map = class_map
            if isinstance(list(class_map.values())[0], list):
                num_classes = len(list(class_map.values())[0])
            else:
                num_classes = len(set(class_map.values()))
            class_map_ko_0 = construct_class_numpy(class_map)
        elif hi_num == 1:
            class_map = class_map2
            if isinstance(list(class_map.values())[0], list):
                num_classes = len(list(class_map.values())[0])
            else:
                num_classes = len(set(class_map.values()))
            class_map_ko_1 = construct_class_numpy(class_map)

        elif hi_num == 2:
            class_map = class_map3
            if isinstance(list(class_map.values())[0], list):
                num_classes = len(list(class_map.values())[0])
            else:
                num_classes = len(set(class_map.values()))
            class_map_ko_2 = construct_class_numpy(class_map)


        class_map_ko = construct_class_numpy(class_map)

        OTU_ko_num = class_map_ko.sum(axis=1)
        ko_samle_num = class_map_ko.sum(axis=0)
        count = 0
        for num in OTU_ko_num:
            if num < 100:
                count += 1
        ko_cb = construct_class_para(class_map_ko, 0, FLAGS.beta1)
        ko_cb = tf.cast(ko_cb, tf.float32)
        f1_par = construct_class_para(class_map_ko, 1, FLAGS.beta2)


        context_pairs = train_data[3] if FLAGS.random_context else None
        placeholders = construct_placeholders(num_classes)
        minibatch = NodeMinibatchIterator(G,
                                          id_map,
                                          placeholders,
                                          class_map,
                                          num_classes,
                                          batch_size=FLAGS.batch_size,
                                          max_degree=FLAGS.max_degree,
                                          context_pairs=context_pairs)
        ctrain = 0
        cval =0
        ctest = 0
        for i in minibatch.train_nodes:
            if i in degreelist:
                ctrain += 1
        for i in minibatch.val_nodes:
            if i in degreelist:
                cval += 1
        for i in minibatch.test_nodes:
            if i in degreelist:
                ctest += 1
        #pdb.set_trace()
        with open('test_nodes.txt', 'w') as f:
            json.dump(minibatch.test_nodes, f)
    ###########
        list_node = minibatch.nodes
        for otu in minibatch.train_nodes:
            if otu in list_node:
                list_node.remove(otu)
        for otu in minibatch.val_nodes:
            if otu in list_node:
                list_node.remove(otu)
        for otu in minibatch.test_nodes:
            if otu in list_node:
                list_node.remove(otu)
    ###########
        if hi_num == 0:
            adj_info_ph = tf.placeholder(tf.int32, shape=minibatch.adj.shape)
        # 把adj_info设成Variable应该是因为在训练和测试时会改变adj_info的值,所以
        # 用Varible然后用tf.assign()赋值。
        adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")
        shap.initjs()
        if FLAGS.model == 'graphsage_mean':
            # Create model
            sampler = UniformNeighborSampler(adj_info)

            if FLAGS.samples_3 != 0:
                layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                               SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2),
                               SAGEInfo("node", sampler, FLAGS.samples_3, FLAGS.dim_2)]


            elif FLAGS.samples_2 != 0:
                layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                               SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)]


            else:
                layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1)]


            model = SupervisedGraphsage(num_classes, placeholders,
                                        features,
                                        adj_info,
                                        minibatch.deg,  # 每一个的度
                                        layer_infos,
                                        ko_cb, hi_num,
                                        model_size=FLAGS.model_size,
                                        sigmoid_loss=FLAGS.sigmoid,
                                        identity_dim=FLAGS.identity_dim,
                                        logging=True,
                                        concat=False
                                        )

        elif FLAGS.model == 'gcn':
            # Create model
            sampler = UniformNeighborSampler(adj_info)
            layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, 2 * FLAGS.dim_1),
                           SAGEInfo("node", sampler, FLAGS.samples_2, 2 * FLAGS.dim_2)]

            model = SupervisedGraphsage(num_classes, placeholders,
                                        features,
                                        adj_info,
                                        minibatch.deg,
                                        layer_infos=layer_infos,
                                        aggregator_type="gcn",
                                         model_size=FLAGS.model_size,
                                        concat=False,
                                        sigmoid_loss=FLAGS.sigmoid,
                                        identity_dim=FLAGS.identity_dim,
                                        logging=True)

        elif FLAGS.model == 'graphsage_seq':
            sampler = UniformNeighborSampler(adj_info)
            layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                           SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)]

            model = SupervisedGraphsage(num_classes, placeholders,
                                        features,
                                        adj_info,
                                        minibatch.deg,
                                        layer_infos=layer_infos,
                                        aggregator_type="seq",
                                        model_size=FLAGS.model_size,
                                        sigmoid_loss=FLAGS.sigmoid,
                                        identity_dim=FLAGS.identity_dim,
                                        logging=True,
                                        concat=True)

        elif FLAGS.model == 'graphsage_maxpool':
            sampler = UniformNeighborSampler(adj_info)
            layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                           SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)]

            model = SupervisedGraphsage(num_classes, placeholders,
                                        features,
                                        adj_info,
                                        minibatch.deg,
                                        layer_infos=layer_infos,
                                        aggregator_type="maxpool",
                                        model_size=FLAGS.model_size,
                                        sigmoid_loss=FLAGS.sigmoid,
                                        identity_dim=FLAGS.identity_dim,
                                        logging=True,
                                        concat=True)

        elif FLAGS.model == 'graphsage_meanpool':
            sampler = UniformNeighborSampler(adj_info)
            layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                           SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)]

            model = SupervisedGraphsage(num_classes, placeholders,
                                        features,
                                        adj_info,
                                        minibatch.deg,
                                        layer_infos=layer_infos,
                                        aggregator_type="meanpool",
                                        model_size=FLAGS.model_size,
                                        sigmoid_loss=FLAGS.sigmoid,
                                        identity_dim=FLAGS.identity_dim,
                                        logging=True,
                                        concat=True)
        elif FLAGS.model == 'gat':
            sampler = UniformNeighborSampler(adj_info)
            # 建立两层网络 采样邻居、邻居个数、输出维度
            layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                           SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)]

            model = SupervisedGraphsage(num_classes, placeholders,
                                        features,
                                        adj_info,
                                        minibatch.deg,
                                        concat=True,
                                        layer_infos=layer_infos,
                                        aggregator_type="gat",
                                        model_size=FLAGS.model_size,
                                        sigmoid_loss=FLAGS.sigmoid,
                                        identity_dim=FLAGS.identity_dim,
                                        logging=True,
                                        )
        else:
            raise Exception('Error: model name unrecognized.')

        config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)
        config.gpu_options.allow_growth = True
        config.gpu_options.per_process_gpu_memory_fraction = GPU_MEM_FRACTION
        config.allow_soft_placement = True

        # Initialize session
        sess = tf.Session(config=config)
        # sess = tf_dbg.LocalCLIDebugWrapperSession(sess)
        #merged = tf.summary.merge_all()  # 将所有东西保存到磁盘,可视化会用到
        #summary_writer = tf.summary.FileWriter(log_dir(), sess.graph)  # 记录信息,可视化,可以用tensorboard查看

        # Init variables
        sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj})
        #sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph2: minibatch2.adj})

        # Train model
        total_steps = 0
        avg_time = 0.0
        epoch_val_costs = []
        epoch_val_costs2 = []
        # 这里minibatch.adj和minibathc.test_adj的大小是一样的,只不过adj里面把不是train的值都变成一样
        # val在这里是validation的意思,验证
        train_adj_info = tf.assign(adj_info, minibatch.adj)  # tf.assign()是为一个tf.Variable赋值,返回值是一个Variable,是赋值后的值
        val_adj_info = tf.assign(adj_info, minibatch.test_adj)  # assign()是一个Opration,要用sess.run()才能执行
        it = 0
        train_loss = []
        val_loss = []
        train_f1_mics = []
        val_f1_mics = []
        loss_plt = []
        loss_plt2 = []
        trainf1mi = []
        trainf1ma = []
        valf1mi = []
        valf1ma = []
        iter_num = 0

        for epoch in range(FLAGS.epochs*2):
            if epoch < FLAGS.epochs:
                minibatch.shuffle()
                iter = 0
                print('Epoch: %04d' % (epoch + 1))
                epoch_val_costs.append(0)
                while not minibatch.end():
                    # Construct feed dictionary
                    # 通过改变feed_dict来改变每次minibatch的节点
                    feed_dict, labels = minibatch.next_minibatch_feed_dict()  # feed_dict是mibatch修改过的placeholder
                    feed_dict.update({placeholders['dropout']: FLAGS.dropout})
                    t = time.time()
                    # Training step
                    outs = sess.run([model.opt_op, model.loss, model.preds], feed_dict=feed_dict)
                    train_cost = outs[1]
                    iter_num = iter_num + 1
                    loss_plt.append(float(train_cost))
                    if iter % FLAGS.print_every == 0:
                        # Validation 验证集
                        sess.run(val_adj_info.op)  # sess.run()  fetch参数是一个Opration,代表执行这个操作。
                        if FLAGS.validate_batch_size == -1:
                            val_cost, val_f1_mic, val_f1_mac, duration, otu_lazy, _, val_preds, __, val_accuracy, val_mi_roc_auc = incremental_evaluate(sess, model, minibatch, f1_par,
                                                                                                        FLAGS.batch_size)
                        else:
                            val_cost, val_f1_mic, val_f1_mac, duration, val_accuracy, val_mi_roc_auc = evaluate(sess, model, minibatch, f1_par,
                                                                                  FLAGS.validate_batch_size)
                        sess.run(train_adj_info.op)  # 每一个tensor都有op属性,代表产生这个张量的opration。
                        epoch_val_costs[-1] += val_cost

                    #if iter % FLAGS.print_every == 0:
                        #summary_writer.add_summary(outs[0], total_steps)

                    # Print results
                    avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1)
                    loss_plt2.append(float(val_cost))
                    valf1mi.append(float(val_f1_mic))
                    valf1ma.append(float(val_f1_mac))

                    if iter % FLAGS.print_every == 0:
                        train_f1_mic, train_f1_mac, train_f1_none, train_accuracy, train_mi_roc_auc = calc_f1(labels, outs[-1], f1_par)
                        trainf1mi.append(float(train_f1_mic))
                        trainf1ma.append(float(train_f1_mac))

                        print("Iter:", '%04d' % iter,
                              # 训练集上的损失函数等信息
                              "train_loss=", "{:.5f}".format(train_cost),
                              "train_f1_mic=", "{:.5f}".format(train_f1_mic),
                              "train_f1_mac=", "{:.5f}".format(train_f1_mac),
                              "train_accuracy=", "{:.5f}".format(train_accuracy),
                              "train_ra_mi=", "{:.5f}".format(train_mi_roc_auc),

                              # 在测试集上的损失函数值等信息
                              "val_loss=", "{:.5f}".format(val_cost),
                              "val_f1_mic=", "{:.5f}".format(val_f1_mic),
                              "val_f1_mac=", "{:.5f}".format(val_f1_mac),
                              "val_accuracy=", "{:.5f}".format(val_accuracy),
                              "val_ra_mi=", "{:.5f}".format(val_mi_roc_auc),

                              "time=", "{:.5f}".format(avg_time))
                        train_loss.append(train_cost)
                        val_loss.append(val_cost)
                        train_f1_mics.append(train_f1_mic)
                        val_f1_mics.append(val_f1_mic)

                    iter += 1
                    total_steps += 1

                    if total_steps > FLAGS.max_total_steps:
                        break

                if total_steps > FLAGS.max_total_steps:
                    break
    ###################################################################################################################
            # begin second degree training
    ###################################################################################################################


        print("Optimization Finished!")
        sess.run(val_adj_info.op)
        if hi_num == 1:
            last_preds = test_preds
            last_labels = test_labels
        val_cost, val_f1_mic, val_f1_mac, duration, otu_f1, ko_none, test_preds, test_labels, test_accuracy, test_mi_roc_auc = incremental_evaluate(sess, model, minibatch, f1_par, FLAGS.batch_size, test=True)
        print("Full validation stats:",
              "loss=", "{:.5f}".format(val_cost),
              "f1_micro=", "{:.5f}".format(val_f1_mic),
              "f1_macro=", "{:.5f}".format(val_f1_mac),
              "accuracy=", "{:.5f}".format(test_accuracy),
              "roc_auc_mi=", "{:.5f}".format(test_mi_roc_auc),

              "time=", "{:.5f}".format(duration),)



        pred = y_ture_pre(sess, model, minibatch, FLAGS.batch_size)
        for i in range(pred.shape[0]):
            sum = 0
            for l in range(pred.shape[1]):
                sum = sum + pred[i, l]
            for m in range(pred.shape[1]):
                pred[i, m] = pred[i, m]/sum
        id = json.load(open(FLAGS.train_prefix + "-id_map.json"))
        # x_train = np.empty([pred.shape[0], array.s)
        num = 0
        session = tf.Session()
        array = session.run(features)
        x_test = np.empty([pred.shape[0], array.shape[1]])
        x_train = np.empty([len(minibatch.train_nodes), array.shape[1]])
        for node in minibatch.val_nodes:
            x_test[num] = array[id[node]]
            num = num + 1
        num1 = 0
        for node in minibatch.train_nodes:
            x_train[num1] = array[id[node]]
            num1 = num1 + 1

        with open(log_dir() + "val_stats.txt", "w") as fp:
            fp.write("loss={:.5f} f1_micro={:.5f} f1_macro={:.5f} time={:.5f}".
                     format(val_cost, val_f1_mic, val_f1_mac, duration))

        print("Writing test set stats to file (don't peak!)")
        val_cost, val_f1_mic, val_f1_mac, duration, otu_lazy, ko_none, _, __, test_accuracy, test_mi_roc_auc = incremental_evaluate(sess, model, minibatch, f1_par, FLAGS.batch_size,
                                                                                    test=True)
        with open(log_dir() + "test_stats.txt", "w") as fp:
            fp.write("loss={:.5f} f1_micro={:.5f} f1_macro={:.5f}".
                     format(val_cost, val_f1_mic, val_f1_mac))

        incremental_evaluate_for_each(sess, model, minibatch, FLAGS.batch_size,
                                      test=True)


##################################################################################################################
    # plot loss
    plt.figure()
    plt.plot(loss_plt, label='train_loss')
    plt.plot(loss_plt2, label='val_loss')
    plt.legend(loc=0)
    plt.xlabel('Iteration')
    plt.ylabel('loss')
    plt.title('Loss plot')
    plt.grid(True)
    plt.axis('tight')
    #plt.savefig("./graph/HMC_SAGE_CB_loss.png")
    # plt.show()

    # plot f1 score
    plt.figure()
    plt.subplot(211)
    plt.plot(trainf1mi, label='train_f1_micro')
    plt.plot(valf1mi, label='val_f1_micro')
    plt.legend(loc=0)
    plt.xlabel('Iterations')
    plt.ylabel('f1_micro')
    plt.title('train_val_f1_score')
    plt.grid(True)
    plt.axis('tight')

    plt.subplot(212)
    plt.plot(trainf1ma, label='train_f1_macro')
    plt.plot(valf1ma, label='val_f1_macro')
    plt.legend(loc=0)
    plt.xlabel('Iteration')
    plt.ylabel('f1_macro')
    plt.grid(True)
    plt.axis('tight')
   # plt.savefig("./graph/HMC_SAGE_CB_f1.png")
    # plt.show()

    plt.figure()
    plt.plot(np.arange(len(train_loss)) + 1, train_loss, label='train')
    plt.plot(np.arange(len(val_loss)) + 1, val_loss, label='val')
    plt.legend()
    plt.savefig('loss.png')
    plt.figure()
    plt.plot(np.arange(len(train_f1_mics)) + 1, train_f1_mics, label='train')
    plt.plot(np.arange(len(val_f1_mics)) + 1, val_f1_mics, label='val')
    plt.legend()
    #plt.savefig('f1.png')

    # OTU f1
    plt.figure()
    plt.plot(otu_f1, label='otu_f1')
    plt.legend(loc=0)
    plt.xlabel('OTU')
    plt.ylabel('f1_score')
    plt.title('OTU f1 plot')
    plt.grid(True)
    plt.axis('tight')
    #plt.savefig("./graph/below_1500_CECB15_otu_f1.png")
    # plt.show()

    # Ko f1 score
    plt.figure()
    plt.plot(ko_none, label='Ko f1 score')
    plt.legend(loc=0)
    plt.xlabel('Ko')
    plt.ylabel('f1_score')
    plt.grid(True)
    plt.axis('tight')
    #plt.savefig("./graph/below1500_CECB15_ko_f1.png")
    bad_ko = []
    b02 = 0
    b05 = 0
    b07 = 0
    for i in range(len(ko_none)):
        if ko_none[i] < 0.2:
            bad_ko.append(i)
            b02 += 1
        elif ko_none[i] < 0.5:
            b05 += 1
        elif ko_none[i] < 0.7:
            b07 += 1
    print("ko f1 below 0.2:", b02)
    print("ko f1 below 0.5:", b05)
    print("ko f1 below 0.7:", b07)
    print("ko f1 over 0.7:", num_classes-b02-b05-b07)
    bad_ko = np.array(bad_ko)
    with open('./new_data_badko/graph7 ko below zero point two .txt', 'w') as f:
        np.savetxt(f, bad_ko, fmt='%d', delimiter=",")

    workbook = xlwt.Workbook()
    sheet = workbook.add_sheet("sample_performance")
    for row in range(num_classes):
        sheet.write(row, 0, str(ko_samle_num[row]))
        sheet.write(row, 1, str(train_f1_none[row]))
    workbook.save('./graph/sample_performance11.xls')
def heart_disease_risk_factors(model, patient):

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(patient)
    shap.initjs()
    return plot(shap.force_plot(explainer.expected_value[1], shap_values[1], patient))
Пример #30
0
import numpy as np
import shap
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split

np.set_printoptions(precision=2, suppress=True)

X, y = shap.datasets.boston()
X_train, X_validation, y_train, y_validation = train_test_split(
    X, y, train_size=0.8, random_state=12)

model = CatBoostRegressor(iterations=700,
                          learning_rate=0.001,
                          eval_metric='RMSE',
                          random_seed=12,
                          silent=True)

model.fit(X_train, y_train, eval_set=(X_validation, y_validation), plot=True)

shap.initjs()

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap.force_plot(explainer.expected_value,
                shap_values[0, :],
                X.iloc[0, :],
                matplotlib=True)
shap.force_plot(explainer.expected_value, shap_values, X)
Пример #31
0
 def Proof_of_concept(self,clf,scaler,explainer,imputer,imputer_raw,  # Make dynamic plots for local feature importance / predicions
                      X,ids_events,ts,t,plot=True):
     import shap
     import matplotlib.pyplot as plt
     shap.initjs()
     
     # all available indexes:
     
     idx = np.unique(X[:,-1])
     
     
     for i in range(
             len(idxs)
             # 1
             ):
         print('\n Patient',i)
         # try:
         
         patient = X[np.where(X[:,-1] == idx)][:,:-1]
         print(patient.shape)
         
         total_features = make_total_features(self.features,self.specs)
         print(total_features.shape)
         
         patient = pd.DataFrame(patient,columns=total_features)
         #HEREEE
         if label == 'pos':
             print('to ICU:',t_event)
         else:
             print('discharge:',t_event)
         
         # Calculate model risks, PLOT SHAPLEY FORCE PLOTS   
         predictions =  predict(clf, X)
         
         diff = []
         if len(predictions) > 2:
             for p in range(len(predictions)-1):
                 diff.append(np.abs(predictions[p+1]-predictions[p]))
             diff = np.asarray(diff)
             n = len(predictions)
             if label == 'pos':
                 diff_idx = diff.argsort()[-(n-1):]
             else:
                 diff_idx = diff.argsort()[-3:]
             
             
             feature_inc_units = []
             for feature in features_tot:
                 feature_inc_units.append(feature+' '+self.dict_unit[feature])
             feature_inc_units = np.asarray(feature_inc_units)
                 
             count = 1    
             if plot:
                 for idx in diff_idx:
                     # new_base_value = np.log(t / (1 - t))  # the logit function
                     shap_display = shap.force_plot(
                                         explainer.expected_value[1], 
                                         # new_base_value,
                                         # link='logit',
                                         explainer.shap_values(X[idx+1,:])[1], 
                                         features=np.round(X_raw.iloc[idx+1,:],2), 
                                         feature_names=feature_inc_units,
                                         text_rotation=30,
                                         matplotlib=True,show=False, 
                                         # plot_cmap=["#FF5733","#335BFF"]
                                         )
         
                     plt.savefig('results/POC_plot_FORCE_'+ str(i) + '_' + str(count) + '.png',bbox_inches='tight',dpi=300)
                     count+=1
                 
         #Calculate feature impacts
         feature_impacts = list()
         
         for j in range(X.shape[0]):
             feature_impacts.append(explainer.shap_values(X[j,:])[1])
             
         
         feature_impacts = np.array([np.array(x) for x in feature_impacts])
         feature_impacts = pd.DataFrame(feature_impacts)
         feature_impacts.columns = features_tot
 
         # Calculate NEWS score
         news = []
         for v in range(X_raw_imputed.shape[0]):
             a = NEWS(X_raw_imputed.loc[v,'SpO2'],
                      X_raw_imputed.loc[v,'HR'],
                      X_raw_imputed.loc[v,'BP'],
                      X_raw_imputed.loc[v,'RR'],
                      X_raw_imputed.loc[v,'Temp']
                      )
             news.append(a)
         
         # 'Global' SHAPs for specific patient
         shap_values = explainer.shap_values(X)
         shap_mean = np.mean(np.abs(shap_values[1]),axis=0)
         sorted_idx = shap_mean.argsort()
         sorted_idx = list(sorted_idx)
         sorted_idx.remove(0)
         sorted_idx.remove(1)
         sorted_idx = np.asarray(sorted_idx)
         
         features_to_plot = features_tot[sorted_idx][-8:]
         
         if plot:
             plt = subplot(X_raw,ts,predictions,news,features_to_plot,i,t_event,feature_impacts,label,t,self.dict_unit,self.specs)  
         
         if i == 0:
             X_overall = X
         else:
             X_overall = np.concatenate([X_overall,X],axis=0)
         # except:
         #     print('patient', i, ' too short')
     
     
         
     
     return X_overall