def ga2m(self): # Explore the Data marginal = Marginal().explain_data(self.x, self.y, name="Raw Data") # Train the Explainable Boosting Machine(EBM) lr = LinearRegression() lr.fit(self.x, self.y) rt = RegressionTree() rt.fit(self.x, self.y) ebm = ExplainableBoostingRegressor( ) # For Classifier, use ebm = ExplainableBoostingClassifier() ebm.fit(self.x, self.y) # How Does the EBM Model Perform? ebm_perf = RegressionPerf(ebm.predict).explain_perf(self.x, self.y, name="EBM") lr_perf = RegressionPerf(lr.predict).explain_perf( self.x, self.y, name="Linear Regression") rt_perf = RegressionPerf(rt.predict).explain_perf( self.x, self.y, name="Regression Tree") # Global Interpretability - What the Model says for All Data ebm_global = ebm.explain_global(name="EBM") lr_global = lr.explain_global(name="LinearRegression") rt_global = rt.explain_global(name="Regression Tree") # Put All in a Dashboard - This is the best show([ marginal, lr_global, lr_perf, rt_global, rt_perf, ebm_perf, ebm_global ])
def blackbox_show_performance(self, method, predictions="default", show=True): """ Plots an interpretable display of your model based off a performance metric. Can either be 'ROC' or 'PR' for precision, recall for classification problems. Can be 'regperf' for regression problems. Parameters ---------- method : str Performance metric, either 'roc' or 'PR' predictions : str, optional Prediction type, can either be 'default' (.predict) or 'probability' if the model can predict probabilities, by default 'default' show : bool, optional False to not display the plot, by default True Returns ------- Interpret Interpretable dashboard of your model """ if predictions == "probability": predict_fn = self.model.predict_proba else: predict_fn = self.model.predict if self.problem in INTERPRET_EXPLAINERS["problem"]: if method.lower() in INTERPRET_EXPLAINERS["problem"][self.problem]: blackbox_perf = INTERPRET_EXPLAINERS["problem"][self.problem][ method.lower()](predict_fn).explain_perf( self.x_test, self.y_test, name=method.upper()) else: raise ValueError( "Supported blackbox explainers are only {} for classification problems and {} for regression problems" .format( ",".join(INTERPRET_EXPLAINERS["problem"] ["classification"].keys()), ",".join( INTERPRET_EXPLAINERS["problem"]["regression"].keys()), )) if show: interpret.show(blackbox_perf) self.trained_blackbox_explainers[method.lower()] = blackbox_perf return blackbox_perf
def test_interpret_dashboard(self, mimic_explainer): # Validate our explanation works with the interpret dashboard x_train, x_test, y_train, y_test, feature_names, target_names = create_cancer_data( ) # Fit an SVM model model = create_sklearn_svm_classifier(x_train, y_train) explainer = mimic_explainer(model, x_train, LGBMExplainableModel, features=feature_names, classes=target_names) explanation = explainer.explain_global(x_test) show(explanation)
def blackbox_global_explanation(self, method="morris", predictions="default", show=True, **kwargs): """ Provides an interpretable summary of your models behaviour based off an explainer. Can either be 'morris' or 'dependence' for Partial Dependence. Parameters ---------- method : str, optional Explainer type, can either be 'morris' or 'dependence', by default 'morris' predictions : str, optional Prediction type, can either be 'default' (.predict) or 'probability' if the model can predict probabilities, by default 'default' show : bool, optional False to not display the plot, by default True Returns ------- Interpret Interpretable dashboard of your model """ if predictions == "probability": predict_fn = self.model.predict_proba else: predict_fn = self.model.predict if method.lower() in INTERPRET_EXPLAINERS["global"]: sensitivity = INTERPRET_EXPLAINERS["global"][method.lower()]( predict_fn=predict_fn, data=self.x_train, **kwargs) else: raise ValueError( 'Supported blackbox global explainers are only "morris" and partial "dependence".' ) sensitivity_global = sensitivity.explain_global(name=method.upper()) self.trained_blackbox_explainers[method.lower()] = sensitivity_global if show: interpret.show(sensitivity_global) return sensitivity_global
def create_dashboard(self): # pragma: no cover """ Displays an interpretable dashboard of already created interpretable plots. If a plot hasn't been interpreted yet it is created using default parameters for the dashboard. """ dashboard_plots = [] for explainer_type in INTERPRET_EXPLAINERS: if explainer_type == "problem": temp_explainer_type = INTERPRET_EXPLAINERS[explainer_type][self.problem] else: temp_explainer_type = INTERPRET_EXPLAINERS[explainer_type] for explainer in temp_explainer_type: if explainer in self.trained_blackbox_explainers: dashboard_plots.append(self.trained_blackbox_explainers[explainer]) else: if explainer_type == "problem": dashboard_plots.append( self.blackbox_show_performance(explainer, show=False) ) elif explainer_type == "local": dashboard_plots.append( self.blackbox_local_explanation( method=explainer, show=False ) ) else: dashboard_plots.append( self.blackbox_global_explanation( method=explainer, show=False ) ) interpret.show(dashboard_plots)
X_train, X_validate, y_train, y_validate = train_test_split( train_data.drop('Survived', axis=1), train_data['Survived'], test_size=.25) ebm = ExplainableBoostingClassifier() lrm = LogisticRegression() ebm.fit(X_train, y_train) le = LabelEncoder() X_train_lr = X_train X_train_lr['Sex'] = le.fit_transform(X_train['Sex']) lrm.fit(X_train_lr, y_train) ebm_global = ebm.explain_global() show(ebm_global) ebm_local = ebm.explain_local(X_validate, y_validate) show(ebm_local) lrm_global = lrm.explain_global() show(lrm_global) X_validate_lr = X_validate X_validate_lr['Sex'] = le.fit_transform(X_validate['Sex']) lrm_local = lrm.explain_local(X_validate, y_validate) show(lrm_local) ## Age binning ages = pd.DataFrame({'ages': [10, 20, 24, 25, 29, 41, 45, 55, 56]}) ages['ages2'] = pd.cut(ages.ages, bins=[0, 20, 40, 60], include_lowest=True) ages
# # **Q7**. Report (global) feature importances for EBM as a table or figure. What are the most important three features in EBM? Are they the same as in the linear model? # # w_1X + w_2Y + w_3(XY) = Z # %% from interpret.glassbox import ExplainableBoostingClassifier from interpret import show train_features, train_labels, dev_features, dev_labels, test_features, test_labels = prepare_load_classification_data( ) ebm = ExplainableBoostingClassifier(n_jobs=-1) ebm.fit(train_features, train_labels) # EBM #%% # Global Explanation ebm_global = ebm.explain_global(name='EBM') show(ebm_global) #%% # Local Explanation ebm_local = ebm.explain_local(dev_features[:5], dev_labels[:5], name='EBM') show(ebm_local) #%% # Performance from interpret.perf import ROC ebm_perf = ROC(ebm.predict_proba).explain_perf(dev_features, dev_labels, name='EBM') show(ebm_perf) # %% [markdown] # ### Training and Explaining Neural Networks # Train two Neural Networks: # 1. One-layer MLP (ReLU activation function + 50 hidden neurons) # 2. Two-layer MLP (ReLU activation function + (20, 20) hidden neurons) #
model = model.fit(X=X_train, y=y_train) model.predict(X_train).mean() model.coef_ X_train.columns model.intercept_ model.get_params() # %% Explainable gbm from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression from interpret import show ebm = ExplainableBoostingClassifier() ebm.fit(X=X_train, y=y_train) ebm_global = ebm.explain_global(name='EBM') show(ebm_global) # %% log_model = LogisticRegression() log_model.fit(X=X_train, y=y_train) log_global = log_model.explain_global(name='LogReg') show(log_global) show([ebm_global, log_global], share_tables=True) # %% from interpret.data import ClassHistogram hist = ClassHistogram().explain_data(X_train, y_train, name='Train Data') show(hist)
from interpret.blackbox import LimeTabular from interpret import show # %% Load and preprocess data data_loader = DataLoader() data_loader.load_dataset() data_loader.preprocess_data() # Split the data for evaluation X_train, X_test, y_train, y_test = data_loader.get_data_split() # Oversample the train data X_train, y_train = data_loader.oversample(X_train, y_train) print(X_train.shape) print(X_test.shape) # %% Fit blackbox model rf = RandomForestClassifier() rf.fit(X_train, y_train) y_pred = rf.predict(X_test) print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}") print(f"Accuracy {accuracy_score(y_test, y_pred)}") # %% Apply lime # Initilize Lime for Tabular data lime = LimeTabular(predict_fn=rf.predict_proba, data=X_train, random_state=1) # Get local explanations lime_local = lime.explain_local(X_test[-20:], y_test[-20:], name='LIME') show(lime_local) # %%
def blackbox_local_explanation( self, num_samples=0.5, sample_no=None, method="lime", predictions="default", show=True, **kwargs, ): """ Plots an interpretable display that explains individual predictions of your model. Supported explainers are either 'lime' or 'shap'. Parameters ---------- num_samples : int, float, or 'all', optional Number of samples to display, if less than 1 it will treat it as a percentage, 'all' will include all samples , by default 0.25 sample_no : int, optional Sample number to isolate and analyze, if provided it overrides num_samples, by default None method : str, optional Explainer type, can either be 'lime' or 'shap', by default 'lime' predictions : str, optional Prediction type, can either be 'default' (.predict) or 'probability' if the model can predict probabilities, by default 'default' show : bool, optional False to not display the plot, by default True Returns ------- Interpret Interpretable dashboard of your model """ if predictions == "probability": predict_fn = self.model.predict_proba else: predict_fn = self.model.predict # Determine method if method.lower() in INTERPRET_EXPLAINERS["local"]: if method.lower() == "lime": data = self.x_train elif method.lower() == "shap": data = np.median(self.x_train, axis=0).reshape(1, -1) else: raise ValueError explainer = INTERPRET_EXPLAINERS["local"][method.lower()]( predict_fn=predict_fn, data=data, **kwargs ) else: raise ValueError( 'Supported blackbox local explainers are only "lime" and "shap".' ) if sample_no is not None: if sample_no < 1 or not isinstance(sample_no, int): raise ValueError("Sample number must be greater than 1.") samples = slice(sample_no - 1, sample_no) else: if num_samples == "all": samples = slice(0, len(self.x_test)) elif num_samples <= 0: raise ValueError( "Number of samples must be greater than 0. If it is less than 1, it will be treated as a percentage." ) elif num_samples > 0 and num_samples < 1: samples = slice(0, int(num_samples * len(self.x_test))) else: samples = slice(0, num_samples) explainer_local = explainer.explain_local( self.x_test[samples], self.y_test[samples], name=method.upper() ) self.trained_blackbox_explainers[method.lower()] = explainer_local if show: interpret.show(explainer_local) return explainer_local
train_cols = df.columns[0:-1] label = df.columns[-1] X = df[train_cols] y = df[label] #X,y = datasets.load_boston(return_X_y=True) seed = 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed) from interpret import show from interpret.data import ClassHistogram hist = ClassHistogram().explain_data(X_train, y_train, name='Train Data') show(hist) print(type(hist)) from interpret.glassbox import ExplainableBoostingRegressor, LogisticRegression, ClassificationTree, DecisionListClassifier ebm = ExplainableBoostingRegressor(random_state=seed) ebm.fit(X_train, y_train) #Works on dataframes and numpy arrays ebm_global = ebm.explain_global(name='EBM') for i in range(7): ebm_global.visualize(i).write_html('Concrete_Strength/CS_' + df.columns[i] + '.html') preds = ebm.predict(X_test) #for i in range(len(preds)): #print(preds[i],y_test[i]) print(preds) print(y_test)
X_train, y_train = data_loader.oversample(X_train, y_train) print("After oversampling:", X_train.shape) # %% Fit logistic regression model lr = LogisticRegression(random_state=2021, feature_names=X_train.columns, penalty='l1', solver='liblinear') lr.fit(X_train, y_train) print("Training finished.") # %% Evaluate logistic regression model y_pred = lr.predict(X_test) print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}") print(f"Accuracy {accuracy_score(y_test, y_pred)}") # %% Explain local prediction lr_local = lr.explain_local(X_test[:100], y_test[:100], name='Logistic Regression') show(lr_local) # %% Explain global logistic regression model lr_global = lr.explain_global(name='Logistic Regression') show(lr_global) # %% Fit decision tree model tree = ClassificationTree() tree.fit(X_train, y_train) print("Training finished.") y_pred = tree.predict(X_test) print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}") print(f"Accuracy {accuracy_score(y_test, y_pred)}") # %% Explain local prediction tree_local = tree.explain_local(X_test[:100], y_test[:100], name='Tree')
training_columns = ['x', 'y'] training_df = pd.concat([df_A.iloc[:500], df_B.iloc[:500]], ignore_index=True, sort=True) #define test df (second 500 elements of each cathegory) test_df = pd.concat([df_A.iloc[500:], df_B.iloc[500:]], ignore_index=True, sort=True) ebm_clf = ExplainableBoostingClassifier() ebm_clf.fit(training_df[training_columns], training_df['category']) probabilities = ebm_clf.predict_proba(test_df[training_columns]) ebm_global = ebm_clf.explain_global() show(ebm_global) for prob in range(2): test_df['prob_{0}'.format(prob)] = probabilities[:, prob] figcontur = plt.figure(figsize=(18, 7.5)) contourax = figcontur.add_subplot(111) xx, yy = make_meshgrid(test_df['x'], test_df['y']) plot_contours(contourax, ebm_clf, xx, yy, cmap='RdYlBu', alpha=0.8) contourax.scatter(test_df.x, test_df.y, c=test_df['category'], cmap='RdYlBu', s=20, edgecolors='k') contourax.set_xlim(xx.min(), xx.max())