def test_display_dataset_analysis_3(self, mock_correlation_matrix): """ Test we don't have a problem when only categorical features """ df = self.df.copy() df['x1'] = 'a' df['x2'] = df['x2'].astype(str) encoder = OrdinalEncoder( cols=['x1', 'x2'], handle_unknown='ignore', return_df=True).fit(df) df = encoder.transform(df) clf = cb.CatBoostClassifier(n_estimators=1).fit(df[['x1', 'x2']], df['y']) xpl = SmartExplainer() xpl.compile(model=clf, x=df[['x1', 'x2']]) report = ProjectReport( explainer=xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), x_train=df[['x1', 'x2']], ) report.display_dataset_analysis() self.assertEqual(mock_correlation_matrix.call_count, 0)
def test_compile_3(self): """ Unit test compile 3 checking compile method without model """ df = pd.DataFrame(range(0, 21), columns=['id']) df['y'] = df['id'].apply(lambda x: 1 if x < 10 else 0) df['x1'] = np.random.randint(1, 123, df.shape[0]) df['x2'] = np.random.randint(1, 3, df.shape[0]) df = df.set_index('id') clf = cb.CatBoostClassifier(n_estimators=1).fit( df[['x1', 'x2']], df['y']) clf_explainer = shap.TreeExplainer(clf) contrib = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], columns=[ 'contribution_0', 'contribution_1', 'contribution_2', 'contribution_3' ], index=[0, 1, 2]) xpl = SmartExplainer() with self.assertRaises(ValueError): xpl.compile(model=clf, x=df[['x1', 'x2']], explainer=clf_explainer, contributions=contrib)
def compile_shapash_model(x, model): xpl = SmartExplainer() xpl.compile( x=x, model=model, ) return xpl
def test_to_smartpredictor_1(self): """ Unit test 1 to_smartpredictor """ df = pd.DataFrame(range(0, 5), columns=['id']) df['y'] = df['id'].apply(lambda x: 1 if x < 2 else 0) df['x1'] = np.random.randint(1, 123, df.shape[0]) df['x2'] = ["S", "M", "S", "D", "M"] df = df.set_index('id') encoder = ce.OrdinalEncoder(cols=["x2"], handle_unknown="None") encoder_fitted = encoder.fit(df[["x1", "x2"]]) df_encoded = encoder_fitted.transform(df[["x1", "x2"]]) clf = cb.CatBoostClassifier(n_estimators=1).fit(df_encoded[['x1', 'x2']], df['y']) postprocessing = {"x2": { "type": "transcoding", "rule": {"S": "single", "M": "married", "D": "divorced"}}} xpl = SmartExplainer(features_dict={"x1": "age", "x2": "family_situation"}) xpl.compile(model=clf, x=df_encoded[['x1', 'x2']], preprocessing=encoder_fitted, postprocessing=postprocessing) predictor_1 = xpl.to_smartpredictor() xpl.mask_params = { 'features_to_hide': None, 'threshold': None, 'positive': True, 'max_contrib': 1 } predictor_2 = xpl.to_smartpredictor() assert hasattr(predictor_1, 'model') assert hasattr(predictor_1, 'explainer') assert hasattr(predictor_1, 'features_dict') assert hasattr(predictor_1, 'label_dict') assert hasattr(predictor_1, '_case') assert hasattr(predictor_1, '_classes') assert hasattr(predictor_1, 'columns_dict') assert hasattr(predictor_1, 'features_types') assert hasattr(predictor_1, 'preprocessing') assert hasattr(predictor_1, 'postprocessing') assert hasattr(predictor_1, 'mask_params') assert hasattr(predictor_2, 'mask_params') assert predictor_1.model == xpl.model assert predictor_1.explainer == xpl.explainer assert predictor_1.features_dict == xpl.features_dict assert predictor_1.label_dict == xpl.label_dict assert predictor_1._case == xpl._case assert predictor_1._classes == xpl._classes assert predictor_1.columns_dict == xpl.columns_dict assert predictor_1.preprocessing == xpl.preprocessing assert predictor_1.postprocessing == xpl.postprocessing assert all(predictor_1.features_types[feature] == str(xpl.x_pred[feature].dtypes) for feature in xpl.x_pred.columns ) assert predictor_2.mask_params == xpl.mask_params
def test_compile_0(self, mock_apply_preprocessing, mock_choose_state): """ Unit test compile Parameters ---------- mock_apply_preprocessing : [type] [description] mock_choose_state : [type] [description] """ xpl = SmartExplainer() mock_state = Mock() mock_choose_state.return_value = mock_state model = lambda: None model.predict = types.MethodType(self.predict, model) mock_state.rank_contributions.return_value = 1, 2, 3 contributions = pd.DataFrame([[-0.1, 0.2, -0.3], [0.1, -0.2, 0.3]]) mock_state.validate_contributions.return_value = contributions mock_apply_preprocessing.return_value = contributions x_pred = pd.DataFrame([[1, 2, 3], [1, 2, 3]]) xpl.compile(x=x_pred, model=model, contributions=contributions) assert hasattr(xpl, 'state') assert xpl.state == mock_state assert hasattr(xpl, 'x_pred') pd.testing.assert_frame_equal(xpl.x_pred, x_pred) assert hasattr(xpl, 'contributions') pd.testing.assert_frame_equal(xpl.contributions, contributions) mock_choose_state.assert_called() mock_state.validate_contributions.assert_called() mock_apply_preprocessing.assert_called() mock_state.rank_contributions.assert_called() assert xpl._case == "regression"
def test_load_smartpredictor_1(self): """ Unit test load_smartpredictor 1 """ xpl = SmartExplainer(features_dict={}) y_pred = pd.DataFrame(data=np.array([1, 2]), columns=['pred']) dataframe_x = pd.DataFrame([[1, 2, 4], [1, 2, 3]]) clf = cb.CatBoostClassifier(n_estimators=1).fit(dataframe_x, y_pred) xpl.compile(x=dataframe_x, y_pred=y_pred, model=clf) predictor = xpl.to_smartpredictor() current = Path(path.abspath(__file__)).parent.parent.parent if str(sys.version)[0:3] == '3.7': pkl_file = path.join(current, 'data/predictor_to_load_37.pkl') elif str(sys.version)[0:3] == '3.6': pkl_file = path.join(current, 'data/predictor_to_load_36.pkl') predictor2 = load_smartpredictor(pkl_file) attrib_predictor = [element for element in predictor.__dict__.keys()] attrib_predictor2 = [element for element in predictor2.__dict__.keys()] assert all(attrib in attrib_predictor2 for attrib in attrib_predictor) assert all(attrib2 in attrib_predictor for attrib2 in attrib_predictor2)
def test_to_pandas_2(self): """ Unit test to_pandas : test to_pandas method in classification case with predict_proba output and column_dict attribute """ xpl = SmartExplainer() contrib = pd.DataFrame( [[0.32230754, 0.1550689, 0.10183475, 0.05471339], [-0.58547512, -0.37050409, -0.07249285, 0.00171975], [-0.48666675, 0.25507156, -0.16968889, 0.0757443]], index=[0, 1, 2]) model = lambda: None model._classes = np.array([1, 3]) model.predict = types.MethodType(self.predict, model) model.predict_proba = types.MethodType(self.predict_proba, model) x = pd.DataFrame( [[3., 1., 22., 1.], [1., 2., 38., 2.], [3., 2., 26., 1.]], index=[0, 1, 2]) pred = pd.DataFrame([3, 1, 1], columns=['pred'], index=[0, 1, 2]) xpl.compile(contributions=contrib, x=x, model=model, y_pred=pred) xpl.columns_dict = {0: 'Pclass', 1: 'Sex', 2: 'Age', 3: 'Embarked'} xpl.features_dict = { 'Pclass': 'Pclass', 'Sex': 'Sex', 'Age': 'Age', 'Embarked': 'Embarked' } output = xpl.to_pandas(max_contrib=3, positive=True, proba=True) expected = pd.DataFrame([[ 3, 0.8, 'Pclass', 3.0, 0.32230754, 'Sex', 1.0, 0.1550689, 'Age', 22.0, 0.10183475 ], [ 1, 0.3, 'Pclass', 1.0, 0.58547512, 'Sex', 2.0, 0.37050409, 'Age', 38.0, 0.07249285 ], [ 1, 0.4, 'Pclass', 3.0, 0.48666675, 'Age', 26.0, 0.16968889, np.nan, np.nan, np.nan ]], columns=[ 'pred', 'proba', 'feature_1', 'value_1', 'contribution_1', 'feature_2', 'value_2', 'contribution_2', 'feature_3', 'value_3', 'contribution_3' ], index=[0, 1, 2], dtype=object) expected['pred'] = expected['pred'].astype(int) expected['proba'] = expected['proba'].astype(float) assert not pd.testing.assert_frame_equal(expected, output)
def test_predict_2(self): """ Test predict method 2 """ xpl = SmartExplainer() X = pd.DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]]) y_true = pd.DataFrame(data=np.array([1, 2, 3]), columns=['pred']) model = LinearRegression().fit(X, y_true) xpl.compile(x=X, model=model) xpl.predict() pd.testing.assert_frame_equal(xpl.y_pred, y_true, check_dtype=False)
def test_predict_1(self): """ Test predict method 1 """ xpl = SmartExplainer() X = pd.DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]]) y_true = pd.DataFrame(data=np.array([1, 2, 3]), columns=['pred']) y_false = pd.DataFrame(data=np.array([1, 2, 4]), columns=['pred']) model = LinearRegression().fit(X, y_true) xpl.compile(x=X, y_pred=y_false, model=model) xpl.predict() # y_false should be replaced by predictions which are equal to y_true pd.testing.assert_frame_equal(xpl.y_pred, y_true, check_dtype=False)
class TestWebappSettings(unittest.TestCase): """ Unit tests for webapp settings class Checks that the webapp settings remain valid whether the user input is valid or not """ def __init__(self, *args, **kwargs): """ Constructor - loads a SmartExplainer object from the appropriate pickle """ self.xpl = SmartExplainer() contributions = pd.DataFrame([[-0.1, 0.2, -0.3], [0.1, -0.2, 0.3]]) y_pred = pd.DataFrame(data=np.array([1, 2]), columns=['pred']) dataframe_x = pd.DataFrame([[1, 2, 3], [1, 2, 3]]) self.xpl.compile(contributions=contributions, x=dataframe_x, y_pred=y_pred, model=LinearRegression()) self.xpl.filter(max_contrib=2) super(TestWebappSettings, self).__init__(*args, **kwargs) def test_settings_types(self): """ Test settings dtypes (must be ints) """ settings = {'rows': None, 'points': 5200.4, 'violin': -1, 'features': "oui"} self.xpl.init_app(settings) print(self.xpl.smartapp.settings) assert all(isinstance(attrib, int) for k, attrib in self.xpl.smartapp.settings.items()) def test_settings_values(self): """ Test settings values (must be >0) """ settings = {'rows': 0, 'points': 5200.4, 'violin': -1, 'features': "oui"} self.xpl.init_app(settings) assert all(attrib > 0 for k, attrib in self.xpl.smartapp.settings.items()) def test_settings_keys(self): """ Test settings keys : the expected keys must be in the final settings dict, whatever the user input is """ settings = {'oui': 1, 1: 2, "a": []} self.xpl.init_app(settings) assert all(k in ['rows', 'points', 'violin', 'features'] for k in self.xpl.smartapp.settings)
def test_compile_1(self): """ Unit test compile 1 checking compile method without model """ df = pd.DataFrame(range(0, 21), columns=['id']) df['y'] = df['id'].apply(lambda x: 1 if x < 10 else 0) df['x1'] = np.random.randint(1, 123, df.shape[0]) df['x2'] = np.random.randint(1, 3, df.shape[0]) df = df.set_index('id') clf = cb.CatBoostClassifier(n_estimators=1).fit(df[['x1', 'x2']], df['y']) xpl = SmartExplainer() xpl.compile(model=clf, x=df[['x1', 'x2']]) assert xpl._case == "classification" self.assertListEqual(xpl._classes, [0, 1])
def test_run_app_1(self, mock_get_host_name, mock_custom_thread, mock_smartapp): """ Test that when y_pred is not given, y_pred is automatically computed. """ xpl = SmartExplainer() X = pd.DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]]) contributions = pd.DataFrame([[0.1, -0.2, 0.3], [0.1, -0.2, 0.3], [0.1, -0.2, 0.3]]) y_true = pd.DataFrame(data=np.array([1, 2, 3]), columns=['pred']) model = LinearRegression().fit(X, y_true) xpl.compile(contributions=contributions, x=X, model=model) xpl.run_app() assert xpl.y_pred is not None
def compute_contributions(self, x, model, methods, preprocessing): """ Compute contributions based on specified methods Parameters ---------- x : pandas.DataFrame Prediction set. IMPORTANT: this should be the raw prediction set, whose values are seen by the end user. x is a preprocessed dataset: Shapash can apply the model to it model : model object Model used to consistency check. model object can also be used by some method to compute predict and predict_proba values methods : list, optional When contributions is None, list of methods to use to calculate contributions, by default ["shap", "acv"] preprocessing : category_encoders, ColumnTransformer, list, dict --> Differents types of preprocessing are available: - A single category_encoders (OrdinalEncoder/OnehotEncoder/BaseNEncoder/BinaryEncoder/TargetEncoder) - A single ColumnTransformer with scikit-learn encoding or category_encoders transformers - A list with multiple category_encoders with optional (dict, list of dict) - A list with a single ColumnTransformer with optional (dict, list of dict) - A dict - A list of dict Returns ------- contributions : dict Dict whose keys are method names and values are the corresponding contributions """ contributions = {} xpl = SmartExplainer() for backend in methods: xpl.compile(x=x, model=model, preprocessing=preprocessing, backend=backend) if xpl._case == "classification" and len(xpl._classes) == 2: contributions[backend] = xpl.contributions[1] elif xpl._case == "classification" and len(xpl._classes) > 2: raise AssertionError( "Multi-class classification is not supported") else: contributions[backend] = xpl.contributions return contributions
def test_get_interaction_values_1(self): df = pd.DataFrame({ "y": np.random.randint(2, size=50), "a": np.random.rand(50), "b": np.random.rand(50), }) clf = cb.CatBoostClassifier(n_estimators=1).fit(df[['a', 'b']], df['y']) xpl = SmartExplainer() xpl.compile(x=df.drop('y', axis=1), model=clf) shap_interaction_values = xpl.get_interaction_values(n_samples_max=10) assert shap_interaction_values.shape[0] == 10 shap_interaction_values = xpl.get_interaction_values() assert shap_interaction_values.shape[0] == df.shape[0]
def init_sme_to_pickle_test(): """ Init sme to pickle test TODO: Docstring Returns ------- [type] [description] """ current = Path(path.abspath(__file__)).parent.parent.parent pkl_file = path.join(current, 'data/xpl.pkl') xpl = SmartExplainer() contributions = pd.DataFrame([[-0.1, 0.2, -0.3], [0.1, -0.2, 0.3]]) y_pred = pd.DataFrame(data=np.array([1, 2]), columns=['pred']) dataframe_x = pd.DataFrame([[1, 2, 3], [1, 2, 3]]) xpl.compile(contributions=contributions, x=dataframe_x, y_pred=y_pred, model=LinearRegression()) xpl.filter(max_contrib=2) return pkl_file, xpl
def init_sme_to_pickle_test(): """ Init sme to pickle test TODO: Docstring Returns ------- [type] [description] """ current = Path(path.abspath(__file__)).parent.parent.parent pkl_file = path.join(current, 'data/predictor.pkl') xpl = SmartExplainer(features_dict={}) y_pred = pd.DataFrame(data=np.array([1, 2]), columns=['pred']) dataframe_x = pd.DataFrame([[1, 2, 4], [1, 2, 3]]) clf = cb.CatBoostClassifier(n_estimators=1).fit(dataframe_x, y_pred) xpl.compile(x=dataframe_x, y_pred=y_pred, model=clf) predictor = xpl.to_smartpredictor() return pkl_file, predictor
def test_display_model_explainability_2(self): """ Tests multiclass case """ df = pd.DataFrame(range(0, 21), columns=['id']) df['y'] = df['id'].apply( lambda x: 0 if x < 5 else 1 if (5 <= x < 10) else 2 if (10 <= x < 15) else 3) df['x1'] = np.random.randint(1, 123, df.shape[0]) df['x2'] = np.random.randint(1, 3, df.shape[0]) df = df.set_index('id') clf = cb.CatBoostClassifier(n_estimators=1).fit(df[['x1', 'x2']], df['y']) xpl = SmartExplainer() xpl.compile(model=clf, x=df[['x1', 'x2']]) report = ProjectReport( explainer=xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml') ) report.display_model_explainability()
def test_compile_2(self): """ Unit test compile 2 checking new attributes added to the compile method """ df = pd.DataFrame(range(0, 5), columns=['id']) df['y'] = df['id'].apply(lambda x: 1 if x < 2 else 0) df['x1'] = np.random.randint(1, 123, df.shape[0]) df['x2'] = ["S", "M", "S", "D", "M"] df = df.set_index('id') encoder = ce.OrdinalEncoder(cols=["x2"], handle_unknown="None") encoder_fitted = encoder.fit(df) df_encoded = encoder_fitted.transform(df) output = df[["x1", "x2"]].copy() output["x2"] = ["single", "married", "single", "divorced", "married"] clf = cb.CatBoostClassifier(n_estimators=1).fit(df_encoded[['x1', 'x2']], df_encoded['y']) postprocessing_1 = {"x2": { "type": "transcoding", "rule": {"S": "single", "M": "married", "D": "divorced"}}} postprocessing_2 = { "family_situation": { "type": "transcoding", "rule": {"S": "single", "M": "married", "D": "divorced"}}} xpl_postprocessing1 = SmartExplainer() xpl_postprocessing2 = SmartExplainer(features_dict={"x1": "age", "x2": "family_situation"} ) xpl_postprocessing3 = SmartExplainer() xpl_postprocessing1.compile(model=clf, x=df_encoded[['x1', 'x2']], preprocessing=encoder_fitted, postprocessing=postprocessing_1) xpl_postprocessing2.compile(model=clf, x=df_encoded[['x1', 'x2']], preprocessing=encoder_fitted, postprocessing=postprocessing_2) xpl_postprocessing3.compile(model=clf, x=df_encoded[['x1', 'x2']], preprocessing=None, postprocessing=None) assert hasattr(xpl_postprocessing1, "preprocessing") assert hasattr(xpl_postprocessing1, "postprocessing") assert hasattr(xpl_postprocessing2, "preprocessing") assert hasattr(xpl_postprocessing2, "postprocessing") assert hasattr(xpl_postprocessing3, "preprocessing") assert hasattr(xpl_postprocessing3, "postprocessing") pd.testing.assert_frame_equal(xpl_postprocessing1.x_pred, output) pd.testing.assert_frame_equal(xpl_postprocessing2.x_pred, output) assert xpl_postprocessing1.preprocessing == encoder_fitted assert xpl_postprocessing2.preprocessing == encoder_fitted assert xpl_postprocessing1.postprocessing == postprocessing_1 assert xpl_postprocessing2.postprocessing == postprocessing_1
] encoder = OrdinalEncoder(cols=categorical_features, handle_unknown='ignore', return_df=True).fit(X_df) X_df = encoder.transform(X_df) Xtrain, Xtest, ytrain, ytest = train_test_split(X_df, y_df, train_size=0.75, random_state=1) regressor = LGBMRegressor(n_estimators=200).fit(Xtrain, ytrain) y_pred = pd.DataFrame(regressor.predict(Xtest), columns=['pred'], index=Xtest.index) xpl = SmartExplainer(features_dict=house_dict) xpl.compile(x=Xtest, model=regressor, preprocessing=encoder, y_pred=y_pred, title_story='House Prices - Lightgbm Regressor') xpl.init_app() app = xpl.smartapp.app if __name__ == "__main__": app.run_server(debug=False, host="0.0.0.0", port=8080)
house_df.head() categorical_features = [ col for col in X_df.columns if X_df[col].dtype == 'object' ] encoder = OrdinalEncoder(cols=categorical_features, handle_unknown='ignore', return_df=True).fit(X_df) X_df = encoder.transform(X_df) Xtrain, Xtest, ytrain, ytest = train_test_split(X_df, y_df, train_size=0.75, random_state=1) regressor = LGBMRegressor(n_estimators=200).fit(Xtrain, ytrain) y_pred = pd.DataFrame(regressor.predict(Xtest), columns=['pred'], index=Xtest.index) xpl = SmartExplainer(features_dict=house_dict) xpl.compile(x=Xtest, model=regressor, preprocessing=encoder, y_pred=y_pred) xpl.init_app() app = xpl.smartapp.app if __name__ == "__main__": app.run_server(debug=False, host="0.0.0.0", port=8080)
titanic_enc, y, test_size=0.2, ) X_test_ini = X.loc[X_test.index, :] df = titanic[features + y.columns.to_list()] df = df.loc[X_test.index, :] df.reset_index(level=0, inplace=True) model.fit(X_train, y_train) y_pred = model.predict(X_test) xpl = SmartExplainer() y_pred = pd.DataFrame(data=y_pred, columns=y.columns.to_list(), index=X_test.index) xpl.compile(X_test, model, y_pred=y_pred, preprocessing=encoder, title_story=cases[CASE]) xpl.init_app() app = xpl.smartapp.app if __name__ == "__main__": app.run_server(debug=False, host="0.0.0.0", port=8080)
class TestGeneration(unittest.TestCase): def setUp(self): df = pd.DataFrame(range(0, 21), columns=['id']) df['y'] = df['id'].apply(lambda x: 1 if x < 10 else 0) df['x1'] = np.random.randint(1, 123, df.shape[0]) df['x2'] = np.random.randint(1, 3, df.shape[0]) df = df.set_index('id') clf = cb.CatBoostClassifier(n_estimators=1).fit( df[['x1', 'x2']], df['y']) self.xpl = SmartExplainer() self.xpl.compile(model=clf, x=df[['x1', 'x2']]) self.df = df def test_exexcute_report_1(self): tmp_dir_path = tempfile.mkdtemp() execute_report(working_dir=tmp_dir_path, explainer=self.xpl, project_info_file=os.path.join(current_path, '../data/metadata.yaml'), config=None, notebook_path=None) assert os.path.exists( os.path.join(tmp_dir_path, 'smart_explainer.pickle')) assert os.path.exists(os.path.join(tmp_dir_path, 'base_report.ipynb')) shutil.rmtree(tmp_dir_path) def test_exexcute_report_2(self): tmp_dir_path = tempfile.mkdtemp() execute_report(working_dir=tmp_dir_path, explainer=self.xpl, project_info_file=os.path.join(current_path, '../data/metadata.yaml'), x_train=self.df[['x1', 'x2']], config=None, notebook_path=None) assert os.path.exists(os.path.join(tmp_dir_path, 'x_train.csv')) assert os.path.exists( os.path.join(tmp_dir_path, 'smart_explainer.pickle')) assert os.path.exists(os.path.join(tmp_dir_path, 'base_report.ipynb')) shutil.rmtree(tmp_dir_path) def test_exexcute_report_3(self): tmp_dir_path = tempfile.mkdtemp() execute_report(working_dir=tmp_dir_path, explainer=self.xpl, project_info_file=os.path.join(current_path, '../data/metadata.yaml'), x_train=self.df[['x1', 'x2']], y_test=self.df['y'], config=None, notebook_path=None) assert os.path.exists(os.path.join(tmp_dir_path, 'x_train.csv')) assert os.path.exists(os.path.join(tmp_dir_path, 'y_test.csv')) assert os.path.exists( os.path.join(tmp_dir_path, 'smart_explainer.pickle')) assert os.path.exists(os.path.join(tmp_dir_path, 'base_report.ipynb')) shutil.rmtree(tmp_dir_path) def test_exexcute_report_4(self): tmp_dir_path = tempfile.mkdtemp() execute_report(working_dir=tmp_dir_path, explainer=self.xpl, project_info_file=os.path.join(current_path, '../data/metadata.yaml'), x_train=self.df[['x1', 'x2']], y_train=self.df['y'], y_test=self.df['y'], config=None, notebook_path=None) assert os.path.exists(os.path.join(tmp_dir_path, 'x_train.csv')) assert os.path.exists(os.path.join(tmp_dir_path, 'y_test.csv')) assert os.path.exists(os.path.join(tmp_dir_path, 'y_train.csv')) assert os.path.exists( os.path.join(tmp_dir_path, 'smart_explainer.pickle')) assert os.path.exists(os.path.join(tmp_dir_path, 'base_report.ipynb')) shutil.rmtree(tmp_dir_path) def test_export_and_save_report_1(self): tmp_dir_path = tempfile.mkdtemp() execute_report( working_dir=tmp_dir_path, explainer=self.xpl, project_info_file=os.path.join(current_path, '../data/metadata.yaml'), ) outfile = os.path.join(tmp_dir_path, 'report.html') export_and_save_report(working_dir=tmp_dir_path, output_file=outfile) assert os.path.exists(outfile) shutil.rmtree(tmp_dir_path)
shap_interaction_values = shap.TreeExplainer(XGBoostModel).shap_interaction_values(X) shap.summary_plot(shap_interaction_values, X) # To choose 2nd feature by your choice and no automatic shap.dependence_plot(("Var1", "Var1"),shap_interaction_values, X) #Var1 only shap.dependence_plot(("Var1", "Var2"),shap_interaction_values, X) #choose Var2 instead of automatic choosing ### SHAPASH ### # https://www.analyticsvidhya.com/blog/2021/04/shapash-python-library-to-make-machine-learning-interpretable/ # it allows you to quickly understand the machine learning model by using a simple webapp pip install shapash from shapash.explainer.smart_explainer import SmartExplainer # Import shapash module # Initialize class. Here we initialize the class of shapash and then inside this class having inbuild function compile where we set a pair of parameters SE = SmartExplainer() SE.compile( x=xtest, # test set model=regressor, # black-box model ) # to run an app app = SE.run_app(title_story='Concrete_Data') # to kill the app app_name.kill() # prediction prediction = SE.to_smartpredictor() prediction.save('./predictor.pkl') from shapash.utils.load_smartpredictor import load_smartpredictor predictor_load = load_smartpredictor('./predictor.pkl') load.add_input(x=x, ypred=y) detailed = load.detail_contributions() detailed_contributions.head()
class TestProjectReport(unittest.TestCase): def setUp(self): self.df = pd.DataFrame(range(0, 21), columns=['id']) self.df['y'] = self.df['id'].apply(lambda x: 1 if x < 10 else 0) self.df['x1'] = np.random.randint(1, 123, self.df.shape[0]) self.df['x2'] = np.random.randint(1, 3, self.df.shape[0]) self.df = self.df.set_index('id') self.clf = cb.CatBoostClassifier(n_estimators=1).fit(self.df[['x1', 'x2']], self.df['y']) self.xpl = SmartExplainer() self.xpl.compile(model=self.clf, x=self.df[['x1', 'x2']]) self.report1 = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), ) self.report2 = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), x_train=self.df[['x1', 'x2']], ) def test_init_1(self): report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), ) for attr in expected_attrs: assert hasattr(report, attr) def test_init_2(self): report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), x_train=self.df[['x1', 'x2']], ) for attr in expected_attrs: assert hasattr(report, attr) def test_init_3(self): report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), x_train=self.df[['x1', 'x2']], y_test=self.df['y'] ) for attr in expected_attrs: assert hasattr(report, attr) def test_init_4(self): report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), x_train=self.df[['x1', 'x2']], y_test=self.df['y'], config={} ) for attr in expected_attrs: assert hasattr(report, attr) def test_init_5(self): ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), x_train=self.df[['x1', 'x2']], y_test=self.df['y'], config={'metrics': [{'path': 'sklearn.metrics.mean_squared_error'}]} ) def test_init_6(self): self.assertRaises(ValueError, ProjectReport, self.xpl, os.path.join(current_path, '../../data/metadata.yaml'), self.df[['x1', 'x2']], self.df['y'], {'metrics': ['sklearn.metrics.mean_squared_error']} ) @patch('shapash.report.project_report.print_html') def test_display_title_description_1(self, mock_print_html): self.report1.display_title_description() mock_print_html.assert_called_once() @patch('shapash.report.project_report.print_html') def test_display_title_description_2(self, mock_print_html): report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), x_train=self.df[['x1', 'x2']], y_test=self.df['y'], config={'title_story': "My project report", 'title_description': """This document is a data science project report."""} ) report.display_title_description() self.assertEqual(mock_print_html.call_count, 2) @patch('shapash.report.project_report.print_md') def test_display_general_information_1(self, mock_print_html): report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml') ) report.display_project_information() self.assertTrue(mock_print_html.called) @patch('shapash.report.project_report.print_md') def test_display_model_information_1(self, mock_print_md): report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml') ) report.display_model_analysis() self.assertTrue(mock_print_md.called) def test_display_dataset_analysis_1(self): report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), x_train=self.df[['x1', 'x2']], ) report.display_dataset_analysis() def test_display_dataset_analysis_2(self): report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), ) report.display_dataset_analysis() @patch('shapash.report.project_report.generate_correlation_matrix_fig') def test_display_dataset_analysis_3(self, mock_correlation_matrix): """ Test we don't have a problem when only categorical features """ df = self.df.copy() df['x1'] = 'a' df['x2'] = df['x2'].astype(str) encoder = OrdinalEncoder( cols=['x1', 'x2'], handle_unknown='ignore', return_df=True).fit(df) df = encoder.transform(df) clf = cb.CatBoostClassifier(n_estimators=1).fit(df[['x1', 'x2']], df['y']) xpl = SmartExplainer() xpl.compile(model=clf, x=df[['x1', 'x2']]) report = ProjectReport( explainer=xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), x_train=df[['x1', 'x2']], ) report.display_dataset_analysis() self.assertEqual(mock_correlation_matrix.call_count, 0) def test_display_model_explainability_1(self): report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), ) report.display_model_explainability() def test_display_model_explainability_2(self): """ Tests multiclass case """ df = pd.DataFrame(range(0, 21), columns=['id']) df['y'] = df['id'].apply( lambda x: 0 if x < 5 else 1 if (5 <= x < 10) else 2 if (10 <= x < 15) else 3) df['x1'] = np.random.randint(1, 123, df.shape[0]) df['x2'] = np.random.randint(1, 3, df.shape[0]) df = df.set_index('id') clf = cb.CatBoostClassifier(n_estimators=1).fit(df[['x1', 'x2']], df['y']) xpl = SmartExplainer() xpl.compile(model=clf, x=df[['x1', 'x2']]) report = ProjectReport( explainer=xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml') ) report.display_model_explainability() @patch('shapash.report.project_report.logging') def test_display_model_performance_1(self, mock_logging): """ No y_test given """ report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), ) report.display_model_performance() mock_logging.info.assert_called_once() @patch('shapash.report.project_report.logging') def test_display_model_performance_2(self, mock_logging): report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), y_test=self.df['y'], config=dict(metrics=[{'path': 'sklearn.metrics.mean_squared_error'}]) ) report.display_model_performance() self.assertEqual(mock_logging.call_count, 0) @patch('shapash.report.project_report.logging') def test_display_model_performance_3(self, mock_logging): """ No metrics given in ProjectReport """ report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), y_test=self.df['y'], ) report.display_model_performance() mock_logging.info.assert_called_once() @patch('shapash.report.project_report.logging') def test_display_model_performance_4(self, mock_logging): """ Test use of proba values. """ report = ProjectReport( explainer=self.xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), y_test=self.df['y'], config=dict(metrics=[{'path': 'sklearn.metrics.log_loss', 'use_proba_values': True}]) ) report.display_model_performance() self.assertEqual(mock_logging.call_count, 0)