def test_tree_shap(): np.random.seed(0) alibi_model = make_tree_shap() tree_shap = TreeShap(alibi_model) adult = fetch_adult() X_test = adult.data[30001:, :] explanation = tree_shap.explain(X_test[0:1].tolist()) exp_json = json.loads(explanation.to_json()) assert exp_json["meta"]["name"] == "TreeShap"
def test_tree_shap(): os.environ.clear() alibi_model = os.path.join(kfserving.Storage.download(ADULT_EXPLAINER_URI), EXPLAINER_FILENAME) with open(alibi_model, "rb") as f: alibi_model = dill.load(f) tree_shap = TreeShap(alibi_model) adult = fetch_adult() X_test = adult.data[30001:, :] np.random.seed(0) explanation = tree_shap.explain(X_test[0:1].tolist()) exp_json = json.loads(explanation.to_json()) print(exp_json)
def __init__(self, ): adult = fetch_adult() data = adult.data target = adult.target self.feature_names = adult.feature_names self.category_map = adult.category_map np.random.seed(0) data_perm = np.random.permutation(np.c_[data, target]) data = data_perm[:, :-1] target = data_perm[:, -1] idx = 30000 self.X_train, self.Y_train = data[:idx, :], target[:idx] self.X_test, self.Y_test = data[idx + 1:, :], target[idx + 1:]
def test_adult(return_X_y): data = fetch_adult(return_X_y=return_X_y) if return_X_y: assert len(data) == 2 X, y = data else: assert len(data) == 5 X = data.data y = data.target assert X.ndim == ADULT_DIM assert X.shape[1] == ADULT_FEATURES assert len(X) == len(y) assert len(set(y)) == ADULT_CLASSES
def adult_dataset(): """ Loads and preprocesses Adult dataset. """ # load raw data adult = fetch_adult() data = adult.data target = adult.target feature_names = adult.feature_names category_map = adult.category_map # split it idx = 30000 X_train, Y_train = data[:idx, :], target[:idx] X_test, Y_test = data[idx + 1:, :], target[idx + 1:] # Create feature transformation pipeline ordinal_features = [ x for x in range(len(feature_names)) if x not in list(category_map.keys()) ] ordinal_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = list(category_map.keys()) categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[('num', ordinal_transformer, ordinal_features ), ('cat', categorical_transformer, categorical_features)]) preprocessor.fit(X_train) return { 'X_train': X_train, 'y_train': Y_train, 'X_test': X_test, 'y_test': Y_test, 'preprocessor': preprocessor, 'metadata': { 'feature_names': feature_names, 'category_map': category_map, 'name': 'adult' } }
def test_kernel_shap(): os.environ.clear() alibi_model = os.path.join(kfserving.Storage.download(ADULT_EXPLAINER_URI), EXPLAINER_FILENAME) with open(alibi_model, "rb") as f: skmodel = SKLearnServer(ADULT_MODEL_URI) skmodel.load() alibi_model = dill.load(f) kernel_shap = KernelShap(skmodel.predict, alibi_model) adult = fetch_adult() X_test = adult.data[30001:, :] np.random.seed(0) explanation = kernel_shap.explain(X_test[0:1].tolist()) exp_json = json.loads(explanation.to_json()) print(exp_json)
def tf_keras_adult(tf_keras_adult_model): # fetch data adult = fetch_adult() X = adult.data X_ord = np.c_[X[:, 1:8], X[:, 11], X[:, 0], X[:, 8:11]] y = adult.target # scale numerical features X_num = X_ord[:, -4:].astype(np.float32, copy=False) xmin, xmax = X_num.min(axis=0), X_num.max(axis=0) rng = (-1., 1.) X_num_scaled = (X_num - xmin) / (xmax - xmin) * (rng[1] - rng[0]) + rng[0] # OHE categorical features X_cat = X_ord[:, :-4].copy() ohe = OneHotEncoder() ohe.fit(X_cat) X_cat_ohe = ohe.transform(X_cat) # combine categorical and numerical data X_comb = np.c_[X_cat_ohe.todense(), X_num_scaled].astype(np.float32, copy=False) # split in train and test set idx = 30000 X_train, y_train = X_comb[:idx, :], y[:idx] assert X_train.shape[1] == 57 # set random seed np.random.seed(1) tf.set_random_seed(1) model = tf_keras_adult_model model.fit(X_train, to_categorical(y_train), batch_size=128, epochs=5, verbose=0) # create categorical variable dict cat_vars_ord = {} n_categories = 8 for i in range(n_categories): cat_vars_ord[i] = len(np.unique(X_ord[:, i])) cat_vars_ohe = ord_to_ohe(X_ord, cat_vars_ord)[1] return X_train, model, cat_vars_ohe
def test_anchor_tabular(): alibi_model = os.path.join( kfserving.Storage.download(ADULT_EXPLAINER_URI), EXPLAINER_FILENAME ) with open(alibi_model, "rb") as f: skmodel = SKLearnServer(ADULT_MODEL_URI) skmodel.load() alibi_model = dill.load(f) anchor_tabular = AnchorTabular(skmodel.predict, alibi_model) adult = fetch_adult() X_test = adult.data[30001:, :] np.random.seed(0) explanation = anchor_tabular.explain(X_test[0:1].tolist()) exp_json = json.loads(explanation.to_json()) assert exp_json["data"]["anchor"][0] == "Marital Status = Never-Married"
def test_adult(return_X_y): try: data = fetch_adult(return_X_y=return_X_y) except RequestException: pytest.skip('Adult dataset URL down') if return_X_y: assert len(data) == 2 X, y = data else: assert len(data) == 5 X = data.data y = data.target assert X.ndim == ADULT_DIM assert X.shape[1] == ADULT_FEATURES assert len(X) == len(y) assert len(set(y)) == ADULT_CLASSES
def test_anchor_tabular(): os.environ.clear() alibi_model = os.path.join(kserve.Storage.download(ADULT_EXPLAINER_URI), EXPLAINER_FILENAME) with open(alibi_model, "rb") as f: skmodel = SKLearnModel("adult", ADULT_MODEL_URI) skmodel.load() predictor = Predictor(skmodel) alibi_model = dill.load(f) anchor_tabular = AnchorTabular(predictor.predict_fn, alibi_model) adult = fetch_adult() X_test = adult.data[30001:, :] np.random.seed(0) explanation = anchor_tabular.explain(X_test[0:1].tolist()) exp_json = json.loads(explanation.to_json()) assert exp_json["data"]["anchor"][0] == "Relationship = Own-child" or \ exp_json["data"]["anchor"][0] == "Age <= 28.00"
def make_tree_shap(dirname: Optional[Path] = None) -> TreeShap: np.random.seed(0) # get X_train for explainer fit adult = fetch_adult() data = adult.data target = adult.target data_perm = np.random.permutation(np.c_[data, target]) data = data_perm[:, :-1] target = data_perm[:, -1] idx = 30000 X_train, y_train = data[:idx, :], target[:idx] X_test, y_test = data[idx + 1:, :], target[idx + 1:] d_train = xgboost.DMatrix(X_train, label=y_train) d_test = xgboost.DMatrix(X_test, label=y_test) params = { "eta": 0.01, "objective": "binary:logistic", "subsample": 0.5, "base_score": np.mean(y_train), "eval_metric": "logloss", } model = xgboost.train( params, d_train, 5000, evals=[(d_test, "test")], verbose_eval=100, early_stopping_rounds=20, ) tree_explainer = TreeShap(model, model_output="raw", task="classification") tree_explainer.fit(X_train) if dirname is not None: tree_explainer.save(dirname) return tree_explainer
def make_anchor_tabular_income( dirname: Optional[Path] = None) -> AnchorTabular: # adapted from: # https://docs.seldon.io/projects/alibi/en/latest/examples/anchor_tabular_adult.html np.random.seed(0) # prepare data adult = fetch_adult() data = adult.data target = adult.target feature_names = adult.feature_names category_map = adult.category_map data_perm = np.random.permutation(np.c_[data, target]) data = data_perm[:, :-1] target = data_perm[:, -1] # build model idx = 30000 X_train, Y_train = data[:idx, :], target[:idx] X_test, Y_test = data[idx + 1:, :], target[idx + 1:] ordinal_features = [ x for x in range(len(feature_names)) if x not in list(category_map.keys()) ] ordinal_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ]) categorical_features = list(category_map.keys()) categorical_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("onehot", OneHotEncoder(handle_unknown="ignore")), ]) preprocessor = ColumnTransformer(transformers=[ ("num", ordinal_transformer, ordinal_features), ("cat", categorical_transformer, categorical_features), ]) clf = RandomForestClassifier(n_estimators=50) model_pipeline = Pipeline(steps=[ ("preprocess", preprocessor), ("classifier", clf), ]) model_pipeline.fit(X_train, Y_train) explainer = AnchorTabular(model_pipeline.predict, feature_names, categorical_names=category_map, seed=1) explainer.fit(X_train, disc_perc=[25, 50, 75]) if dirname is not None: explainer.save(dirname) return explainer
# See the License for the specific language governing permissions and # limitations under the License. import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder from alibi.datasets import fetch_adult import joblib import dill from sklearn.pipeline import Pipeline import alibi # load data adult = fetch_adult() data = adult.data targets = adult.target feature_names = adult.feature_names category_map = adult.category_map # define train and test set np.random.seed(0) data_perm = np.random.permutation(np.c_[data, targets]) data = data_perm[:, :-1] labels = data_perm[:, -1] idx = 30000 X_train, Y_train = data[:idx, :], targets[:idx] X_test, Y_test = data[idx + 1:, :], targets[idx + 1:]