def train_explainer(regressor: LogisticRegression, feature_names: List[str], X_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray): predict_fn = lambda x: regressor.predict(x) explainer = AnchorTabular(predict_fn, feature_names) explainer.fit(X_train) file_path="" with open("explainer.dill", "wb") as file: dill.dump(explainer, file) file_path = file.name mlflow.log_artifact("explainer.dill", "model") print(np.where(y_test == 1)[0]) probe = np. array([40.316667556762695, 0.5605325219195545, 0.350, 0, 3, 1, 5], dtype=float) #probe = np. array(X_test[700], dtype=float) explanation = explainer.explain(probe) print('Anchor: %s' % (' AND '.join(explanation['names']))) print('Precision: %.2f' % explanation['precision']) print('Coverage: %.2f' % explanation['coverage']) print(explanation) return explainer # kedro install # kedro run # kedro viz
def atab_explainer(lr_classifier, adult_data): predictor = predict_fcn(predict_type='class', clf=lr_classifier, preproc=adult_data['preprocessor']) atab = AnchorTabular( predictor=predictor, feature_names=adult_data['metadata']['feature_names'], categorical_names=adult_data['metadata']['category_map']) atab.fit(adult_data['X_train'], disc_perc=(25, 50, 75)) return atab
def train_explainer(artifacts_folder: str, data: AdultData, model: RandomForestClassifier) -> AnchorTabular: def predict_fn(x): return model.predict(x) explainer = AnchorTabular(predict_fn, data.feature_names, categorical_names=data.category_map, seed=1) explainer.fit(data.X_train, disc_perc=(25, 50, 75)) with open(f"{artifacts_folder}/{EXPLAINER_FOLDER}" + "/explainer.dill", "wb") as f: explainer.predictor = None explainer.samplers[0].predictor = None dill.dump(explainer, f) return explainer
class Anchors(FeatureImportance): """ Feature importance method by [RIB]_. References ---------- .. [RIB] Ribeiro, et al, "Anchors: High-precision model-agnostic explanations", Proceedings of the AAAI Conference on Artificial Intelligence, Volume 32, 2018. """ def __init__(self, model: Any, seed: int = SEED): super().__init__(seed=seed) self._model = assign_model(model=model) self._explainer = None def fit(self, X: Any) -> None: self._explainer = AnchorTabular( predictor=self._model.predict_proba, feature_names=list(range(X.shape[1])), seed=self._seed) self._explainer.fit(train_data=X) # disc_perc=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) # disc_perc=(0.1, 0.3, 0.5, 0.7, 0.9)) # disc_perc=(0.2, 0.4, 0.6, 0.8)) def _compute_anchors_per_sample(self, X: np.ndarray, idx: int) -> List: result = self._explainer.explain(X=X[idx, :]) return result.data['raw']['feature'] @staticmethod def _calculate_importance(anchors: List, output_shape: Tuple) -> np.ndarray: importance = np.zeros(shape=output_shape) for k, anchor in enumerate(anchors): if isinstance(anchor, list): importance[k, anchor] = 1 else: importance[anchor] = 1 return importance def _compute_anchors(self, X: np.ndarray, num_jobs: int) -> List: return Parallel(n_jobs=num_jobs)( delayed(self._compute_anchors_per_sample)(X, sample_idx) for sample_idx in range(X.shape[0])) def explain(self, X: np.ndarray, sample_idx: int) -> np.ndarray: anchors = self._compute_anchors_per_sample(X=X, idx=sample_idx) return self._calculate_importance(anchors=anchors, output_shape=(X.shape[1],)) def explain_batch(self, X: np.ndarray, num_jobs: int = 2) -> np.ndarray: anchors = self._compute_anchors(X=X, num_jobs=num_jobs) return self._calculate_importance(anchors=anchors, output_shape=X.shape)
def at_iris_explainer(get_iris_dataset, rf_classifier, request): """ Instantiates and fits an AnchorTabular explainer for the Iris dataset. """ predict_type = request.param data = get_iris_dataset clf, _ = rf_classifier # preprocessor not necessary # instantiate and fit explainer pred_fn = predict_fcn(predict_type, clf) explainer = AnchorTabular(pred_fn, data['metadata']['feature_names']) explainer.fit(data['X_train'], disc_perc=(25, 50, 75)) return data['X_test'], explainer, pred_fn, predict_type
def at_adult_explainer(get_adult_dataset, rf_classifier, request): """ Instantiates and fits an AnchorTabular explainer for the Adult dataset. """ # fit random forest classifier predict_type = request.param data = get_adult_dataset clf, preprocessor = rf_classifier # instantiate and fit explainer pred_fn = predict_fcn(predict_type, clf, preprocessor) explainer = AnchorTabular( pred_fn, data['metadata']['feature_names'], categorical_names=data['metadata']['category_map']) explainer.fit(data['X_train'], disc_perc=(25, 50, 75)) return data['X_test'], explainer, pred_fn, predict_type
def fit(self, x, y): self.dim = x.shape[1] # clf = sklearn.svm.SVC(kernel=self.kernel, probability=True) clf = RandomForestClassifier() clf.fit(x, y) y_pred = clf.predict(x) print("Clf model accuracy: [{:.4f}]".format( sklearn.metrics.accuracy_score(y, y_pred))) self.ano_idx = np.where(y == 1)[0] print(self.ano_idx.shape) n_f = x.shape[1] feature_names = ["A" + str(i) for i in range(n_f)] # use anchor predict_fn = lambda xx: clf.predict_proba(xx) explainer = AnchorTabular(predict_fn, feature_names) explainer.fit(x, disc_perc=(25, 50, 75)) exp_sub_lst = [] for i in tqdm(range(len(self.ano_idx))): ano = x[self.ano_idx[i]] explanation = explainer.explain(ano, threshold=0.95) anchor = explanation['anchor'] f_sub = [] for a in anchor: for item in a.split(" "): if item.startswith("A"): item = int(item[1:]) f_sub.append(item) # print(anchor, f_sub) if len(f_sub) == 0: f_sub = np.arange(n_f) exp_sub_lst.append(f_sub) return exp_sub_lst
def make_anchor_tabular(dirname: Optional[Path] = None) -> AnchorTabular: # train model iris_data = load_iris() clf = LogisticRegression(solver="liblinear", multi_class="ovr") clf.fit(iris_data.data, iris_data.target) # create explainer explainer = AnchorTabular(clf.predict, feature_names=iris_data.feature_names) explainer.fit(iris_data.data, disc_perc=(25, 50, 75)) if dirname is not None: explainer.save(dirname) return explainer
def retrain_classifier_final(self, args, nn_model_ref): nn_model_ref.epochs = args.num_epch_2 nn_model_ref.batch_size_2 = args.batch_size_2 nn_model_ref.net.freeze() X_train_proba_feat, X_eval_proba_feat = nn_model_ref.all_intermediaire, nn_model_ref.all_intermediaire_val Y_train_proba = nn_model_ref.Y_train_nn_binaire Y_eval_proba = nn_model_ref.Y_val_nn_binaire print("START RETRAIN LINEAR NN GOHR ") print() """net_retrain, h = train_speck_distinguisher(args, X_train_proba_feat.shape[1], X_train_proba_feat, Y_train_proba, X_eval_proba_feat, Y_eval_proba, bs=args.batch_size_2, epoch=args.num_epch_2, name_ici="retrain_nn_gohr", wdir=self.path_save_model)""" from alibi.explainers import AnchorTabular #from alibi.explainers import AnchorImage from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=50) clf.fit(X_train_proba_feat, Y_train_proba) predict_fn = lambda x: clf.predict_proba(x) feature_names = [i for i in range(X_train_proba_feat.shape[1])] explainer = AnchorTabular(predict_fn, feature_names) idx = 0 explainer.fit(X_train_proba_feat, disc_perc=(25)) print('Prediction: ', explainer.predictor(X_eval_proba_feat[idx].reshape(1, -1))[0]) #print('Prediction: ', explainer.predict_fn(X_eval_proba_feat[idx].reshape(1, -1))[0]) explanation = explainer.explain(X_eval_proba_feat[idx], threshold=0.8) print('Anchor: %s' % (' AND '.join(explanation['names']))) print('Precision: %.2f' % explanation['precision']) print('Coverage: %.2f' % explanation['coverage']) print(ok) return net_retrain
import numpy as np from sklearn.datasets import load_iris from alibi.explainers import AnchorTabular import requests dataset = load_iris() feature_names = dataset.feature_names iris_data = dataset.data model_url = "http://localhost:8003/seldon/seldon/iris/api/v1.0/predictions" def predict_fn(X): data = {"data": {"ndarray": X.tolist()}} r = requests.post(model_url, json={"data": {"ndarray": [[1, 2, 3, 4]]}}) return np.array(r.json()["data"]["ndarray"]) explainer = AnchorTabular(predict_fn, feature_names) explainer.fit(iris_data, disc_perc=(25, 50, 75)) explainer.save("./explainer/")
clf = DecisionTreeClassifier(random_state=42) else: clf = RandomForestClassifier(random_state=42) # st.sidebar.write(selected_model) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) st.write("""### Metrics:""") st.write('Train accuracy:', accuracy_score(y_train,clf.predict(X_train))) st.write('Test accuracy:', accuracy_score(y_test, clf.predict(X_test))) confusion_matrix(y_test, y_pred) st.write('Confusion matrix:') plot_confusion_matrix(clf, X_test, y_test) st.pyplot() # st.write(classification_report(y_test, y_pred)) predict_fn = lambda x: clf.predict_proba(x) explainer = AnchorTabular(predict_fn, feature_names) explainer.fit(X_train) idx = st.sidebar.slider(label='Select an instance:',min_value=1,max_value=len(y_test)) st.write("""### Selected instance:""") st.write(X_test_df.iloc[[idx-1]], height=150) print(y_train_df.iloc[[idx-1]]) st.write('Prediction: ', class_names[explainer.predictor(X_test[idx-1].reshape(1, -1))[0]]) st.write("""### Prediction Explained:""") with st.spinner('Calculating'): explanation = explainer.explain(X_test[idx-1], threshold=0.70) st.write('Anchor (instance explanation): %s' % (' AND '.join(explanation.anchor))) st.write('Precision: %.2f' % explanation.precision) st.write('Coverage: %.2f' % explanation.coverage) # st.write("""### Trust score:""") ts = TrustScore(k_filter=10, alpha=.05,
def make_anchor_tabular_income( dirname: Optional[Path] = None) -> AnchorTabular: # adapted from: # https://docs.seldon.io/projects/alibi/en/latest/examples/anchor_tabular_adult.html np.random.seed(0) # prepare data adult = fetch_adult() data = adult.data target = adult.target feature_names = adult.feature_names category_map = adult.category_map data_perm = np.random.permutation(np.c_[data, target]) data = data_perm[:, :-1] target = data_perm[:, -1] # build model idx = 30000 X_train, Y_train = data[:idx, :], target[:idx] X_test, Y_test = data[idx + 1:, :], target[idx + 1:] ordinal_features = [ x for x in range(len(feature_names)) if x not in list(category_map.keys()) ] ordinal_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ]) categorical_features = list(category_map.keys()) categorical_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("onehot", OneHotEncoder(handle_unknown="ignore")), ]) preprocessor = ColumnTransformer(transformers=[ ("num", ordinal_transformer, ordinal_features), ("cat", categorical_transformer, categorical_features), ]) clf = RandomForestClassifier(n_estimators=50) model_pipeline = Pipeline(steps=[ ("preprocess", preprocessor), ("classifier", clf), ]) model_pipeline.fit(X_train, Y_train) explainer = AnchorTabular(model_pipeline.predict, feature_names, categorical_names=category_map, seed=1) explainer.fit(X_train, disc_perc=[25, 50, 75]) if dirname is not None: explainer.save(dirname) return explainer
def fit(self, X: Any) -> None: self._explainer = AnchorTabular( predictor=self._model.predict_proba, feature_names=list(range(X.shape[1])), seed=self._seed) self._explainer.fit(train_data=X)
def anchors_connector(self, *arg): query_instance = dict(s.split(':') for s in arg) #anchor instance to model instance. Input: Numpy. Output: Pandas df. Turns numbers into categories. def adapter(n): d = pd.DataFrame(data=n, columns=self.featureNames) categories = self.getCategoricalFeatures() for c in categories: d[c] = d[c].map(self.dictionary[c]["values"]) #d['Sex'] = d['Sex'].map({0:'Male', 1: 'Female'}) #d['Embarked'] = d['Embarked'].map({0: 'Southampton', 1: 'Cherbourg', 2: 'Queenstown'}) #d['Pclass'] = d['Pclass'].map({0: 'First', 1: 'Second', 2: 'Third'}) return d #model instance to anchor instance. Input: Pandas df. Output: Numpy. Turns categories into numbers. def reverse_adapter(p): d = p.copy() categories = self.getCategoricalFeatures() for c in categories: d[c] = d[c].map( {v: k for k, v in self.dictionary[c]["values"].items()}) #d['Sex'] = d['Sex'].map({'Male': 0, 'Female': 1}) #d['Embarked'] = d['Embarked'].map({'Southampton': 0, 'Cherbourg': 1, 'Queenstown': 2}) #d['Pclass'] = d['Pclass'].map({'First': 0, 'Second': 1, 'Third': 2}) n = d.to_numpy().astype(np.float) return (n) predict_fn = lambda x: self.model.predict(adapter(x)) #create the category map categories = self.getCategoricalFeatures() category_map = {} for i in range(len(self.featureNames)): if self.featureNames[i] in categories: category_map[i] = [ str(k) for k in list(self.dictionary[self.featureNames[i]] ["values"].values()) ] #category_map = {0: ['First', 'Second', 'Third'], 1: ['Male','Female'], 4: ['Southampton', 'Cherbourg', 'Queenstown']} print("-------") print(query_instance) print(reverse_adapter(pd.DataFrame([query_instance]))) #sort query_instance sorted_query_instance = {} for f in self.featureNames: sorted_query_instance[f] = query_instance[f] print(sorted_query_instance) print(reverse_adapter(pd.DataFrame([sorted_query_instance]))) explainer = AnchorTabular(predict_fn, feature_names=self.featureNames, categorical_names=category_map) anchor_training = reverse_adapter(self.X_train) explainer.fit(anchor_training, disc_perc=[25, 50, 75]) explanation = explainer.explain(reverse_adapter( pd.DataFrame([sorted_query_instance])), threshold=0.90, max_anchor_size=3, batch_size=2000) print('Anchor: %s' % (' AND '.join(explanation['data']['anchor']))) print('Precision: %.2f' % explanation['precision']) print('Coverage: %.2f' % explanation['coverage']) #build rule rule = "" names = explanation['data']['anchor'] precision = np.asarray(explanation['raw']['precision']) precision[1:] -= precision[:-1].copy() precision = [round(elem, 2) for elem in precision.tolist()] for i in range(0, len(names)): rule = rule + names[i] importance = round(precision[i] / sum(precision) * 100, 2) rule = rule + " (" + str(importance) + "%)" if (i < len(names) - 1): rule = rule + " AND " self.explanation = 'I generated the following rule for you. It describes the boundaries under which the current prediction remains stable: <br> <br> <big>' + rule + '</big>. <br> <br> Each rule condition has an importance score which shows how critical the condition is for the prediction outcome to stay stable.' self.certainty = 'I tested the rule on many sample data instances. The condition applies on %.2f' % explanation[ 'coverage'] + ' of the instances. In these cases, the rule was accurate in %.2f' % explanation[ 'precision'] + ' of the cases.' return (True)
def main(unused_args): ##Read hypertuning values from file if args.component == 'training': timestamp = str(args.timestamp) filename = "/mnt/Model_Blerssi/hpv-" + timestamp + ".txt" f = open(filename, "r") args.tf_batch_size = int(f.readline()) args.learning_rate = float(f.readline()) print("****************") print("Optimized Hyper paramater value") print("Batch-size = " + str(args.tf_batch_size)) print("Learning rate = " + str(args.learning_rate)) print("****************") # Feature columns COLUMNS = list(BLE_RSSI.columns) FEATURES = COLUMNS[2:] LABEL = [COLUMNS[0]] b3001 = tf.feature_column.numeric_column(key='b3001', dtype=tf.float64) b3002 = tf.feature_column.numeric_column(key='b3002', dtype=tf.float64) b3003 = tf.feature_column.numeric_column(key='b3003', dtype=tf.float64) b3004 = tf.feature_column.numeric_column(key='b3004', dtype=tf.float64) b3005 = tf.feature_column.numeric_column(key='b3005', dtype=tf.float64) b3006 = tf.feature_column.numeric_column(key='b3006', dtype=tf.float64) b3007 = tf.feature_column.numeric_column(key='b3007', dtype=tf.float64) b3008 = tf.feature_column.numeric_column(key='b3008', dtype=tf.float64) b3009 = tf.feature_column.numeric_column(key='b3009', dtype=tf.float64) b3010 = tf.feature_column.numeric_column(key='b3010', dtype=tf.float64) b3011 = tf.feature_column.numeric_column(key='b3011', dtype=tf.float64) b3012 = tf.feature_column.numeric_column(key='b3012', dtype=tf.float64) b3013 = tf.feature_column.numeric_column(key='b3013', dtype=tf.float64) feature_columns = [ b3001, b3002, b3003, b3004, b3005, b3006, b3007, b3008, b3009, b3010, b3011, b3012, b3013 ] df_full = pd.read_csv("/opt/iBeacon_RSSI_Labeled.csv") #Labeled dataset # Input Data Preprocessing df_full = df_full.drop(['date'], axis=1) df_full[FEATURES] = (df_full[FEATURES]) / (-200) #Output Data Preprocessing dict = { 'O02': 0, 'P01': 1, 'P02': 2, 'R01': 3, 'R02': 4, 'S01': 5, 'S02': 6, 'T01': 7, 'U02': 8, 'U01': 9, 'J03': 10, 'K03': 11, 'L03': 12, 'M03': 13, 'N03': 14, 'O03': 15, 'P03': 16, 'Q03': 17, 'R03': 18, 'S03': 19, 'T03': 20, 'U03': 21, 'U04': 22, 'T04': 23, 'S04': 24, 'R04': 25, 'Q04': 26, 'P04': 27, 'O04': 28, 'N04': 29, 'M04': 30, 'L04': 31, 'K04': 32, 'J04': 33, 'I04': 34, 'I05': 35, 'J05': 36, 'K05': 37, 'L05': 38, 'M05': 39, 'N05': 40, 'O05': 41, 'P05': 42, 'Q05': 43, 'R05': 44, 'S05': 45, 'T05': 46, 'U05': 47, 'S06': 48, 'R06': 49, 'Q06': 50, 'P06': 51, 'O06': 52, 'N06': 53, 'M06': 54, 'L06': 55, 'K06': 56, 'J06': 57, 'I06': 58, 'F08': 59, 'J02': 60, 'J07': 61, 'I07': 62, 'I10': 63, 'J10': 64, 'D15': 65, 'E15': 66, 'G15': 67, 'J15': 68, 'L15': 69, 'R15': 70, 'T15': 71, 'W15': 72, 'I08': 73, 'I03': 74, 'J08': 75, 'I01': 76, 'I02': 77, 'J01': 78, 'K01': 79, 'K02': 80, 'L01': 81, 'L02': 82, 'M01': 83, 'M02': 84, 'N01': 85, 'N02': 86, 'O01': 87, 'I09': 88, 'D14': 89, 'D13': 90, 'K07': 91, 'K08': 92, 'N15': 93, 'P15': 94, 'I15': 95, 'S15': 96, 'U15': 97, 'V15': 98, 'S07': 99, 'S08': 100, 'L09': 101, 'L08': 102, 'Q02': 103, 'Q01': 104 } df_full['location'] = df_full['location'].map(dict) df_train = df_full.sample(frac=0.8, random_state=200) df_valid = df_full.drop(df_train.index) location_counts = BLE_RSSI.location.value_counts() x1 = np.asarray(df_train[FEATURES]) y1 = np.asarray(df_train['location']) x2 = np.asarray(df_valid[FEATURES]) y2 = np.asarray(df_valid['location']) def formatFeatures(features): formattedFeatures = {} numColumns = features.shape[1] for i in range(0, numColumns): formattedFeatures["b" + str(3001 + i)] = features[:, i] return formattedFeatures trainingFeatures = formatFeatures(x1) trainingCategories = y1 testFeatures = formatFeatures(x2) testCategories = y2 # Train Input Function def train_input_fn(): dataset = tf.data.Dataset.from_tensor_slices((trainingFeatures, y1)) dataset = dataset.repeat(args.epochs).batch(args.tf_batch_size) return dataset # Test Input Function def eval_input_fn(): dataset = tf.data.Dataset.from_tensor_slices((testFeatures, y2)) return dataset.repeat(args.epochs).batch(args.tf_batch_size) # Provide list of GPUs should be used to train the model distribution = tf.distribute.experimental.ParameterServerStrategy() print('Number of devices: {}'.format(distribution.num_replicas_in_sync)) # Configuration of training model config = tf.estimator.RunConfig(train_distribute=distribution, model_dir=args.tf_model_dir, save_summary_steps=100, save_checkpoints_steps=100) # Build 3 layer DNN classifier model = tf.estimator.DNNClassifier(hidden_units=[13, 65, 110], feature_columns=feature_columns, optimizer=tf.train.AdamOptimizer( learning_rate=args.learning_rate, beta1=args.beta1, beta2=args.beta2), model_dir=args.tf_model_dir, n_classes=105, config=config) export_final = tf.estimator.FinalExporter( args.tf_export_dir, serving_input_receiver_fn=serving_input_receiver_fn) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=args.tf_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=100, exporters=export_final, throttle_secs=1, start_delay_secs=1) # Train and Evaluate the model tf.estimator.train_and_evaluate(model, train_spec, eval_spec) MODEL_EXPORT_PATH = args.tf_model_dir def predict(request): """ Define custom predict function to be used by local prediction and explainer. Set anchor_tabular predict function so it always returns predicted class """ # Get model exporter path for dir in os.listdir(args.tf_model_dir): if re.match('[0-9]', dir): exported_path = os.path.join(args.tf_model_dir, dir) break else: raise Exception("Model path not found") # Prepare model input data feature_cols = [ "b3001", "b3002", "b3003", "b3004", "b3005", "b3006", "b3007", "b3008", "b3009", "b3010", "b3011", "b3012", "b3013" ] input = { 'b3001': [], 'b3002': [], 'b3003': [], 'b3004': [], 'b3005': [], 'b3006': [], 'b3007': [], 'b3008': [], 'b3009': [], 'b3010': [], 'b3011': [], 'b3012': [], 'b3013': [] } X = request if np.ndim(X) != 2: for i in range(len(X)): input[feature_cols[i]].append(X[i]) else: for i in range(len(X)): for j in range(len(X[i])): input[feature_cols[j]].append(X[i][j]) # Open a Session to predict with tf.Session() as sess: tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], exported_path) predictor = tf.contrib.predictor.from_saved_model( exported_path, signature_def_key='predict') output_dict = predictor(input) sess.close() output = {} output["predictions"] = { "probabilities": output_dict["probabilities"].tolist() } return np.asarray(output['predictions']["probabilities"]) #Initialize and fit feature_cols = [ "b3001", "b3002", "b3003", "b3004", "b3005", "b3006", "b3007", "b3008", "b3009", "b3010", "b3011", "b3012", "b3013" ] explainer = AnchorTabular(predict, feature_cols) explainer.fit(x1, disc_perc=(25, 50, 75)) #Save Explainer file #Save explainer file with .dill extension. It will be used when creating the InferenceService if not os.path.exists(args.explainer_dir): os.mkdir(args.explainer_dir) with open("%s/explainer.dill" % args.explainer_dir, 'wb') as f: dill.dump(explainer, f)