def test_explain_linear_dense(): clf = LogisticRegression(random_state=42) data = [{ 'day': 'mon', 'moon': 'full' }, { 'day': 'tue', 'moon': 'rising' }, { 'day': 'tue', 'moon': 'rising' }, { 'day': 'mon', 'moon': 'rising' }] vec = DictVectorizer(sparse=False) X = vec.fit_transform(data) clf.fit(X, [0, 1, 1, 0]) test_day = {'day': 'tue', 'moon': 'full'} target_names = ['sunny', 'shady'] res1 = explain_prediction(clf, test_day, vec=vec, target_names=target_names) expl_text, expl_html = format_as_all(res1, clf) assert 'day=tue' in expl_text assert 'day=tue' in expl_html [test_day_vec] = vec.transform(test_day) res2 = explain_prediction(clf, test_day_vec, target_names=target_names, vectorized=True, feature_names=vec.get_feature_names()) assert res1 == res2
def explainELI5(rows, name, classes, XY): import eli5 import eli5.sklearn from eli5 import show_weights from eli5.sklearn import PermutationImportance from eli5.sklearn import explain_prediction_linear_classifier from eli5 import show_prediction cols = XY[2] clf = loadModel(name+"-"+classes) ExplainPath_Specific = ExplainPath + name + "_" + classes weights = eli5.explain_weights(clf, feature_names=cols, top=(len(cols)+1)) predictionA = eli5.explain_prediction(clf, XY[0][0], feature_names=cols, top=(len(cols)+1)) predictionB = eli5.explain_prediction(clf, XY[0][1], feature_names=cols, top=(len(cols)+1)) html = eli5.format_as_html(weights) with open(uniquify(ExplainPath_Specific + "/" + classes +"_ELI5_WEIGHTS.html"), 'w') as f: f.write(html) f.close() html = eli5.format_as_html(predictionA) with open(uniquify(ExplainPath_Specific + "/" + classes +"_ELI5_PREDICTION_A.html"), 'w') as f: f.write(html) f.close() html = eli5.format_as_html(predictionB) with open(uniquify(ExplainPath_Specific + "/" + classes +"_ELI5_PREDICTION_B.html"), 'w') as f: f.write(html) f.close()
def assert_binary_linear_classifier_explained(newsgroups_train_binary, clf, explain_prediction): docs, y, target_names = newsgroups_train_binary vec = TfidfVectorizer() X = vec.fit_transform(docs) clf.fit(X, y) assert y[2] == 1 cg_document = docs[2] res = explain_prediction(clf, cg_document, vec=vec, target_names=target_names, top=20) expl_text, expl_html = format_as_all(res, clf) for expl in [expl_text, expl_html]: assert 'software' in expl or 'thanks' in expl assert target_names[1] in expl assert y[15] == 0 atheism_document = docs[15] res = explain_prediction(clf, atheism_document, vec=vec, target_names=target_names, top=20) expl_text, expl_html = format_as_all(res, clf) for expl in [expl_text, expl_html]: assert 'god' in expl assert target_names[0] in expl assert_correct_class_explained_binary(clf, X[::10])
def test_explain_prediction_unsupported(): clf = BaseEstimator() doc = 'doc' res = explain_prediction(clf, doc) assert 'BaseEstimator' in res.error with pytest.raises(TypeError): explain_prediction(clf, doc, unknown_argument=True)
def test_explain_hashing_vectorizer(newsgroups_train_binary): # test that we can pass InvertableHashingVectorizer explicitly vec = HashingVectorizer(n_features=1000) ivec = InvertableHashingVectorizer(vec) clf = LogisticRegression(random_state=42) docs, y, target_names = newsgroups_train_binary ivec.fit([docs[0]]) X = vec.fit_transform(docs) clf.fit(X, y) get_res = lambda **kwargs: explain_prediction( clf, docs[0], vec=ivec, target_names=target_names, top=20, **kwargs) res = get_res() check_explain_linear_binary(res, clf) assert res == get_res() res_vectorized = explain_prediction(clf, vec.transform([docs[0]])[0], vec=ivec, target_names=target_names, top=20, vectorized=True) pprint(res_vectorized) assert res_vectorized == _without_weighted_spans(res) assert res == get_res(feature_names=ivec.get_feature_names( always_signed=False))
def test_explain_prediction_booster_multitarget(newsgroups_train): docs, ys, target_names = newsgroups_train vec = CountVectorizer(stop_words='english', dtype=np.float64) xs = vec.fit_transform(docs) clf = lightgbm.train(params={ 'objective': 'multiclass', 'verbose_eval': -1, 'max_depth': 2, 'n_estimators': 100, 'min_child_samples': 1, 'min_child_weight': 1, 'num_class': len(target_names) }, train_set=lightgbm.Dataset(xs.toarray(), label=ys)) doc = 'computer graphics in space: a new religion' res = explain_prediction(clf, doc, vec=vec, target_names=target_names) format_as_all(res, clf) check_targets_scores(res) graphics_weights = res.targets[1].feature_weights assert 'computer' in get_all_features(graphics_weights.pos) religion_weights = res.targets[3].feature_weights assert 'religion' in get_all_features(religion_weights.pos) top_target_res = explain_prediction(clf, doc, vec=vec, top_targets=2) assert len(top_target_res.targets) == 2 assert sorted(t.proba for t in top_target_res.targets) == sorted( t.proba for t in res.targets)[-2:]
def assert_linear_regression_explained(boston_train, reg, explain_prediction): X, y, feature_names = boston_train reg.fit(X, y) res = explain_prediction(reg, X[0]) expl_text, expl_html = format_as_all(res, reg) assert len(res.targets) == 1 target = res.targets[0] assert target.target == 'y' pos, neg = (get_all_features(target.feature_weights.pos), get_all_features(target.feature_weights.neg)) assert 'x11' in pos or 'x11' in neg if has_intercept(reg): assert '<BIAS>' in pos or '<BIAS>' in neg assert '<BIAS>' in expl_text assert '<BIAS>' in expl_html else: assert '<BIAS>' not in pos and '<BIAS>' not in neg assert '<BIAS>' not in expl_text assert 'BIAS' not in expl_html for expl in [expl_text, expl_html]: assert 'x11' in expl assert '(score' in expl assert "'y'" in expl_text assert '<b>y</b>' in strip_blanks(expl_html) assert res == explain_prediction(reg, X[0])
def assert_multitarget_linear_regression_explained(reg, explain_prediction): X, y = make_regression(n_samples=100, n_targets=3, n_features=10, random_state=42) reg.fit(X, y) res = explain_prediction(reg, X[0]) expl_text, expl_html = format_as_all(res, reg) assert len(res.targets) == 3 target = res.targets[1] assert target.target == 'y1' pos, neg = (get_all_features(target.feature_weights.pos), get_all_features(target.feature_weights.neg)) assert 'x8' in pos or 'x8' in neg if has_intercept(reg): assert '<BIAS>' in pos or '<BIAS>' in neg assert 'x8' in expl_text if has_intercept(reg): assert '<BIAS>' in expl_text assert "'y2'" in expl_text assert res == explain_prediction(reg, X[0]) check_targets_scores(res) top_targets_res = explain_prediction(reg, X[0], top_targets=1) assert len(top_targets_res.targets) == 1
def assert_linear_regression_explained(boston_train, reg, explain_prediction, atol=1e-8, reg_has_intercept=None): X, y, feature_names = boston_train reg.fit(X, y) res = explain_prediction(reg, X[0], feature_names=feature_names) expl_text, expl_html = expls = format_as_all(res, reg) assert len(res.targets) == 1 target = res.targets[0] assert target.target == 'y' get_pos_neg_features = lambda fw: ( get_all_features(fw.pos, with_weights=True), get_all_features(fw.neg, with_weights=True)) pos, neg = get_pos_neg_features(target.feature_weights) assert 'LSTAT' in pos or 'LSTAT' in neg if reg_has_intercept is None: reg_has_intercept = has_intercept(reg) if reg_has_intercept: assert '<BIAS>' in pos or '<BIAS>' in neg assert '<BIAS>' in expl_text assert '<BIAS>' in expl_html else: assert '<BIAS>' not in pos and '<BIAS>' not in neg assert '<BIAS>' not in expl_text assert 'BIAS' not in expl_html for expl in [expl_text, expl_html]: assert 'LSTAT' in expl assert '(score' in expl assert "'y'" in expl_text assert '<b>y</b>' in strip_blanks(expl_html) for expl in expls: assert_feature_values_present(expl, feature_names, X[0]) assert res == explain_prediction(reg, X[0], feature_names=feature_names) check_targets_scores(res, atol=atol) flt_res = explain_prediction( reg, X[0], feature_names=feature_names, feature_filter=lambda name, v: name != 'LSTAT') format_as_all(flt_res, reg) flt_target = flt_res.targets[0] flt_pos, flt_neg = get_pos_neg_features(flt_target.feature_weights) assert 'LSTAT' not in flt_pos and 'LSTAT' not in flt_neg flt_all = dict(flt_pos, **flt_neg) expected = dict(pos, **neg) expected.pop('LSTAT') assert flt_all == expected
def test_unsupported(): vec = CountVectorizer() clf = BaseEstimator() doc = 'doc' res = explain_prediction(clf, doc, vec=vec) assert 'BaseEstimator' in res.error for expl in format_as_all(res, clf): assert 'Error' in expl assert 'BaseEstimator' in expl with pytest.raises(TypeError): explain_prediction(clf, doc, unknown_argument=True)
def test_explain_tree_regressor_multitarget(reg): X, y = make_regression(n_samples=100, n_targets=3, n_features=10, random_state=42) reg.fit(X, y) res = explain_prediction(reg, X[0]) for expl in format_as_all(res, reg): for target in ['y0', 'y1', 'y2']: assert target in expl assert 'BIAS' in expl assert any('x%d' % i in expl for i in range(10)) check_targets_scores(res) top_targets_res = explain_prediction(reg, X[0], top_targets=1) assert len(top_targets_res.targets) == 1
def test_explain_regression_hashing_vectorizer(newsgroups_train_binary): docs, y, target_names = newsgroups_train_binary vec = HashingVectorizer(norm=None) clf = LinearRegression() clf.fit(vec.fit_transform(docs), y) # Setting large "top" in order to compare it with CountVectorizer below # (due to small differences in the coefficients they might have cutoffs # at different points). res = explain_prediction(clf, docs[0], vec=vec, target_names=[target_names[1]], top=1000) expl, _ = format_as_all(res, clf) assert len(res.targets) == 1 e = res.targets[0] assert e.target == 'comp.graphics' neg = get_all_features(e.feature_weights.neg) assert 'objective' in neg assert 'that' in neg assert 'comp.graphics' in expl assert 'objective' in expl assert 'that' in expl # HashingVectorizer with norm=None is "the same" as CountVectorizer, # so we can compare it and check that explanation is almost the same. count_vec = CountVectorizer() count_clf = LinearRegression() count_clf.fit(count_vec.fit_transform(docs), y) count_res = explain_prediction(count_clf, docs[0], vec=count_vec, target_names=[target_names[1]], top=1000) pprint(count_res) count_expl, _ = format_as_all(count_res, count_clf) print(count_expl) for key in ['pos', 'neg']: values, count_values = [ sorted(get_names_coefs(getattr(r.targets[0].feature_weights, key))) for r in [res, count_res] ] assert len(values) == len(count_values) for (name, coef), (count_name, count_coef) in zip(values, count_values): assert name == count_name assert abs(coef - count_coef) < 0.05
def test_explain_tree_clf_multiclass(clf, iris_train): X, y, feature_names, target_names = iris_train clf.fit(X, y) res = explain_prediction( clf, X[0], target_names=target_names, feature_names=feature_names) for expl in format_as_all(res, clf): for target in target_names: assert target in expl assert 'BIAS' in expl assert any(f in expl for f in feature_names) assert_feature_values_present(expl, feature_names, X[0]) check_targets_scores(res) top_targets_res = explain_prediction(clf, X[0], top_targets=1) assert len(top_targets_res.targets) == 1
def assert_tree_explain_prediction_single_target(clf, X, feature_names): get_res = lambda _x, **kwargs: explain_prediction( clf, _x, feature_names=feature_names, **kwargs) res = get_res(X[0]) for expl in format_as_all(res, clf): assert_feature_values_present(expl, feature_names, X[0]) checked_flt = False all_expls = [] for x in X[:5]: res = get_res(x) text_expl = format_as_text(res, show=fields.WEIGHTS) print(text_expl) assert '<BIAS>' in text_expl check_targets_scores(res) all_expls.append(text_expl) get_all = lambda fw: get_all_features(fw.pos) | get_all_features(fw.neg ) all_features = get_all(res.targets[0].feature_weights) if len(all_features) > 1: f = list(all_features - {'<BIAS>'})[0] flt_res = get_res(x, feature_filter=lambda name, _: name != f) flt_features = get_all(flt_res.targets[0].feature_weights) assert flt_features == (all_features - {f}) checked_flt = True assert checked_flt assert any(f in ''.join(all_expls) for f in feature_names)
def explain(model_path): wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini") sent_word2vec_path = "./data/word2vec.query.bin" sent_vocab_path = "./data/word2vec.query.vocab" sent_model_path = "./data/sif.model" sent_word2vec = KeyedVectors.load_word2vec_format(sent_word2vec_path, binary=True) sent_vocab_dict = load_vocab(sent_vocab_path) sent_model = joblib.load(sent_model_path) tfidf_count_hash_vectorModels = VectorModels() ner_dict_path = "./data/ner.dict" syn_dict_path = "./data/syn.dict" ner_dict, syn_dict = load_ner_dict(ner_dict_path, syn_dict_path) model = joblib.load(model_path) pd.set_option('display.max_rows', None) explain = eli5.explain_weights(model, top=None) explain = eli5.format_as_text(explain) print explain feature_names = [] column_names = ["qid", "ql", "qr"] #reader = pd.read_csv(in_path, sep="\t", dtype="str", names=column_names, chunksize=100) reader = pd.read_csv(sys.stdin, sep="\t", dtype="str", names=column_names, chunksize=1) first_chunk = True feature_extractor = lambda row: extract_features( wordseg, row["ql"], row["qr"], tfidf_count_hash_vectorModels, sent_word2vec, sent_vocab_dict, sent_model, ner_dict, syn_dict) for data in reader: _ = data.fillna("", inplace=True) X = data[["ql", "qr"]].apply(feature_extractor, axis=1) X_features = X.apply(pd.Series) feature_names = X_features.columns.values.tolist() X_features = X_features[feature_names] y_preds = model.predict_proba(X_features, ntree_limit=model.best_ntree_limit) y_preds = map(lambda o: o[1], y_preds) data = pd.concat([data, X_features], axis=1) data = data.assign(predict=y_preds) #if first_chunk: # data.to_csv(in_path + ".predict", header=True, sep="\t", mode="w") # first_chunk = False #else: # data.to_csv(in_path + ".predict", header=False, sep="\t", mode="a") data.to_csv(sys.stdout, header=False, sep="\t") explain = eli5.explain_prediction(model, X_features.iloc[0]) explain = eli5.format_as_text(explain) print explain print X_features.iloc[0]
def test_explain_linear_classifiers_unsupported_kernels( clf, newsgroups_train_binary): docs, y, target_names = newsgroups_train_binary vec = TfidfVectorizer() clf.fit(vec.fit_transform(docs), y) res = explain_prediction(clf, docs[0], vec=vec) assert 'supported' in res.error
def explore_final_model(): #https://github.com/gameofdimension/xgboost_explainer/blob/master/xgboost_explainer_demo.ipynb nr_labels = len(y) value_counts = y.value_counts() perc_per_label = {k:round(100 * v/float(nr_labels),2) for k,v in value_counts.items()} print('value counts:', y.value_counts()) print('perc per label:', perc_per_label) model = pickle.load(open(filename_model, "rb")) model_feature_names = model.attr('feature_names').split('|') index_to_class = json.loads(model.attr('index_to_class')) print(index_to_class) classes = [index_to_class[k] for k in sorted(index_to_class.keys())] print(classes) print('eli5 explain weights (gain):\n',eli5.format_as_text(eli5.explain_weights(model, top=10))) #gain df_test = pd.read_json(open(test_filename, "r")) df_test = df_test.head(5) feature_extractor = FeatureExtractor(df_test) X_test, X_test_featurenames = feature_extractor.get_features_pred_instances(df_test, model_feature_names) print(X) print(set(X.dtypes)) # print(X.iloc[0]) print(eli5.format_as_text(eli5.explain_prediction(model, X_test.head(1), target_names = classes, top = 10, feature_names = X_test_featurenames)))
def print_eli5(click_data, category): pred = pd.read_csv(category + '_xy.csv') model = joblib.load(category + ".h5") pred = pred.loc[(pred['grid_x'] == click_data['points'][0]['lon']) & (pred['grid_y'] == click_data['points'][0]['lat']), :] pred_sqr = pred['eurogrid_0250_1'].values[0] dane_model = df.loc[df['eurogrid_0250_1'] == pred_sqr, :] dict_ = eli5.format_as_dataframe(eli5.explain_weights(model)) cols = dict_['feature'].values maping = {} for i in range(len(cols)): maping['x' + str(i)] = cols[i] # print(dane_model.columns) expl = dane_model.loc[:, cols] # print(expl.head()) all_cols = itertools.permutations(cols) for cols in all_cols: try: expl = expl.loc[:, list(cols)] expl = eli5.formatters.format_as_dataframe( eli5.explain_prediction(model, expl)) break except: continue expl['feature'] = expl['feature'].apply(lambda x: map_x(x, maping)) return generate_table(expl)
def result(): tweet = str(request.data) explain = explain_prediction(gnb, tweet, vec=tfid, target_names=['known weird', 'less weird']) return str(format_as_text(explain))
def assert_multiclass_linear_classifier_explained(newsgroups_train, clf, explain_prediction): docs, y, target_names = newsgroups_train vec = TfidfVectorizer() X = vec.fit_transform(docs) clf.fit(X, y) get_res = lambda: explain_prediction( clf, docs[0], vec=vec, target_names=target_names, top=20) res = get_res() pprint(res) expl_text, expl_html = format_as_all(res, clf) for e in res.targets: if e.target != 'comp.graphics': continue pos = get_all_features(e.feature_weights.pos) assert 'file' in pos for expl in [expl_text, expl_html]: for label in target_names: assert str(label) in expl assert 'file' in expl assert res == get_res()
def test_explain_one_class_svm(): X = np.array([[0, 0], [0, 1], [5, 3], [93, 94], [90, 91]]) clf = OneClassSVM(kernel='linear', random_state=42).fit(X) res = explain_prediction(clf, X[0]) assert res.targets[0].score < 0 for expl in format_as_all(res, clf): assert 'BIAS' in expl assert 'x0' not in expl assert 'x1' not in expl res = explain_prediction(clf, X[4]) assert res.targets[0].score > 0 for expl in format_as_all(res, clf): assert 'BIAS' in expl assert 'x0' in expl assert 'x1' in expl
def test_explain_prediction_clf_binary(newsgroups_train_binary_big): docs, ys, target_names = newsgroups_train_binary_big vec = CountVectorizer(stop_words='english', dtype=np.float64) clf = LGBMClassifier(n_estimators=100, max_depth=2, min_child_samples=1, min_child_weight=1) xs = vec.fit_transform(docs) clf.fit(xs, ys) get_res = lambda **kwargs: explain_prediction( clf, 'computer graphics in space: a sign of atheism', vec=vec, target_names=target_names, **kwargs) res = get_res() for expl in format_as_all(res, clf, show_feature_values=True): assert 'graphics' in expl check_targets_scores(res) weights = res.targets[0].feature_weights pos_features = get_all_features(weights.pos) neg_features = get_all_features(weights.neg) assert 'graphics' in pos_features assert 'computer' in pos_features assert 'atheism' in neg_features flt_res = get_res(feature_re='gra') flt_pos_features = get_all_features(flt_res.targets[0].feature_weights.pos) assert 'graphics' in flt_pos_features assert 'computer' not in flt_pos_features
def fi_eli(self, instance, prediction, model): """ Parameters ---------- instance: array The instance which prediction's interpretation(s) Altruist will investigate. prediction: None or any it is not used in this function, but kept for consistency model: The machine learning model made the prediction Returns ------- list The feature importances provided by Eli5 """ fn = [i for i in range(len(instance))] temp = format_as_dataframe(explain_prediction(model, instance,top=None)) temp.drop(['target', 'value'],axis=1,inplace=True) temp = temp[temp.feature != '<BIAS>'] def remove_x(x): return int(x.replace('x','')) temp['feature'] = temp['feature'].apply(remove_x) zero = [j for j in fn if j not in temp['feature'].values] for z in zero: temp = temp.append({'feature':z,'weight':0}, ignore_index=True) temp = temp.sort_values(by=['feature']) return temp.values[:,1]
def explain_pred(input_data, model): y_preds = [] y_probs = [] encoded_htmls = [] for i in input_data: expl = eli5.explain_prediction( model.steps[-1][1], i, model.steps[0][1], target_names=['Compliant', 'Not Compliant'], top=10) html_explanation = format_as_html(expl, force_weights=False, show_feature_values=True).replace( "\n", "").strip() encoded_html = base64.b64encode( bytes(html_explanation, encoding='utf-8')) encoded_htmls.append(encoded_html) expl_dict = format_as_dict(expl) targets = expl_dict['targets'][0] target = targets['target'] y_pred = 1 if target.startswith('N') else 0 y_prob = targets['proba'] if len(i.split()) < 3: # one or two words can't be non-compliant y_pred = 0 y_prob = 1.0 y_prob = f'{round(y_prob, 3) * 100}%' y_preds.append(y_pred) y_probs.append(y_prob) inferences = np.column_stack((y_probs, y_preds, encoded_htmls)) return inferences
def test_explain_tree_classifier_text(clf, newsgroups_train_big): docs, y, target_names = newsgroups_train_big vec = CountVectorizer(binary=True, stop_words='english') X = vec.fit_transform(docs) clf.fit(X, y) res = explain_prediction(clf, docs[0], vec=vec, target_names=target_names) check_targets_scores(res) format_as_all(res, clf)
def test_unsupported(): vec = CountVectorizer() clf = BaseEstimator() res = explain_prediction(clf, 'hello, world', vec=vec) assert 'BaseEstimator' in res.error for expl in format_as_all(res, clf): assert 'Error' in expl assert 'BaseEstimator' in expl
def explain_prediction_df(estimator, doc, **kwargs): # type: (...) -> pd.DataFrame """ Explain prediction and export explanation to ``pandas.DataFrame`` All keyword arguments are passed to :func:`eli5.explain_prediction`. Weights of all features are exported by default. """ kwargs = _set_defaults(kwargs) return format_as_dataframe( eli5.explain_prediction(estimator, doc, **kwargs))
def test_explain_prediction_single_leaf_tree(iris_train): X, y, feature_names, target_names = iris_train clf = LGBMClassifier(n_estimators=100) clf.fit(X, y) # at least one of the trees has only a single leaf res = explain_prediction(clf, X[0], target_names=target_names) format_as_all(res, clf) check_targets_scores(res)
def explain_score(lead_no): nb_feat = 20 lead_no = int(lead_no) get = test[test[ID]==lead_no].reset_index(drop=True) exps = eli5.explain_prediction(xgb_clf, get[xgb_clf.booster().feature_names].iloc[0], top=nb_feat) score_explain = eli5.format_as_html(exps, show=('targets', 'feature_importances'), show_feature_values=True) target_flag = get[TARGET].values[0] proba_target= get[TARGET_PROBA].values[0] proba_target = round(proba_target, 3) return render_template('explain.html', lead_no=lead_no, score_explain=score_explain, target_flag=target_flag, proba_target=proba_target, nb_feat=nb_feat)
def test_explain_prediction_pandas(reg, boston_train): pd = pytest.importorskip('pandas') X, y, feature_names = boston_train df = pd.DataFrame(X, columns=feature_names) reg.fit(df, y) res = explain_prediction(reg, df.iloc[0]) for expl in format_as_all(res, reg): assert 'PTRATIO' in expl if has_intercept(reg): assert 'BIAS' in expl