def test_explain_prediction_clf_multitarget( newsgroups_train, filter_missing, use_booster): docs, ys, target_names = newsgroups_train vec = CountVectorizer(stop_words='english') xs = vec.fit_transform(docs) if use_booster: clf = xgboost.train( params={'objective': 'multi:softprob', 'num_class': len(target_names), 'silent': True, 'max_depth': 2}, dtrain=xgboost.DMatrix(xs, label=ys, missing=np.nan), num_boost_round=100, ) else: clf = XGBClassifier(n_estimators=100, max_depth=2) clf.fit(xs, ys) feature_filter = (lambda _, v: not np.isnan(v)) if filter_missing else None doc = 'computer graphics in space: a new religion' res = explain_prediction(clf, doc, vec=vec, target_names=target_names, feature_filter=feature_filter) format_as_all(res, clf) if not filter_missing: check_targets_scores(res) graphics_weights = res.targets[1].feature_weights assert 'computer' in get_all_features(graphics_weights.pos) religion_weights = res.targets[3].feature_weights assert 'religion' in get_all_features(religion_weights.pos) top_target_res = explain_prediction(clf, doc, vec=vec, top_targets=2) assert len(top_target_res.targets) == 2 assert sorted(t.proba for t in top_target_res.targets) == sorted( t.proba for t in res.targets)[-2:]
def test_explain_prediction_clf_multitarget(newsgroups_train, filter_missing): docs, ys, target_names = newsgroups_train vec = CountVectorizer(stop_words='english') xs = vec.fit_transform(docs) clf = XGBClassifier(n_estimators=100, max_depth=2) clf.fit(xs, ys) feature_filter = (lambda _, v: not np.isnan(v)) if filter_missing else None doc = 'computer graphics in space: a new religion' res = explain_prediction(clf, doc, vec=vec, target_names=target_names, feature_filter=feature_filter) format_as_all(res, clf) if not filter_missing: check_targets_scores(res) graphics_weights = res.targets[1].feature_weights assert 'computer' in get_all_features(graphics_weights.pos) religion_weights = res.targets[3].feature_weights assert 'religion' in get_all_features(religion_weights.pos) top_target_res = explain_prediction(clf, doc, vec=vec, top_targets=2) assert len(top_target_res.targets) == 2 assert sorted(t.proba for t in top_target_res.targets) == sorted( t.proba for t in res.targets)[-2:]
def test_explain_prediction_clf_xor(): true_xs = [[np.random.randint(2), np.random.randint(2)] for _ in range(100)] xs = np.array([[np.random.normal(x, 0.2), np.random.normal(y, 0.2)] for x, y in true_xs]) ys = np.array([x == y for x, y in true_xs]) clf = XGBClassifier(n_estimators=100, max_depth=2) clf.fit(xs, ys) res = explain_prediction(clf, np.array([1, 1])) format_as_all(res, clf) for x in [[0, 1], [1, 0], [0, 0], [1, 1]]: res = explain_prediction(clf, np.array(x)) print(x) print(format_as_text(res, show=fields.WEIGHTS)) check_targets_scores(res)
def explain_prediction_me(x, model, feature_name_list): from eli5.explain import explain_prediction params = {} params['feature_names'] = feature_name_list #params['top'] = 5 expl = explain_prediction(model.get_booster(), x, **params) #expl.targets for target_explanation_i in range(len(expl.targets)): target_explanation = expl.targets[target_explanation_i] print "class " + str( target_explanation.target) + " probability: " + str( target_explanation.proba) + " score: " + str( target_explanation.score) print "Positive:" for feature_weight in target_explanation.feature_weights.pos: print str(feature_weight.feature) + ": weight: " + str( feature_weight.weight) + " actual value: " + str( feature_weight.value) print "Negative:" for feature_weight in target_explanation.feature_weights.neg: print str(feature_weight.feature) + ": weight: " + str( feature_weight.weight) + " actual value: " + str( feature_weight.value)
def test_explain_prediction_clf_binary(newsgroups_train_binary_big, missing): docs, ys, target_names = newsgroups_train_binary_big vec = CountVectorizer(stop_words='english') clf = XGBClassifier(n_estimators=100, max_depth=2, missing=missing) xs = vec.fit_transform(docs) clf.fit(xs, ys) get_res = lambda **kwargs: explain_prediction( clf, 'computer graphics in space: a sign of atheism', vec=vec, target_names=target_names, **kwargs) res = get_res() for expl in format_as_all(res, clf, show_feature_values=True): assert 'graphics' in expl assert 'Missing' in expl check_targets_scores(res) weights = res.targets[0].feature_weights pos_features = get_all_features(weights.pos) neg_features = get_all_features(weights.neg) assert 'graphics' in pos_features assert 'computer' in pos_features assert 'atheism' in neg_features flt_res = get_res(feature_re='gra') flt_pos_features = get_all_features(flt_res.targets[0].feature_weights.pos) assert 'graphics' in flt_pos_features assert 'computer' not in flt_pos_features flt_value_res = get_res(feature_filter=lambda _, v: not np.isnan(v)) for expl in format_as_all(flt_value_res, clf, show_feature_values=True): assert 'Missing' not in expl
def test_explain_prediction_clf_interval(): true_xs = [[np.random.randint(3), np.random.randint(10)] for _ in range(1000)] xs = np.array([[np.random.normal(x, 0.2), np.random.normal(y, 0.2)] for x, y in true_xs]) ys = np.array([x == 1 for x, _ in true_xs]) clf = XGBClassifier(n_estimators=100, max_depth=2) clf.fit(xs, ys) res = explain_prediction(clf, np.array([1.23, 1.45])) for expl in format_as_all(res, clf, show_feature_values=True): assert 'x0' in expl assert '1.23' in expl for x in [[0, 1], [1, 1], [2, 1], [0.8, 5], [1.2, 5]]: res = explain_prediction(clf, np.array(x)) print(x) print(format_as_text(res, show=fields.WEIGHTS)) check_targets_scores(res)
def test_dense_missing(): xs = np.array([[0, 1], [0, 2], [1, 2], [1, 0], [0.1, 0.1]] * 10) ys = np.array([0, 0, 3, 2, 0.2] * 10) # set too high n_estimators to check empty trees too reg = XGBRegressor(n_estimators=100, max_depth=2, missing=0) reg.fit(xs, ys) res = explain_prediction(reg, np.array([2, 0])) check_targets_scores(res) for expl in format_as_all(res, reg, show_feature_values=True): assert 'x0' in expl assert 'x1' in expl assert 'Missing' in expl flt_res = explain_prediction(reg, np.array([2, 0]), feature_filter=lambda _, v: not np.isnan(v)) for expl in format_as_all(flt_res, reg, show_feature_values=True): assert 'x1' not in expl assert 'Missing' not in expl
def explain_prediction(self, x, column_id, feature_names): from eli5.explain import explain_prediction params = {} params['feature_names'] = feature_names params['top'] = 5 expl = explain_prediction(self.model[column_id], x, **params) from eli5.formatters import format_as_text params_text = {} params_text['show_feature_values'] = True return format_as_text(expl, **params_text)
def explain_prediction(self, x, model): from eli5.explain import explain_prediction params = {} params['feature_names'] = self.feature_name_list params['top'] = 5 expl = explain_prediction(model, x, **params) from eli5.formatters import format_as_text params_text = {} params_text['show_feature_values'] = True return format_as_text(expl, **params_text)
def test_explain_prediction_pandas_dot_in_feature_name(boston_train): pd = pytest.importorskip('pandas') X, y, feature_names = boston_train feature_names = ["%s.%s" % (name, idx) for idx, name in enumerate(feature_names)] df = pd.DataFrame(X, columns=feature_names) reg = XGBRegressor() reg.fit(df, y) res = explain_prediction(reg, df.iloc[0]) for expl in format_as_all(res, reg): assert 'PTRATIO.1' in expl
def test_explain_prediction_feature_union_sparse(newsgroups_train_binary): # FeatureUnion with sparce features and text highlighting docs, ys, target_names = newsgroups_train_binary vec = FeatureUnion([ ('word', CountVectorizer(stop_words='english')), ('char', CountVectorizer(ngram_range=(3, 3))), ]) clf = XGBClassifier(n_estimators=100, max_depth=2, missing=0) xs = vec.fit_transform(docs) clf.fit(xs, ys) res = explain_prediction( clf, 'computer graphics in space: a sign of atheism', vec=vec, target_names=target_names) format_as_all(res, clf) check_targets_scores(res) weights = res.targets[0].feature_weights pos_features = get_all_features(weights.pos) assert 'word__graphics' in pos_features assert res.targets[0].weighted_spans
def test_explain_prediction_clf_binary( newsgroups_train_binary_big, missing, use_booster): docs, ys, target_names = newsgroups_train_binary_big vec = CountVectorizer(stop_words='english') xs = vec.fit_transform(docs) explain_kwargs = {} if use_booster: clf = xgboost.train( params={'objective': 'binary:logistic', 'silent': True, 'max_depth': 2}, dtrain=xgboost.DMatrix(xs, label=ys, missing=missing), num_boost_round=100, ) explain_kwargs.update({'missing': missing, 'is_regression': False}) else: clf = XGBClassifier(n_estimators=100, max_depth=2, missing=missing) clf.fit(xs, ys) get_res = lambda **kwargs: explain_prediction( clf, 'computer graphics in space: a sign of atheism', vec=vec, target_names=target_names, **dict(kwargs, **explain_kwargs)) res = get_res() for expl in format_as_all(res, clf, show_feature_values=True): assert 'graphics' in expl assert 'Missing' in expl check_targets_scores(res) weights = res.targets[0].feature_weights pos_features = get_all_features(weights.pos) neg_features = get_all_features(weights.neg) assert 'graphics' in pos_features assert 'computer' in pos_features assert 'atheism' in neg_features flt_res = get_res(feature_re='gra') flt_pos_features = get_all_features(flt_res.targets[0].feature_weights.pos) assert 'graphics' in flt_pos_features assert 'computer' not in flt_pos_features flt_value_res = get_res(feature_filter=lambda _, v: not np.isnan(v)) for expl in format_as_all(flt_value_res, clf, show_feature_values=True): assert 'Missing' not in expl
def test_explain_prediction_feature_union_dense(): # Test FeatureUnion handling and missing features in dense matrix transformer = lambda key: FunctionTransformer( lambda xs: np.array([[x.get(key, np.nan)] for x in xs]), validate=False) vec = FeatureUnion([('x', transformer('x')), ('y', transformer('y'))]) gauss = np.random.normal data = [(gauss(1), 2 + 10 * gauss(1)) for _ in range(200)] ys = [-3 * x + y for x, y in data] xs = [{'x': gauss(x), 'y': gauss(y)} for x, y in data] for x in xs[:50]: del x['x'] for x in xs[-50:]: del x['y'] reg = XGBRegressor() reg.fit(vec.transform(xs), ys) res = explain_prediction(reg, xs[0], vec=vec, feature_names=['_x_', '_y_']) check_targets_scores(res) for expl in format_as_all(res, reg, show_feature_values=True): assert 'Missing' in expl assert '_y_' in expl assert '_x_' in expl