def get_decision_path_explanation(estimator, doc, vec, vectorized, x, feature_names, feature_filter, feature_re, top, original_display_names, target_names, targets, top_targets, is_regression, is_multiclass, proba, get_score_weights): # type: (...) -> Explanation display_names = get_target_display_names( original_display_names, target_names, targets, top_targets, proba) flt_feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re, x) def get_top_features(weights, scale=1.0): return get_top_features_filtered(x, flt_feature_names, flt_indices, weights, top, scale) explanation = Explanation( estimator=repr(estimator), method='decision paths', description={ (False, False): DESCRIPTION_CLF_BINARY, (False, True): DESCRIPTION_CLF_MULTICLASS, (True, False): DESCRIPTION_REGRESSION, }[is_regression, is_multiclass], is_regression=is_regression, targets=[], ) assert explanation.targets is not None if is_multiclass: for label_id, label in display_names: score, all_feature_weights = get_score_weights(label_id) target_expl = TargetExplanation( target=label, feature_weights=get_top_features(all_feature_weights), score=score, proba=proba[label_id] if proba is not None else None, ) add_weighted_spans(doc, vec, vectorized, target_expl) explanation.targets.append(target_expl) else: score, all_feature_weights = get_score_weights(0) if is_regression: target, scale, label_id = display_names[-1][1], 1.0, 1 else: target, scale, label_id = get_binary_target_scale_label_id( score, display_names, proba) target_expl = TargetExplanation( target=target, feature_weights=get_top_features(all_feature_weights, scale), score=score, proba=proba[label_id] if proba is not None else None, ) add_weighted_spans(doc, vec, vectorized, target_expl) explanation.targets.append(target_expl) return explanation
def explain_prediction_tree_classifier(clf, doc, vec=None, top=None, top_targets=None, target_names=None, targets=None, feature_names=None, feature_re=None, feature_filter=None, vectorized=False): """ Explain prediction of a tree classifier. See :func:`eli5.explain_prediction` for description of ``top``, ``top_targets``, ``target_names``, ``targets``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``vec`` is a vectorizer instance used to transform raw features to the input of the classifier ``clf`` (e.g. a fitted CountVectorizer instance); you can pass it instead of ``feature_names``. ``vectorized`` is a flag which tells eli5 if ``doc`` should be passed through ``vec`` or not. By default it is False, meaning that if ``vec`` is not None, ``vec.transform([doc])`` is passed to the classifier. Set it to True if you're passing ``vec``, but ``doc`` is already vectorized. Method for determining feature importances follows an idea from http://blog.datadive.net/interpreting-random-forests/. Feature weights are calculated by following decision paths in trees of an ensemble (or a single tree for DecisionTreeClassifier). Each node of the tree has an output score, and contribution of a feature on the decision path is how much the score changes from parent to child. Weights of all features sum to the output score or proba of the estimator. """ vec, feature_names = handle_vec(clf, doc, vec, vectorized, feature_names) X = get_X(doc, vec=vec, vectorized=vectorized) if feature_names.bias_name is None: # Tree estimators do not have an intercept, but here we interpret # them as having an intercept feature_names.bias_name = '<BIAS>' proba = predict_proba(clf, X) if hasattr(clf, 'decision_function'): score, = clf.decision_function(X) else: score = None is_multiclass = clf.n_classes_ > 2 feature_weights = _trees_feature_weights(clf, X, feature_names, clf.n_classes_) x = get_X0(add_intercept(X)) flt_feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re, x) def _weights(label_id, scale=1.0): weights = feature_weights[:, label_id] return get_top_features_filtered(x, flt_feature_names, flt_indices, weights, top, scale) res = Explanation( estimator=repr(clf), method='decision path', targets=[], description=(DESCRIPTION_TREE_CLF_MULTICLASS if is_multiclass else DESCRIPTION_TREE_CLF_BINARY), ) assert res.targets is not None display_names = get_target_display_names( clf.classes_, target_names, targets, top_targets, score=score if score is not None else proba) if is_multiclass: for label_id, label in display_names: target_expl = TargetExplanation( target=label, feature_weights=_weights(label_id), score=score[label_id] if score is not None else None, proba=proba[label_id] if proba is not None else None, ) add_weighted_spans(doc, vec, vectorized, target_expl) res.targets.append(target_expl) else: target, scale, label_id = get_binary_target_scale_label_id( score, display_names, proba) target_expl = TargetExplanation( target=target, feature_weights=_weights(label_id, scale=scale), score=score if score is not None else None, proba=proba[label_id] if proba is not None else None, ) add_weighted_spans(doc, vec, vectorized, target_expl) res.targets.append(target_expl) return res