def test_transition_features(): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation('class1', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), TargetExplanation('class2', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), ], transition_features=TransitionFeatureWeights( class_names=['class2', 'class1'], # reverse on purpose coef=np.array([[1.5, 2.5], [3.5, 4.5]]), )) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert set(df_dict) == {'targets', 'transition_features'} assert df_dict['targets'].equals(format_as_dataframe(expl.targets)) df = df_dict['transition_features'] print(df) print(format_as_text(expl)) assert str(df) == ('to class2 class1\n' 'from \n' 'class2 1.5 2.5\n' 'class1 3.5 4.5') with pytest.warns(UserWarning): single_df = format_as_dataframe(expl) assert single_df.equals(df)
def test_feature_importances(with_std, with_value): expl = Explanation(estimator='some estimator', feature_importances=FeatureImportances( importances=[ FeatureWeight('a', 1, std=0.1 if with_std else None, value=1 if with_value else None), FeatureWeight('b', 2, std=0.2 if with_std else None, value=3 if with_value else None), ], remaining=10, )) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert list(df_dict) == ['feature_importances'] df = df_dict['feature_importances'] expected_df = pd.DataFrame({'weight': [1, 2]}, index=['a', 'b']) if with_std: expected_df['std'] = [0.1, 0.2] if with_value: expected_df['value'] = [1, 3] print(df, expected_df, sep='\n') assert expected_df.equals(df) single_df = format_as_dataframe(expl) assert expected_df.equals(single_df)
def test_targets_with_value(): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation('y', feature_weights=FeatureWeights( pos=[ FeatureWeight('a', 13, value=1), FeatureWeight('b', 5, value=2) ], neg=[ FeatureWeight('neg1', -10, value=3), FeatureWeight('neg2', -1, value=4) ], )), TargetExplanation('y2', feature_weights=FeatureWeights( pos=[FeatureWeight('f', 1, value=5)], neg=[], )), ], ) df = format_as_dataframe(expl) expected_df = pd.DataFrame( { 'weight': [13, 5, -1, -10, 1], 'value': [1, 2, 4, 3, 5] }, columns=['weight', 'value'], index=pd.MultiIndex.from_tuples([('y', 'a'), ('y', 'b'), ('y', 'neg2'), ('y', 'neg1'), ('y2', 'f')], names=['target', 'feature'])) print(df, expected_df, sep='\n') assert expected_df.equals(df)
def _features(indices, feature_names, coef, x): names = mask(feature_names, indices) weights = mask(coef, indices) if x is not None: values = mask(x, indices) return [FeatureWeight(name, weight, value=value) for name, weight, value in zip(names, weights, values)] else: return [FeatureWeight(name, weight) for name, weight in zip(names, weights)]
def test_targets(with_std, with_value): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation( 'y', feature_weights=FeatureWeights( pos=[ FeatureWeight('a', 13, std=0.13 if with_std else None, value=2 if with_value else None), FeatureWeight('b', 5, std=0.5 if with_std else None, value=1 if with_value else None) ], neg=[ FeatureWeight('neg1', -10, std=0.2 if with_std else None, value=5 if with_value else None), FeatureWeight('neg2', -1, std=0.3 if with_std else None, value=4 if with_value else None) ], )), TargetExplanation('y2', feature_weights=FeatureWeights( pos=[FeatureWeight('f', 1)], neg=[], )), ], ) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert list(df_dict) == ['targets'] df = df_dict['targets'] expected_df = pd.DataFrame( { 'target': ['y', 'y', 'y', 'y', 'y2'], 'feature': ['a', 'b', 'neg2', 'neg1', 'f'], 'weight': [13, 5, -1, -10, 1] }, columns=['target', 'feature', 'weight']) if with_std: expected_df['std'] = [0.13, 0.5, 0.3, 0.2, None] if with_value: expected_df['value'] = [2, 1, 4, 5, None] print(df, expected_df, sep='\n') assert expected_df.equals(df) single_df = format_as_dataframe(expl) assert expected_df.equals(single_df)
def __init__(self, *args, **kwargs): self.dictionary = kwargs['dictionary'] if 'dictionary' in kwargs else None self.formatted_value = kwargs['formatted_value'] if 'formatted_value' in kwargs else None self.score = kwargs['score'] if 'score' in kwargs else None if 'dictionary' in kwargs: del kwargs['dictionary'] if 'formatted_value' in kwargs: del kwargs['formatted_value'] if 'score' in kwargs: del kwargs['score'] FeatureWeight.__init__(self, *args, **kwargs)
def test_transition_features(): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation('class1', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), TargetExplanation('class2', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), ], transition_features=TransitionFeatureWeights( class_names=['class2', 'class1'], # reverse on purpose coef=np.array([[1.5, 2.5], [3.5, 4.5]]), )) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert set(df_dict) == {'targets', 'transition_features'} assert df_dict['targets'].equals(format_as_dataframe(expl.targets)) df = df_dict['transition_features'] print(df) print(format_as_text(expl)) expected = pd.DataFrame([ { 'from': 'class2', 'to': 'class2', 'coef': 1.5 }, { 'from': 'class2', 'to': 'class1', 'coef': 2.5 }, { 'from': 'class1', 'to': 'class2', 'coef': 3.5 }, { 'from': 'class1', 'to': 'class1', 'coef': 4.5 }, ], columns=['from', 'to', 'coef']) assert df.equals(expected) with pytest.warns(UserWarning): single_df = format_as_dataframe(expl) assert single_df.equals(df)
def _get_other(feature_weights, named_found_features): # type: (FeatureWeights, List[Tuple[str, FoundFeatures]]) -> FeatureWeights # search for items that were not accounted at all. other_items = [] # type: List[FeatureWeight] accounted_keys = set() # type: Set[Tuple[str, int]] all_found_features = set() # type: Set[Tuple[str, int]] for _, found_features in named_found_features: all_found_features.update(found_features) for group in ['pos', 'neg']: for idx, fw in enumerate(getattr(feature_weights, group)): key = (group, idx) if key not in all_found_features and key not in accounted_keys: other_items.append(fw) accounted_keys.add(key) for vec_name, found_features in named_found_features: if found_features: other_items.append( FeatureWeight(feature=FormattedFeatureName( '{}Highlighted in text (sum)'.format( '{}: '.format(vec_name) if vec_name else '')), weight=sum(found_features.values()))) other_items.sort(key=lambda x: abs(x.weight), reverse=True) return FeatureWeights( pos=[fw for fw in other_items if fw.weight >= 0], neg=[fw for fw in other_items if fw.weight < 0], pos_remaining=feature_weights.pos_remaining, neg_remaining=feature_weights.neg_remaining, )
def explain_decision_tree( estimator, vec=None, top=_TOP, target_names=None, targets=None, # ignored feature_names=None, feature_re=None, feature_filter=None, **export_graphviz_kwargs): """ Return an explanation of a decision tree. See :func:`eli5.explain_weights` for description of ``top``, ``target_names``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``targets`` parameter is ignored. ``vec`` is a vectorizer instance used to transform raw features to the input of the estimator (e.g. a fitted CountVectorizer instance); you can pass it instead of ``feature_names``. All other keyword arguments are passed to `sklearn.tree.export_graphviz`_ function. .. _sklearn.tree.export_graphviz: http://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html """ feature_names = get_feature_names(estimator, vec, feature_names=feature_names) coef = estimator.feature_importances_ tree_feature_names = feature_names feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re) if flt_indices is not None: coef = coef[flt_indices] indices = argsort_k_largest_positive(coef, top) names, values = feature_names[indices], coef[indices] export_graphviz_kwargs.setdefault("proportion", True) tree_info = get_tree_info(estimator, feature_names=tree_feature_names, class_names=target_names, **export_graphviz_kwargs) return Explanation( feature_importances=FeatureImportances( [FeatureWeight(*x) for x in zip(names, values)], remaining=np.count_nonzero(coef) - len(indices), ), decision_tree=tree_info, description=DESCRIPTION_DECISION_TREE, estimator=repr(estimator), method='decision tree', )
def test_format_as_dict(): assert format_as_dict( Explanation( estimator='some estimator', targets=[ TargetExplanation('y', feature_weights=FeatureWeights(pos=[ FeatureWeight('a', np.float32(13.0)) ], neg=[])), ], )) == { 'estimator': 'some estimator', 'targets': [ { 'target': 'y', 'feature_weights': { 'pos': [{ 'feature': 'a', 'weight': 13.0, 'std': None, 'value': None }], 'pos_remaining': 0, 'neg': [], 'neg_remaining': 0, }, 'score': None, 'proba': None, 'weighted_spans': None, 'heatmap': None, }, ], 'decision_tree': None, 'description': None, 'error': None, 'feature_importances': None, 'highlight_spaces': None, 'is_regression': False, 'method': None, 'transition_features': None, 'image': None, }
def explain_weights_xgboost(xgb, vec=None, top=20, target_names=None, # ignored targets=None, # ignored feature_names=None, feature_re=None, feature_filter=None, importance_type='gain', ): """ Return an explanation of an XGBoost estimator (via scikit-learn wrapper XGBClassifier or XGBRegressor) as feature importances. See :func:`eli5.explain_weights` for description of ``top``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``target_names`` and ``targets`` parameters are ignored. Parameters ---------- importance_type : str, optional A way to get feature importance. Possible values are: - 'gain' - the average gain of the feature when it is used in trees (default) - 'weight' - the number of times a feature is used to split the data across all trees - 'cover' - the average coverage of the feature when it is used in trees """ coef = _xgb_feature_importances(xgb, importance_type=importance_type) num_features = coef.shape[-1] feature_names = get_feature_names( xgb, vec, feature_names=feature_names, num_features=num_features) feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re) if flt_indices is not None: coef = coef[flt_indices] indices = argsort_k_largest_positive(coef, top) names, values = feature_names[indices], coef[indices] return Explanation( feature_importances=FeatureImportances( [FeatureWeight(*x) for x in zip(names, values)], remaining=np.count_nonzero(coef) - len(indices), ), description=DESCRIPTION_XGBOOST, estimator=repr(xgb), method='feature importances', is_regression=isinstance(xgb, XGBRegressor), )
def explain_decision_tree( clf, vec=None, top=_TOP, target_names=None, targets=None, # ignored feature_names=None, feature_re=None, **export_graphviz_kwargs): """ Return an explanation of a decision tree classifier in the following format (compatible with random forest explanations):: Explanation( estimator="<classifier repr>", method="<interpretation method>", description="<human readable description>", decision_tree={...tree information}, feature_importances=[ FeatureWeight(feature_name, importance, std_deviation), ... ] ) """ feature_names = get_feature_names(clf, vec, feature_names=feature_names) coef = clf.feature_importances_ tree_feature_names = feature_names if feature_re is not None: feature_names, flt_indices = feature_names.filtered_by_re(feature_re) coef = coef[flt_indices] indices = argsort_k_largest(coef, top) names, values = feature_names[indices], coef[indices] std = np.zeros_like(values) export_graphviz_kwargs.setdefault("proportion", True) tree_info = get_tree_info(clf, feature_names=tree_feature_names, class_names=target_names, **export_graphviz_kwargs) return Explanation( feature_importances=[ FeatureWeight(*x) for x in zip(names, values, std) ], decision_tree=tree_info, description=DESCRIPTION_DECISION_TREE, estimator=repr(clf), method='decision tree', )
def explain_rf_feature_importance( estimator, vec=None, top=_TOP, target_names=None, # ignored targets=None, # ignored feature_names=None, feature_re=None, feature_filter=None, ): """ Return an explanation of a tree-based ensemble estimator. See :func:`eli5.explain_weights` for description of ``top``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``target_names`` and ``targets`` parameters are ignored. ``vec`` is a vectorizer instance used to transform raw features to the input of the estimator (e.g. a fitted CountVectorizer instance); you can pass it instead of ``feature_names``. """ feature_names = get_feature_names(estimator, vec, feature_names=feature_names) coef = estimator.feature_importances_ trees = np.array(estimator.estimators_).ravel() coef_std = np.std([tree.feature_importances_ for tree in trees], axis=0) feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re) if flt_indices is not None: coef = coef[flt_indices] coef_std = coef_std[flt_indices] indices = argsort_k_largest_positive(coef, top) names, values, std = feature_names[indices], coef[indices], coef_std[ indices] return Explanation( feature_importances=FeatureImportances( [FeatureWeight(*x) for x in zip(names, values, std)], remaining=np.count_nonzero(coef) - len(indices), ), description=DESCRIPTION_RANDOM_FOREST, estimator=repr(estimator), method='feature importances', )
def explain_rf_feature_importance( clf, vec=None, top=_TOP, target_names=None, # ignored targets=None, # ignored feature_names=None, feature_re=None): """ Return an explanation of a tree-based ensemble classifier in the following format:: Explanation( estimator="<classifier repr>", method="<interpretation method>", description="<human readable description>", feature_importances=[ FeatureWeight(feature_name, importance, std_deviation), ... ] ) """ feature_names = get_feature_names(clf, vec, feature_names=feature_names) coef = clf.feature_importances_ trees = np.array(clf.estimators_).ravel() coef_std = np.std([tree.feature_importances_ for tree in trees], axis=0) if feature_re is not None: feature_names, flt_indices = feature_names.filtered_by_re(feature_re) coef = coef[flt_indices] coef_std = coef_std[flt_indices] indices = argsort_k_largest(coef, top) names, values, std = feature_names[indices], coef[indices], coef_std[ indices] return Explanation( feature_importances=[ FeatureWeight(*x) for x in zip(names, values, std) ], description=DESCRIPTION_RANDOM_FOREST, estimator=repr(clf), method='feature importances', )
def _get_other(feature_weights, feature_weights_dict, found_features): # search for items that were not accounted at all. other_items = [] accounted_keys = set() # type: Set[Tuple[str, int]] for feature, (_, key) in feature_weights_dict.items(): if key not in found_features and key not in accounted_keys: group, idx = key other_items.append(getattr(feature_weights, group)[idx]) accounted_keys.add(key) if found_features: other_items.append( FeatureWeight(FormattedFeatureName('Highlighted in text (sum)'), sum(found_features.values()))) other_items.sort(key=lambda x: abs(x.weight), reverse=True) return FeatureWeights( pos=[fw for fw in other_items if fw.weight >= 0], neg=[fw for fw in other_items if fw.weight < 0], pos_remaining=feature_weights.pos_remaining, neg_remaining=feature_weights.neg_remaining, )
def _features(indices, feature_names, coef): names = mask(feature_names, indices) values = mask(coef, indices) return [FeatureWeight(name, weight) for name, weight in zip(names, values)]