def test_weighted_spans_feature_union(): doc = {'text': 'I see: a leaning lemon tree', 'url': 'http://example.com'} vec = FeatureUnion([ ('text', CountVectorizer(analyzer='word', preprocessor=lambda x: x['text'].lower())), ('url', CountVectorizer(analyzer='char', ngram_range=(4, 4), preprocessor=lambda x: x['url'])), ]) vec.fit([doc]) w_spans = get_weighted_spans( doc, vec, FeatureWeights(pos=[ FW('text__see', 2), FW('text__lemon', 4), FW('bias', 8), FW('url__ampl', 10), FW('url__mple', 7), ], neg=[ FW('text__tree', -6), FW('url__exam', -10), ], neg_remaining=10)) assert w_spans == WeightedSpans( [ DocWeightedSpans( document='i see: a leaning lemon tree', spans=[('see', [(2, 5)], 2), ('lemon', [(17, 22)], 4), ('tree', [(23, 27)], -6)], preserve_density=False, vec_name='text', ), DocWeightedSpans( document='http://example.com', spans=[('exam', [(7, 11)], -10), ('ampl', [(9, 13)], 10), ('mple', [(10, 14)], 7)], preserve_density=True, vec_name='url', ), ], other=FeatureWeights( pos=[ FW('bias', 8), FW(FormattedFeatureName('url: Highlighted in text (sum)'), 7), FW(FormattedFeatureName('text: Highlighted in text (sum)'), 0), ], neg=[], neg_remaining=10, ))
def _get_other(feature_weights, named_found_features): # type: (FeatureWeights, List[Tuple[str, FoundFeatures]]) -> FeatureWeights # search for items that were not accounted at all. other_items = [] # type: List[FeatureWeight] accounted_keys = set() # type: Set[Tuple[str, int]] all_found_features = set() # type: Set[Tuple[str, int]] for _, found_features in named_found_features: all_found_features.update(found_features) for group in ['pos', 'neg']: for idx, fw in enumerate(getattr(feature_weights, group)): key = (group, idx) if key not in all_found_features and key not in accounted_keys: other_items.append(fw) accounted_keys.add(key) for vec_name, found_features in named_found_features: if found_features: other_items.append( FeatureWeight(feature=FormattedFeatureName( '{}Highlighted in text (sum)'.format( '{}: '.format(vec_name) if vec_name else '')), weight=sum(found_features.values()))) other_items.sort(key=lambda x: abs(x.weight), reverse=True) return FeatureWeights( pos=[fw for fw in other_items if fw.weight >= 0], neg=[fw for fw in other_items if fw.weight < 0], pos_remaining=feature_weights.pos_remaining, neg_remaining=feature_weights.neg_remaining, )
def _get_other(feature_weights, feature_weights_dict, found_features): # search for items that were not accounted at all. other_items = [] accounted_keys = set() # type: Set[Tuple[str, int]] for feature, (_, key) in feature_weights_dict.items(): if key not in found_features and key not in accounted_keys: group, idx = key other_items.append(getattr(feature_weights, group)[idx]) accounted_keys.add(key) if found_features: other_items.append( FeatureWeight(FormattedFeatureName('Highlighted in text (sum)'), sum(found_features.values()))) other_items.sort(key=lambda x: abs(x.weight), reverse=True) return FeatureWeights( pos=[fw for fw in other_items if fw.weight >= 0], neg=[fw for fw in other_items if fw.weight < 0], pos_remaining=feature_weights.pos_remaining, neg_remaining=feature_weights.neg_remaining, )
def test_format_formatted_feature(): assert format_feature(FormattedFeatureName('a b')) == 'a b' assert format_feature('a b') != 'a b' assert format_feature('a b') == format_single_feature('a b')
from sklearn.feature_extraction.text import CountVectorizer from eli5.base import WeightedSpans, FeatureWeights, FeatureWeight as FW from eli5.formatters import FormattedFeatureName from eli5.sklearn.text import get_weighted_spans hl_in_text = FormattedFeatureName('Highlighted in text (sum)') def test_weighted_spans_word(): doc = 'I see: a leaning lemon tree' vec = CountVectorizer(analyzer='word') vec.fit([doc]) w_spans = get_weighted_spans( doc, vec, FeatureWeights(pos=[FW('see', 2), FW('lemon', 4), FW('bias', 8)], neg=[FW('tree', -6)], neg_remaining=10)) assert w_spans == WeightedSpans(analyzer='word', document='i see: a leaning lemon tree', weighted_spans=[('see', [(2, 5)], 2), ('lemon', [(17, 22)], 4), ('tree', [(23, 27)], -6)], other=FeatureWeights( pos=[FW('bias', 8), FW(hl_in_text, 0)], neg=[], neg_remaining=10, ))
def test_format_formatted_feature(): assert format_feature(FormattedFeatureName('a b')) == 'a b' assert format_feature('a b') == 'a{}b'.format(_SPACE)