Пример #1
0
def test_weighted_spans_feature_union():
    doc = {'text': 'I see: a leaning lemon tree', 'url': 'http://example.com'}
    vec = FeatureUnion([
        ('text',
         CountVectorizer(analyzer='word',
                         preprocessor=lambda x: x['text'].lower())),
        ('url',
         CountVectorizer(analyzer='char',
                         ngram_range=(4, 4),
                         preprocessor=lambda x: x['url'])),
    ])
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[
            FW('text__see', 2),
            FW('text__lemon', 4),
            FW('bias', 8),
            FW('url__ampl', 10),
            FW('url__mple', 7),
        ],
                       neg=[
                           FW('text__tree', -6),
                           FW('url__exam', -10),
                       ],
                       neg_remaining=10))
    assert w_spans == WeightedSpans(
        [
            DocWeightedSpans(
                document='i see: a leaning lemon tree',
                spans=[('see', [(2, 5)], 2), ('lemon', [(17, 22)], 4),
                       ('tree', [(23, 27)], -6)],
                preserve_density=False,
                vec_name='text',
            ),
            DocWeightedSpans(
                document='http://example.com',
                spans=[('exam', [(7, 11)], -10), ('ampl', [(9, 13)], 10),
                       ('mple', [(10, 14)], 7)],
                preserve_density=True,
                vec_name='url',
            ),
        ],
        other=FeatureWeights(
            pos=[
                FW('bias', 8),
                FW(FormattedFeatureName('url: Highlighted in text (sum)'), 7),
                FW(FormattedFeatureName('text: Highlighted in text (sum)'), 0),
            ],
            neg=[],
            neg_remaining=10,
        ))
Пример #2
0
def _get_other(feature_weights, named_found_features):
    # type: (FeatureWeights, List[Tuple[str, FoundFeatures]]) -> FeatureWeights
    # search for items that were not accounted at all.
    other_items = []  # type: List[FeatureWeight]
    accounted_keys = set()  # type: Set[Tuple[str, int]]
    all_found_features = set()  # type: Set[Tuple[str, int]]
    for _, found_features in named_found_features:
        all_found_features.update(found_features)

    for group in ['pos', 'neg']:
        for idx, fw in enumerate(getattr(feature_weights, group)):
            key = (group, idx)
            if key not in all_found_features and key not in accounted_keys:
                other_items.append(fw)
                accounted_keys.add(key)

    for vec_name, found_features in named_found_features:
        if found_features:
            other_items.append(
                FeatureWeight(feature=FormattedFeatureName(
                    '{}Highlighted in text (sum)'.format(
                        '{}: '.format(vec_name) if vec_name else '')),
                              weight=sum(found_features.values())))

    other_items.sort(key=lambda x: abs(x.weight), reverse=True)
    return FeatureWeights(
        pos=[fw for fw in other_items if fw.weight >= 0],
        neg=[fw for fw in other_items if fw.weight < 0],
        pos_remaining=feature_weights.pos_remaining,
        neg_remaining=feature_weights.neg_remaining,
    )
Пример #3
0
def _get_other(feature_weights, feature_weights_dict, found_features):
    # search for items that were not accounted at all.
    other_items = []
    accounted_keys = set()  # type: Set[Tuple[str, int]]
    for feature, (_, key) in feature_weights_dict.items():
        if key not in found_features and key not in accounted_keys:
            group, idx = key
            other_items.append(getattr(feature_weights, group)[idx])
            accounted_keys.add(key)
    if found_features:
        other_items.append(
            FeatureWeight(FormattedFeatureName('Highlighted in text (sum)'),
                          sum(found_features.values())))
    other_items.sort(key=lambda x: abs(x.weight), reverse=True)
    return FeatureWeights(
        pos=[fw for fw in other_items if fw.weight >= 0],
        neg=[fw for fw in other_items if fw.weight < 0],
        pos_remaining=feature_weights.pos_remaining,
        neg_remaining=feature_weights.neg_remaining,
    )
Пример #4
0
def test_format_formatted_feature():
    assert format_feature(FormattedFeatureName('a b')) == 'a b'
    assert format_feature('a b') != 'a b'
    assert format_feature('a b') == format_single_feature('a b')
Пример #5
0
from sklearn.feature_extraction.text import CountVectorizer

from eli5.base import WeightedSpans, FeatureWeights, FeatureWeight as FW
from eli5.formatters import FormattedFeatureName
from eli5.sklearn.text import get_weighted_spans

hl_in_text = FormattedFeatureName('Highlighted in text (sum)')


def test_weighted_spans_word():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='word')
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[FW('see', 2),
                            FW('lemon', 4),
                            FW('bias', 8)],
                       neg=[FW('tree', -6)],
                       neg_remaining=10))
    assert w_spans == WeightedSpans(analyzer='word',
                                    document='i see: a leaning lemon tree',
                                    weighted_spans=[('see', [(2, 5)], 2),
                                                    ('lemon', [(17, 22)], 4),
                                                    ('tree', [(23, 27)], -6)],
                                    other=FeatureWeights(
                                        pos=[FW('bias', 8),
                                             FW(hl_in_text, 0)],
                                        neg=[],
                                        neg_remaining=10,
                                    ))
Пример #6
0
def test_format_formatted_feature():
    assert format_feature(FormattedFeatureName('a b')) == 'a b'
    assert format_feature('a b') == 'a{}b'.format(_SPACE)