Пример #1
0
def test_feature_names_filtered():
    filtered, indices = (FeatureNames(['one', 'two', 'twenty-two'
                                       ]).filtered(lambda name: 'two' in name))
    assert indices == [1, 2]
    assert list(filtered) == ['two', 'twenty-two']

    filtered, indices = (FeatureNames(
        {
            1: 'two',
            3: 'twenty-two',
            5: 'two-thirds'
        },
        unkn_template='%d',
        n_features=6,
        bias_name='foo').filtered(lambda name: name.startswith('two')))
    assert indices == [1, 5]
    assert filtered.bias_name is None
    assert filtered.unkn_template == '%d'
    assert list(filtered) == ['two', 'two-thirds']

    filtered, indices = (FeatureNames(
        ['a', 'b'], bias_name='bias').filtered(lambda name: 'b' in name))
    assert indices == [1, 2]
    assert filtered.bias_name == 'bias'
    assert list(filtered) == ['b', 'bias']

    filtered, indices = (FeatureNames(
        unkn_template='x%d', n_features=6).filtered(lambda name: False))
    assert indices == []

    filtered, indices = (FeatureNames(['one', 'two', 'twenty-two']).filtered(
        lambda name, value: 't' in name and value <= 1, x=[0, 1, 2]))
    assert indices == [1]
    assert list(filtered) == ['two']
Пример #2
0
def get_feature_names(clf, vec=None, bias_name='<BIAS>', feature_names=None):
    """
    Return a vector of feature names, including bias feature.
    If vec is None or doesn't have get_feature_names() method,
    features are named x1, x2, etc.
    """
    if not has_intercept(clf):
        bias_name = None

    if feature_names is None:
        if vec and hasattr(vec, 'get_feature_names'):
            return FeatureNames(vec.get_feature_names(), bias_name=bias_name)
        else:
            num_features = get_num_features(clf)
            return FeatureNames(n_features=num_features,
                                unkn_template='x%d',
                                bias_name=bias_name)

    num_features = get_num_features(clf)
    if isinstance(feature_names, FeatureNames):
        if feature_names.n_features != num_features:
            raise ValueError("feature_names has a wrong n_features: "
                             "expected=%d, got=%d" %
                             (num_features, feature_names.n_features))
        # Make a shallow copy setting proper bias_name
        return FeatureNames(feature_names.feature_names,
                            n_features=num_features,
                            bias_name=bias_name,
                            unkn_template=feature_names.unkn_template)
    else:
        if len(feature_names) != num_features:
            raise ValueError("feature_names has a wrong length: "
                             "expected=%d, got=%d" %
                             (num_features, len(feature_names)))
        return FeatureNames(feature_names, bias_name=bias_name)
Пример #3
0
def test_get_feature_names():
    docs = ['hello world', 'hello', 'world']

    def _names(*args, **kwargs):
        return set(get_feature_names(*args, **kwargs))

    for y in [[0, 1, 2], [0, 1, 0]]:  # multiclass, binary
        vec = CountVectorizer()
        X = vec.fit_transform(docs)

        clf = LogisticRegression()
        clf.fit(X, y)

        fnames = get_feature_names(clf, vec)
        assert isinstance(fnames, FeatureNames)
        assert repr(fnames) == '<FeatureNames: 2 features with bias>'
        assert _names(clf, vec) == {'hello', 'world', '<BIAS>'}
        assert _names(clf, vec, 'B') == {'hello', 'world', 'B'}
        assert _names(clf) == {'x0', 'x1', '<BIAS>'}
        assert _names(clf, feature_names=['a', 'b']) == {'a', 'b', '<BIAS>'}
        assert _names(clf, feature_names=['a', 'b'],
                      bias_name='bias') == {'a', 'b', 'bias'}
        assert _names(clf,
                      feature_names=np.array(['a',
                                              'b'])) == {'a', 'b', '<BIAS>'}
        assert _names(clf,
                      feature_names=FeatureNames(['a', 'b'
                                                  ])) == {'a', 'b', '<BIAS>'}
        assert _names(clf,
                      feature_names=FeatureNames(n_features=2,
                                                 unkn_template='F%d')) == {
                                                     'F0', 'F1', '<BIAS>'
                                                 }

        with pytest.raises(ValueError):
            get_feature_names(clf, feature_names=['a'])

        with pytest.raises(ValueError):
            get_feature_names(clf, feature_names=['a', 'b', 'c'])

        with pytest.raises(ValueError):
            get_feature_names(clf, feature_names=FeatureNames(['a', 'b', 'c']))

        clf2 = LogisticRegression(fit_intercept=False)
        clf2.fit(X, y)
        assert _names(clf2, vec) == {'hello', 'world'}
        assert _names(clf2, feature_names=['hello',
                                           'world']) == {'hello', 'world'}
Пример #4
0
def _invhashing_union_feature_names_scale(vec_union):
    # type: (FeatureUnion) -> Tuple[FeatureNames, np.ndarray]
    feature_names_store = {}  # type: Dict[int, Union[str, List]]
    unkn_template = None
    shift = 0
    coef_scale_values = []
    for vec_name, vec in vec_union.transformer_list:
        if isinstance(vec, InvertableHashingVectorizer):
            vec_feature_names = vec.get_feature_names(always_signed=False)
            unkn_template = vec_feature_names.unkn_template
            for idx, fs in vec_feature_names.feature_names.items():
                new_fs = []
                for f in fs:
                    new_f = dict(f)
                    new_f['name'] = '{}__{}'.format(vec_name, f['name'])
                    new_fs.append(new_f)
                feature_names_store[idx + shift] = new_fs
            coef_scale_values.append((shift, vec.column_signs_))
            shift += vec_feature_names.n_features
        else:
            vec_feature_names = vec.get_feature_names()
            feature_names_store.update(
                (shift + idx, '{}__{}'.format(vec_name, fname))
                for idx, fname in enumerate(vec_feature_names))
            shift += len(vec_feature_names)
    n_features = shift
    feature_names = FeatureNames(feature_names=feature_names_store,
                                 n_features=n_features,
                                 unkn_template=unkn_template)
    coef_scale = np.ones(n_features) * np.nan
    for idx, values in coef_scale_values:
        coef_scale[idx:idx + len(values)] = values
    return feature_names, coef_scale
Пример #5
0
def test_init():
    with pytest.raises(ValueError):
        FeatureNames()
    with pytest.raises(ValueError):
        FeatureNames(unkn_template='%d')
    with pytest.raises(ValueError):
        FeatureNames(n_features=10)
    with pytest.raises(ValueError):
        FeatureNames(['a'], n_features=10)
    with pytest.raises(TypeError):
        FeatureNames({'a', 'b'})
    with pytest.raises(ValueError):
        FeatureNames({0: 'a', 1: 'b'}, n_features=10)
    FeatureNames(unkn_template='%d', n_features=10)
    FeatureNames(['a', 'b'])
    FeatureNames({0: 'a', 1: 'b'})
    FeatureNames({0: 'a', 1: 'b'}, n_features=10, unkn_template='x%d')
Пример #6
0
def get_feature_names(clf,
                      vec=None,
                      bias_name='<BIAS>',
                      feature_names=None,
                      num_features=None,
                      estimator_feature_names=None):
    # type: (Any, Any, Optional[str], Any, int, Any) -> FeatureNames
    """
    Return a FeatureNames instance that holds all feature names
    and a bias feature.
    If vec is None or doesn't have get_feature_names() method,
    features are named x0, x1, x2, etc.
    """
    if not has_intercept(clf):
        bias_name = None

    if feature_names is None:
        if vec and hasattr(vec, 'get_feature_names'):
            return FeatureNames(vec.get_feature_names(), bias_name=bias_name)
        else:
            if estimator_feature_names is None:
                num_features = num_features or get_num_features(clf)
                return FeatureNames(n_features=num_features,
                                    unkn_template='x%d',
                                    bias_name=bias_name)
            return FeatureNames(estimator_feature_names, bias_name=bias_name)

    num_features = num_features or get_num_features(clf)
    if isinstance(feature_names, FeatureNames):
        if feature_names.n_features != num_features:
            raise ValueError("feature_names has a wrong n_features: "
                             "expected=%d, got=%d" %
                             (num_features, feature_names.n_features))
        # Make a shallow copy setting proper bias_name
        return FeatureNames(feature_names.feature_names,
                            n_features=num_features,
                            bias_name=bias_name,
                            unkn_template=feature_names.unkn_template)
    else:
        if len(feature_names) != num_features:
            raise ValueError("feature_names has a wrong length: "
                             "expected=%d, got=%d" %
                             (num_features, len(feature_names)))
        return FeatureNames(feature_names, bias_name=bias_name)
Пример #7
0
def test_feature_names_handle_filter():
    filtered, indices = (FeatureNames(
        ['one', 'two', 'twenty-two']).handle_filter(lambda name: 'two' in name,
                                                    feature_re=None))
    assert indices == [1, 2]
    assert list(filtered) == ['two', 'twenty-two']

    filtered, indices = (FeatureNames(['one', 'two', 'twenty-two'
                                       ]).handle_filter(feature_filter=None,
                                                        feature_re='two'))
    assert indices == [1, 2]
    assert list(filtered) == ['two', 'twenty-two']

    filtered, indices = FeatureNames(['one', 'two']).handle_filter(None, None)
    assert indices is None
    assert list(filtered) == ['one', 'two']

    with pytest.raises(ValueError):
        FeatureNames(['one', 'two']).handle_filter(lambda name: True, '.*')
Пример #8
0
def explain_weights_sklearn_crfsuite(crf,
                                     top=20,
                                     target_names=None,
                                     targets=None,
                                     feature_re=None,
                                     feature_filter=None):
    """ Explain sklearn_crfsuite.CRF weights.

    See :func:`eli5.explain_weights` for description of
    ``top``, ``target_names``, ``targets``,
    ``feature_re`` and ``feature_filter`` parameters.
    """
    feature_names = np.array(crf.attributes_)
    state_coef = crf_state_coef(crf).todense().A
    transition_coef = crf_transition_coef(crf)

    if feature_filter is not None or feature_re is not None:
        state_feature_names, flt_indices = (
            FeatureNames(feature_names).handle_filter(feature_filter, feature_re))
        state_feature_names = np.array(state_feature_names.feature_names)
        state_coef = state_coef[:, flt_indices]
    else:
        state_feature_names = feature_names

    def _features(label_id):
        return get_top_features(state_feature_names, state_coef[label_id], top)

    if targets is None:
        targets = sorted_for_ner(crf.classes_)

    display_names = get_target_display_names(crf.classes_, target_names,
                                             targets)
    indices, names = zip(*display_names)
    transition_coef = filter_transition_coefs(transition_coef, indices)

    return Explanation(
        targets=[
            TargetExplanation(
                target=label,
                feature_weights=_features(label_id)
            )
            for label_id, label in zip(indices, names)
        ],
        transition_features=TransitionFeatureWeights(
            class_names=names,
            coef=transition_coef,
        ),
        estimator=repr(crf),
        method='CRF',
    )
Пример #9
0
def test_feature_names_filter_by_re():
    filtered, indices = (FeatureNames(['one', 'two',
                                       'twenty-two']).filtered_by_re('two'))
    assert indices == [1, 2]
    assert list(filtered) == ['two', 'twenty-two']

    filtered, indices = (FeatureNames(
        {
            1: 'two',
            3: 'twenty-two',
            5: 'two-thirds'
        },
        unkn_template='%d',
        n_features=6,
        bias_name='foo').filtered_by_re('^two'))
    assert indices == [1, 5]
    assert filtered.bias_name == 'foo'
    assert filtered.unkn_template == '%d'
    assert list(filtered) == ['two', 'two-thirds', 'foo']

    filtered, indices = (FeatureNames(unkn_template='x%d',
                                      n_features=6).filtered_by_re('x'))
    assert indices == []
Пример #10
0
    def get_feature_names(self, always_signed=True, always_positive=False):
        self.recalculate_attributes()

        # lists of names with signs of known features
        column_ids, term_names, term_signs = self._get_collision_info()
        feature_names = {}
        for col_id, names, signs in zip(column_ids, term_names, term_signs):
            if always_positive:
                feature_names[col_id] = [{'name': name, 'sign': 1}
                                         for name in names]
            else:
                if not always_signed and _invert_signs(signs):
                    signs = [-sign for sign in signs]
                feature_names[col_id] = [{'name': name, 'sign': sign}
                                         for name, sign in zip(names, signs)]
        return FeatureNames(
            feature_names,
            n_features=self.n_features,
            unkn_template=self.unkn_template)
Пример #11
0
    assert FN(['one', 'two', 'three'])[1:] == ['two', 'three']
    assert FN({1: 'one'}, n_features=3, unkn_template='x%d')[:] \
        == ['x0', 'one', 'x2']
    assert FN({1: 'one'}, n_features=3, unkn_template='x%d',
              bias_name='bias')[-3:] \
        == ['one', 'x2', 'bias']
    assert FN(['one', 'two', 'three'], bias_name='bias')[-1:] == ['bias']
    assert FN(np.array(['one', 'two', 'three']), bias_name='bias')[-1:] \
        == ['bias']
    assert FN(np.array(['one', 'two', 'three']), bias_name='bias')[-2:] \
        == ['three', 'bias']
    assert list(FN(np.array(['one', 'two', 'three']))[-2:]) == ['two', 'three']


@pytest.mark.parametrize(['feature_names'], [
    [FeatureNames(['x1', 'x2', 'x3'])],
    [FeatureNames(['x1', 'x2', 'x3'], bias_name='<BIAS>')],
    [FeatureNames(np.array(['x1', 'x2', 'x3']))],
    [FeatureNames({
        0: 'x1',
        1: 'x2'
    })],
    [FeatureNames(n_features=5, unkn_template='%d')],
])
def test_add_feature(feature_names):
    len_before = len(feature_names)
    storage = feature_names.feature_names
    new_feature = 'new'
    new_idx = feature_names.add_feature(new_feature)
    assert len(feature_names) == len_before + 1
    assert feature_names[new_idx] == new_feature