def test_feature_names_filtered(): filtered, indices = (FeatureNames(['one', 'two', 'twenty-two' ]).filtered(lambda name: 'two' in name)) assert indices == [1, 2] assert list(filtered) == ['two', 'twenty-two'] filtered, indices = (FeatureNames( { 1: 'two', 3: 'twenty-two', 5: 'two-thirds' }, unkn_template='%d', n_features=6, bias_name='foo').filtered(lambda name: name.startswith('two'))) assert indices == [1, 5] assert filtered.bias_name is None assert filtered.unkn_template == '%d' assert list(filtered) == ['two', 'two-thirds'] filtered, indices = (FeatureNames( ['a', 'b'], bias_name='bias').filtered(lambda name: 'b' in name)) assert indices == [1, 2] assert filtered.bias_name == 'bias' assert list(filtered) == ['b', 'bias'] filtered, indices = (FeatureNames( unkn_template='x%d', n_features=6).filtered(lambda name: False)) assert indices == [] filtered, indices = (FeatureNames(['one', 'two', 'twenty-two']).filtered( lambda name, value: 't' in name and value <= 1, x=[0, 1, 2])) assert indices == [1] assert list(filtered) == ['two']
def get_feature_names(clf, vec=None, bias_name='<BIAS>', feature_names=None): """ Return a vector of feature names, including bias feature. If vec is None or doesn't have get_feature_names() method, features are named x1, x2, etc. """ if not has_intercept(clf): bias_name = None if feature_names is None: if vec and hasattr(vec, 'get_feature_names'): return FeatureNames(vec.get_feature_names(), bias_name=bias_name) else: num_features = get_num_features(clf) return FeatureNames(n_features=num_features, unkn_template='x%d', bias_name=bias_name) num_features = get_num_features(clf) if isinstance(feature_names, FeatureNames): if feature_names.n_features != num_features: raise ValueError("feature_names has a wrong n_features: " "expected=%d, got=%d" % (num_features, feature_names.n_features)) # Make a shallow copy setting proper bias_name return FeatureNames(feature_names.feature_names, n_features=num_features, bias_name=bias_name, unkn_template=feature_names.unkn_template) else: if len(feature_names) != num_features: raise ValueError("feature_names has a wrong length: " "expected=%d, got=%d" % (num_features, len(feature_names))) return FeatureNames(feature_names, bias_name=bias_name)
def test_get_feature_names(): docs = ['hello world', 'hello', 'world'] def _names(*args, **kwargs): return set(get_feature_names(*args, **kwargs)) for y in [[0, 1, 2], [0, 1, 0]]: # multiclass, binary vec = CountVectorizer() X = vec.fit_transform(docs) clf = LogisticRegression() clf.fit(X, y) fnames = get_feature_names(clf, vec) assert isinstance(fnames, FeatureNames) assert repr(fnames) == '<FeatureNames: 2 features with bias>' assert _names(clf, vec) == {'hello', 'world', '<BIAS>'} assert _names(clf, vec, 'B') == {'hello', 'world', 'B'} assert _names(clf) == {'x0', 'x1', '<BIAS>'} assert _names(clf, feature_names=['a', 'b']) == {'a', 'b', '<BIAS>'} assert _names(clf, feature_names=['a', 'b'], bias_name='bias') == {'a', 'b', 'bias'} assert _names(clf, feature_names=np.array(['a', 'b'])) == {'a', 'b', '<BIAS>'} assert _names(clf, feature_names=FeatureNames(['a', 'b' ])) == {'a', 'b', '<BIAS>'} assert _names(clf, feature_names=FeatureNames(n_features=2, unkn_template='F%d')) == { 'F0', 'F1', '<BIAS>' } with pytest.raises(ValueError): get_feature_names(clf, feature_names=['a']) with pytest.raises(ValueError): get_feature_names(clf, feature_names=['a', 'b', 'c']) with pytest.raises(ValueError): get_feature_names(clf, feature_names=FeatureNames(['a', 'b', 'c'])) clf2 = LogisticRegression(fit_intercept=False) clf2.fit(X, y) assert _names(clf2, vec) == {'hello', 'world'} assert _names(clf2, feature_names=['hello', 'world']) == {'hello', 'world'}
def _invhashing_union_feature_names_scale(vec_union): # type: (FeatureUnion) -> Tuple[FeatureNames, np.ndarray] feature_names_store = {} # type: Dict[int, Union[str, List]] unkn_template = None shift = 0 coef_scale_values = [] for vec_name, vec in vec_union.transformer_list: if isinstance(vec, InvertableHashingVectorizer): vec_feature_names = vec.get_feature_names(always_signed=False) unkn_template = vec_feature_names.unkn_template for idx, fs in vec_feature_names.feature_names.items(): new_fs = [] for f in fs: new_f = dict(f) new_f['name'] = '{}__{}'.format(vec_name, f['name']) new_fs.append(new_f) feature_names_store[idx + shift] = new_fs coef_scale_values.append((shift, vec.column_signs_)) shift += vec_feature_names.n_features else: vec_feature_names = vec.get_feature_names() feature_names_store.update( (shift + idx, '{}__{}'.format(vec_name, fname)) for idx, fname in enumerate(vec_feature_names)) shift += len(vec_feature_names) n_features = shift feature_names = FeatureNames(feature_names=feature_names_store, n_features=n_features, unkn_template=unkn_template) coef_scale = np.ones(n_features) * np.nan for idx, values in coef_scale_values: coef_scale[idx:idx + len(values)] = values return feature_names, coef_scale
def test_init(): with pytest.raises(ValueError): FeatureNames() with pytest.raises(ValueError): FeatureNames(unkn_template='%d') with pytest.raises(ValueError): FeatureNames(n_features=10) with pytest.raises(ValueError): FeatureNames(['a'], n_features=10) with pytest.raises(TypeError): FeatureNames({'a', 'b'}) with pytest.raises(ValueError): FeatureNames({0: 'a', 1: 'b'}, n_features=10) FeatureNames(unkn_template='%d', n_features=10) FeatureNames(['a', 'b']) FeatureNames({0: 'a', 1: 'b'}) FeatureNames({0: 'a', 1: 'b'}, n_features=10, unkn_template='x%d')
def get_feature_names(clf, vec=None, bias_name='<BIAS>', feature_names=None, num_features=None, estimator_feature_names=None): # type: (Any, Any, Optional[str], Any, int, Any) -> FeatureNames """ Return a FeatureNames instance that holds all feature names and a bias feature. If vec is None or doesn't have get_feature_names() method, features are named x0, x1, x2, etc. """ if not has_intercept(clf): bias_name = None if feature_names is None: if vec and hasattr(vec, 'get_feature_names'): return FeatureNames(vec.get_feature_names(), bias_name=bias_name) else: if estimator_feature_names is None: num_features = num_features or get_num_features(clf) return FeatureNames(n_features=num_features, unkn_template='x%d', bias_name=bias_name) return FeatureNames(estimator_feature_names, bias_name=bias_name) num_features = num_features or get_num_features(clf) if isinstance(feature_names, FeatureNames): if feature_names.n_features != num_features: raise ValueError("feature_names has a wrong n_features: " "expected=%d, got=%d" % (num_features, feature_names.n_features)) # Make a shallow copy setting proper bias_name return FeatureNames(feature_names.feature_names, n_features=num_features, bias_name=bias_name, unkn_template=feature_names.unkn_template) else: if len(feature_names) != num_features: raise ValueError("feature_names has a wrong length: " "expected=%d, got=%d" % (num_features, len(feature_names))) return FeatureNames(feature_names, bias_name=bias_name)
def test_feature_names_handle_filter(): filtered, indices = (FeatureNames( ['one', 'two', 'twenty-two']).handle_filter(lambda name: 'two' in name, feature_re=None)) assert indices == [1, 2] assert list(filtered) == ['two', 'twenty-two'] filtered, indices = (FeatureNames(['one', 'two', 'twenty-two' ]).handle_filter(feature_filter=None, feature_re='two')) assert indices == [1, 2] assert list(filtered) == ['two', 'twenty-two'] filtered, indices = FeatureNames(['one', 'two']).handle_filter(None, None) assert indices is None assert list(filtered) == ['one', 'two'] with pytest.raises(ValueError): FeatureNames(['one', 'two']).handle_filter(lambda name: True, '.*')
def explain_weights_sklearn_crfsuite(crf, top=20, target_names=None, targets=None, feature_re=None, feature_filter=None): """ Explain sklearn_crfsuite.CRF weights. See :func:`eli5.explain_weights` for description of ``top``, ``target_names``, ``targets``, ``feature_re`` and ``feature_filter`` parameters. """ feature_names = np.array(crf.attributes_) state_coef = crf_state_coef(crf).todense().A transition_coef = crf_transition_coef(crf) if feature_filter is not None or feature_re is not None: state_feature_names, flt_indices = ( FeatureNames(feature_names).handle_filter(feature_filter, feature_re)) state_feature_names = np.array(state_feature_names.feature_names) state_coef = state_coef[:, flt_indices] else: state_feature_names = feature_names def _features(label_id): return get_top_features(state_feature_names, state_coef[label_id], top) if targets is None: targets = sorted_for_ner(crf.classes_) display_names = get_target_display_names(crf.classes_, target_names, targets) indices, names = zip(*display_names) transition_coef = filter_transition_coefs(transition_coef, indices) return Explanation( targets=[ TargetExplanation( target=label, feature_weights=_features(label_id) ) for label_id, label in zip(indices, names) ], transition_features=TransitionFeatureWeights( class_names=names, coef=transition_coef, ), estimator=repr(crf), method='CRF', )
def test_feature_names_filter_by_re(): filtered, indices = (FeatureNames(['one', 'two', 'twenty-two']).filtered_by_re('two')) assert indices == [1, 2] assert list(filtered) == ['two', 'twenty-two'] filtered, indices = (FeatureNames( { 1: 'two', 3: 'twenty-two', 5: 'two-thirds' }, unkn_template='%d', n_features=6, bias_name='foo').filtered_by_re('^two')) assert indices == [1, 5] assert filtered.bias_name == 'foo' assert filtered.unkn_template == '%d' assert list(filtered) == ['two', 'two-thirds', 'foo'] filtered, indices = (FeatureNames(unkn_template='x%d', n_features=6).filtered_by_re('x')) assert indices == []
def get_feature_names(self, always_signed=True, always_positive=False): self.recalculate_attributes() # lists of names with signs of known features column_ids, term_names, term_signs = self._get_collision_info() feature_names = {} for col_id, names, signs in zip(column_ids, term_names, term_signs): if always_positive: feature_names[col_id] = [{'name': name, 'sign': 1} for name in names] else: if not always_signed and _invert_signs(signs): signs = [-sign for sign in signs] feature_names[col_id] = [{'name': name, 'sign': sign} for name, sign in zip(names, signs)] return FeatureNames( feature_names, n_features=self.n_features, unkn_template=self.unkn_template)
assert FN(['one', 'two', 'three'])[1:] == ['two', 'three'] assert FN({1: 'one'}, n_features=3, unkn_template='x%d')[:] \ == ['x0', 'one', 'x2'] assert FN({1: 'one'}, n_features=3, unkn_template='x%d', bias_name='bias')[-3:] \ == ['one', 'x2', 'bias'] assert FN(['one', 'two', 'three'], bias_name='bias')[-1:] == ['bias'] assert FN(np.array(['one', 'two', 'three']), bias_name='bias')[-1:] \ == ['bias'] assert FN(np.array(['one', 'two', 'three']), bias_name='bias')[-2:] \ == ['three', 'bias'] assert list(FN(np.array(['one', 'two', 'three']))[-2:]) == ['two', 'three'] @pytest.mark.parametrize(['feature_names'], [ [FeatureNames(['x1', 'x2', 'x3'])], [FeatureNames(['x1', 'x2', 'x3'], bias_name='<BIAS>')], [FeatureNames(np.array(['x1', 'x2', 'x3']))], [FeatureNames({ 0: 'x1', 1: 'x2' })], [FeatureNames(n_features=5, unkn_template='%d')], ]) def test_add_feature(feature_names): len_before = len(feature_names) storage = feature_names.feature_names new_feature = 'new' new_idx = feature_names.add_feature(new_feature) assert len(feature_names) == len_before + 1 assert feature_names[new_idx] == new_feature