Пример #1
0
def aux_feat_imp_SFI(feat_names, clf, X, cont, scoring, cv_gen):
    """
    Substitution effects can lead us to discard important features that happen to be redundant
    generally not a problem for prediction, but can lead us to wrong conclusions when we are trying to understand, improve, or simplify a model
    cross-section predictive-importance (out-ofsample, OOS) method
    It computes the OOS performance score of each feature in isolation
    - can be applied to any classifier, not only tree-based classifiers
    - not limited to accuracy as the sole performance score
    - Unlike MDI and MDA, no substitution effects take place, since only one feature is taken into consideration at a time
    """
    imp = pd.DataFrame(columns=['mean', 'std'])
    for feat_name in feat_names:
        # Steps through each of the features and gets a cross validation score with just that feature as x-input
        print("Testing feature: {}".format(feat_name))
        scores = cv_score(clf,
                          X=X[[feat_name]],
                          y=cont['bin'],
                          sample_weight=cont['w'],
                          scoring=scoring,
                          cv_gen=cv_gen)
        imp.loc[feat_name, 'mean'] = scores.mean()
        imp.loc[feat_name, 'std'] = scores.std() * np.sqrt(scores.shape[0])
        print("Finished feature: {}".format(feat_name))
    print("completed aux imp SFI")
    return imp
Пример #2
0
def feat_importance(X,
                    cont,
                    clf=None,
                    n_estimators=1000,
                    n_splits=10,
                    max_samples=1.,
                    num_threads=24,
                    pct_embargo=0.,
                    scoring='accuracy',
                    method='SFI',
                    min_w_leaf=0.,
                    **kwargs):
    """
    takes in a classifier and type of importance score and runs through the cross validation score
    """
    n_jobs = (-1 if num_threads > 1 else 1)
    # Build classifiers
    if clf is None:
        base_clf = DecisionTreeClassifier(criterion='entropy',
                                          max_features=1,
                                          class_weight='balanced',
                                          min_weight_fraction_leaf=min_w_leaf)
        clf = BaggingClassifier(base_estimator=base_clf,
                                n_estimators=n_estimators,
                                max_features=1.,
                                max_samples=max_samples,
                                oob_score=True,
                                n_jobs=n_jobs)
    fit_clf = clf.fit(X, cont['bin'], sample_weight=cont['w'].values)
    if hasattr(fit_clf, 'oob_score_'):
        oob = fit_clf.oob_score_
    else:
        oob = None
    # cv score will use true out of sample training sets
    if method == 'MDI':
        imp = feat_imp_MDI(fit_clf, feat_names=X.columns)
        oos = cv_score(clf,
                       X=X,
                       y=cont['bin'],
                       n_splits=n_splits,
                       sample_weight=cont['w'],
                       t1=cont['t1'],
                       pct_embargo=pct_embargo,
                       scoring=scoring).mean()
    elif method == 'MDA':
        imp, oos = feat_imp_MDA(clf,
                                X=X,
                                y=cont['bin'],
                                n_splits=n_splits,
                                sample_weight=cont['w'],
                                t1=cont['t1'],
                                pct_embargo=pct_embargo,
                                scoring=scoring)
    elif method == 'SFI':
        cv_gen = PurgedKFold(n_splits=n_splits,
                             t1=cont['t1'],
                             pct_embargo=pct_embargo)
        oos = cv_score(clf,
                       X=X,
                       y=cont['bin'],
                       sample_weight=cont['w'],
                       scoring=scoring,
                       cv_gen=cv_gen)
        clf.n_jobs = 24
        imp = mp_pandas_obj(aux_feat_imp_SFI, ('feat_names', X.columns),
                            num_threads,
                            clf=clf,
                            X=X,
                            cont=cont,
                            scoring=scoring,
                            cv_gen=cv_gen)
    return imp, oob, oos