示例#1
0
    def test_plot_feature_importance(self):
        """
        Test plot_feature_importance function
        """

        sb_clf, cv_gen = self._prepare_clf_data_set(oob_score=True)
        oos_score = ml_cross_val_score(sb_clf,
                                       self.X_train,
                                       self.y_train_clf,
                                       cv_gen=cv_gen,
                                       sample_weight_score=None,
                                       scoring=accuracy_score).mean()

        sb_clf.fit(self.X_train, self.y_train_clf)

        mdi_feat_imp = mean_decrease_impurity(sb_clf, self.X_train.columns)
        plot_feature_importance(mdi_feat_imp,
                                oob_score=sb_clf.oob_score_,
                                oos_score=oos_score)
        plot_feature_importance(mdi_feat_imp,
                                oob_score=sb_clf.oob_score_,
                                oos_score=oos_score,
                                save_fig=True,
                                output_path='test.png')

        os.remove('test.png')
示例#2
0
def feature_importance_sfi(clf, X, y, cv_gen, sample_weight=None, scoring='neg_log_loss'):
    """
    Snippet 8.4, page 118. Implementation of SFI

    This function generates Single Feature Importance based on OOS score (using cross-validation object)

    :param clf: (sklearn.ClassifierMixin): any sklearn classifier
    :param X: (pd.DataFrame): train set features
    :param y: (pd.DataFrame, np.array): train set labels
    :param cv_gen: (cross_validation.PurgedKFold): cross-validation object
    :param sample_weight: (np.array): sample weights, if None equal to ones
    :param scoring: (str): scoring function used to determine importance
    :return: (pd.DataFrame): mean and std feature importance
    """
    feature_names = X.columns
    if sample_weight is None:
        sample_weight = np.ones((X.shape[0],))

    imp = pd.DataFrame(columns=['mean', 'std'])
    for feat in feature_names:
        feat_cross_val_scores = ml_cross_val_score(clf, X=X[[feat]], y=y, sample_weight=sample_weight,
                                                   scoring=scoring, cv_gen=cv_gen)
        imp.loc[feat, 'mean'] = feat_cross_val_scores.mean()
        imp.loc[feat, 'std'] = feat_cross_val_scores.std() * feat_cross_val_scores.shape[0] ** -.5
    return imp
 def test_ml_cross_val_score_00_exception(self):
     """
     Test the ml_cross_val_score function with an artificial dataset. In this case we give it the wrong scoring
     method - jaccard_score.
     """
     info_sets, records, labels, sample_weights, decision_tree = self._test_ml_cross_val_score__data(
     )
     cv_gen = PurgedKFold(samples_info_sets=info_sets,
                          n_splits=3,
                          pct_embargo=0.01)
     try:
         ml_cross_val_score(
             classifier=decision_tree,
             X=records,
             y=labels,
             sample_weight=sample_weights.values,
             scoring='jaccard_score',
             cv_gen=cv_gen,
         )
     except ValueError:
         pass
     else:
         self.fail("ValueError not raised")
示例#4
0
def single_feature_importance(clf, X, y, cv_gen, sample_weight_train=None, sample_weight_score=None, scoring=log_loss):
    """
    Snippet 8.4, page 118. Implementation of SFI

    Substitution effects can lead us to discard important features that happen to be redundant. This is not generally a
    problem in the context of prediction, but it could lead us to wrong conclusions when we are trying to understand,
    improve, or simplify a model. For this reason, the following single feature importance method can be a good
    complement to MDI and MDA.

    Single feature importance (SFI) is a cross-section predictive-importance (out-of- sample) method. It computes the
    OOS performance score of each feature in isolation. A few considerations:

    * This method can be applied to any classifier, not only tree-based classifiers.
    * SFI is not limited to accuracy as the sole performance score.
    * Unlike MDI and MDA, no substitution effects take place, since only one feature is taken into consideration at a time.
    * Like MDA, it can conclude that all features are unimportant, because performance is evaluated via OOS CV.

    The main limitation of SFI is that a classifier with two features can perform better than the bagging of two
    single-feature classifiers. For example, (1) feature B may be useful only in combination with feature A;
    or (2) feature B may be useful in explaining the splits from feature A, even if feature B alone is inaccurate.
    In other words, joint effects and hierarchical importance are lost in SFI. One alternative would be to compute the
    OOS performance score from subsets of features, but that calculation will become intractable as more features are
    considered.

    :param clf: (sklearn.Classifier): Any sklearn classifier.
    :param X: (pd.DataFrame): Train set features.
    :param y: (pd.DataFrame, np.array): Train set labels.
    :param cv_gen: (cross_validation.PurgedKFold): Cross-validation object.
    :param sample_weight_train: A numpy array of sample weights used to train the model for each record in the dataset.
    :param sample_weight_score: A numpy array of sample weights used to evaluate the model quality.
    :param scoring: (function): Scoring function used to determine importance.
    :return: (pd.DataFrame): Mean and standard deviation of feature importance.
    """
    feature_names = X.columns
    if sample_weight_train is None:
        sample_weight_train = np.ones((X.shape[0],))

    if sample_weight_score is None:
        sample_weight_score = np.ones((X.shape[0],))

    imp = pd.DataFrame(columns=['mean', 'std'])
    for feat in feature_names:
        feat_cross_val_scores = ml_cross_val_score(clf, X=X[[feat]], y=y, sample_weight_train=sample_weight_train,
                                                   sample_weight_score=sample_weight_score,
                                                   scoring=scoring, cv_gen=cv_gen)
        imp.loc[feat, 'mean'] = feat_cross_val_scores.mean()
        # pylint: disable=unsubscriptable-object
        imp.loc[feat, 'std'] = feat_cross_val_scores.std() * feat_cross_val_scores.shape[0] ** -.5
    return imp
    def test_ml_cross_val_score_04_sw(self):
        """
        Test the ml_cross_val_score function with an artificial dataset.
        """
        info_sets, records, labels, _, decision_tree = self._test_ml_cross_val_score__data(
        )
        cv_gen = PurgedKFold(samples_info_sets=info_sets,
                             n_splits=3,
                             pct_embargo=0.01)
        scores = ml_cross_val_score(
            classifier=decision_tree,
            X=records,
            y=labels,
            sample_weight=None,
            scoring=accuracy_score,
            cv_gen=cv_gen,
        )
        self.log(f"score1= {scores}")

        should_be = np.array([0.5, 0.4984984984984985, 0.4984984984984985])
        self.assertTrue(np.array_equal(scores, should_be),
                        "score lists don't match")
    def test_ml_cross_val_score_03_other_cv_gen(self):
        """
        Test the ml_cross_val_score function with an artificial dataset.
        """
        _, records, labels, sample_weights, decision_tree = self._test_ml_cross_val_score__data(
        )
        scores = ml_cross_val_score(
            classifier=decision_tree,
            X=records,
            y=labels,
            sample_weight=sample_weights.values,
            scoring=log_loss,
            cv_gen=TimeSeriesSplit(max_train_size=None, n_splits=3),
        )
        self.log(f"scores= {scores}")

        should_be = np.array(
            [-17.520701311460694, -18.25536255165772, -16.964650471071668])
        self.assertTrue(
            np.array_equal(scores, should_be),
            # self.assertListEqual(scores.tolist(), should_be.tolist()),
            "score lists don't match")
示例#7
0
    def test_ml_cross_val_score_02_neg_log_loss(self):
        """
        Test the ml_cross_val_score function with an artificial dataset.
        """
        info_sets, records, labels, sample_weights, decision_tree = self._test_ml_cross_val_score__data()
        cv_gen = PurgedKFold(samples_info_sets=info_sets, n_splits=3, pct_embargo=0.01)
        scores = ml_cross_val_score(
            classifier=decision_tree,
            X=records,
            y=labels,
            sample_weight_train=sample_weights.values,
            sample_weight_score=None,
            scoring=log_loss,
            cv_gen=cv_gen,
        )
        self.log(f"scores= {scores}")

        should_be = np.array([-17.26939, -17.32125, -17.32125])
        self.assertTrue(
            np.allclose(scores, should_be),
            "score lists don't match"
        )
示例#8
0
    def test_ml_cross_val_score_01_accuracy(self):
        """
        Test the ml_cross_val_score function with an artificial dataset.
        """
        info_sets, records, labels, sample_weights, decision_tree = self._test_ml_cross_val_score__data()
        cv_gen = PurgedKFold(samples_info_sets=info_sets, n_splits=3, pct_embargo=0.01)
        scores = ml_cross_val_score(
            classifier=decision_tree,
            X=records,
            y=labels,
            sample_weight_train=sample_weights.values,
            sample_weight_score=sample_weights.values,
            scoring=accuracy_score,
            cv_gen=cv_gen,
        )
        self.log(f"score1= {scores}")

        should_be = np.array([0.5186980141893885, 0.4876916232189882, 0.4966185791847402])
        self.assertTrue(
            np.array_equal(scores, should_be),
            "score lists don't match"
        )
    def test_ml_cross_val_score_02_neg_log_loss(self):
        """
        Test the ml_cross_val_score function with an artificial dataset.
        """
        info_sets, records, labels, sample_weights, decision_tree = self._test_ml_cross_val_score__data(
        )
        cv_gen = PurgedKFold(samples_info_sets=info_sets,
                             n_splits=3,
                             pct_embargo=0.01)
        scores = ml_cross_val_score(
            classifier=decision_tree,
            X=records,
            y=labels,
            sample_weight=sample_weights.values,
            scoring=log_loss,
            cv_gen=cv_gen,
        )
        self.log(f"scores= {scores}")

        should_be = np.array(
            [-16.623581666339184, -17.694504470879014, -17.386178334890698])
        self.assertTrue(np.array_equal(scores, should_be),
                        "score lists don't match")