示例#1
0
    def test_plot_feature_importance(self):
        """
        Test plot_feature_importance function
        """

        sb_clf, cv_gen = self._prepare_clf_data_set(oob_score=True)
        oos_score = ml_cross_val_score(sb_clf,
                                       self.X_train,
                                       self.y_train_clf,
                                       cv_gen=cv_gen,
                                       sample_weight_score=None,
                                       scoring=accuracy_score).mean()

        sb_clf.fit(self.X_train, self.y_train_clf)

        mdi_feat_imp = mean_decrease_impurity(sb_clf, self.X_train.columns)
        plot_feature_importance(mdi_feat_imp,
                                oob_score=sb_clf.oob_score_,
                                oos_score=oos_score)
        plot_feature_importance(mdi_feat_imp,
                                oob_score=sb_clf.oob_score_,
                                oos_score=oos_score,
                                save_fig=True,
                                output_path='test.png')

        os.remove('test.png')
示例#2
0
    def test_orthogonal_features(self):
        """
        Test orthogonal features: PCA features, importance vs PCA importance analysis
        """

        pca_features = get_orthogonal_features(self.X)

        # PCA features should have mean of 0
        self.assertAlmostEqual(np.mean(pca_features[:, 2]), 0, delta=1e-7)
        self.assertAlmostEqual(np.mean(pca_features[:, 5]), 0, delta=1e-7)
        self.assertAlmostEqual(np.mean(pca_features[:, 6]), 0, delta=1e-7)

        # Check particular PCA values std
        self.assertAlmostEqual(np.std(pca_features[:, 1]), 1.3813, delta=0.2)
        self.assertAlmostEqual(np.std(pca_features[:, 3]), 1.0255, delta=0.2)
        self.assertAlmostEqual(np.std(pca_features[:, 4]), 1.0011, delta=0.2)

        mdi_feat_imp = mean_decrease_impurity(self.fit_clf, self.X.columns)
        pca_corr_res = feature_pca_analysis(self.X, mdi_feat_imp)

        # Check correlation metrics results
        self.assertAlmostEqual(pca_corr_res['Weighted_Kendall_Rank'][0],
                               0.7424,
                               delta=1e-1)

        # Check particular number of PCA features
        pca_ten_features = get_orthogonal_features(self.X, num_features=10)
        self.assertEqual(pca_ten_features.shape[1], 10)

        pca_five_features = get_orthogonal_features(self.X, num_features=5)
        self.assertEqual(pca_five_features.shape[1], 5)
示例#3
0
    def test_plot_feature_importance(self):
        """
        Test plot_feature_importance function
        """
        oos_score = cross_val_score(self.bag_clf, self.X, self.y, cv=self.cv_gen, scoring='accuracy').mean()

        mdi_feat_imp = mean_decrease_impurity(self.bag_clf, self.X.columns)
        plot_feature_importance(mdi_feat_imp, oob_score=self.bag_clf.oob_score_, oos_score=oos_score)
        plot_feature_importance(mdi_feat_imp, oob_score=self.bag_clf.oob_score_, oos_score=oos_score,
                                save_fig=True, output_path='test.png')
        os.remove('test.png')
示例#4
0
    def test_orthogonal_features(self):
        """
        Test orthogonal features: PCA features, importance vs PCA importance analysis
        """

        # Init classifiers
        clf_base = RandomForestClassifier(n_estimators=1,
                                          criterion='entropy',
                                          bootstrap=False,
                                          class_weight='balanced_subsample')

        sb_clf = SequentiallyBootstrappedBaggingClassifier(
            base_estimator=clf_base,
            max_features=1.0,
            n_estimators=100,
            samples_info_sets=self.samples_info_sets,
            price_bars=self.price_bars_trim,
            oob_score=True,
            random_state=1)

        pca_features = get_orthogonal_features(self.X_train)

        # PCA features should have mean of 0
        self.assertAlmostEqual(np.mean(pca_features[:, 2]), 0, delta=1e-7)
        self.assertAlmostEqual(np.mean(pca_features[:, 5]), 0, delta=1e-7)
        self.assertAlmostEqual(np.mean(pca_features[:, 6]), 0, delta=1e-7)

        # Check particular PCA values std
        self.assertAlmostEqual(np.std(pca_features[:, 1]), 1.499, delta=0.2)
        self.assertAlmostEqual(np.std(pca_features[:, 3]), 1.047, delta=0.2)
        self.assertAlmostEqual(np.std(pca_features[:, 4]), 0.948, delta=0.2)

        sb_clf.fit(self.X_train, self.y_train_clf)
        mdi_feat_imp = mean_decrease_impurity(sb_clf, self.X_train.columns)
        pca_corr_res = feature_pca_analysis(self.X_train, mdi_feat_imp)

        # Check correlation metrics results
        self.assertAlmostEqual(pca_corr_res['Weighted_Kendall_Rank'][0],
                               0.0677,
                               delta=1e-1)
    def test_orthogonal_features(self):
        """
        Test orthogonal features: PCA features, importance vs PCA importance analysis
        """

        pca_features = get_orthogonal_features(self.X)

        # PCA features should have mean of 0
        self.assertAlmostEqual(np.mean(pca_features[:, 2]), 0, delta=1e-7)
        self.assertAlmostEqual(np.mean(pca_features[:, 5]), 0, delta=1e-7)
        self.assertAlmostEqual(np.mean(pca_features[:, 6]), 0, delta=1e-7)

        # Check particular PCA values std
        self.assertAlmostEqual(np.std(pca_features[:, 1]), 1.2503, delta=0.2)
        self.assertAlmostEqual(np.std(pca_features[:, 3]), 1.0292, delta=0.2)
        self.assertAlmostEqual(np.std(pca_features[:, 4]), 1.0134, delta=0.2)

        mdi_feat_imp = mean_decrease_impurity(self.fit_clf, self.X.columns)
        pca_corr_res = feature_pca_analysis(self.X, mdi_feat_imp)

        # Check correlation metrics results
        self.assertAlmostEqual(pca_corr_res['Weighted_Kendall_Rank'][0],
                               -0.0724,
                               delta=1e-1)
    def test_feature_importance(self):
        """
        Test features importance: MDI, MDA, SFI and plot function
        """

        # MDI feature importance
        mdi_feat_imp = mean_decrease_impurity(self.fit_clf, self.X.columns)

        # MDA feature importance
        mda_feat_imp_log_loss = mean_decrease_accuracy(
            self.bag_clf,
            self.X,
            self.y,
            self.cv_gen,
            sample_weight_train=np.ones((self.X.shape[0], )),
            sample_weight_score=np.ones((self.X.shape[0], )),
            scoring=log_loss)
        mda_feat_imp_f1 = mean_decrease_accuracy(self.bag_clf,
                                                 self.X,
                                                 self.y,
                                                 self.cv_gen,
                                                 scoring=f1_score)
        # SFI feature importance
        sfi_feat_imp_log_loss = single_feature_importance(
            self.bag_clf,
            self.X,
            self.y,
            cv_gen=self.cv_gen,
            sample_weight_train=np.ones((self.X.shape[0], )),
            scoring=log_loss)
        sfi_feat_imp_f1 = single_feature_importance(
            self.bag_clf,
            self.X,
            self.y,
            cv_gen=self.cv_gen,
            sample_weight_score=np.ones((self.X.shape[0], )),
            scoring=f1_score)

        # MDI assertions
        self.assertAlmostEqual(mdi_feat_imp['mean'].sum(), 1, delta=0.001)
        # The most informative features
        self.assertAlmostEqual(mdi_feat_imp.loc['I_1', 'mean'],
                               0.47075,
                               delta=0.01)
        self.assertAlmostEqual(mdi_feat_imp.loc['I_0', 'mean'],
                               0.09291,
                               delta=0.01)
        # Redundant feature
        self.assertAlmostEqual(mdi_feat_imp.loc['R_0', 'mean'],
                               0.07436,
                               delta=0.01)
        # Noisy feature
        self.assertAlmostEqual(mdi_feat_imp.loc['N_0', 'mean'],
                               0.01798,
                               delta=0.01)

        # MDA(log_loss) assertions
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['I_1', 'mean'],
                               0.59684,
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['R_0', 'mean'],
                               0.13177,
                               delta=0.1)

        # MDA(f1) assertions
        self.assertAlmostEqual(mda_feat_imp_f1.loc['I_1', 'mean'],
                               0.52268,
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_f1.loc['I_2', 'mean'],
                               0.29533,
                               delta=0.1)

        # SFI(log_loss) assertions
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['I_0', 'mean'],
                               -6.50385,
                               delta=0.1)
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['R_0', 'mean'],
                               -3.27282,
                               delta=0.1)

        # SFI(accuracy) assertions
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_0', 'mean'],
                               0.48530,
                               delta=0.1)
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_1', 'mean'],
                               0.78778,
                               delta=0.1)
示例#7
0
    def test_feature_importance(self):
        """
        Test features importance: MDI, MDA, SFI and plot function
        """
        sb_clf, cv_gen = self._prepare_clf_data_set(oob_score=False)

        # MDI feature importance
        mdi_feat_imp = mean_decrease_impurity(sb_clf, self.X_train.columns)

        # MDA feature importance
        mda_feat_imp_log_loss = mean_decrease_accuracy(
            sb_clf,
            self.X_train,
            self.y_train_clf,
            cv_gen,
            sample_weight_train=np.ones((self.X_train.shape[0], )),
            sample_weight_score=np.ones((self.X_train.shape[0], )))
        mda_feat_imp_f1 = mean_decrease_accuracy(sb_clf,
                                                 self.X_train,
                                                 self.y_train_clf,
                                                 cv_gen,
                                                 scoring=f1_score)
        # SFI feature importance
        # Take only 5 features for faster test run
        sfi_feat_imp_log_loss = single_feature_importance(
            sb_clf,
            self.X_train[self.X_train.columns[:5]],
            self.y_train_clf,
            cv_gen=cv_gen,
            sample_weight_train=np.ones((self.X_train.shape[0], )))
        sfi_feat_imp_f1 = single_feature_importance(
            sb_clf,
            self.X_train[self.X_train.columns[:5]],
            self.y_train_clf,
            cv_gen=cv_gen,
            scoring=f1_score,
            sample_weight_score=np.ones((self.X_train.shape[0], )))

        # MDI assertions
        self.assertAlmostEqual(mdi_feat_imp['mean'].sum(), 1, delta=0.001)
        # The most informative features
        self.assertAlmostEqual(mdi_feat_imp.loc['label_prob_0.1', 'mean'],
                               0.19598,
                               delta=0.01)
        self.assertAlmostEqual(mdi_feat_imp.loc['label_prob_0.2', 'mean'],
                               0.164,
                               delta=0.01)
        # Noisy feature
        self.assertAlmostEqual(mdi_feat_imp.loc['label_prob_0.1_sma_5',
                                                'mean'],
                               0.08805,
                               delta=0.01)

        # MDA(log_loss) assertions
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['label_prob_0.1',
                                                         'mean'],
                               0.23685,
                               delta=10)
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['label_prob_0.2',
                                                         'mean'],
                               0.3222,
                               delta=10)

        # MDA(f1) assertions
        self.assertAlmostEqual(mda_feat_imp_f1.loc['label_prob_0.1', 'mean'],
                               0.25,
                               delta=3)
        self.assertAlmostEqual(mda_feat_imp_f1.loc['label_prob_0.2', 'mean'],
                               0.3,
                               delta=3)

        # SFI(log_loss) assertions
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['label_prob_0.1',
                                                         'mean'],
                               -2.14,
                               delta=1)
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['label_prob_0.2',
                                                         'mean'],
                               -2.15,
                               delta=1)

        # SFI(accuracy) assertions
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['label_prob_0.1', 'mean'],
                               0.81,
                               delta=1)
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['label_prob_0.2', 'mean'],
                               0.74,
                               delta=1)
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['label_prob_0.5_sma_2',
                                                   'mean'],
                               0.224,
                               delta=1)
    def test_feature_importance(self):
        """
        Test features importance: MDI, MDA, SFI and plot function
        """
        #getting the clustered subsets for CFI with number of clusters selection using ONC algorithm
        clustered_subsets_linear = get_feature_clusters(
            self.X,
            dependence_metric='linear',
            distance_metric=None,
            linkage_method=None,
            n_clusters=None)
        #Also to verify the theory that if number clusters is equal to number of features then the
        #result will be same as MDA
        feature_subset_single = [[x] for x in self.X.columns]

        # MDI feature importance
        mdi_feat_imp = mean_decrease_impurity(self.fit_clf, self.X.columns)
        #Clustered MDI feature importance
        clustered_mdi = mean_decrease_impurity(
            self.fit_clf,
            self.X.columns,
            clustered_subsets=clustered_subsets_linear)
        mdi_cfi_single = mean_decrease_impurity(
            self.fit_clf,
            self.X.columns,
            clustered_subsets=feature_subset_single)

        # MDA feature importance
        mda_feat_imp_log_loss = mean_decrease_accuracy(
            self.bag_clf,
            self.X,
            self.y,
            self.cv_gen,
            sample_weight_train=np.ones((self.X.shape[0], )),
            sample_weight_score=np.ones((self.X.shape[0], )),
            scoring=log_loss)

        mda_feat_imp_f1 = mean_decrease_accuracy(self.bag_clf,
                                                 self.X,
                                                 self.y,
                                                 self.cv_gen,
                                                 scoring=f1_score)
        #ClusteredMDA feature importance
        clustered_mda = mean_decrease_accuracy(
            self.bag_clf,
            self.X,
            self.y,
            self.cv_gen,
            clustered_subsets=clustered_subsets_linear)
        mda_cfi_single = mean_decrease_accuracy(
            self.bag_clf,
            self.X,
            self.y,
            self.cv_gen,
            clustered_subsets=feature_subset_single)

        # SFI feature importance
        sfi_feat_imp_log_loss = single_feature_importance(
            self.bag_clf,
            self.X,
            self.y,
            cv_gen=self.cv_gen,
            sample_weight_train=np.ones((self.X.shape[0], )),
            scoring=log_loss)
        sfi_feat_imp_f1 = single_feature_importance(
            self.bag_clf,
            self.X,
            self.y,
            cv_gen=self.cv_gen,
            sample_weight_score=np.ones((self.X.shape[0], )),
            scoring=f1_score)

        # MDI assertions
        self.assertAlmostEqual(mdi_feat_imp['mean'].sum(), 1, delta=0.001)
        # The most informative features
        self.assertAlmostEqual(mdi_feat_imp.loc['I_1', 'mean'],
                               0.48058,
                               delta=0.01)
        self.assertAlmostEqual(mdi_feat_imp.loc['I_0', 'mean'],
                               0.08214,
                               delta=0.01)
        # Redundant feature
        self.assertAlmostEqual(mdi_feat_imp.loc['R_0', 'mean'],
                               0.06511,
                               delta=0.01)
        # Noisy feature
        self.assertAlmostEqual(mdi_feat_imp.loc['N_0', 'mean'],
                               0.02229,
                               delta=0.01)

        # MDA(log_loss) assertions
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['I_1', 'mean'],
                               0.65522,
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['R_0', 'mean'],
                               0.00332,
                               delta=0.1)

        # MDA(f1) assertions
        self.assertAlmostEqual(mda_feat_imp_f1.loc['I_1', 'mean'],
                               0.47751,
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_f1.loc['I_2', 'mean'],
                               0.33617,
                               delta=0.1)

        # SFI(log_loss) assertions
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['I_0', 'mean'],
                               -6.39442,
                               delta=0.1)
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['R_0', 'mean'],
                               -5.04315,
                               delta=0.1)

        # SFI(accuracy) assertions
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_0', 'mean'],
                               0.48915,
                               delta=0.1)
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_1', 'mean'],
                               0.78443,
                               delta=0.1)

        #Cluster MDI  assertions
        self.assertAlmostEqual(clustered_mdi.loc['R_0', 'mean'],
                               0.01912,
                               delta=0.1)
        self.assertAlmostEqual(clustered_mdi.loc['I_0', 'mean'],
                               0.06575,
                               delta=0.1)

        #Clustered MDA (log_loss) assertions
        self.assertAlmostEqual(clustered_mda.loc['I_0', 'mean'],
                               0.04154,
                               delta=0.1)
        self.assertAlmostEqual(clustered_mda.loc['R_0', 'mean'],
                               0.02940,
                               delta=0.1)

        #Test if CFI with number of clusters same to number features is equal to normal MDI & MDA results
        self.assertAlmostEqual(mdi_feat_imp.loc['I_1', 'mean'],
                               mdi_cfi_single.loc['I_1', 'mean'],
                               delta=0.1)
        self.assertAlmostEqual(mdi_feat_imp.loc['R_0', 'mean'],
                               mdi_cfi_single.loc['R_0', 'mean'],
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['I_1', 'mean'],
                               mda_cfi_single.loc['I_1', 'mean'],
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['R_0', 'mean'],
                               mda_cfi_single.loc['R_0', 'mean'],
                               delta=0.1)