def test_plot_feature_importance(self): """ Test plot_feature_importance function """ sb_clf, cv_gen = self._prepare_clf_data_set(oob_score=True) oos_score = ml_cross_val_score(sb_clf, self.X_train, self.y_train_clf, cv_gen=cv_gen, sample_weight_score=None, scoring=accuracy_score).mean() sb_clf.fit(self.X_train, self.y_train_clf) mdi_feat_imp = mean_decrease_impurity(sb_clf, self.X_train.columns) plot_feature_importance(mdi_feat_imp, oob_score=sb_clf.oob_score_, oos_score=oos_score) plot_feature_importance(mdi_feat_imp, oob_score=sb_clf.oob_score_, oos_score=oos_score, save_fig=True, output_path='test.png') os.remove('test.png')
def test_orthogonal_features(self): """ Test orthogonal features: PCA features, importance vs PCA importance analysis """ pca_features = get_orthogonal_features(self.X) # PCA features should have mean of 0 self.assertAlmostEqual(np.mean(pca_features[:, 2]), 0, delta=1e-7) self.assertAlmostEqual(np.mean(pca_features[:, 5]), 0, delta=1e-7) self.assertAlmostEqual(np.mean(pca_features[:, 6]), 0, delta=1e-7) # Check particular PCA values std self.assertAlmostEqual(np.std(pca_features[:, 1]), 1.3813, delta=0.2) self.assertAlmostEqual(np.std(pca_features[:, 3]), 1.0255, delta=0.2) self.assertAlmostEqual(np.std(pca_features[:, 4]), 1.0011, delta=0.2) mdi_feat_imp = mean_decrease_impurity(self.fit_clf, self.X.columns) pca_corr_res = feature_pca_analysis(self.X, mdi_feat_imp) # Check correlation metrics results self.assertAlmostEqual(pca_corr_res['Weighted_Kendall_Rank'][0], 0.7424, delta=1e-1) # Check particular number of PCA features pca_ten_features = get_orthogonal_features(self.X, num_features=10) self.assertEqual(pca_ten_features.shape[1], 10) pca_five_features = get_orthogonal_features(self.X, num_features=5) self.assertEqual(pca_five_features.shape[1], 5)
def test_plot_feature_importance(self): """ Test plot_feature_importance function """ oos_score = cross_val_score(self.bag_clf, self.X, self.y, cv=self.cv_gen, scoring='accuracy').mean() mdi_feat_imp = mean_decrease_impurity(self.bag_clf, self.X.columns) plot_feature_importance(mdi_feat_imp, oob_score=self.bag_clf.oob_score_, oos_score=oos_score) plot_feature_importance(mdi_feat_imp, oob_score=self.bag_clf.oob_score_, oos_score=oos_score, save_fig=True, output_path='test.png') os.remove('test.png')
def test_orthogonal_features(self): """ Test orthogonal features: PCA features, importance vs PCA importance analysis """ # Init classifiers clf_base = RandomForestClassifier(n_estimators=1, criterion='entropy', bootstrap=False, class_weight='balanced_subsample') sb_clf = SequentiallyBootstrappedBaggingClassifier( base_estimator=clf_base, max_features=1.0, n_estimators=100, samples_info_sets=self.samples_info_sets, price_bars=self.price_bars_trim, oob_score=True, random_state=1) pca_features = get_orthogonal_features(self.X_train) # PCA features should have mean of 0 self.assertAlmostEqual(np.mean(pca_features[:, 2]), 0, delta=1e-7) self.assertAlmostEqual(np.mean(pca_features[:, 5]), 0, delta=1e-7) self.assertAlmostEqual(np.mean(pca_features[:, 6]), 0, delta=1e-7) # Check particular PCA values std self.assertAlmostEqual(np.std(pca_features[:, 1]), 1.499, delta=0.2) self.assertAlmostEqual(np.std(pca_features[:, 3]), 1.047, delta=0.2) self.assertAlmostEqual(np.std(pca_features[:, 4]), 0.948, delta=0.2) sb_clf.fit(self.X_train, self.y_train_clf) mdi_feat_imp = mean_decrease_impurity(sb_clf, self.X_train.columns) pca_corr_res = feature_pca_analysis(self.X_train, mdi_feat_imp) # Check correlation metrics results self.assertAlmostEqual(pca_corr_res['Weighted_Kendall_Rank'][0], 0.0677, delta=1e-1)
def test_orthogonal_features(self): """ Test orthogonal features: PCA features, importance vs PCA importance analysis """ pca_features = get_orthogonal_features(self.X) # PCA features should have mean of 0 self.assertAlmostEqual(np.mean(pca_features[:, 2]), 0, delta=1e-7) self.assertAlmostEqual(np.mean(pca_features[:, 5]), 0, delta=1e-7) self.assertAlmostEqual(np.mean(pca_features[:, 6]), 0, delta=1e-7) # Check particular PCA values std self.assertAlmostEqual(np.std(pca_features[:, 1]), 1.2503, delta=0.2) self.assertAlmostEqual(np.std(pca_features[:, 3]), 1.0292, delta=0.2) self.assertAlmostEqual(np.std(pca_features[:, 4]), 1.0134, delta=0.2) mdi_feat_imp = mean_decrease_impurity(self.fit_clf, self.X.columns) pca_corr_res = feature_pca_analysis(self.X, mdi_feat_imp) # Check correlation metrics results self.assertAlmostEqual(pca_corr_res['Weighted_Kendall_Rank'][0], -0.0724, delta=1e-1)
def test_feature_importance(self): """ Test features importance: MDI, MDA, SFI and plot function """ # MDI feature importance mdi_feat_imp = mean_decrease_impurity(self.fit_clf, self.X.columns) # MDA feature importance mda_feat_imp_log_loss = mean_decrease_accuracy( self.bag_clf, self.X, self.y, self.cv_gen, sample_weight_train=np.ones((self.X.shape[0], )), sample_weight_score=np.ones((self.X.shape[0], )), scoring=log_loss) mda_feat_imp_f1 = mean_decrease_accuracy(self.bag_clf, self.X, self.y, self.cv_gen, scoring=f1_score) # SFI feature importance sfi_feat_imp_log_loss = single_feature_importance( self.bag_clf, self.X, self.y, cv_gen=self.cv_gen, sample_weight_train=np.ones((self.X.shape[0], )), scoring=log_loss) sfi_feat_imp_f1 = single_feature_importance( self.bag_clf, self.X, self.y, cv_gen=self.cv_gen, sample_weight_score=np.ones((self.X.shape[0], )), scoring=f1_score) # MDI assertions self.assertAlmostEqual(mdi_feat_imp['mean'].sum(), 1, delta=0.001) # The most informative features self.assertAlmostEqual(mdi_feat_imp.loc['I_1', 'mean'], 0.47075, delta=0.01) self.assertAlmostEqual(mdi_feat_imp.loc['I_0', 'mean'], 0.09291, delta=0.01) # Redundant feature self.assertAlmostEqual(mdi_feat_imp.loc['R_0', 'mean'], 0.07436, delta=0.01) # Noisy feature self.assertAlmostEqual(mdi_feat_imp.loc['N_0', 'mean'], 0.01798, delta=0.01) # MDA(log_loss) assertions self.assertAlmostEqual(mda_feat_imp_log_loss.loc['I_1', 'mean'], 0.59684, delta=0.1) self.assertAlmostEqual(mda_feat_imp_log_loss.loc['R_0', 'mean'], 0.13177, delta=0.1) # MDA(f1) assertions self.assertAlmostEqual(mda_feat_imp_f1.loc['I_1', 'mean'], 0.52268, delta=0.1) self.assertAlmostEqual(mda_feat_imp_f1.loc['I_2', 'mean'], 0.29533, delta=0.1) # SFI(log_loss) assertions self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['I_0', 'mean'], -6.50385, delta=0.1) self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['R_0', 'mean'], -3.27282, delta=0.1) # SFI(accuracy) assertions self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_0', 'mean'], 0.48530, delta=0.1) self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_1', 'mean'], 0.78778, delta=0.1)
def test_feature_importance(self): """ Test features importance: MDI, MDA, SFI and plot function """ sb_clf, cv_gen = self._prepare_clf_data_set(oob_score=False) # MDI feature importance mdi_feat_imp = mean_decrease_impurity(sb_clf, self.X_train.columns) # MDA feature importance mda_feat_imp_log_loss = mean_decrease_accuracy( sb_clf, self.X_train, self.y_train_clf, cv_gen, sample_weight_train=np.ones((self.X_train.shape[0], )), sample_weight_score=np.ones((self.X_train.shape[0], ))) mda_feat_imp_f1 = mean_decrease_accuracy(sb_clf, self.X_train, self.y_train_clf, cv_gen, scoring=f1_score) # SFI feature importance # Take only 5 features for faster test run sfi_feat_imp_log_loss = single_feature_importance( sb_clf, self.X_train[self.X_train.columns[:5]], self.y_train_clf, cv_gen=cv_gen, sample_weight_train=np.ones((self.X_train.shape[0], ))) sfi_feat_imp_f1 = single_feature_importance( sb_clf, self.X_train[self.X_train.columns[:5]], self.y_train_clf, cv_gen=cv_gen, scoring=f1_score, sample_weight_score=np.ones((self.X_train.shape[0], ))) # MDI assertions self.assertAlmostEqual(mdi_feat_imp['mean'].sum(), 1, delta=0.001) # The most informative features self.assertAlmostEqual(mdi_feat_imp.loc['label_prob_0.1', 'mean'], 0.19598, delta=0.01) self.assertAlmostEqual(mdi_feat_imp.loc['label_prob_0.2', 'mean'], 0.164, delta=0.01) # Noisy feature self.assertAlmostEqual(mdi_feat_imp.loc['label_prob_0.1_sma_5', 'mean'], 0.08805, delta=0.01) # MDA(log_loss) assertions self.assertAlmostEqual(mda_feat_imp_log_loss.loc['label_prob_0.1', 'mean'], 0.23685, delta=10) self.assertAlmostEqual(mda_feat_imp_log_loss.loc['label_prob_0.2', 'mean'], 0.3222, delta=10) # MDA(f1) assertions self.assertAlmostEqual(mda_feat_imp_f1.loc['label_prob_0.1', 'mean'], 0.25, delta=3) self.assertAlmostEqual(mda_feat_imp_f1.loc['label_prob_0.2', 'mean'], 0.3, delta=3) # SFI(log_loss) assertions self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['label_prob_0.1', 'mean'], -2.14, delta=1) self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['label_prob_0.2', 'mean'], -2.15, delta=1) # SFI(accuracy) assertions self.assertAlmostEqual(sfi_feat_imp_f1.loc['label_prob_0.1', 'mean'], 0.81, delta=1) self.assertAlmostEqual(sfi_feat_imp_f1.loc['label_prob_0.2', 'mean'], 0.74, delta=1) self.assertAlmostEqual(sfi_feat_imp_f1.loc['label_prob_0.5_sma_2', 'mean'], 0.224, delta=1)
def test_feature_importance(self): """ Test features importance: MDI, MDA, SFI and plot function """ #getting the clustered subsets for CFI with number of clusters selection using ONC algorithm clustered_subsets_linear = get_feature_clusters( self.X, dependence_metric='linear', distance_metric=None, linkage_method=None, n_clusters=None) #Also to verify the theory that if number clusters is equal to number of features then the #result will be same as MDA feature_subset_single = [[x] for x in self.X.columns] # MDI feature importance mdi_feat_imp = mean_decrease_impurity(self.fit_clf, self.X.columns) #Clustered MDI feature importance clustered_mdi = mean_decrease_impurity( self.fit_clf, self.X.columns, clustered_subsets=clustered_subsets_linear) mdi_cfi_single = mean_decrease_impurity( self.fit_clf, self.X.columns, clustered_subsets=feature_subset_single) # MDA feature importance mda_feat_imp_log_loss = mean_decrease_accuracy( self.bag_clf, self.X, self.y, self.cv_gen, sample_weight_train=np.ones((self.X.shape[0], )), sample_weight_score=np.ones((self.X.shape[0], )), scoring=log_loss) mda_feat_imp_f1 = mean_decrease_accuracy(self.bag_clf, self.X, self.y, self.cv_gen, scoring=f1_score) #ClusteredMDA feature importance clustered_mda = mean_decrease_accuracy( self.bag_clf, self.X, self.y, self.cv_gen, clustered_subsets=clustered_subsets_linear) mda_cfi_single = mean_decrease_accuracy( self.bag_clf, self.X, self.y, self.cv_gen, clustered_subsets=feature_subset_single) # SFI feature importance sfi_feat_imp_log_loss = single_feature_importance( self.bag_clf, self.X, self.y, cv_gen=self.cv_gen, sample_weight_train=np.ones((self.X.shape[0], )), scoring=log_loss) sfi_feat_imp_f1 = single_feature_importance( self.bag_clf, self.X, self.y, cv_gen=self.cv_gen, sample_weight_score=np.ones((self.X.shape[0], )), scoring=f1_score) # MDI assertions self.assertAlmostEqual(mdi_feat_imp['mean'].sum(), 1, delta=0.001) # The most informative features self.assertAlmostEqual(mdi_feat_imp.loc['I_1', 'mean'], 0.48058, delta=0.01) self.assertAlmostEqual(mdi_feat_imp.loc['I_0', 'mean'], 0.08214, delta=0.01) # Redundant feature self.assertAlmostEqual(mdi_feat_imp.loc['R_0', 'mean'], 0.06511, delta=0.01) # Noisy feature self.assertAlmostEqual(mdi_feat_imp.loc['N_0', 'mean'], 0.02229, delta=0.01) # MDA(log_loss) assertions self.assertAlmostEqual(mda_feat_imp_log_loss.loc['I_1', 'mean'], 0.65522, delta=0.1) self.assertAlmostEqual(mda_feat_imp_log_loss.loc['R_0', 'mean'], 0.00332, delta=0.1) # MDA(f1) assertions self.assertAlmostEqual(mda_feat_imp_f1.loc['I_1', 'mean'], 0.47751, delta=0.1) self.assertAlmostEqual(mda_feat_imp_f1.loc['I_2', 'mean'], 0.33617, delta=0.1) # SFI(log_loss) assertions self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['I_0', 'mean'], -6.39442, delta=0.1) self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['R_0', 'mean'], -5.04315, delta=0.1) # SFI(accuracy) assertions self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_0', 'mean'], 0.48915, delta=0.1) self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_1', 'mean'], 0.78443, delta=0.1) #Cluster MDI assertions self.assertAlmostEqual(clustered_mdi.loc['R_0', 'mean'], 0.01912, delta=0.1) self.assertAlmostEqual(clustered_mdi.loc['I_0', 'mean'], 0.06575, delta=0.1) #Clustered MDA (log_loss) assertions self.assertAlmostEqual(clustered_mda.loc['I_0', 'mean'], 0.04154, delta=0.1) self.assertAlmostEqual(clustered_mda.loc['R_0', 'mean'], 0.02940, delta=0.1) #Test if CFI with number of clusters same to number features is equal to normal MDI & MDA results self.assertAlmostEqual(mdi_feat_imp.loc['I_1', 'mean'], mdi_cfi_single.loc['I_1', 'mean'], delta=0.1) self.assertAlmostEqual(mdi_feat_imp.loc['R_0', 'mean'], mdi_cfi_single.loc['R_0', 'mean'], delta=0.1) self.assertAlmostEqual(mda_feat_imp_log_loss.loc['I_1', 'mean'], mda_cfi_single.loc['I_1', 'mean'], delta=0.1) self.assertAlmostEqual(mda_feat_imp_log_loss.loc['R_0', 'mean'], mda_cfi_single.loc['R_0', 'mean'], delta=0.1)