def test_sensitivity_based_feature_selection(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) # of features to remove Nremove = 2 # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity fe = SensitivityBasedFeatureSelection( sens_ana, feature_selector=FixedNElementTailSelector(2), enable_ca=["sensitivity", "selected_ids"]) data = self.get_data() data_nfeatures = data.nfeatures fe.train(data) resds = fe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # silly check if nfeatures got a single one removed self.assertEqual(data.nfeatures, resds.nfeatures + Nremove, msg="We had to remove just a single feature") self.assertEqual( fe.ca.sensitivity.nfeatures, data_nfeatures, msg="Sensitivity have to have # of features equal to original")
def test_feature_selection_pipeline(self): sens_ana = SillySensitivityAnalyzer() data = self.get_data() data_nfeatures = data.nfeatures # test silly one first ;-) self.assertEqual( sens_ana(data).samples[0, 0], -int(data_nfeatures / 2)) # OLD: first remove 25% == 6, and then 4, total removing 10 # NOW: test should be independent of the numerical number of features feature_selections = [ SensitivityBasedFeatureSelection(sens_ana, FractionTailSelector(0.25)), SensitivityBasedFeatureSelection(sens_ana, FixedNElementTailSelector(4)) ] # create a FeatureSelection pipeline feat_sel_pipeline = ChainMapper(feature_selections) feat_sel_pipeline.train(data) resds = feat_sel_pipeline(data) self.assertEqual(len(feat_sel_pipeline), len(feature_selections), msg="Test the property feature_selections") desired_nfeatures = int(np.ceil(data_nfeatures * 0.75)) self.assertEqual([fe._oshape[0] for fe in feat_sel_pipeline], [desired_nfeatures, desired_nfeatures - 4])
def test_sensitivity_based_feature_selection(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) # of features to remove Nremove = 2 # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity fe = SensitivityBasedFeatureSelection(sens_ana, feature_selector=FixedNElementTailSelector(2), enable_ca=["sensitivity", "selected_ids"]) data = self.get_data() data_nfeatures = data.nfeatures fe.train(data) resds = fe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # silly check if nfeatures got a single one removed self.assertEqual(data.nfeatures, resds.nfeatures+Nremove, msg="We had to remove just a single feature") self.assertEqual(fe.ca.sensitivity.nfeatures, data_nfeatures, msg="Sensitivity have to have # of features equal to original")
def test_feature_selection_classifier(self): from mvpa2.featsel.base import \ SensitivityBasedFeatureSelection from mvpa2.featsel.helpers import \ FixedNElementTailSelector # should give lowest weight to the feature with lowest index sens_ana = SillySensitivityAnalyzer() # should give lowest weight to the feature with highest index sens_ana_rev = SillySensitivityAnalyzer(mult=-1) # corresponding feature selections feat_sel = SensitivityBasedFeatureSelection( sens_ana, FixedNElementTailSelector(1, mode='discard')) feat_sel_rev = SensitivityBasedFeatureSelection( sens_ana_rev, FixedNElementTailSelector(1)) samples = np.array([[0, 0, -1], [1, 0, 1], [-1, -1, 1], [-1, 0, 1], [1, -1, 1]]) testdata3 = dataset_wizard(samples=samples, targets=1) # dummy train data so proper mapper gets created traindata = dataset_wizard(samples=np.array([[0, 0, -1], [1, 0, 1]]), targets=[1, 2]) # targets res110 = [1, 1, 1, -1, -1] res011 = [-1, 1, -1, 1, -1] # first classifier -- 0th feature should be discarded clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel, enable_ca=['feature_ids']) self.clf_sign.ca.change_temporarily(enable_ca=['estimates']) clf011.train(traindata) self.assertEqual(clf011.predict(testdata3.samples), res011) # just silly test if we get values assigned in the 'ProxyClassifier' self.assertTrue(len(clf011.ca.estimates) == len(res110), msg="We need to pass values into ProxyClassifier") self.clf_sign.ca.reset_changed_temporarily() self.assertEqual(clf011.mapper._oshape, (2, )) "Feature selection classifier had to be trained on 2 features" # first classifier -- last feature should be discarded clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev) clf011.train(traindata) self.assertEqual(clf011.predict(testdata3.samples), res110)
def test_remove_invariant_as_a_mapper(): from mvpa2.featsel.helpers import RangeElementSelector from mvpa2.featsel.base import StaticFeatureSelection, SensitivityBasedFeatureSelection from mvpa2.testing.datasets import datasets from mvpa2.datasets.miscfx import remove_invariant_features mapper = SensitivityBasedFeatureSelection( lambda x: np.std(x, axis=0), RangeElementSelector(lower=0, inclusive=False), train_analyzer=False, auto_train=True) ds = datasets['uni2large'].copy() ds.a['mapper'] = StaticFeatureSelection(np.arange(ds.nfeatures)) ds.fa['index'] = np.arange(ds.nfeatures) ds.samples[:, [1, 8]] = 10 ds_out = mapper(ds) # Validate that we are getting the same results as remove_invariant_features ds_rifs = remove_invariant_features(ds) assert_array_equal(ds_out.samples, ds_rifs.samples) assert_array_equal(ds_out.fa.index, ds_rifs.fa.index) assert_equal(ds_out.fa.index[1], 2) assert_equal(ds_out.fa.index[8], 10)
def test_feature_selection_classifier_with_regression(self): from mvpa2.featsel.base import \ SensitivityBasedFeatureSelection from mvpa2.featsel.helpers import \ FixedNElementTailSelector if sample_clf_reg is None: # none regression was found, so nothing to test return # should give lowest weight to the feature with lowest index sens_ana = SillySensitivityAnalyzer() # corresponding feature selections feat_sel = SensitivityBasedFeatureSelection( sens_ana, FixedNElementTailSelector(1, mode='discard')) # now test with regression-based classifier. The problem is # that it is determining predictions twice from values and # then setting the values from the results, which the second # time is set to predictions. The final outcome is that the # values are actually predictions... dat = dataset_wizard(samples=np.random.randn(4, 10), targets=[-1, -1, 1, 1]) clf_reg = FeatureSelectionClassifier(sample_clf_reg, feat_sel) clf_reg.train(dat) _ = clf_reg.predict(dat.samples) self.failIf( (np.array(clf_reg.ca.estimates) - clf_reg.ca.predictions).sum() == 0, msg="Values were set to the predictions in %s." % sample_clf_reg)
def test_custom_combined_selectors(self): """Test combination of the selectors in a single function """ def custom_tail_selector(seq): seq1 = FractionTailSelector(0.01, mode='discard', tail='upper')(seq) seq2 = FractionTailSelector(0.05, mode='select', tail='upper')(seq) return list(set(seq1).intersection(seq2)) seq = np.arange(100) seq_ = custom_tail_selector(seq) assert_array_equal(sorted(seq_), [95, 96, 97, 98]) # verify that this function could be used in place of the selector fs = SensitivityBasedFeatureSelection(OneWayAnova(), custom_tail_selector) ds = datasets['3dsmall'] fs.train(ds) # XXX: why needs to be trained here explicitly? ds_ = fs(ds) assert_equal(ds_.nfeatures, int(ds.nfeatures * 0.04))
def test_custom_combined_selectors(self): """Test combination of the selectors in a single function """ def custom_tail_selector(seq): seq1 = FractionTailSelector(0.01, mode='discard', tail='upper')(seq) seq2 = FractionTailSelector(0.05, mode='select', tail='upper')(seq) return list(set(seq1).intersection(seq2)) seq = np.arange(100) seq_ = custom_tail_selector(seq) assert_array_equal(sorted(seq_), [95, 96, 97, 98]) # verify that this function could be used in place of the selector fs = SensitivityBasedFeatureSelection( OneWayAnova(), custom_tail_selector) ds = datasets['3dsmall'] fs.train(ds) # XXX: why needs to be trained here explicitly? ds_ = fs(ds) assert_equal(ds_.nfeatures, int(ds.nfeatures * 0.04))
def test_union_feature_selection(self): # two methods: 5% highes F-scores, non-zero SMLR weights fss = [SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')), SensitivityBasedFeatureSelection( SMLRWeights(SMLR(lm=1, implementation="C"), postproc=sumofabs_sample()), RangeElementSelector(mode='select'))] fs = CombinedFeatureSelection(fss, method='union') od_union = fs(self.dataset) self.assertTrue(fs.method == 'union') # check output dataset self.assertTrue(od_union.nfeatures <= self.dataset.nfeatures) # again for intersection fs = CombinedFeatureSelection(fss, method='intersection') od_intersect = fs(self.dataset) assert_true(od_intersect.nfeatures < od_union.nfeatures)
def test_mapped_classifier_sensitivity_analyzer(self, clf): """Test sensitivity of the mapped classifier """ # Assuming many defaults it is as simple as mclf = FeatureSelectionClassifier( clf, SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.5, mode='select', tail='upper')), enable_ca=['training_stats']) sana = mclf.get_sensitivity_analyzer(postproc=sumofabs_sample(), enable_ca=["sensitivities"]) # and lets look at all sensitivities dataset = datasets['uni2small'] # and we get sensitivity analyzer which works on splits sens = sana(dataset) self.assertEqual(sens.shape, (1, dataset.nfeatures))
def setup_classifier(**kwargs): ''' Thinked! ''' for arg in kwargs: if arg == 'clf_type': clf_type = kwargs[arg] if arg == 'fsel': f_sel = kwargs[arg] if arg == 'cv_type': cv_approach = kwargs[arg] if arg == 'cv_folds': if np.int(kwargs[arg]) == 0: cv_type = np.float(kwargs[arg]) else: cv_type = np.int(kwargs[arg]) if arg == 'permutations': permutations = np.int(kwargs[arg]) if arg == 'cv_attribute': attribute = kwargs[arg] cv_n = cv_type ################# Classifier ####################### if clf_type == 'SVM': clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) elif clf_type == 'GNB': clf = GNB() elif clf_type == 'LDA': clf = LDA() elif clf_type == 'QDA': clf = QDA() elif clf_type == 'SMLR': clf = SMLR() elif clf_type == 'RbfSVM': sk_clf = SVC(gamma=0.1, C=1) clf = SKLLearnerAdapter(sk_clf, enable_ca=['probabilities']) elif clf_type == 'GP': clf = GPR() else: clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) ############## Feature Selection ######################### if f_sel == 'True': logger.info('Feature Selection selected.') fsel = SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')) fclf = FeatureSelectionClassifier(clf, fsel) elif f_sel == 'Fixed': logger.info('Fixed Feature Selection selected.') fsel = SensitivityBasedFeatureSelection( OneWayAnova(), FixedNElementTailSelector(100, mode='select', tail='upper')) fclf = FeatureSelectionClassifier(clf, fsel) elif f_sel == 'PCA': from mvpa2.mappers.skl_adaptor import SKLTransformer from sklearn.decomposition import PCA logger.info('Fixed Feature Selection selected.') fsel = SKLTransformer(PCA(n_components=45)) fclf = FeatureSelectionClassifier(clf, fsel) else: fclf = clf ######################### Permutations ############################# if permutations != 0: if __debug__: debug.active += ["STATMC"] repeater = Repeater(count=permutations) permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) partitioner = NFoldPartitioner(cvtype=cv_n, attr=attribute) null_cv = CrossValidation(clf, ChainNode([partitioner, permutator], space=partitioner.get_space()), errorfx=mean_mismatch_error) distr_est = MCNullDist(repeater, tail='left', measure=null_cv, enable_ca=['dist_samples']) #postproc = mean_sample() else: distr_est = None #postproc = None ######################################################## if cv_approach == 'n_fold': if cv_type != 0: splitter_used = NFoldPartitioner(cvtype=cv_type, attr=attribute) else: splitter_used = NFoldPartitioner(cvtype=1, attr=attribute) else: splitter_used = HalfPartitioner(attr=attribute) chain_splitter = ChainNode([ splitter_used, Balancer( attr='targets', count=1, limit='partitions', apply_selection=True) ], space='partitions') ############################################################# if distr_est == None: cvte = CrossValidation(fclf, chain_splitter, enable_ca=['stats', 'repetition_results']) else: cvte = CrossValidation(fclf, chain_splitter, errorfx=mean_mismatch_error, null_dist=distr_est, enable_ca=['stats', 'repetition_results']) logger.info('Classifier set...') return [fclf, cvte]
tags=_lars_tags, descr='skl.LassoLarsIC()') regrswh += [_lasso_lars_ic] clfswh += [ RegressionAsClassifier(_lasso_lars_ic, descr='skl.LassoLarsIC_C()') ] # kNN clfswh += kNN(k=5, descr="kNN(k=5)") clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( SMLRWeights(SMLR(lm=1.0, implementation="C"), postproc=maxofabs_sample()), RangeElementSelector(mode='select')), descr="kNN on SMLR(lm=1) non-0") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')), descr="kNN on 5%(ANOVA)") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection(
data = np.concatenate(data) labels = np.concatenate(labels) return data, labels.astype(np.int) rois = ['aSTG', 'HG', 'pSTG'] for sub_id in range(1, 21): data = [] for roi in rois: data_path = os.path.join(data_dir, roi) tmp_data, label = load_data(data_path, sub_id) data.append(tmp_data) data = np.concatenate(data, axis=1) data = np.concatenate([data[i,:,:].T for i in range(len(data))]) ds = Dataset(data) ds.sa['time_coords'] = np.linspace(0, len(ds)-1, len(ds)) events = [{'onset': i*5, 'duration': 5, 'targets':label[i], 'chunks':i+1} for i in range(int(len(ds)/5))] hrf_estimates = fit_event_hrf_model(ds, events, time_attr='time_coords', condition_attr=('targets', 'chunks'), design_kwargs=dict(drift_model='blank'), glmfit_kwargs=dict(model='ols'), return_model=True) fsel = SensitivityBasedFeatureSelection(OneWayAnova(), FixedNElementTailSelector(5000, mode='select', tail='upper')) fsel.train(hrf_estimates) ds_p = fsel(hrf_estimates) np.save('feat_sub{:03d}'.format(sub_id), ds_p.samples)