def test_anova(self, do_int): """Additional aspects of OnewayAnova """ oa = OneWayAnova() oa_custom = OneWayAnova(space='custom') ds = datasets['uni4large'].copy() if do_int: ds.samples = (ds.samples * 1000).astype(np.int) ds_samples_orig = ds.samples.copy( ) # to verify that nothing was modified ds_custom = Dataset(ds.samples, sa={'custom': ds.targets}) r = oa(ds) assert_array_equal(ds.samples, ds_samples_orig) # no inplace changes! self.assertRaises(KeyError, oa_custom, ds) r_custom = oa_custom(ds_custom) self.assertTrue(np.allclose(r.samples, r_custom.samples)) # we should get the same results on subsequent runs r2 = oa(ds) r_custom2 = oa_custom(ds_custom) self.assertTrue(np.allclose(r.samples, r2.samples)) self.assertTrue(np.allclose(r_custom.samples, r_custom2.samples)) skip_if_no_external('scipy') from scipy.stats.stats import f_oneway # compare against scipy implementation # we need to create groups of those target samples groups = [ds[ds.targets == ut] for ut in ds.sa['targets'].unique] spf, spp = f_oneway(*groups) assert_array_almost_equal(r.samples[0], spf)
def test_null_dist_prob(self, null): """Testing null dist probability""" if not isinstance(null, NullDist): return ds = datasets['uni2small'] null.fit(OneWayAnova(), ds) # check reasonable output. # p-values for non-bogus features should significantly different, # while bogus (0) not prob = null.p([20, 0, 0, 0, 0, np.nan]) # XXX this is labile! it also needs checking since the F-scores # of the MCNullDists using normal distribution are apparently not # distributed that way, hence the test often (if not always) fails. if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(np.abs(prob[0]) < 0.05, msg="Expected small p, got %g" % prob[0]) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue((np.abs(prob[1:]) > 0.05).all(), msg="Bogus features should have insignificant p." " Got %s" % (np.abs(prob[1:]), )) # has to have matching shape if not isinstance(null, FixedNullDist): # Fixed dist is univariate ATM so it doesn't care # about dimensionality and gives 1 output value self.assertRaises(ValueError, null.p, [5, 3, 4])
def test_split_samples_probability_mapper(self): skip_if_no_external('scipy') nf = 10 ns = 100 nsubj = 5 nchunks = 5 data = np.random.normal(size=(ns, nf)) ds = AttrDataset(data, sa=dict(sidx=np.arange(ns), targets=np.arange(ns) % nchunks, chunks=np.floor(np.arange(ns) * nchunks / ns), subjects=np.arange(ns) / (ns / nsubj / nchunks) % nsubj), fa=dict(fidx=np.arange(nf))) analyzer = OneWayAnova() element_selector = FractionTailSelector(.4, mode='select', tail='upper') common = True m = SplitSamplesProbabilityMapper(analyzer, 'subjects', probability_label='fprob', select_common_features=common, selector=element_selector) m.train(ds) y = m(ds) z = m(ds.samples) assert_array_equal(z, y.samples) assert_equal(y.shape, (100, 4))
def test_features01(): # TODO: might be worth creating appropriate factory # help in mappers/fx aov = OneWayAnova( postproc=FxMapper('features', lambda x: x / x.max(), attrfx=None)) f = aov(datasets['uni2small']) ok_((f.samples != 1.0).any()) ok_(f.samples.max() == 1.0)
def test_anova(self): """Additional aspects of OnewayAnova """ oa = OneWayAnova() oa_custom = OneWayAnova(space='custom') ds = datasets['uni4large'] ds_custom = Dataset(ds.samples, sa={'custom': ds.targets}) r = oa(ds) self.assertRaises(KeyError, oa_custom, ds) r_custom = oa_custom(ds_custom) self.assertTrue(np.allclose(r.samples, r_custom.samples)) # we should get the same results on subsequent runs r2 = oa(ds) r_custom2 = oa_custom(ds_custom) self.assertTrue(np.allclose(r.samples, r2.samples)) self.assertTrue(np.allclose(r_custom.samples, r_custom2.samples))
def test_mapped_classifier_sensitivity_analyzer(self, clf): """Test sensitivity of the mapped classifier """ # Assuming many defaults it is as simple as mclf = FeatureSelectionClassifier( clf, SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.5, mode='select', tail='upper')), enable_ca=['training_stats']) sana = mclf.get_sensitivity_analyzer(postproc=sumofabs_sample(), enable_ca=["sensitivities"]) # and lets look at all sensitivities dataset = datasets['uni2small'] # and we get sensitivity analyzer which works on splits sens = sana(dataset) self.assertEqual(sens.shape, (1, dataset.nfeatures))
def test_custom_combined_selectors(self): """Test combination of the selectors in a single function """ def custom_tail_selector(seq): seq1 = FractionTailSelector(0.01, mode='discard', tail='upper')(seq) seq2 = FractionTailSelector(0.05, mode='select', tail='upper')(seq) return list(set(seq1).intersection(seq2)) seq = np.arange(100) seq_ = custom_tail_selector(seq) assert_array_equal(sorted(seq_), [95, 96, 97, 98]) # verify that this function could be used in place of the selector fs = SensitivityBasedFeatureSelection(OneWayAnova(), custom_tail_selector) ds = datasets['3dsmall'] fs.train(ds) # XXX: why needs to be trained here explicitly? ds_ = fs(ds) assert_equal(ds_.nfeatures, int(ds.nfeatures * 0.04))
def test_union_feature_selection(self): # two methods: 5% highes F-scores, non-zero SMLR weights fss = [SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')), SensitivityBasedFeatureSelection( SMLRWeights(SMLR(lm=1, implementation="C"), postproc=sumofabs_sample()), RangeElementSelector(mode='select'))] fs = CombinedFeatureSelection(fss, method='union') od_union = fs(self.dataset) self.assertTrue(fs.method == 'union') # check output dataset self.assertTrue(od_union.nfeatures <= self.dataset.nfeatures) # again for intersection fs = CombinedFeatureSelection(fss, method='intersection') od_intersect = fs(self.dataset) assert_true(od_intersect.nfeatures < od_union.nfeatures)
def test_anova(self): """Do some extended testing of OneWayAnova in particular -- compound estimation """ m = OneWayAnova() # default must be not compound ? mc = CompoundOneWayAnova() ds = datasets['uni2medium'] # For 2 labels it must be identical for both and equal to # simple OneWayAnova a, ac = m(ds), mc(ds) self.assertTrue(a.shape == (1, ds.nfeatures)) self.assertTrue(ac.shape == (len(ds.UT), ds.nfeatures)) assert_array_equal(ac[0], ac[1]) assert_array_equal(a, ac[1]) # check for p-value attrs if externals.exists('scipy'): assert_true('fprob' in a.fa.keys()) assert_equal(len(ac.fa), len(ac)) ds = datasets['uni4large'] ac = mc(ds) if cfg.getboolean('tests', 'labile', default='yes'): # All non-bogus features must be high for a corresponding feature self.assertTrue( (ac.samples[np.arange(4), np.array(ds.a.nonbogus_features)] >= 1).all()) # All features should have slightly but different CompoundAnova # values. I really doubt that there will be a case when this # test would fail just to being 'labile' self.assertTrue(np.max(np.std(ac, axis=1)) > 0, msg='In compound anova, we should get different' ' results for different labels. Got %s' % ac)
from mvpa2.generators.splitters import Splitter from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.resampling import Balancer from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.misc.transformers import Absolute, \ DistPValue from mvpa2.measures.base import Measure, \ TransferMeasure, RepeatedMeasure, CrossValidation from mvpa2.measures.anova import OneWayAnova, CompoundOneWayAnova from mvpa2.measures.irelief import IterativeRelief, IterativeReliefOnline, \ IterativeRelief_Devel, IterativeReliefOnline_Devel _MEASURES_2_SWEEP = [ OneWayAnova(), CompoundOneWayAnova(postproc=sumofabs_sample()), IterativeRelief(), IterativeReliefOnline(), IterativeRelief_Devel(), IterativeReliefOnline_Devel() ] if externals.exists('scipy'): from mvpa2.measures.corrcoef import CorrCoef _MEASURES_2_SWEEP += [ CorrCoef(), # that one is good when small... handle later #CorrCoef(pvalue=True) ]
def setup_classifier(**kwargs): ''' Thinked! ''' for arg in kwargs: if arg == 'clf_type': clf_type = kwargs[arg] if arg == 'fsel': f_sel = kwargs[arg] if arg == 'cv_type': cv_approach = kwargs[arg] if arg == 'cv_folds': if np.int(kwargs[arg]) == 0: cv_type = np.float(kwargs[arg]) else: cv_type = np.int(kwargs[arg]) if arg == 'permutations': permutations = np.int(kwargs[arg]) if arg == 'cv_attribute': attribute = kwargs[arg] cv_n = cv_type ################# Classifier ####################### if clf_type == 'SVM': clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) elif clf_type == 'GNB': clf = GNB() elif clf_type == 'LDA': clf = LDA() elif clf_type == 'QDA': clf = QDA() elif clf_type == 'SMLR': clf = SMLR() elif clf_type == 'RbfSVM': sk_clf = SVC(gamma=0.1, C=1) clf = SKLLearnerAdapter(sk_clf, enable_ca=['probabilities']) elif clf_type == 'GP': clf = GPR() else: clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) ############## Feature Selection ######################### if f_sel == 'True': logger.info('Feature Selection selected.') fsel = SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')) fclf = FeatureSelectionClassifier(clf, fsel) elif f_sel == 'Fixed': logger.info('Fixed Feature Selection selected.') fsel = SensitivityBasedFeatureSelection( OneWayAnova(), FixedNElementTailSelector(100, mode='select', tail='upper')) fclf = FeatureSelectionClassifier(clf, fsel) elif f_sel == 'PCA': from mvpa2.mappers.skl_adaptor import SKLTransformer from sklearn.decomposition import PCA logger.info('Fixed Feature Selection selected.') fsel = SKLTransformer(PCA(n_components=45)) fclf = FeatureSelectionClassifier(clf, fsel) else: fclf = clf ######################### Permutations ############################# if permutations != 0: if __debug__: debug.active += ["STATMC"] repeater = Repeater(count=permutations) permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) partitioner = NFoldPartitioner(cvtype=cv_n, attr=attribute) null_cv = CrossValidation(clf, ChainNode([partitioner, permutator], space=partitioner.get_space()), errorfx=mean_mismatch_error) distr_est = MCNullDist(repeater, tail='left', measure=null_cv, enable_ca=['dist_samples']) #postproc = mean_sample() else: distr_est = None #postproc = None ######################################################## if cv_approach == 'n_fold': if cv_type != 0: splitter_used = NFoldPartitioner(cvtype=cv_type, attr=attribute) else: splitter_used = NFoldPartitioner(cvtype=1, attr=attribute) else: splitter_used = HalfPartitioner(attr=attribute) chain_splitter = ChainNode([ splitter_used, Balancer( attr='targets', count=1, limit='partitions', apply_selection=True) ], space='partitions') ############################################################# if distr_est == None: cvte = CrossValidation(fclf, chain_splitter, enable_ca=['stats', 'repetition_results']) else: cvte = CrossValidation(fclf, chain_splitter, errorfx=mean_mismatch_error, null_dist=distr_est, enable_ca=['stats', 'repetition_results']) logger.info('Classifier set...') return [fclf, cvte]
from mvpa2.generators.splitters import Splitter from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.resampling import Balancer from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.misc.transformers import Absolute, \ DistPValue from mvpa2.measures.base import Measure, \ TransferMeasure, RepeatedMeasure, CrossValidation from mvpa2.measures.anova import OneWayAnova, CompoundOneWayAnova from mvpa2.measures.irelief import IterativeRelief, IterativeReliefOnline, \ IterativeRelief_Devel, IterativeReliefOnline_Devel _MEASURES_2_SWEEP = [ OneWayAnova(), CompoundOneWayAnova(postproc=sumofabs_sample()), IterativeRelief(), IterativeReliefOnline(), IterativeRelief_Devel(), IterativeReliefOnline_Devel() ] if externals.exists('scipy'): from mvpa2.measures.corrcoef import CorrCoef _MEASURES_2_SWEEP += [ CorrCoef(), # that one is good when small... handle later #CorrCoef(pvalue=True) ] from mvpa2.featsel.base import SplitSamplesProbabilityMapper class SensitivityAnalysersTests(unittest.TestCase): def setUp(self):
clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( SMLRWeights(SMLR(lm=1.0, implementation="C"), postproc=maxofabs_sample()), RangeElementSelector(mode='select')), descr="kNN on SMLR(lm=1) non-0") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')), descr="kNN on 5%(ANOVA)") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( OneWayAnova(), FixedNElementTailSelector(50, mode='select', tail='upper')), descr="kNN on 50(ANOVA)") # GNB clfswh += GNB(descr="GNB()") clfswh += GNB(common_variance=True, descr="GNB(common_variance=True)") clfswh += GNB(prior='uniform', descr="GNB(prior='uniform')")
no_permutations = 1000 permutator = AttributePermutator('targets', count=no_permutations) cv = CrossValidation(clf, NFoldPartitioner(), null_dist=MCNullDist(permutator, tail='left'), enable_ca=['stats']) error = cv(datasets['uni2small']) self.assertTrue(error < 0.4) self.assertTrue(cv.ca.null_prob < 0.05) @reseed_rng() @labile(3, 1) # Let's test with clf sens analyzer AND OneWayAnova @sweepargs(fmeasure=( None, # use clf's sensitivity analyzer OneWayAnova(), # ad-hoc feature-wise measure # targets_mutualinfo_kde(), # FxMeasure targets_dcorrcoef(), # FxMeasure wrapper )) def test_SplitRFE(self, fmeasure): # just a smoke test ATM from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import MappedClassifier from mvpa2.misc.data_generators import normal_feature_dataset #import mvpa2.featsel.rfe #reload(mvpa2.featsel.rfe) from mvpa2.featsel.rfe import RFE, SplitRFE from mvpa2.generators.partition import NFoldPartitioner from mvpa2.featsel.helpers import FractionTailSelector from mvpa2.testing import ok_, assert_equal
data = np.concatenate(data) labels = np.concatenate(labels) return data, labels.astype(np.int) rois = ['aSTG', 'HG', 'pSTG'] for sub_id in range(1, 21): data = [] for roi in rois: data_path = os.path.join(data_dir, roi) tmp_data, label = load_data(data_path, sub_id) data.append(tmp_data) data = np.concatenate(data, axis=1) data = np.concatenate([data[i,:,:].T for i in range(len(data))]) ds = Dataset(data) ds.sa['time_coords'] = np.linspace(0, len(ds)-1, len(ds)) events = [{'onset': i*5, 'duration': 5, 'targets':label[i], 'chunks':i+1} for i in range(int(len(ds)/5))] hrf_estimates = fit_event_hrf_model(ds, events, time_attr='time_coords', condition_attr=('targets', 'chunks'), design_kwargs=dict(drift_model='blank'), glmfit_kwargs=dict(model='ols'), return_model=True) fsel = SensitivityBasedFeatureSelection(OneWayAnova(), FixedNElementTailSelector(5000, mode='select', tail='upper')) fsel.train(hrf_estimates) ds_p = fsel(hrf_estimates) np.save('feat_sub{:03d}'.format(sub_id), ds_p.samples)
class RFETests(unittest.TestCase): def get_data(self): return datasets['uni2medium'] def test_best_detector(self): bd = BestDetector() # for empty history -- no best self.assertTrue(bd([]) == False) # we got the best if we have just 1 self.assertTrue(bd([1]) == True) # we got the best if we have the last minimal self.assertTrue(bd([1, 0.9, 0.8]) == True) # test for alternative func bd = BestDetector(func=max) self.assertTrue(bd([0.8, 0.9, 1.0]) == True) self.assertTrue(bd([0.8, 0.9, 1.0] + [0.9] * 9) == False) self.assertTrue(bd([0.8, 0.9, 1.0] + [0.9] * 10) == False) # test to detect earliest and latest minimum bd = BestDetector(lastminimum=True) self.assertTrue(bd([3, 2, 1, 1, 1, 2, 1]) == True) bd = BestDetector() self.assertTrue(bd([3, 2, 1, 1, 1, 2, 1]) == False) def test_n_back_history_stop_crit(self): """Test stopping criterion""" stopcrit = NBackHistoryStopCrit() # for empty history -- no best but just go self.assertTrue(stopcrit([]) == False) # should not stop if we got 10 more after minimal self.assertTrue( stopcrit([1, 0.9, 0.8] + [0.9] * (stopcrit.steps - 1)) == False) # should stop if we got 10 more after minimal self.assertTrue( stopcrit([1, 0.9, 0.8] + [0.9] * stopcrit.steps) == True) # test for alternative func stopcrit = NBackHistoryStopCrit(BestDetector(func=max)) self.assertTrue(stopcrit([0.8, 0.9, 1.0] + [0.9] * 9) == False) self.assertTrue(stopcrit([0.8, 0.9, 1.0] + [0.9] * 10) == True) # test to detect earliest and latest minimum stopcrit = NBackHistoryStopCrit(BestDetector(lastminimum=True)) self.assertTrue(stopcrit([3, 2, 1, 1, 1, 2, 1]) == False) stopcrit = NBackHistoryStopCrit(steps=4) self.assertTrue(stopcrit([3, 2, 1, 1, 1, 2, 1]) == True) def test_fixed_error_threshold_stop_crit(self): """Test stopping criterion""" stopcrit = FixedErrorThresholdStopCrit(0.5) self.assertTrue(stopcrit([]) == False) self.assertTrue(stopcrit([0.8, 0.9, 0.5]) == False) self.assertTrue(stopcrit([0.8, 0.9, 0.4]) == True) # only last error has to be below to stop self.assertTrue(stopcrit([0.8, 0.4, 0.6]) == False) def test_n_steps_stop_crit(self): """Test stopping criterion""" stopcrit = NStepsStopCrit(2) self.assertTrue(stopcrit([]) == False) self.assertTrue(stopcrit([0.8, 0.9]) == True) self.assertTrue(stopcrit([0.8]) == False) def test_multi_stop_crit(self): """Test multiple stop criteria""" stopcrit = MultiStopCrit( [FixedErrorThresholdStopCrit(0.5), NBackHistoryStopCrit(steps=4)]) # default 'or' mode # nback triggers self.assertTrue(stopcrit([1, 0.9, 0.8] + [0.9] * 4) == True) # threshold triggers self.assertTrue(stopcrit([1, 0.9, 0.2]) == True) # alternative 'and' mode stopcrit = MultiStopCrit( [FixedErrorThresholdStopCrit(0.5), NBackHistoryStopCrit(steps=4)], mode='and') # nback triggers not self.assertTrue(stopcrit([1, 0.9, 0.8] + [0.9] * 4) == False) # threshold triggers not self.assertTrue(stopcrit([1, 0.9, 0.2]) == False) # only both satisfy self.assertTrue(stopcrit([1, 0.9, 0.4] + [0.4] * 4) == True) def test_feature_selector(self): """Test feature selector""" # remove 10% weekest selector = FractionTailSelector(0.1) data = np.array([3.5, 10, 7, 5, -0.4, 0, 0, 2, 10, 9]) # == rank [4, 5, 6, 7, 0, 3, 2, 9, 1, 8] target10 = np.array([0, 1, 2, 3, 5, 6, 7, 8, 9]) target30 = np.array([0, 1, 2, 3, 7, 8, 9]) self.assertRaises(UnknownStateError, selector.ca.__getattribute__, 'ndiscarded') self.assertTrue((selector(data) == target10).all()) selector.felements = 0.30 # discard 30% self.assertTrue(selector.felements == 0.3) self.assertTrue((selector(data) == target30).all()) self.assertTrue(selector.ca.ndiscarded == 3) # se 3 were discarded selector = FixedNElementTailSelector(1) # 0 1 2 3 4 5 6 7 8 9 data = np.array([3.5, 10, 7, 5, -0.4, 0, 0, 2, 10, 9]) self.assertTrue((selector(data) == target10).all()) selector.nelements = 3 self.assertTrue(selector.nelements == 3) self.assertTrue((selector(data) == target30).all()) self.assertTrue(selector.ca.ndiscarded == 3) # test range selector # simple range 'above' self.assertTrue((RangeElementSelector(lower=0)(data) == \ np.array([0,1,2,3,7,8,9])).all()) self.assertTrue((RangeElementSelector(lower=0, inclusive=True)(data) == \ np.array([0,1,2,3,5,6,7,8,9])).all()) self.assertTrue((RangeElementSelector(lower=0, mode='discard', inclusive=True)(data) == \ np.array([4])).all()) # simple range 'below' self.assertTrue((RangeElementSelector(upper=2)(data) == \ np.array([4,5,6])).all()) self.assertTrue((RangeElementSelector(upper=2, inclusive=True)(data) == \ np.array([4,5,6,7])).all()) self.assertTrue((RangeElementSelector(upper=2, mode='discard', inclusive=True)(data) == \ np.array([0,1,2,3,8,9])).all()) # ranges self.assertTrue((RangeElementSelector(lower=2, upper=9)(data) == \ np.array([0,2,3])).all()) self.assertTrue((RangeElementSelector(lower=2, upper=9, inclusive=True)(data) == \ np.array([0,2,3,7,9])).all()) self.assertTrue((RangeElementSelector( upper=2, lower=9, mode='discard', inclusive=True)(data) == RangeElementSelector( lower=2, upper=9, inclusive=False)(data)).all()) # non-0 elements -- should be equivalent to np.nonzero()[0] self.assertTrue((RangeElementSelector()(data) == \ np.nonzero(data)[0]).all()) # XXX put GPR back in after it gets fixed up @sweepargs(clf=clfswh['has_sensitivity', '!meta', '!gpr']) def test_sensitivity_based_feature_selection(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) # of features to remove Nremove = 2 # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity fe = SensitivityBasedFeatureSelection( sens_ana, feature_selector=FixedNElementTailSelector(2), enable_ca=["sensitivity", "selected_ids"]) data = self.get_data() data_nfeatures = data.nfeatures fe.train(data) resds = fe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # silly check if nfeatures got a single one removed self.assertEqual(data.nfeatures, resds.nfeatures + Nremove, msg="We had to remove just a single feature") self.assertEqual( fe.ca.sensitivity.nfeatures, data_nfeatures, msg="Sensitivity have to have # of features equal to original") def test_feature_selection_pipeline(self): sens_ana = SillySensitivityAnalyzer() data = self.get_data() data_nfeatures = data.nfeatures # test silly one first ;-) self.assertEqual( sens_ana(data).samples[0, 0], -int(data_nfeatures / 2)) # OLD: first remove 25% == 6, and then 4, total removing 10 # NOW: test should be independent of the numerical number of features feature_selections = [ SensitivityBasedFeatureSelection(sens_ana, FractionTailSelector(0.25)), SensitivityBasedFeatureSelection(sens_ana, FixedNElementTailSelector(4)) ] # create a FeatureSelection pipeline feat_sel_pipeline = ChainMapper(feature_selections) feat_sel_pipeline.train(data) resds = feat_sel_pipeline(data) self.assertEqual(len(feat_sel_pipeline), len(feature_selections), msg="Test the property feature_selections") desired_nfeatures = int(np.ceil(data_nfeatures * 0.75)) self.assertEqual([fe._oshape[0] for fe in feat_sel_pipeline], [desired_nfeatures, desired_nfeatures - 4]) # TODO: should later on work for any clfs_with_sens @sweepargs(clf=clfswh['has_sensitivity', '!meta'][:1]) @reseed_rng() def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity ( RFE( sens_ana, cvmeasure, Repeater( 2 ), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector(0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits ( RFE( rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.abs), FxMapper('samples', np.mean) ])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater( 2), # we will use the same full cv-training dataset fselector=FractionTailSelector(0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit( BestDetector(), 10), train_pmeasure= False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin()) else: imin = np.argmin(e) if 'does_feature_selection' in clf.__tags__: # if clf is smart it might figure it out right away assert_array_less(imin, len(e)) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? self.assertTrue(1 < imin < len(e) - 1) else: self.assertTrue(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all()) # check if history has elements for every step self.assertTrue( set(rfe.ca.history) == set(range(len(np.array( rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.assertTrue(rfe.ca.nfeatures[-1] == len( np.where(rfe.ca.history == max(rfe.ca.history))[0])) # XXX add a test where sensitivity analyser and transfer error do not # use the same classifier def test_james_problem(self): percent = 80 dataset = datasets['uni2small'] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer(), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['confusion']) # TODO -- it is stats #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception as e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, )) assert (len(cv_storage.storage) == len(dataset.sa['chunks'].unique)) assert (len(cv_storage.storage[0]) == 2) assert (len(cv_storage.storage[0][0]) == dataset.nfeatures) self.assertTrue(error < 0.2) def test_james_problem_multiclass(self): percent = 80 dataset = datasets['uni4large'] #dataset = dataset[:, dataset.a.nonbogus_features] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ #FxMapper('features', l2_normed), #FxMapper('samples', np.mean), #FxMapper('samples', np.abs) FxMapper('features', lambda x: np.argsort(np.abs(x))), #maxofabs_sample() mean_sample() ])), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['stats']) #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception as e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, )) #print "ERROR: ", error #print cv.ca.stats assert (len(cv_storage.storage) == len(dataset.sa['chunks'].unique)) assert (len(cv_storage.storage[0]) == 2) assert (len(cv_storage.storage[0][0]) == dataset.nfeatures) #print "non bogus features", dataset.a.nonbogus_features #print cv_storage.storage self.assertTrue(error < 0.2) ##REF: Name was automagically refactored def __test_matthias_question(self): rfe_clf = LinearCSVMC(C=1) rfesvm_split = SplitClassifier(rfe_clf) clf = \ FeatureSelectionClassifier( clf = LinearCSVMC(C=1), feature_selection = RFE( sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer( combiner=first_axis_mean, transformer=np.abs), transfer_error=ConfusionBasedError( rfesvm_split, confusion_state="confusion"), stopping_criterion=FixedErrorThresholdStopCrit(0.20), feature_selector=FractionTailSelector( 0.2, mode='discard', tail='lower'), update_sensitivity=True)) no_permutations = 1000 permutator = AttributePermutator('targets', count=no_permutations) cv = CrossValidation(clf, NFoldPartitioner(), null_dist=MCNullDist(permutator, tail='left'), enable_ca=['stats']) error = cv(datasets['uni2small']) self.assertTrue(error < 0.4) self.assertTrue(cv.ca.null_prob < 0.05) @reseed_rng() @labile(3, 1) # Let's test with clf sens analyzer AND OneWayAnova @sweepargs(fmeasure=( None, # use clf's sensitivity analyzer OneWayAnova(), # ad-hoc feature-wise measure # targets_mutualinfo_kde(), # FxMeasure targets_dcorrcoef(), # FxMeasure wrapper )) def test_SplitRFE(self, fmeasure): # just a smoke test ATM from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import MappedClassifier from mvpa2.misc.data_generators import normal_feature_dataset #import mvpa2.featsel.rfe #reload(mvpa2.featsel.rfe) from mvpa2.featsel.rfe import RFE, SplitRFE from mvpa2.generators.partition import NFoldPartitioner from mvpa2.featsel.helpers import FractionTailSelector from mvpa2.testing import ok_, assert_equal clf = LinearCSVMC(C=1) dataset = normal_feature_dataset(perlabel=20, nlabels=2, nfeatures=11, snr=1., nonbogus_features=[1, 5]) # flip one of the meaningful features around to see # if we are still getting proper selection dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1 # 3 partitions should be enough for testing partitioner = NFoldPartitioner(count=3) rfeclf = MappedClassifier( clf, SplitRFE( clf, partitioner, fselector=FractionTailSelector(0.5, mode='discard', tail='lower'), fmeasure=fmeasure, # need to update only when using clf's sens anal update_sensitivity=fmeasure is None)) r0 = repr(rfeclf) ok_(rfeclf.mapper.nfeatures_min == 0) rfeclf.train(dataset) ok_(rfeclf.mapper.nfeatures_min > 0) predictions = rfeclf(dataset).samples # at least 1 of the nonbogus-features should be chosen ok_( len( set(dataset.a.nonbogus_features).intersection( rfeclf.mapper.slicearg)) > 0) # check repr to have all needed pieces r = repr(rfeclf) s = str(rfeclf) ok_(('partitioner=NFoldP' in r) or ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r)) ok_('lrn=' in r) ok_(not 'slicearg=' in r) assert_equal(r, r0) if externals.exists('joblib'): rfeclf.mapper.nproc = -1 # compare results against the one ran in parallel _slicearg = rfeclf.mapper.slicearg _predictions = predictions rfeclf.train(dataset) predictions = rfeclf(dataset).samples assert_array_equal(predictions, _predictions) assert_array_equal(_slicearg, rfeclf.mapper.slicearg) # Test that we can collect stats from cas within cross-validation sensitivities = [] nested_errors = [] nested_nfeatures = [] def store_me(data, node, result): sens = node.measure.get_sensitivity_analyzer( force_train=False)(data) sensitivities.append(sens) nested_errors.append(node.measure.mapper.ca.nested_errors) nested_nfeatures.append(node.measure.mapper.ca.nested_nfeatures) cv = CrossValidation(rfeclf, NFoldPartitioner(count=1), callback=store_me, enable_ca=['stats']) _ = cv(dataset) # just to make sure we collected them assert_equal(len(sensitivities), 1) assert_equal(len(nested_errors), 1) assert_equal(len(nested_nfeatures), 1)