def test_split_featurewise_dataset_measure(self): ds = datasets['uni3small'] sana = RepeatedMeasure( SMLR(fit_all_weights=True).get_sensitivity_analyzer(), ChainNode( [NFoldPartitioner(), Splitter('partitions', attr_values=[1])])) sens = sana(ds) # a sensitivity for each chunk and each label combination assert_equal(sens.shape, (len(ds.sa['chunks'].unique) * len(ds.sa['targets'].unique), ds.nfeatures)) # Lets try more complex example with 'boosting' ds = datasets['uni3medium'] ds.init_origids('samples') sana = RepeatedMeasure( SMLR(fit_all_weights=True).get_sensitivity_analyzer(), Balancer(amount=0.25, count=2, apply_selection=True), enable_ca=['datasets', 'repetition_results']) sens = sana(ds) assert_equal(sens.shape, (2 * len(ds.sa['targets'].unique), ds.nfeatures)) splits = sana.ca.datasets self.assertEqual(len(splits), 2) self.assertTrue( np.all([s.nsamples == ds.nsamples // 4 for s in splits])) # should have used different samples self.assertTrue(np.any([splits[0].sa.origids != splits[1].sa.origids])) # and should have got different sensitivities self.assertTrue(np.any(sens[0] != sens[3]))
def test_repeated_features(self): class CountFeatures(Measure): is_trained = True def _call(self, ds): return Dataset([ds.nfeatures], fa={ 'nonbogus_targets': list(ds.fa['nonbogus_targets'].unique) }) cf = CountFeatures() spl = Splitter('fa.nonbogus_targets') nsplits = len(list(spl.generate(self.dataset))) assert_equal(nsplits, 3) rm = RepeatedMeasure(cf, spl, concat_as='features') res = rm(self.dataset) assert_equal(res.shape, (1, nsplits)) # due to https://github.com/numpy/numpy/issues/641 we are # using list(set(...)) construct and there order of # nonbogus_targets.unique can vary from run to run, thus there # is no guarantee that we would get 18 first, which is a # questionable assumption anyways, thus performing checks # which do not require any specific order. # And yet due to another issue # https://github.com/numpy/numpy/issues/3759 # we can't just is None for the bool mask None_fa = np.array([x is None for x in res.fa.nonbogus_targets]) assert_array_equal(res.samples[0, None_fa], [18]) assert_array_equal(res.samples[0, ~None_fa], [1, 1]) if sys.version_info[0] < 3: # with python2 order seems to be consistent assert_array_equal(res.samples[0], [18, 1, 1])
def test_pseudo_cv_measure(self): clf = SMLR() enode = BinaryFxNode(mean_mismatch_error, 'targets') tm = TransferMeasure(clf, Splitter('partitions'), postproc=enode) cvgen = NFoldPartitioner() rm = RepeatedMeasure(tm, cvgen) res = rm(self.dataset) # one error per fold assert_equal(res.shape, (len(self.dataset.sa['chunks'].unique), 1)) # we can do the same with Crossvalidation cv = CrossValidation(clf, cvgen, enable_ca=['stats', 'training_stats', 'datasets']) res = cv(self.dataset) assert_equal(res.shape, (len(self.dataset.sa['chunks'].unique), 1))
def test_repeated_features(self): print self.dataset print self.dataset.fa.nonbogus_targets class CountFeatures(Measure): is_trained = True def _call(self, ds): return ds.nfeatures cf = CountFeatures() spl = Splitter('fa.nonbogus_targets') nsplits = len(list(spl.generate(self.dataset))) assert_equal(nsplits, 3) rm = RepeatedMeasure(cf, spl, concat_as='features') res = rm(self.dataset) assert_equal(res.shape, (1, nsplits)) assert_array_equal(res.samples[0], [18,1,1])
def test_rfe_sensmap(): # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html # just a smoke test. fails with from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import FeatureSelectionClassifier from mvpa2.measures.base import CrossValidation, RepeatedMeasure from mvpa2.generators.splitters import Splitter from mvpa2.generators.partition import NFoldPartitioner from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.mappers.fx import mean_sample from mvpa2.mappers.fx import maxofabs_sample from mvpa2.generators.base import Repeater from mvpa2.featsel.rfe import RFE from mvpa2.featsel.helpers import FractionTailSelector, BestDetector from mvpa2.featsel.helpers import NBackHistoryStopCrit from mvpa2.datasets import vstack from mvpa2.misc.data_generators import normal_feature_dataset # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent fds = normal_feature_dataset(nlabels=3, snr=1, # 100, # pure signal! ;) perlabel=9, nfeatures=6, nonbogus_features=range(3), nchunks=3) clfsvm = LinearCSVMC() rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()), CrossValidation( clfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()), Repeater(2), fselector=FractionTailSelector(0.70, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), update_sensitivity=True) fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm) sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()) # manually repeating/splitting so we do both RFE sensitivity and classification senses, errors = [], [] for i, pset in enumerate(NFoldPartitioner().generate(fds)): # split partitioned dataset split = [d for d in Splitter('partitions').generate(pset)] senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets)) senses = vstack(senses) errors = vstack(errors) # Let's compare against rerunning the beast simply for classification with CV errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds) # and they should match assert_array_equal(errors, errors_cv) # buggy! cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner()) senses_rm = cv_sensana_svm(fds) #print senses.samples, senses_rm.samples #print errors, errors_cv.samples assert_raises(AssertionError, assert_array_almost_equal, senses.samples, senses_rm.samples) raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")