def test_null_dist_prob(self, l_clf): train = datasets['uni2medium'] num_perm = 10 permutator = AttributePermutator('targets', count=num_perm, limit='chunks') # define class to estimate NULL distribution of errors # use left tail of the distribution since we use MeanMatchFx as error # function and lower is better terr = TransferMeasure(l_clf, Repeater(count=2), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), null_dist=MCNullDist(permutator, tail='left')) # check reasonable error range err = terr(train) self.assertTrue(np.mean(err) < 0.4) # Lets do the same for CVTE cvte = CrossValidation(l_clf, OddEvenPartitioner(), null_dist=MCNullDist(permutator, tail='left', enable_ca=['dist_samples' ]), postproc=mean_sample()) cv_err = cvte(train) # check that the result is highly significant since we know that the # data has signal null_prob = np.asscalar(terr.ca.null_prob) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue( null_prob <= 0.1, msg="Failed to check that the result is highly significant " "(got %f) since we know that the data has signal" % null_prob) self.assertTrue( np.asscalar(cvte.ca.null_prob) <= 0.1, msg="Failed to check that the result is highly significant " "(got p(cvte)=%f) since we know that the data has signal" % np.asscalar(cvte.ca.null_prob)) # we should be able to access the actual samples of the distribution # yoh: why it is 3D really? # mih: because these are the distribution samples for the ONE error # collapsed into ONE value across all folds. It will also be # 3d if the return value of the measure isn't a scalar and it is # not collapsed across folds. it simply corresponds to the shape # of the output dataset of the respective measure (+1 axis) # Some permutations could have been skipped since classifier failed # to train due to degenerate situation etc, thus accounting for them self.assertEqual(cvte.null_dist.ca.dist_samples.shape[2], num_perm - cvte.null_dist.ca.skipped)
def test_nblocks(self): skip_if_no_external('pprocess') # just a basic test to see that we are getting the same # results with different nblocks ds = datasets['3dsmall'].copy(deep=True)[:, :13] ds.fa['voxel_indices'] = ds.fa.myspace cv = CrossValidation(GNB(), OddEvenPartitioner()) res1 = sphere_searchlight(cv, radius=1, nproc=2)(ds) res2 = sphere_searchlight(cv, radius=1, nproc=2, nblocks=5)(ds) assert_array_equal(res1, res2)
def test_searchlight_cross_decoding(path, subjects, conf_file, type, **kwargs): conf = read_configuration(path, conf_file, type) for arg in kwargs: conf[arg] = kwargs[arg] if arg == 'radius': radius = kwargs[arg] debug.active += ["SLC"] ds_merged = get_merged_ds(path, subjects, conf_file, type, **kwargs) clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) cv = CrossValidation(clf, NFoldPartitioner(attr='task')) maps = [] for ds in ds_merged: ds.targets[ds.targets == 'point'] = 'face' ds.targets[ds.targets == 'saccade'] = 'place' sl = sphere_searchlight(cv, radius, space='voxel_indices') sl_map = sl(ds) sl_map.samples *= -1 sl_map.samples += 1 nif = map2nifti(sl_map, imghdr=ds.a.imghdr) maps.append(nif) datetime = get_time() analysis = 'cross_searchlight' mask = conf['mask_area'] task = type new_dir = datetime + '_' + analysis + '_' + mask + '_' + task command = 'mkdir ' + os.path.join(path, '0_results', new_dir) os.system(command) parent_dir = os.path.join(path, '0_results', new_dir) for s, map in zip(subjects, maps): name = s command = 'mkdir ' + os.path.join(parent_dir, name) os.system(command) results_dir = os.path.join(parent_dir, name) fname = name + '_radius_' + str(radius) + '_searchlight_map.nii.gz' map.to_filename(os.path.join(results_dir, fname)) return maps
def test_sifter_superord_usecase(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.svm import LinearCSVMC # fast one to use for tests from mvpa2.measures.base import CrossValidation from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter ds = _get_superord_dataset() npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') # and then do your normal where clf is space='superord' clf = LinearCSVMC(space='superord') cvte_regular = CrossValidation(clf, NFoldPartitioner(), errorfx=lambda p, t: np.mean(p == t)) cvte_super = CrossValidation(clf, npart, errorfx=lambda p, t: np.mean(p == t)) accs_regular = cvte_regular(ds) accs_super = cvte_super(ds) # With sifting we should get only 2^3 = 8 splits assert (len(accs_super) == 8) # I don't think that this would ever fail, so not marking it labile assert (np.mean(accs_regular) > .8) assert (np.mean(accs_super) < .6)
def test_cached_kernel_different_datasets(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) # Inspired by the problem Swaroop ran into k = LinearSGKernel(normalizer_cls=False) k_ = LinearSGKernel(normalizer_cls=False) # to be cached ck = CachedKernel(k_) clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1) clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1) cvte = CrossValidation(clf, NFoldPartitioner()) cvte_ = CrossValidation(clf_, NFoldPartitioner()) postproc=BinaryFxNode(mean_mismatch_error, 'targets') te = ProxyMeasure(clf, postproc=postproc) te_ = ProxyMeasure(clf_, postproc=postproc) for r in xrange(2): ds1 = datasets['uni2medium'] errs1 = cvte(ds1) ck.compute(ds1) ok_(ck._recomputed) errs1_ = cvte_(ds1) ok_(~ck._recomputed) assert_array_equal(errs1, errs1_) ds2 = datasets['uni3small'] errs2 = cvte(ds2) ck.compute(ds2) ok_(ck._recomputed) errs2_ = cvte_(ds2) ok_(~ck._recomputed) assert_array_equal(errs2, errs2_) ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int) te.train(datasets['uni3small'][::2]) terr = np.asscalar(te(datasets['uni3small'][ssel])) te_.train(datasets['uni3small'][::2]) terr_ = np.asscalar(te_(datasets['uni3small'][ssel])) ok_(~ck._recomputed) ok_(terr == terr_)
def test_split_classifier(self): ds = self.data_bin_1 clf = SplitClassifier(clf=SameSignClassifier(), enable_ca=['stats', 'training_stats', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.stats.error tr_error = clf.ca.training_stats.error clf2 = clf.clone() cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds) cverror = cverror.samples.squeeze() tr_cverror = cv.ca.training_stats.error self.assertEqual(error, cverror, msg="We should get the same error using split classifier as" " using CrossValidation. Got %s and %s" % (error, cverror)) self.assertEqual(tr_error, tr_cverror, msg="We should get the same training error using split classifier as" " using CrossValidation. Got %s and %s" % (tr_error, tr_cverror)) self.assertEqual(clf.ca.stats.percent_correct, 100, msg="Dummy clf should train perfectly") # CV and SplitClassifier should get the same confusion matrices assert_array_equal(clf.ca.stats.matrix, cv.ca.stats.matrix) self.assertEqual(len(clf.ca.stats.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.assertEqual(len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs") self.assertEqual(clf.predict(ds.samples), list(ds.targets), msg="Should classify correctly") # feature_ids must be list of lists, and since it is not # feature-selecting classifier used - we expect all features # to be utilized # NOT ANYMORE -- for BoostedClassifier we have now union of all # used features across slave classifiers. That makes # semantics clear. If you need to get deeper -- use upcoming # harvesting facility ;-) # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks)) # self.assertTrue(np.array([len(ids)==ds.nfeatures # for ids in clf.feature_ids]).all()) # Just check if we get it at all ;-) summary = clf.summary()
def test_cache_speedup(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1) sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1) cv_c = CrossValidation(ck, NFoldPartitioner()) cv_s = CrossValidation(sk, NFoldPartitioner()) #data = datasets['uni4large'] P = 5000 data = normal_feature_dataset(snr=2, perlabel=200, nchunks=10, means=np.random.randn(2, P), nfeatures=P) t0 = time() ck.params.kernel.compute(data) cachetime = time() - t0 t0 = time() cached_err = cv_c(data) ccv_time = time() - t0 t0 = time() norm_err = cv_s(data) ncv_time = time() - t0 assert_almost_equal(np.asanyarray(cached_err), np.asanyarray(norm_err)) ok_(cachetime < ncv_time) ok_(ccv_time < ncv_time) #print 'Regular CV time: %s seconds'%ncv_time #print 'Caching time: %s seconds'%cachetime #print 'Cached CV time: %s seconds'%ccv_time speedup = ncv_time / (ccv_time + cachetime) #print 'Speedup factor: %s'%speedup # Speedup ideally should be 10, though it's not purely linear self.failIf(speedup < 2, 'Problem caching data - too slow!')
def test_confusion_as_node(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.gnb import GNB from mvpa2.clfs.transerror import Confusion ds = normal_feature_dataset(snr=2.0, perlabel=42, nchunks=3, nonbogus_features=[0, 1], nfeatures=2) clf = GNB() cv = CrossValidation(clf, NFoldPartitioner(), errorfx=None, postproc=Confusion(labels=ds.UT), enable_ca=['stats']) res = cv(ds) # needs to be identical to CA assert_array_equal(res.samples, cv.ca.stats.matrix) assert_array_equal(res.sa.predictions, ds.UT) assert_array_equal(res.fa.targets, ds.UT) skip_if_no_external('scipy') from mvpa2.clfs.transerror import BayesConfusionHypothesis from mvpa2.base.node import ChainNode # same again, but this time with Bayesian hypothesis testing at the end cv = CrossValidation(clf, NFoldPartitioner(), errorfx=None, postproc=ChainNode((Confusion(labels=ds.UT), BayesConfusionHypothesis()))) res = cv(ds) # only two possible hypothesis with two classes assert_equals(len(res), 2) # the first hypothesis is the can't discriminate anything assert_equal(len(res.sa.hypothesis[0]), 1) assert_equal(len(res.sa.hypothesis[0][0]), 2) # and the hypothesis is actually less likely than the other one # (both classes can be distinguished) assert (np.e**res.samples[0, 0] < np.e**res.samples[1, 0])
def test_james_problem_multiclass(self): percent = 80 dataset = datasets['uni4large'] #dataset = dataset[:, dataset.a.nonbogus_features] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ #FxMapper('features', l2_normed), #FxMapper('samples', np.mean), #FxMapper('samples', np.abs) FxMapper('features', lambda x: np.argsort(np.abs(x))), #maxofabs_sample() mean_sample() ])), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['stats']) #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception, e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, ))
def _test_gnb_overflow_haxby(): # pragma: no cover # example from https://github.com/PyMVPA/PyMVPA/issues/581 # a heavier version of the above test import os import numpy as np from mvpa2.datasets.sources.native import load_tutorial_data from mvpa2.clfs.gnb import GNB from mvpa2.measures.base import CrossValidation from mvpa2.generators.partition import HalfPartitioner from mvpa2.mappers.zscore import zscore from mvpa2.mappers.detrend import poly_detrend from mvpa2.datasets.miscfx import remove_invariant_features from mvpa2.testing.datasets import * datapath = '/usr/share/data/pymvpa2-tutorial/' haxby = load_tutorial_data(datapath, roi='vt', add_fa={ 'vt_thr_glm': os.path.join(datapath, 'haxby2001', 'sub001', 'masks', 'orig', 'vt.nii.gz') }) # poly_detrend(haxby, polyord=1, chunks_attr='chunks') haxby = haxby[np.array( [ l in ['rest', 'scrambled'] # ''house', 'face'] for l in haxby.targets ], dtype='bool')] #zscore(haxby, chunks_attr='chunks', param_est=('targets', ['rest']), # dtype='float32') # haxby = haxby[haxby.sa.targets != 'rest'] haxby = remove_invariant_features(haxby) clf = GNB(enable_ca='estimates', logprob=True, normalize=True) #clf.train(haxby) #clf.predict(haxby) # estimates a bit "overfit" to judge in the train/predict on the same data cv = CrossValidation(clf, HalfPartitioner(attr='chunks'), postproc=None, enable_ca=['stats']) cv_results = cv(haxby) res1_est = clf.ca.estimates print "Estimates:\n", res1_est print "Exp(estimates):\n", np.round(np.exp(res1_est), 3) assert np.all(np.isfinite(res1_est))
def test_classifier_generalization(self, clf): """Simple test if classifiers can generalize ok on simple data """ te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample()) # check the default #self.assertTrue(te.transerror.errorfx is mean_mismatch_error) nclasses = 2 * (1 + int('multiclass' in clf.__tags__)) ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))] try: cve = te(ds).samples.squeeze() except Exception, e: self.fail("Failed with %s" % e)
def test_perturbation_sensitivity_analyzer(self): # compute N-1 cross-validation as datameasure cv = CrossValidation(sample_clf_lin, NFoldPartitioner()) # do perturbation analysis using gaussian noise pa = NoisePerturbationSensitivity(cv, noise=np.random.normal) # run analysis map = pa(self.dataset) # check for correct size of map self.assertTrue(map.nfeatures == self.dataset.nfeatures) # dataset is noise -> mean sensitivity should be zero self.assertTrue(-0.2 < np.mean(map) < 0.2)
def test_values(self, clf): if isinstance(clf, MulticlassClassifier): # TODO: handle those values correctly return ds = datasets['uni2small'] clf.ca.change_temporarily(enable_ca = ['estimates']) cv = CrossValidation(clf, OddEvenPartitioner(), enable_ca=['stats', 'training_stats']) _ = cv(ds) #print clf.descr, clf.values[0] # basic test either we get 1 set of values per each sample self.assertEqual(len(clf.ca.estimates), ds.nsamples/2) clf.ca.reset_changed_temporarily()
def test_custom_targets(self, lrn): """Simple test if a learner could cope with custom sa not targets """ # Since we are comparing performances of two learners, we need # to assure that if they depend on some random seed -- they # would use the same value. Currently we have such stochastic # behavior in SMLR # yoh: we explicitly seed right before calling a CVs below so # this setting of .seed is of no real effect/testing if 'seed' in lrn.params: from mvpa2 import _random_seed lrn = lrn.clone() # clone the beast lrn.params.seed = _random_seed # reuse the same seed lrn_ = lrn.clone() lrn_.set_space('custom') te = CrossValidation(lrn, NFoldPartitioner()) te_ = CrossValidation(lrn_, NFoldPartitioner()) nclasses = 2 * (1 + int('multiclass' in lrn.__tags__)) dsname = ('uni%dsmall' % nclasses, 'sin_modulated')[int(lrn.__is_regression__)] ds = datasets[dsname] ds_ = ds.copy() ds_.sa['custom'] = ds_.sa['targets'] ds_.sa.pop('targets') self.assertTrue('targets' in ds.sa, msg="'targets' should remain in original ds") try: mvpa2.seed() cve = te(ds) mvpa2.seed() cve_ = te_(ds_) except Exception, e: self.fail("Failed with %r" % e)
def test_chained_crossvalidation_searchlight(): from mvpa2.clfs.gnb import GNB from mvpa2.clfs.meta import MappedClassifier from mvpa2.generators.partition import NFoldPartitioner from mvpa2.mappers.base import ChainMapper from mvpa2.mappers.base import Mapper from mvpa2.measures.base import CrossValidation from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.testing.datasets import datasets dataset = datasets['3dlarge'].copy() dataset.fa['voxel_indices'] = dataset.fa.myspace sample_clf = GNB() # fast and deterministic class ZScoreFeaturesMapper(Mapper): """Very basic mapper which would take care about standardizing all features within each sample separately """ def _forward_data(self, data): return (data - np.mean(data, axis=1)[:, None]) / np.std( data, axis=1)[:, None] # only do partial to save time sl_kwargs = dict(radius=2, center_ids=[3, 50]) clf_mapped = MappedClassifier(sample_clf, ZScoreFeaturesMapper()) cv = CrossValidation(clf_mapped, NFoldPartitioner()) sl = sphere_searchlight(cv, **sl_kwargs) results_mapped = sl(dataset) cv_chained = ChainMapper([ ZScoreFeaturesMapper(auto_train=True), CrossValidation(sample_clf, NFoldPartitioner()) ]) sl_chained = sphere_searchlight(cv_chained, **sl_kwargs) results_chained = sl_chained(dataset) assert_array_equal(results_mapped, results_chained)
def test_auc(self, clf): """Test AUC computation """ if isinstance(clf, MulticlassClassifier): raise SkipTest, \ "TODO: handle values correctly in MulticlassClassifier" clf.ca.change_temporarily(enable_ca=['estimates']) if 'qda' in clf.__tags__: # for reliable estimation of covariances, need sufficient # sample size ds_size = 'large' else: ds_size = 'small' # uni2 dataset with reordered labels ds2 = datasets['uni2' + ds_size].copy() # revert labels ds2.sa['targets'].value = ds2.targets[::-1].copy() # same with uni3 ds3 = datasets['uni3' + ds_size].copy() ul = ds3.sa['targets'].unique nl = ds3.targets.copy() for l in xrange(3): nl[ds3.targets == ul[l]] = ul[(l + 1) % 3] ds3.sa.targets = nl for ds in [ datasets['uni2' + ds_size], ds2, datasets['uni3' + ds_size], ds3 ]: cv = CrossValidation(clf, OddEvenPartitioner(), enable_ca=['stats', 'training_stats']) cverror = cv(ds) stats = cv.ca.stats.stats Nlabels = len(ds.uniquetargets) # so we at least do slightly above chance # But LARS manages to screw up there as well ATM from time to time, so making # its testing labile if (('lars' in clf.__tags__) and cfg.getboolean('tests', 'labile', default='yes')) \ or (not 'lars' in clf.__tags__): self.assertTrue(stats['ACC'] > 1.2 / Nlabels) auc = stats['AUC'] if (Nlabels == 2) or (Nlabels > 2 and auc[0] is not np.nan): mauc = np.min(stats['AUC']) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue( mauc > 0.55, msg='All AUCs must be above chance. Got minimal ' 'AUC=%.2g among %s' % (mauc, stats['AUC'])) clf.ca.reset_changed_temporarily()
def test_noise_classification(self): # get a dataset with a very high SNR data = get_mv_pattern(10) # do crossval with default errorfx and 'mean' combiner cv = CrossValidation(sample_clf_nl, NFoldPartitioner()) # must return a scalar value result = cv(data) # must be perfect self.assertTrue((result.samples < 0.05).all()) # do crossval with permuted regressors cv = CrossValidation( sample_clf_nl, ChainNode( [NFoldPartitioner(), AttributePermutator('targets', count=10)], space='partitions')) results = cv(data) # must be at chance level pmean = np.array(results).mean() self.assertTrue(pmean < 0.58 and pmean > 0.42)
def test_pseudo_cv_measure(self): clf = SMLR() enode = BinaryFxNode(mean_mismatch_error, 'targets') tm = TransferMeasure(clf, Splitter('partitions'), postproc=enode) cvgen = NFoldPartitioner() rm = RepeatedMeasure(tm, cvgen) res = rm(self.dataset) # one error per fold assert_equal(res.shape, (len(self.dataset.sa['chunks'].unique), 1)) # we can do the same with Crossvalidation cv = CrossValidation(clf, cvgen, enable_ca=['stats', 'training_stats', 'datasets']) res = cv(self.dataset) assert_equal(res.shape, (len(self.dataset.sa['chunks'].unique), 1))
def test_vstack_and_origids_issue(self): # That is actually what swaroop hit skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) # Inspired by the problem Swaroop ran into k = LinearSGKernel(normalizer_cls=False) k_ = LinearSGKernel(normalizer_cls=False) # to be cached ck = CachedKernel(k_) clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1) clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1) cvte = CrossValidation(clf, NFoldPartitioner()) cvte_ = CrossValidation(clf_, NFoldPartitioner()) ds = datasets['uni2large'].copy(deep=True) ok_(~('orig_ids' in ds.sa)) # assure that there are None ck.compute(ds) # so we initialize origids ok_('origids' in ds.sa) ds2 = ds.copy(deep=True) ds2.samples = np.zeros(ds2.shape) from mvpa2.base.dataset import vstack ds_vstacked = vstack((ds2, ds)) # should complaint now since there would not be unique # samples' origids if __debug__: assert_raises(ValueError, ck.compute, ds_vstacked) ds_vstacked.init_origids('samples') # reset origids ck.compute(ds_vstacked) errs = cvte(ds_vstacked) errs_ = cvte_(ds_vstacked) # Following test would have failed since origids # were just ints, and then non-unique after vstack assert_array_equal(errs.samples, errs_.samples)
def test_james_problem(self): percent = 80 dataset = datasets['uni2small'] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer(), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['confusion']) # TODO -- it is stats #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception as e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, )) assert (len(cv_storage.storage) == len(dataset.sa['chunks'].unique)) assert (len(cv_storage.storage[0]) == 2) assert (len(cv_storage.storage[0][0]) == dataset.nfeatures) self.assertTrue(error < 0.2)
def test_regressions_classifiers(self, clf): """Simple tests on regressions being used as classifiers """ # check if we get values set correctly clf.ca.change_temporarily(enable_ca=['estimates']) self.assertRaises(UnknownStateError, clf.ca['estimates']._get) cv = CrossValidation(clf, NFoldPartitioner(), enable_ca=['stats', 'training_stats']) ds = datasets['uni2small'].copy() # we want numeric labels to maintain the previous behavior, especially # since we deal with regressions here ds.sa.targets = AttributeMap().to_numeric(ds.targets) cverror = cv(ds) self.assertTrue(len(clf.ca.estimates) == ds[ds.chunks == 1].nsamples) clf.ca.reset_changed_temporarily()
def test_ifs(self, svm): # measure for feature selection criterion and performance assesment # use the SAME clf! errorfx = mean_mismatch_error fmeasure = CrossValidation(svm, NFoldPartitioner(), postproc=mean_sample()) pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets')) ifs = IFS(fmeasure, pmeasure, Splitter('purpose', attr_values=['train', 'test']), fselector=\ # go for lower tail selection as data_measure will return # errors -> low is good FixedNElementTailSelector(1, tail='lower', mode='select'), ) wdata = self.get_data() wdata.sa['purpose'] = np.repeat('train', len(wdata)) tdata = self.get_data() tdata.sa['purpose'] = np.repeat('test', len(tdata)) ds = vstack((wdata, tdata)) orig_nfeatures = ds.nfeatures ifs.train(ds) resds = ifs(ds) # fail if orig datasets are changed self.assertTrue(ds.nfeatures == orig_nfeatures) # check that the features set with the least error is selected self.assertTrue(len(ifs.ca.errors)) e = np.array(ifs.ca.errors) self.assertTrue(resds.nfeatures == e.argmin() + 1) # repeat with dataset where selection order is known wsignal = datasets['dumb2'].copy() wsignal.sa['purpose'] = np.repeat('train', len(wsignal)) tsignal = datasets['dumb2'].copy() tsignal.sa['purpose'] = np.repeat('test', len(tsignal)) signal = vstack((wsignal, tsignal)) ifs.train(signal) resds = ifs(signal) self.assertTrue((resds.samples[:, 0] == signal.samples[:, 0]).all())
def test_confusionmatrix_nulldist(self): from mvpa2.clfs.gnb import GNB from mvpa2.clfs.transerror import ConfusionMatrixError from mvpa2.misc.data_generators import normal_feature_dataset for snr in [ 0., 2., ]: ds = normal_feature_dataset(snr=snr, perlabel=42, nchunks=3, nonbogus_features=[0, 1], nfeatures=2) clf = GNB() num_perm = 50 permutator = AttributePermutator('targets', limit='chunks', count=num_perm) cv = CrossValidation( clf, NFoldPartitioner(), errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique), postproc=mean_sample(), null_dist=MCNullDist( permutator, tail='right', # because we now look at accuracy not error enable_ca=['dist_samples']), enable_ca=['stats']) cmatrix = cv(ds) #print "Result:\n", cmatrix.samples cvnp = cv.ca.null_prob.samples #print cvnp self.assertTrue(cvnp.shape, (2, 2)) if cfg.getboolean('tests', 'labile', default='yes'): if snr == 0.: # all p should be high since no signal assert_array_less(0.05, cvnp) else: # diagonal p is low -- we have signal after all assert_array_less(np.diag(cvnp), 0.05) # off diagonals are high p since for them we would # need to look at the other tail assert_array_less( 0.9, cvnp[(np.array([0, 1]), np.array([1, 0]))])
def test_CDist_cval(): if _ENFORCE_CA_ENABLED: # skip testing for now, since we are having issue with 'training_stats' raise SkipTest( "Skipping test to avoid issue with 'training_stats while CA enabled" ) targets = np.tile(list(range(3)), 2) chunks = np.repeat(np.array((0, 1)), 3) ds = dataset_wizard(samples=data, targets=targets, chunks=chunks) cv = CrossValidation(CDist(), generator=NFoldPartitioner(), errorfx=None) res = cv(ds) # Testing to make sure the both folds return same results, as they should assert_array_almost_equal( res[res.sa.cvfolds == 0, ].samples.reshape(3, 3), res[res.sa.cvfolds == 1, ].samples.reshape(3, 3).T) # Testing to make sure the last dimension is always 1 to make it work with Searchlights assert_equal(res.nfeatures, 1)
def test_gnbsearchlight_3partitions_and_splitter(self): ds = self.dataset[:, :20] # custom partitioner which provides 3 partitions part = CustomPartitioner([([2], [3], [1])]) gnb_sl = sphere_gnbsearchlight(GNB(), part) res_gnb_sl = gnb_sl(ds) # compare results to full blown searchlight sl = sphere_searchlight(CrossValidation(GNB(), part)) res_sl = sl(ds) assert_datasets_equal(res_gnb_sl, res_sl) # and theoretically for this simple single cross-validation we could # just use Splitter splitter = Splitter('chunks', [2, 3]) # we have to put explicit None since can't become a kwarg in 1 day any # longer here gnb_sl_ = sphere_gnbsearchlight(GNB(), None, splitter=splitter) res_gnb_sl_ = gnb_sl_(ds) assert_datasets_equal(res_gnb_sl, res_gnb_sl_)
def test_regression_with_additional_sa(self): regr = regrswh[:][0] ds = datasets['3dsmall'].copy() ds.fa['voxel_indices'] = ds.fa.myspace # Create a new sample attribute which will be used along with # every searchlight ds.sa['beh'] = np.random.normal(size=(ds.nsamples, 2)) # and now for fun -- lets create custom linar regression # targets out of some random feature and beh linearly combined rfeature = np.random.randint(ds.nfeatures) ds.sa.targets = np.dot( np.hstack((ds.sa.beh, ds.samples[:, rfeature:rfeature + 1])), np.array([0.3, 0.2, 0.3])) class CrossValidationWithBeh(CrossValidation): """An adapter for regular CV which would hstack sa.beh to the searchlighting ds""" def _call(self, ds): return CrossValidation._call( self, Dataset(np.hstack((ds, ds.sa.beh)), sa=ds.sa)) cvbeh = CrossValidationWithBeh(regr, OddEvenPartitioner(), errorfx=corr_error) # regular cv cv = CrossValidation(regr, OddEvenPartitioner(), errorfx=corr_error) slbeh = sphere_searchlight(cvbeh, radius=1) slmapbeh = slbeh(ds) sl = sphere_searchlight(cv, radius=1) slmap = sl(ds) assert_equal(slmap.shape, (2, ds.nfeatures)) # SL which had access to beh should have got for sure better # results especially in the vicinity of the chosen feature... features = sl.queryengine.query_byid(rfeature) assert_array_lequal(slmapbeh.samples[:, features], slmap.samples[:, features])
def test_simple_n_minus_one_cv(self): data = get_mv_pattern(3) data.init_origids('samples') self.assertTrue(data.nsamples == 120) self.assertTrue(data.nfeatures == 2) self.assertTrue( (data.sa.targets == \ [0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0] * 6).all()) self.assertTrue( (data.sa.chunks == \ [k for k in range(1, 7) for i in range(20)]).all()) assert_equal(len(np.unique(data.sa.origids)), data.nsamples) cv = CrossValidation(sample_clf_nl, NFoldPartitioner(), enable_ca=['stats', 'training_stats']) # 'samples_error']) results = cv(data) self.assertTrue((results.samples < 0.2).all() and (results.samples >= 0.0).all())
def test_chi_square_searchlight(self): # only do partial to save time # Can't yet do this since test_searchlight isn't yet "under nose" #skip_if_no_external('scipy') if not externals.exists('scipy'): return from mvpa2.misc.stats import chisquare cv = CrossValidation(sample_clf_lin, NFoldPartitioner(), enable_ca=['stats']) def getconfusion(data): cv(data) return chisquare(cv.ca.stats.matrix)[0] sl = sphere_searchlight(getconfusion, radius=0, center_ids=[3, 50]) # run searchlight results = sl(self.dataset) self.assertTrue(results.nfeatures == 2)
def test_cv_no_generator(self): ds = Dataset(np.arange(4), sa={ 'partitions': [1, 1, 2, 2], 'targets': ['a', 'b', 'c', 'd'] }) class Measure(Classifier): def _train(self, ds_): assert_array_equal(ds_.samples, ds.samples[:2]) assert_array_equal(ds_.sa.partitions, [1] * len(ds_)) def _predict(self, ds_): # also called for estimating training error assert (ds_ is not ds) # we pass a shallow copy assert (len(ds_) < len(ds)) assert_equal(len(ds_.sa['partitions'].unique), 1) return ['c', 'd'] measure = Measure() cv = CrossValidation(measure) res = cv(ds) assert_array_equal(res, [[0]]) # we did perfect here ;)
from mvpa2.clfs.svm import LinearCSVMC from mvpa2.measures.base import CrossValidation from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.testing.datasets import datasets from mvpa2.mappers.fx import mean_sample """For the sake of simplicity, let's use a small artificial dataset.""" # Lets just use our tiny 4D dataset from testing battery dataset = datasets['3dlarge'] """Now it only takes three lines for a searchlight analysis.""" # setup measure to be computed in each sphere (cross-validated # generalization error on odd/even splits) cv = CrossValidation(LinearCSVMC(), OddEvenPartitioner()) # setup searchlight with 2 voxels radius and measure configured above sl = sphere_searchlight(cv, radius=2, space='myspace', postproc=mean_sample()) # run searchlight on dataset sl_map = sl(dataset) print 'Best performing sphere error:', np.min(sl_map.samples) """ If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting searchlight map (`sl_map`) can be mapped back into the original dataspace and viewed as a brain overlay. :ref:`Another example <example_searchlight>` shows a typical application of this algorithm.
def _call(self, ds): return CrossValidation._call( self, Dataset(np.hstack((ds, ds.sa.beh)), sa=ds.sa))
def test_partial_searchlight_with_confusion_matrix(self): ds = self.dataset from mvpa2.clfs.stats import MCNullDist from mvpa2.mappers.fx import mean_sample, sum_sample # compute N-1 cross-validation for each sphere cm = ConfusionMatrix(labels=ds.UT) cv = CrossValidation( sample_clf_lin, NFoldPartitioner(), # we have to assure that matrix does not get flatted by # first vstack in cv and then hstack in searchlight -- # thus 2 leading dimensions # TODO: RF? make searchlight/crossval smarter? errorfx=lambda *a: cm(*a)[None, None, :]) # contruct diameter 2 (or just radius 1) searchlight sl = sphere_searchlight(cv, radius=1, center_ids=[3, 5, 50]) # our regular searchlight -- to compare results cv_gross = CrossValidation(sample_clf_lin, NFoldPartitioner()) sl_gross = sphere_searchlight(cv_gross, radius=1, center_ids=[3, 5, 50]) # run searchlights res = sl(ds) res_gross = sl_gross(ds) # only two spheres but error for all CV-folds and complete confusion matrix assert_equal(res.shape, (len(ds.UC), 3, len(ds.UT), len(ds.UT))) assert_equal(res_gross.shape, (len(ds.UC), 3)) # briefly inspect the confusion matrices mat = res.samples # since input dataset is probably balanced (otherwise adjust # to be per label): sum within columns (thus axis=-2) should # be identical to per-class/chunk number of samples samples_per_classchunk = len(ds) / (len(ds.UT) * len(ds.UC)) ok_(np.all(np.sum(mat, axis= -2) == samples_per_classchunk)) # and if we compute accuracies manually -- they should # correspond to the one from sl_gross assert_array_almost_equal(res_gross.samples, # from accuracies to errors 1 - (mat[..., 0, 0] + mat[..., 1, 1]).astype(float) / (2 * samples_per_classchunk)) # and now for those who remained sited -- lets perform H0 MC # testing of this searchlight... just a silly one with minimal # number of permutations no_permutations = 10 permutator = AttributePermutator('targets', count=no_permutations) # once again -- need explicit leading dimension to avoid # vstacking during cross-validation cv.postproc = lambda x: sum_sample()(x)[None, :] sl = sphere_searchlight(cv, radius=1, center_ids=[3, 5, 50], null_dist=MCNullDist(permutator, tail='right', enable_ca=['dist_samples'])) res_perm = sl(ds) # XXX all of the res_perm, sl.ca.null_prob and # sl.null_dist.ca.dist_samples carry a degenerate leading # dimension which was probably due to introduced new axis # above within cv.postproc assert_equal(res_perm.shape, (1, 3, 2, 2)) assert_equal(sl.null_dist.ca.dist_samples.shape, res_perm.shape + (no_permutations,)) assert_equal(sl.ca.null_prob.shape, res_perm.shape) # just to make sure ;) ok_(np.all(sl.ca.null_prob.samples >= 0)) ok_(np.all(sl.ca.null_prob.samples <= 1)) # we should have got sums of hits across the splits assert_array_equal(np.sum(mat, axis=0), res_perm.samples[0])
fsel = OneWayAnova() import mvpa2.featsel as fs fselector = fs.helpers.FixedNElementTailSelector(nVox, tail='upper', mode='select', sort=False) # fselector = fs.helpers.FractionTailSelector(0.05, mode='select', tail='upper') sbfs = fs.base.SensitivityBasedFeatureSelection(fsel, fselector, enable_ca=['sensitivities']) from mvpa2.clfs.meta import FeatureSelectionClassifier, MappedClassifier fclf = FeatureSelectionClassifier(clf, sbfs) from mvpa2.measures.base import CrossValidation from mvpa2.misc import errorfx from mvpa2.generators.partition import NFoldPartitioner cv = CrossValidation(fclf, NFoldPartitioner(attr='chunks'), errorfx=errorfx.mean_match_accuracy) import numpy as np from mvpa2.misc.io.base import SampleAttributes cv_attr = SampleAttributes(os.path.join(paths[3], (con + "_attribute_labels.txt"))) from mvpa2.measures import rsa dsm = rsa.PDist(square=True) # searchlight # import searchlightutils as sl # from mvpa2.measures.searchlight import sphere_searchlight # cvSL = sphere_searchlight(cv, radius=r) # lres = sl.run_cv_sl(cvSL, fds[lidx].copy(deep=False)) lresults = []