def test_permute_chunks(): def is_sorted(x): return np.array_equal(np.sort(x), x) ds = give_data() # change targets labels # there is no target labels permuting within chunks, # assure = True would be error ds.sa['targets'] = list(range(len(ds.sa.targets))) permutation = AttributePermutator(attr='targets', chunk_attr='chunks', strategy='chunks', assure=True) pds = permutation(ds) assert_false(is_sorted(pds.sa.targets)) assert_true(np.array_equal(pds.samples, ds.samples)) for chunk_id in np.unique(pds.sa.chunks): chunk_ds = pds[pds.sa.chunks == chunk_id] assert_true(is_sorted(chunk_ds.sa.targets)) permutation = AttributePermutator(attr='targets', strategy='chunks') assert_raises(ValueError, permutation, ds)
def test_noise_classification(self): # get a dataset with a very high SNR data = get_mv_pattern(10) # do crossval with default errorfx and 'mean' combiner cv = CrossValidation(sample_clf_nl, NFoldPartitioner()) # must return a scalar value result = cv(data) # must be perfect self.assertTrue((result.samples < 0.05).all()) # do crossval with permuted regressors cv = CrossValidation( sample_clf_nl, ChainNode( [NFoldPartitioner(), AttributePermutator('targets', count=10)], space='partitions')) results = cv(data) # results must not be the same self.assertTrue(len(np.unique(results.samples)) > 1) # must be at chance level pmean = np.array(results).mean() self.assertTrue(pmean < 0.58 and pmean > 0.42)
def __test_matthias_question(self): rfe_clf = LinearCSVMC(C=1) rfesvm_split = SplitClassifier(rfe_clf) clf = \ FeatureSelectionClassifier( clf = LinearCSVMC(C=1), feature_selection = RFE( sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer( combiner=first_axis_mean, transformer=np.abs), transfer_error=ConfusionBasedError( rfesvm_split, confusion_state="confusion"), stopping_criterion=FixedErrorThresholdStopCrit(0.20), feature_selector=FractionTailSelector( 0.2, mode='discard', tail='lower'), update_sensitivity=True)) no_permutations = 1000 permutator = AttributePermutator('targets', count=no_permutations) cv = CrossValidation(clf, NFoldPartitioner(), null_dist=MCNullDist(permutator, tail='left'), enable_ca=['stats']) error = cv(datasets['uni2small']) self.assertTrue(error < 0.4) self.assertTrue(cv.ca.null_prob < 0.05)
def _test_edmund_chong_20120907(): # pragma: no cover # commented out to avoid syntax warnings while compiling # from mvpa2.suite import * from mvpa2.testing.datasets import datasets repeater = Repeater(count=20) partitioner = ChainNode([NFoldPartitioner(cvtype=1), Balancer(attr='targets', count=1, # for real data > 1 limit='partitions', apply_selection=True )], space='partitions') clf = LinearCSVMC() #choice of classifier permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) null_cv = CrossValidation( clf, ChainNode([partitioner, permutator], space=partitioner.get_space()), errorfx=mean_mismatch_error) distr_est = MCNullDist(repeater, tail='left', measure=null_cv, enable_ca=['dist_samples']) cvte = CrossValidation(clf, partitioner, errorfx=mean_mismatch_error, null_dist=distr_est, enable_ca=['stats']) errors = cvte(datasets['uni2small'])
def test_permute_superord(): from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter from mvpa2.generators.permutation import AttributePermutator ds = _get_superord_dataset() # mvpa2.seed(1) part = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), AttributePermutator(['superord'], limit=['partitions', 'chunks']), ], space='partitions') for ds_perm in part.generate(ds): # it does permutation assert (np.sum(ds_perm.sa.superord != ds.sa.superord) != 0)
def test_null_dist_prob(self, l_clf): train = datasets['uni2medium'] num_perm = 10 permutator = AttributePermutator('targets', count=num_perm, limit='chunks') # define class to estimate NULL distribution of errors # use left tail of the distribution since we use MeanMatchFx as error # function and lower is better terr = TransferMeasure(l_clf, Repeater(count=2), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), null_dist=MCNullDist(permutator, tail='left')) # check reasonable error range err = terr(train) self.assertTrue(np.mean(err) < 0.4) # Lets do the same for CVTE cvte = CrossValidation(l_clf, OddEvenPartitioner(), null_dist=MCNullDist(permutator, tail='left', enable_ca=['dist_samples' ]), postproc=mean_sample()) cv_err = cvte(train) # check that the result is highly significant since we know that the # data has signal null_prob = np.asscalar(terr.ca.null_prob) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue( null_prob <= 0.1, msg="Failed to check that the result is highly significant " "(got %f) since we know that the data has signal" % null_prob) self.assertTrue( np.asscalar(cvte.ca.null_prob) <= 0.1, msg="Failed to check that the result is highly significant " "(got p(cvte)=%f) since we know that the data has signal" % np.asscalar(cvte.ca.null_prob)) # we should be able to access the actual samples of the distribution # yoh: why it is 3D really? # mih: because these are the distribution samples for the ONE error # collapsed into ONE value across all folds. It will also be # 3d if the return value of the measure isn't a scalar and it is # not collapsed across folds. it simply corresponds to the shape # of the output dataset of the respective measure (+1 axis) # Some permutations could have been skipped since classifier failed # to train due to degenerate situation etc, thus accounting for them self.assertEqual(cvte.null_dist.ca.dist_samples.shape[2], num_perm - cvte.null_dist.ca.skipped)
def test_attrpermute(): ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3,4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_equal(len(pds), nruns) for p in pds: assert_false(np.all(p.sa.ids == ds.sa.ids)) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids))
def test_confusionmatrix_nulldist(self): from mvpa2.clfs.gnb import GNB from mvpa2.clfs.transerror import ConfusionMatrixError from mvpa2.misc.data_generators import normal_feature_dataset for snr in [ 0., 2., ]: ds = normal_feature_dataset(snr=snr, perlabel=42, nchunks=3, nonbogus_features=[0, 1], nfeatures=2) clf = GNB() num_perm = 50 permutator = AttributePermutator('targets', limit='chunks', count=num_perm) cv = CrossValidation( clf, NFoldPartitioner(), errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique), postproc=mean_sample(), null_dist=MCNullDist( permutator, tail='right', # because we now look at accuracy not error enable_ca=['dist_samples']), enable_ca=['stats']) cmatrix = cv(ds) #print "Result:\n", cmatrix.samples cvnp = cv.ca.null_prob.samples #print cvnp self.assertTrue(cvnp.shape, (2, 2)) if cfg.getboolean('tests', 'labile', default='yes'): if snr == 0.: # all p should be high since no signal assert_array_less(0.05, cvnp) else: # diagonal p is low -- we have signal after all assert_array_less(np.diag(cvnp), 0.05) # off diagonals are high p since for them we would # need to look at the other tail assert_array_less( 0.9, cvnp[(np.array([0, 1]), np.array([1, 0]))])
def test_adhocsearchlight_perm_testing(self): # just a smoke test pretty much ds = datasets['3dmedium'].copy() #ds.samples += np.random.normal(size=ds.samples.shape)*10 mvpa2.seed() ds.fa['voxel_indices'] = ds.fa.myspace from mvpa2.mappers.fx import mean_sample from mvpa2.clfs.stats import MCNullDist permutator = AttributePermutator('targets', count=8, limit='chunks') distr_est = MCNullDist(permutator, tail='left', enable_ca=['dist_samples']) slargs = (kNN(1), NFoldPartitioner(0.5, selection_strategy='random', count=9)) slkwargs = dict(radius=1, postproc=mean_sample()) sl_nodistr = sphere_m1nnsearchlight(*slargs, **slkwargs) skip_if_no_external('scipy') # needed for null_t sl = sphere_m1nnsearchlight(*slargs, null_dist=distr_est, enable_ca=['null_t'], reuse_neighbors=True, **slkwargs) mvpa2.seed() res_nodistr = sl_nodistr(ds) mvpa2.seed() res = sl(ds) # verify that we at least got the same main result # ah (yoh) -- null dist is estimated before the main # estimate so we can't guarantee correspondence :-/ # assert_array_equal(res_nodistr, res) # only resemblance (TODO, may be we want to get/setstate # for rng before null_dist.fit?) # and dimensions correspond assert_array_equal(distr_est.ca.dist_samples.shape, (1, ds.nfeatures, 8)) assert_array_equal(sl.ca.null_t.samples.shape, (1, ds.nfeatures))
def test_null_dist_prob_any(self): """Test 'any' tail statistics estimation""" skip_if_no_external('scipy') # test 'any' mode from mvpa2.measures.corrcoef import CorrCoef # we will reassign targets later on, so let's operate on a # copy ds = datasets['uni2medium'].copy() permutator = AttributePermutator('targets', count=20) null = MCNullDist(permutator, tail='any') assert_raises(ValueError, null.fit, CorrCoef(), ds) # cheat and map to numeric for this test ds.sa.targets = AttributeMap().to_numeric(ds.targets) null.fit(CorrCoef(), ds) # 100 and -100 should both have zero probability on their respective # tails pm100 = null.p([-100] + [0] * (ds.nfeatures - 1)) p100 = null.p([100] + [0] * (ds.nfeatures - 1)) assert_array_almost_equal(pm100, p100) # With 20 samples it isn't that easy to get a reliable sampling for # non-parametric, so we can allow somewhat low significance self.assertTrue(pm100[0] <= 0.1) self.assertTrue(p100[0] <= 0.1) self.assertTrue(np.all(pm100[1:] > 0.05)) self.assertTrue(np.all(p100[1:] > 0.05)) # same test with just scalar measure/feature null.fit(CorrCoef(), ds[:, 0]) p_100 = null.p(100) self.failUnlessAlmostEqual(null.p(-100), p_100) self.failUnlessAlmostEqual(p100[0], p_100)
"""Unit tests for PyMVPA stats helpers""" from mvpa2.testing import * from mvpa2.testing.datasets import datasets from mvpa2 import cfg from mvpa2.base import externals from mvpa2.clfs.stats import MCNullDist, FixedNullDist, NullDist from mvpa2.generators.permutation import AttributePermutator from mvpa2.datasets import Dataset from mvpa2.measures.anova import OneWayAnova, CompoundOneWayAnova from mvpa2.misc.fx import double_gamma_hrf, single_gamma_hrf # Prepare few distributions to test #kwargs = {'permutations':10, 'tail':'any'} permutator = AttributePermutator('targets', count=30) nulldist_sweep = [ MCNullDist(permutator, tail='any'), MCNullDist(permutator, tail='right') ] if externals.exists('scipy'): from mvpa2.support.stats import scipy from scipy.stats import f_oneway from mvpa2.clfs.stats import rv_semifrozen nulldist_sweep += [ MCNullDist(permutator, scipy.stats.norm, tail='any'), MCNullDist(permutator, scipy.stats.norm, tail='right'), MCNullDist(permutator, rv_semifrozen(scipy.stats.norm, loc=0), tail='right'),
def test_gnbsearchlight_permutations(): import mvpa2 from mvpa2.base.node import ChainNode from mvpa2.clfs.gnb import GNB from mvpa2.generators.base import Repeater from mvpa2.generators.partition import NFoldPartitioner, OddEvenPartitioner #import mvpa2.generators.permutation #reload(mvpa2.generators.permutation) from mvpa2.generators.permutation import AttributePermutator from mvpa2.testing.datasets import datasets from mvpa2.measures.base import CrossValidation from mvpa2.measures.gnbsearchlight import sphere_gnbsearchlight from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.mappers.fx import mean_sample from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.clfs.stats import MCNullDist from mvpa2.testing.tools import assert_raises, ok_, assert_array_less # mvpa2.debug.active = ['APERM', 'SLC'] #, 'REPM'] # mvpa2.debug.metrics += ['pid'] count = 10 nproc = 1 + int(mvpa2.externals.exists('pprocess')) ds = datasets['3dsmall'].copy() ds.fa['voxel_indices'] = ds.fa.myspace slkwargs = dict(radius=3, space='voxel_indices', enable_ca=['roi_sizes'], center_ids=[1, 10, 70, 100]) mvpa2.seed(mvpa2._random_seed) clf = GNB() splt = NFoldPartitioner(cvtype=2, attr='chunks') repeater = Repeater(count=count) permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) null_sl = sphere_gnbsearchlight(clf, ChainNode([splt, permutator], space=splt.get_space()), postproc=mean_sample(), errorfx=mean_mismatch_error, **slkwargs) distr_est = MCNullDist(repeater, tail='left', measure=null_sl, enable_ca=['dist_samples']) sl = sphere_gnbsearchlight(clf, splt, reuse_neighbors=True, null_dist=distr_est, postproc=mean_sample(), errorfx=mean_mismatch_error, **slkwargs) if __debug__: # assert is done only without -O mode assert_raises(NotImplementedError, sl, ds) # "ad-hoc searchlights can't handle yet varying targets across partitions" if False: # after above limitation is removed -- enable sl_map = sl(ds) sl_null_prob = sl.ca.null_prob.samples.copy() mvpa2.seed(mvpa2._random_seed) ### 'normal' Searchlight clf = GNB() splt = NFoldPartitioner(cvtype=2, attr='chunks') repeater = Repeater(count=count) permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) # rng=np.random.RandomState(0)) # to trigger failure since the same np.random state # would be reused across all pprocesses null_cv = CrossValidation(clf, ChainNode([splt, permutator], space=splt.get_space()), postproc=mean_sample()) null_sl_normal = sphere_searchlight(null_cv, nproc=nproc, **slkwargs) distr_est_normal = MCNullDist(repeater, tail='left', measure=null_sl_normal, enable_ca=['dist_samples']) cv = CrossValidation(clf, splt, errorfx=mean_mismatch_error, enable_ca=['stats'], postproc=mean_sample() ) sl = sphere_searchlight(cv, nproc=nproc, null_dist=distr_est_normal, **slkwargs) sl_map_normal = sl(ds) sl_null_prob_normal = sl.ca.null_prob.samples.copy() # For every feature -- we should get some variance in estimates In # case of failure they are all really close to each other (up to # numerical precision), so variance will be close to 0 assert_array_less(-np.var(distr_est_normal.ca.dist_samples.samples[0], axis=1), -1e-5) for s in distr_est_normal.ca.dist_samples.samples[0]: ok_(len(np.unique(s)) > 1)
def do_searchlight(glm_dataset, radius, output_basename, with_null_prob=False): clf = LinearCSVMC(space='condition') # clf = RbfCSVMC(C=5.0) splt = NFoldPartitioner() cv = CrossValidation(clf, splt, errorfx=mean_match_accuracy, enable_ca=['stats'], postproc=mean_sample()) distr_est = [] if with_null_prob: permutator = AttributePermutator('condition', count=100, limit='chunks') distr_est = MCNullDist(permutator, tail='left', enable_ca=['dist_samples']) """ repeater = Repeater(count=100) permutator = AttributePermutator('condition', limit={'partitions': 1}, count=1) null_cv = CrossValidation(clf, ChainNode([splt, permutator],space=splt.get_space()), postproc=mean_sample()) null_sl = sphere_searchlight(null_cv, radius=radius, space='voxel_indices', enable_ca=['roi_sizes']) distr_est = MCNullDist(repeater,tail='left', measure=null_sl, enable_ca=['dist_samples']) """ sl = sphere_searchlight(cv, radius=radius, space='voxel_indices', null_dist=distr_est, enable_ca=['roi_sizes', 'roi_feature_ids']) else: sl = sphere_searchlight(cv, radius=radius, space='voxel_indices', enable_ca=['roi_sizes', 'roi_feature_ids']) #ds = glm_dataset.copy(deep=False, # sa=['condition','chunks'], # fa=['voxel_indices'], # a=['mapper']) #debug.active += ["SLC"] sl_map = sl(glm_dataset) errresults = map2nifti(sl_map, imghdr=glm_dataset.a.imghdr) errresults.to_filename('{}-acc.nii.gz'.format(output_basename)) sl_map.samples *= -1 sl_map.samples += 1 niftiresults = map2nifti(sl_map, imghdr=glm_dataset.a.imghdr) niftiresults.to_filename('{}-err.nii.gz'.format(output_basename)) #TODO: save p value map if with_null_prob: nullt_results = map2nifti(sl_map, data=sl.ca.null_t, imghdr=glm_dataset.a.imghdr) nullt_results.to_filename('{}-t.nii.gz'.format(output_basename)) nullprob_results = map2nifti(sl_map, data=sl.ca.null_prob, imghdr=glm_dataset.a.imghdr) nullprob_results.to_filename('{}-prob.nii.gz'.format(output_basename)) nullprob_results = map2nifti(sl_map, data=distr_est.cdf(sl_map.samples), imghdr=glm_dataset.a.imghdr) nullprob_results.to_filename('{}-cdf.nii.gz'.format(output_basename))
def test_partial_searchlight_with_confusion_matrix(self): ds = self.dataset from mvpa2.clfs.stats import MCNullDist from mvpa2.mappers.fx import mean_sample, sum_sample # compute N-1 cross-validation for each sphere cm = ConfusionMatrix(labels=ds.UT) cv = CrossValidation( sample_clf_lin, NFoldPartitioner(), # we have to assure that matrix does not get flatted by # first vstack in cv and then hstack in searchlight -- # thus 2 leading dimensions # TODO: RF? make searchlight/crossval smarter? errorfx=lambda *a: cm(*a)[None, None, :]) # contruct diameter 2 (or just radius 1) searchlight sl = sphere_searchlight(cv, radius=1, center_ids=[3, 5, 50]) # our regular searchlight -- to compare results cv_gross = CrossValidation(sample_clf_lin, NFoldPartitioner()) sl_gross = sphere_searchlight(cv_gross, radius=1, center_ids=[3, 5, 50]) # run searchlights res = sl(ds) res_gross = sl_gross(ds) # only two spheres but error for all CV-folds and complete confusion matrix assert_equal(res.shape, (len(ds.UC), 3, len(ds.UT), len(ds.UT))) assert_equal(res_gross.shape, (len(ds.UC), 3)) # briefly inspect the confusion matrices mat = res.samples # since input dataset is probably balanced (otherwise adjust # to be per label): sum within columns (thus axis=-2) should # be identical to per-class/chunk number of samples samples_per_classchunk = len(ds) / (len(ds.UT) * len(ds.UC)) ok_(np.all(np.sum(mat, axis=-2) == samples_per_classchunk)) # and if we compute accuracies manually -- they should # correspond to the one from sl_gross assert_array_almost_equal( res_gross.samples, # from accuracies to errors 1 - (mat[..., 0, 0] + mat[..., 1, 1]).astype(float) / (2 * samples_per_classchunk)) # and now for those who remained sited -- lets perform H0 MC # testing of this searchlight... just a silly one with minimal # number of permutations no_permutations = 10 permutator = AttributePermutator('targets', count=no_permutations) # once again -- need explicit leading dimension to avoid # vstacking during cross-validation cv.postproc = lambda x: sum_sample()(x)[None, :] sl = sphere_searchlight(cv, radius=1, center_ids=[3, 5, 50], null_dist=MCNullDist( permutator, tail='right', enable_ca=['dist_samples'])) res_perm = sl(ds) # XXX all of the res_perm, sl.ca.null_prob and # sl.null_dist.ca.dist_samples carry a degenerate leading # dimension which was probably due to introduced new axis # above within cv.postproc assert_equal(res_perm.shape, (1, 3, 2, 2)) assert_equal(sl.null_dist.ca.dist_samples.shape, res_perm.shape + (no_permutations, )) assert_equal(sl.ca.null_prob.shape, res_perm.shape) # just to make sure ;) ok_(np.all(sl.ca.null_prob.samples >= 0)) ok_(np.all(sl.ca.null_prob.samples <= 1)) # we should have got sums of hits across the splits assert_array_equal(np.sum(mat, axis=0), res_perm.samples[0])
def test_attrpermute(): # Was about to use borrowkwargs but didn't work out . Test doesn't hurt doc = AttributePermutator.__init__.__doc__ assert_in('limit : ', doc) assert_not_in('collection : ', doc) ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # verify that implausible assure=True would not work permutation = AttributePermutator('targets', limit='ids', assure=True) assert_raises(RuntimeError, permutation, ds) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3, 4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_equal(len(pds), nruns) for p in pds: assert_false(np.all(p.sa.ids == ds.sa.ids)) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids)) # now chunk-wise uattrs strategy (reassignment) permutation = AttributePermutator('targets', limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets), zip(pds.targets)) # in each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) # we have only 1-to-1 mappings assert_true(len(set(zip(otargets, ptargets))), len(set(otargets))) ds.sa['odds'] = ds.sa.ids % 2 # test combinations permutation = AttributePermutator(['targets', 'odds'], limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets, ds.sa.odds), zip(pds.targets, pds.sa.odds)) # In each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] oodds, podds = ds.sa.odds[chunk_idx], pds.sa.odds[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) assert_equal(set(oodds), set(podds)) # at the end we have the same mapping assert_equal(set(zip(otargets, oodds)), set(zip(ptargets, podds)))
def test_attrpermute(): ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # verify that implausible assure=True would not work permutation = AttributePermutator('targets', limit='ids', assure=True) assert_raises(RuntimeError, permutation, ds) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3,4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_equal(len(pds), nruns) for p in pds: assert_false(np.all(p.sa.ids == ds.sa.ids)) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids)) # now chunk-wise uattrs strategy (reassignment) permutation = AttributePermutator('targets', limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets), zip(pds.targets)) # in each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) # we have only 1-to-1 mappings assert_true(len(set(zip(otargets, ptargets))), len(set(otargets))) ds.sa['odds'] = ds.sa.ids % 2 # test combinations permutation = AttributePermutator(['targets', 'odds'], limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets, ds.sa.odds), zip(pds.targets, pds.sa.odds)) # In each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] oodds, podds = ds.sa.odds[chunk_idx], pds.sa.odds[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) assert_equal(set(oodds), set(podds)) # at the end we have the same mapping assert_equal(set(zip(otargets, oodds)), set(zip(ptargets, podds)))
def test_attrpermute(): # Was about to use borrowkwargs but didn't work out . Test doesn't hurt doc = AttributePermutator.__init__.__doc__ assert_in('limit : ', doc) assert_not_in('collection : ', doc) ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # verify that implausible assure=True would not work permutation = AttributePermutator('targets', limit='ids', assure=True) assert_raises(RuntimeError, permutation, ds) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3,4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 def assert_all_different_permutations(pds): assert_equal(len(pds), nruns) for i, p in enumerate(pds): assert_false(np.all(p.sa.ids == ds.sa.ids)) for p_ in pds[i+1:]: assert_false(np.all(p.sa.ids == p_.sa.ids)) permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_all_different_permutations(pds) # if we provide seeding, and generate, it should also return different datasets permutation = AttributePermutator(['targets', 'ids'], count=nruns, rng=1) pds1 = list(permutation.generate(ds)) assert_all_different_permutations(pds) # but if we regenerate -- should all be the same to before pds2 = list(permutation.generate(ds)) assert_equal(len(pds1), len(pds2)) for p1, p2 in zip(pds1, pds2): assert_datasets_equal(p1, p2) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids)) # now chunk-wise uattrs strategy (reassignment) permutation = AttributePermutator('targets', limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets), zip(pds.targets)) # in each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) # we have only 1-to-1 mappings assert_true(len(set(zip(otargets, ptargets))), len(set(otargets))) ds.sa['odds'] = ds.sa.ids % 2 # test combinations permutation = AttributePermutator(['targets', 'odds'], limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets, ds.sa.odds), zip(pds.targets, pds.sa.odds)) # In each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] oodds, podds = ds.sa.odds[chunk_idx], pds.sa.odds[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) assert_equal(set(oodds), set(podds)) # at the end we have the same mapping assert_equal(set(zip(otargets, oodds)), set(zip(ptargets, podds)))
def get_crossvalidation_instance(learner, partitioner, errorfx, sampling_repetitions=1, learner_space='targets', balance_training=None, permutations=0, avg_datafold_results=True, prob_tail='left'): from mvpa2.base.node import ChainNode from mvpa2.measures.base import CrossValidation if not balance_training is None: # balance training data try: amount = int(balance_training) except ValueError: try: amount = float(balance_training) except ValueError: amount = balance_training from mvpa2.generators.resampling import Balancer balancer = Balancer(amount=amount, attr=learner_space, count=sampling_repetitions, limit={partitioner.get_space(): 1}, apply_selection=True, include_offlimit=True) else: balancer = None # set learner space learner.set_space(learner_space) # setup generator for data folding -- put in a chain node for easy # amending gennode = ChainNode([partitioner], space=partitioner.get_space()) if avg_datafold_results: from mvpa2.mappers.fx import mean_sample postproc = mean_sample() else: postproc = None if not balancer is None: # enable balancing step for each partitioning step gennode.append(balancer) if permutations > 0: from mvpa2.generators.base import Repeater from mvpa2.generators.permutation import AttributePermutator from mvpa2.clfs.stats import MCNullDist # how often do we want to shuffle the data repeater = Repeater(count=permutations) # permute the training part of a dataset exactly ONCE permutator = AttributePermutator(learner_space, limit={partitioner.get_space(): 1}, count=1) # CV with null-distribution estimation that permutes the training data for # each fold independently perm_gen_node = copy.deepcopy(gennode) perm_gen_node.append(permutator) null_cv = CrossValidation(learner, perm_gen_node, postproc=postproc, errorfx=errorfx) # Monte Carlo distribution estimator distr_est = MCNullDist(repeater, tail=prob_tail, measure=null_cv, enable_ca=['dist_samples']) # pass the p-values as feature attributes on to the results pass_attr = [('ca.null_prob', 'fa', 1)] else: distr_est = None pass_attr = None # final CV node cv = CrossValidation(learner, gennode, errorfx=errorfx, null_dist=distr_est, postproc=postproc, enable_ca=['stats', 'null_prob'], pass_attr=pass_attr) return cv
def test_retrainables(self, clf): # XXX we agreed to not worry about this for the initial 0.6 release raise SkipTest # we need a copy since will tune its internals later on clf = clf.clone() clf.ca.change_temporarily( enable_ca=['estimates'], # ensure that it does do predictions # while training disable_ca=['training_stats']) clf_re = clf.clone() # TODO: .retrainable must have a callback to call smth like # _set_retrainable clf_re._set_retrainable(True) # need to have high snr so we don't 'cope' with problematic # datasets since otherwise unittests would fail. dsargs = { 'perlabel': 50, 'nlabels': 2, 'nfeatures': 5, 'nchunks': 1, 'nonbogus_features': [2, 4], 'snr': 5.0 } ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # NB datasets will be changed by the end of testing, so if # are to change to use generic datasets - make sure to copy # them here ds = deepcopy(datasets['uni2large']) clf.untrain() clf_re.untrain() trerr = TransferMeasure(clf, Splitter('train'), postproc=BinaryFxNode(mean_mismatch_error, 'targets')) trerr_re = TransferMeasure(clf_re, Splitter('train'), disable_ca=['training_stats'], postproc=BinaryFxNode( mean_mismatch_error, 'targets')) # Just check for correctness of retraining err_1 = np.asscalar(trerr(ds)) self.assertTrue( err_1 < 0.3, msg="We should test here on easy dataset. Got error of %s" % err_1) values_1 = clf.ca.estimates[:] # some times retraining gets into deeper optimization ;-) eps = 0.05 corrcoef_eps = 0.85 # just to get no failures... usually > 0.95 def batch_test(retrain=True, retest=True, closer=True): err = np.asscalar(trerr(ds)) err_re = np.asscalar(trerr_re(ds)) corr = np.corrcoef(clf.ca.estimates, clf_re.ca.estimates)[0, 1] corr_old = np.corrcoef(values_1, clf_re.ca.estimates)[0, 1] if __debug__: debug( 'TEST', "Retraining stats: errors %g %g corr %g " "with old error %g corr %g" % (err, err_re, corr, err_1, corr_old)) self.assertTrue(clf_re.ca.retrained == retrain, ("Must fully train", "Must retrain instead of full training")[retrain]) self.assertTrue(clf_re.ca.repredicted == retest, ("Must fully test", "Must retest instead of full testing")[retest]) self.assertTrue( corr > corrcoef_eps, msg="Result must be close to the one without retraining." " Got corrcoef=%s" % (corr)) if closer: self.assertTrue( corr >= corr_old, msg="Result must be closer to current without retraining" " than to old one. Got corrcoef=%s" % (corr_old)) # Check sequential retraining/retesting for i in xrange(3): flag = bool(i != 0) # ok - on 1st call we should train/test, then retrain/retest # and we can't compare for closinest to old result since # we are working on the same data/classifier batch_test(retrain=flag, retest=flag, closer=False) # should retrain nicely if we change a parameter if 'C' in clf.params: clf.params.C *= 0.1 clf_re.params.C *= 0.1 batch_test() elif 'sigma_noise' in clf.params: clf.params.sigma_noise *= 100 clf_re.params.sigma_noise *= 100 batch_test() else: raise RuntimeError, \ 'Please implement testing while changing some of the ' \ 'params for clf %s' % clf # should retrain nicely if we change kernel parameter if hasattr(clf, 'kernel_params') and len(clf.kernel_params): clf.kernel_params.gamma = 0.1 clf_re.kernel_params.gamma = 0.1 # retest is false since kernel got recomputed thus # can't expect to use the same kernel batch_test(retest=not ('gamma' in clf.kernel_params)) # should retrain nicely if we change labels permute = AttributePermutator('targets', assure=True) oldlabels = dstrain.targets[:] dstrain = permute(dstrain) self.assertTrue( (oldlabels != dstrain.targets).any(), msg="We should succeed at permutting -- now got the same targets") ds = vstack((dstrain, dstest)) batch_test() # Change labels in testing oldlabels = dstest.targets[:] dstest = permute(dstest) self.assertTrue( (oldlabels != dstest.targets).any(), msg="We should succeed at permutting -- now got the same targets") ds = vstack((dstrain, dstest)) batch_test() # should re-train if we change data # reuse trained SVM and its 'final' optimization point if not clf.__class__.__name__ in [ 'GPR' ]: # on GPR everything depends on the data ;-) oldsamples = dstrain.samples.copy() dstrain.samples[:] += dstrain.samples * 0.05 self.assertTrue((oldsamples != dstrain.samples).any()) ds = vstack((dstrain, dstest)) batch_test(retest=False) clf.ca.reset_changed_temporarily() # test retrain() # TODO XXX -- check validity clf_re.retrain(dstrain) self.assertTrue(clf_re.ca.retrained) clf_re.retrain(dstrain, labels=True) self.assertTrue(clf_re.ca.retrained) clf_re.retrain(dstrain, traindataset=True) self.assertTrue(clf_re.ca.retrained) # test repredict() clf_re.repredict(dstest.samples) self.assertTrue(clf_re.ca.repredicted) self.assertRaises(RuntimeError, clf_re.repredict, dstest.samples, labels=True) """for now retesting with anything changed makes no sense""" clf_re._set_retrainable(False)