def test_partitionmapper(): ds = give_data() oep = OddEvenPartitioner() parts = list(oep.generate(ds)) assert_equal(len(parts), 2) for i, p in enumerate(parts): assert_array_equal(p.sa['partitions'].unique, [1, 2]) assert_equal(p.a.partitions_set, i) assert_equal(len(p), len(ds))
def test_searchlight_errors_per_trial(): # To make sure that searchlight can return error/accuracy per trial from mvpa2.clfs.gnb import GNB from mvpa2.generators.partition import OddEvenPartitioner from mvpa2.measures.base import CrossValidation from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.measures.gnbsearchlight import sphere_gnbsearchlight from mvpa2.testing.datasets import datasets from mvpa2.misc.errorfx import prediction_target_matches dataset = datasets['3dsmall'].copy() # randomly permute samples so we break any random correspondence # to strengthen tests below sample_idx = np.arange(len(dataset)) dataset = dataset[np.random.permutation(sample_idx)] dataset.sa.targets = ['L%d' % l for l in dataset.sa.targets] dataset.fa['voxel_indices'] = dataset.fa.myspace sample_clf = GNB() # fast and deterministic part = OddEvenPartitioner() # only do partial to save time cv = CrossValidation(sample_clf, part, errorfx=None) #prediction_target_matches) # Just to compare error cv_error = CrossValidation(sample_clf, part) # Large searchlight radius so we get entire ROI, 2 centers just to make sure # that all stacking works correctly sl = sphere_searchlight(cv, radius=10, center_ids=[0, 1]) results = sl(dataset) sl_gnb = sphere_gnbsearchlight(sample_clf, part, radius=10, errorfx=None, center_ids=[0, 1]) results_gnbsl = sl_gnb(dataset) # inspect both results # verify that partitioning was done correctly partitions = list(part.generate(dataset)) for res in (results, results_gnbsl): assert('targets' in res.sa.keys()) # should carry targets assert('cvfolds' in res.sa.keys()) # should carry cvfolds for ipart in xrange(len(partitions)): assert_array_equal(dataset[partitions[ipart].sa.partitions == 2].targets, res.sa.targets[res.sa.cvfolds == ipart]) assert_datasets_equal(results, results_gnbsl) # one "accuracy" per each trial assert_equal(results.shape, (len(dataset), 2)) # with accuracies the same in both searchlights since the same # features were to be selected in both cases due too large radii errors_dataset = cv(dataset) assert_array_equal(errors_dataset.samples[:, 0], results.samples[:, 0]) assert_array_equal(errors_dataset.samples[:, 0], results.samples[:, 1]) # and error matching (up to precision) the one if we run with default error function assert_array_almost_equal(np.mean(results.targets[:, None] != results.samples, axis=0)[0], np.mean(cv_error(dataset)))
def test_regression_as_classifier(self, regr): """Basic tests of metaclass for using regressions as classifiers """ for dsname in 'uni2small', 'uni4small': ds = datasets[dsname] clf = RegressionAsClassifier(regr, enable_ca=['distances']) cv = CrossValidation(clf, OddEvenPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) error = cv(ds).samples.squeeze() nlabels = len(ds.uniquetargets) if nlabels == 2 \ and cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(error < 0.3, msg="Got error %.2f on %s dataset" % (error, dsname)) # Check if does not puke on repr and str self.assertTrue(str(clf) != "") self.assertTrue(repr(clf) != "") self.assertEqual(clf.ca.distances.shape, (ds.nsamples / 2, nlabels))
def test_null_dist_prob(self, l_clf): train = datasets['uni2medium'] num_perm = 10 permutator = AttributePermutator('targets', count=num_perm, limit='chunks') # define class to estimate NULL distribution of errors # use left tail of the distribution since we use MeanMatchFx as error # function and lower is better terr = TransferMeasure(l_clf, Repeater(count=2), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), null_dist=MCNullDist(permutator, tail='left')) # check reasonable error range err = terr(train) self.assertTrue(np.mean(err) < 0.4) # Lets do the same for CVTE cvte = CrossValidation(l_clf, OddEvenPartitioner(), null_dist=MCNullDist(permutator, tail='left', enable_ca=['dist_samples' ]), postproc=mean_sample()) cv_err = cvte(train) # check that the result is highly significant since we know that the # data has signal null_prob = np.asscalar(terr.ca.null_prob) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue( null_prob <= 0.1, msg="Failed to check that the result is highly significant " "(got %f) since we know that the data has signal" % null_prob) self.assertTrue( np.asscalar(cvte.ca.null_prob) <= 0.1, msg="Failed to check that the result is highly significant " "(got p(cvte)=%f) since we know that the data has signal" % np.asscalar(cvte.ca.null_prob)) # we should be able to access the actual samples of the distribution # yoh: why it is 3D really? # mih: because these are the distribution samples for the ONE error # collapsed into ONE value across all folds. It will also be # 3d if the return value of the measure isn't a scalar and it is # not collapsed across folds. it simply corresponds to the shape # of the output dataset of the respective measure (+1 axis) # Some permutations could have been skipped since classifier failed # to train due to degenerate situation etc, thus accounting for them self.assertEqual(cvte.null_dist.ca.dist_samples.shape[2], num_perm - cvte.null_dist.ca.skipped)
def test_nblocks(self): skip_if_no_external('pprocess') # just a basic test to see that we are getting the same # results with different nblocks ds = datasets['3dsmall'].copy(deep=True)[:, :13] ds.fa['voxel_indices'] = ds.fa.myspace cv = CrossValidation(GNB(), OddEvenPartitioner()) res1 = sphere_searchlight(cv, radius=1, nproc=2)(ds) res2 = sphere_searchlight(cv, radius=1, nproc=2, nblocks=5)(ds) assert_array_equal(res1, res2)
def test_regression_with_additional_sa(self): regr = regrswh[:][0] ds = datasets['3dsmall'].copy() ds.fa['voxel_indices'] = ds.fa.myspace # Create a new sample attribute which will be used along with # every searchlight ds.sa['beh'] = np.random.normal(size=(ds.nsamples, 2)) # and now for fun -- lets create custom linar regression # targets out of some random feature and beh linearly combined rfeature = np.random.randint(ds.nfeatures) ds.sa.targets = np.dot( np.hstack((ds.sa.beh, ds.samples[:, rfeature:rfeature + 1])), np.array([0.3, 0.2, 0.3])) class CrossValidationWithBeh(CrossValidation): """An adapter for regular CV which would hstack sa.beh to the searchlighting ds""" def _call(self, ds): return CrossValidation._call( self, Dataset(np.hstack((ds, ds.sa.beh)), sa=ds.sa)) cvbeh = CrossValidationWithBeh(regr, OddEvenPartitioner(), errorfx=corr_error) # regular cv cv = CrossValidation(regr, OddEvenPartitioner(), errorfx=corr_error) slbeh = sphere_searchlight(cvbeh, radius=1) slmapbeh = slbeh(ds) sl = sphere_searchlight(cv, radius=1) slmap = sl(ds) assert_equal(slmap.shape, (2, ds.nfeatures)) # SL which had access to beh should have got for sure better # results especially in the vicinity of the chosen feature... features = sl.queryengine.query_byid(rfeature) assert_array_lequal(slmapbeh.samples[:, features], slmap.samples[:, features])
def test_values(self, clf): if isinstance(clf, MulticlassClassifier): # TODO: handle those values correctly return ds = datasets['uni2small'] clf.ca.change_temporarily(enable_ca = ['estimates']) cv = CrossValidation(clf, OddEvenPartitioner(), enable_ca=['stats', 'training_stats']) _ = cv(ds) #print clf.descr, clf.values[0] # basic test either we get 1 set of values per each sample self.assertEqual(len(clf.ca.estimates), ds.nsamples/2) clf.ca.reset_changed_temporarily()
def test_auc(self, clf): """Test AUC computation """ if isinstance(clf, MulticlassClassifier): raise SkipTest, \ "TODO: handle values correctly in MulticlassClassifier" clf.ca.change_temporarily(enable_ca=['estimates']) if 'qda' in clf.__tags__: # for reliable estimation of covariances, need sufficient # sample size ds_size = 'large' else: ds_size = 'small' # uni2 dataset with reordered labels ds2 = datasets['uni2' + ds_size].copy() # revert labels ds2.sa['targets'].value = ds2.targets[::-1].copy() # same with uni3 ds3 = datasets['uni3' + ds_size].copy() ul = ds3.sa['targets'].unique nl = ds3.targets.copy() for l in xrange(3): nl[ds3.targets == ul[l]] = ul[(l + 1) % 3] ds3.sa.targets = nl for ds in [ datasets['uni2' + ds_size], ds2, datasets['uni3' + ds_size], ds3 ]: cv = CrossValidation(clf, OddEvenPartitioner(), enable_ca=['stats', 'training_stats']) cverror = cv(ds) stats = cv.ca.stats.stats Nlabels = len(ds.uniquetargets) # so we at least do slightly above chance # But LARS manages to screw up there as well ATM from time to time, so making # its testing labile if (('lars' in clf.__tags__) and cfg.getboolean('tests', 'labile', default='yes')) \ or (not 'lars' in clf.__tags__): self.assertTrue(stats['ACC'] > 1.2 / Nlabels) auc = stats['AUC'] if (Nlabels == 2) or (Nlabels > 2 and auc[0] is not np.nan): mauc = np.min(stats['AUC']) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue( mauc > 0.55, msg='All AUCs must be above chance. Got minimal ' 'AUC=%.2g among %s' % (mauc, stats['AUC'])) clf.ca.reset_changed_temporarily()
def test_single_class(self, clf): """Test if binary and multiclass can handle single class training/testing """ ds = datasets['uni2small'] ds = ds[ds.sa.targets == 'L0'] # only 1 label assert(ds.sa['targets'].unique == ['L0']) ds_ = list(OddEvenPartitioner().generate(ds))[0] # Here is our "nice" 0.6 substitute for TransferError: trerr = TransferMeasure(clf, Splitter('train'), postproc=BinaryFxNode(mean_mismatch_error, 'targets')) try: err = np.asscalar(trerr(ds_)) except Exception, e: self.fail(str(e))
def test_tree_classifier(self): """Basic tests for TreeClassifier """ ds = datasets['uni4medium'] # make it simple of the beast -- take only informative ones # because classifiers for the tree are selected randomly, so # performance varies a lot and we just need to check on # correct operation ds = ds[:, ds.fa.nonbogus_targets != [None]] clfs = clfswh['binary'] # pool of classifiers # Lets permute so each time we try some different combination # of the classifiers but exclude those operating on %s of # features since we might not have enough for that clfs = [ clfs[i] for i in np.random.permutation(len(clfs)) if not '%' in str(clfs[i]) ] # NB: It is necessary that the same classifier was not used at # different nodes, since it would be re-trained for a new set # of targets, thus leading to incorrect behavior/high error. # # Clone only those few leading ones which we will use # throughout the test clfs = [clf.clone() for clf in clfs[:4]] # Test conflicting definition tclf = TreeClassifier(clfs[0], { 'L0+2': (('L0', 'L2'), clfs[1]), 'L2+3': (('L2', 'L3'), clfs[2]) }) self.assertRaises(ValueError, tclf.train, ds) """Should raise exception since label 2 is in both""" # Test insufficient definition tclf = TreeClassifier(clfs[0], { 'L0+5': (('L0', 'L5'), clfs[1]), 'L2+3': (('L2', 'L3'), clfs[2]) }) self.assertRaises(ValueError, tclf.train, ds) """Should raise exception since no group for L1""" # proper definition now tclf = TreeClassifier(clfs[0], { 'L0+1': (('L0', 'L1'), clfs[1]), 'L2+3': (('L2', 'L3'), clfs[2]) }) # Lets test train/test cycle using CVTE cv = CrossValidation(tclf, OddEvenPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds).samples.squeeze() try: rtclf = repr(tclf) except: self.fail(msg="Could not obtain repr for TreeClassifier") # Test accessibility of .clfs self.assertTrue(tclf.clfs['L0+1'] is clfs[1]) self.assertTrue(tclf.clfs['L2+3'] is clfs[2]) cvtrc = cv.ca.training_stats cvtc = cv.ca.stats if cfg.getboolean('tests', 'labile', default='yes'): # just a dummy check to make sure everything is working self.assertTrue(cvtrc != cvtc) self.assertTrue(cverror < 0.3, msg="Got too high error = %s using %s" % (cverror, tclf)) # Test trailing nodes with no classifier # That is why we use separate pool of classifiers here # (that is probably old/not-needed since switched to use clones) clfs_mc = clfswh['multiclass'] # pool of classifiers clfs_mc = [ clfs_mc[i] for i in np.random.permutation(len(clfs_mc)) if not '%' in str(clfs_mc[i]) ] clfs_mc = [clf.clone() for clf in clfs_mc[:4]] # and clones again tclf = TreeClassifier(clfs_mc[0], { 'L0': (('L0', ), None), 'L1+2+3': (('L1', 'L2', 'L3'), clfs_mc[1]) }) cv = CrossValidation(tclf, OddEvenPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = np.asscalar(cv(ds)) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(cverror < 0.3, msg="Got too high error = %s using %s" % (cverror, tclf))
def generate_testing_datasets(specs): # Lets permute upon each invocation of test, so we could possibly # trigger some funny cases nonbogus_pool = np.random.permutation([0, 1, 3, 5]) datasets = {} # use a partitioner to flag odd/even samples as training and test ttp = OddEvenPartitioner(space='train', count=1) for kind, spec in specs.iteritems(): # set of univariate datasets for nlabels in [ 2, 3, 4 ]: basename = 'uni%d%s' % (nlabels, kind) nonbogus_features = nonbogus_pool[:nlabels] dataset = normal_feature_dataset( nlabels=nlabels, nonbogus_features=nonbogus_features, **spec) # full dataset datasets[basename] = list(ttp.generate(dataset))[0] # sample 3D total = 2*spec['perlabel'] nchunks = spec['nchunks'] data = np.random.standard_normal(( total, 3, 6, 6 )) labels = np.concatenate( ( np.repeat( 0, spec['perlabel'] ), np.repeat( 1, spec['perlabel'] ) ) ) data[:, 1, 0, 0] += 2*labels # add some signal chunks = np.asarray(range(nchunks)*(total/nchunks)) mask = np.ones((3, 6, 6), dtype='bool') mask[0, 0, 0] = 0 mask[1, 3, 2] = 0 ds = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks, mask=mask, space='myspace') # and to stress tests on manipulating sa/fa possibly containing # attributes of dtype object ds.sa['test_object'] = [['a'], [1, 2]] * (ds.nsamples/2) datasets['3d%s' % kind] = ds # some additional datasets datasets['dumb2'] = dumb_feature_binary_dataset() datasets['dumb'] = dumb_feature_dataset() # dataset with few invariant features _dsinv = dumb_feature_dataset() _dsinv.samples = np.hstack((_dsinv.samples, np.zeros((_dsinv.nsamples, 1)), np.ones((_dsinv.nsamples, 1)))) datasets['dumbinv'] = _dsinv # Datasets for regressions testing datasets['sin_modulated'] = list(ttp.generate(multiple_chunks(sin_modulated, 4, 30, 1)))[0] # use the same full for training datasets['sin_modulated_train'] = datasets['sin_modulated'] datasets['sin_modulated_test'] = sin_modulated(30, 1, flat=True) # simple signal for linear regressors datasets['chirp_linear'] = multiple_chunks(chirp_linear, 6, 50, 10, 2, 0.3, 0.1) datasets['chirp_linear_test'] = chirp_linear(20, 5, 2, 0.4, 0.1) datasets['wr1996'] = multiple_chunks(wr1996, 4, 50) datasets['wr1996_test'] = wr1996(50) datasets['hollow'] = Dataset(HollowSamples((40,20)), sa={'targets': np.tile(['one', 'two'], 20)}) return datasets
def test_multiclass_pairs_svm_searchlight(): from mvpa2.measures.searchlight import sphere_searchlight import mvpa2.clfs.meta #reload(mvpa2.clfs.meta) from mvpa2.clfs.meta import MulticlassClassifier from mvpa2.datasets import Dataset from mvpa2.clfs.svm import LinearCSVMC #import mvpa2.testing.datasets #reload(mvpa2.testing.datasets) from mvpa2.testing.datasets import datasets from mvpa2.generators.partition import NFoldPartitioner, OddEvenPartitioner from mvpa2.measures.base import CrossValidation from mvpa2.testing import ok_, assert_equal, assert_array_equal from mvpa2.sandbox.multiclass import get_pairwise_accuracies # Some parameters used in the test below nproc = 1 + int(mvpa2.externals.exists('pprocess')) ntargets = 4 # number of targets npairs = ntargets*(ntargets-1)/2 center_ids = [35, 55, 1] ds = datasets['3dsmall'].copy() # redefine C,T so we have a multiclass task nsamples = len(ds) ds.sa.targets = range(ntargets) * (nsamples//ntargets) ds.sa.chunks = np.arange(nsamples) // ntargets # and add some obvious signal where it is due ds.samples[:, 55] += 15*ds.sa.targets # for all 4 targets ds.samples[:, 35] += 15*(ds.sa.targets % 2) # so we have conflicting labels # while 35 would still be just for 2 categories which would conflict mclf = MulticlassClassifier(LinearCSVMC(), pass_attr=['sa.chunks', 'ca.raw_predictions_ds'], enable_ca=['raw_predictions_ds']) label_pairs = mclf._get_binary_pairs(ds) def place_sa_as_samples(ds): # add a degenerate dimension for the hstacking in the searchlight ds.samples = ds.sa.raw_predictions_ds[:, None] ds.sa.pop('raw_predictions_ds') # no need to drag the copy return ds mcv = CrossValidation(mclf, OddEvenPartitioner(), errorfx=None, postproc=place_sa_as_samples) sl = sphere_searchlight(mcv, nproc=nproc, radius=2, space='myspace', center_ids=center_ids) slmap = sl(ds) ok_('chunks' in slmap.sa) ok_('cvfolds' in slmap.sa) ok_('targets' in slmap.sa) # so for each SL we got all pairwise tests assert_equal(slmap.shape, (nsamples, len(center_ids), npairs)) assert_array_equal(np.unique(slmap.sa.cvfolds), [0, 1]) # Verify that we got right labels in each 'pair' # all searchlights should have the same set of labels for a given # pair of targets label_pairs_ = np.apply_along_axis( np.unique, 0, ## reshape slmap so we have only simple pairs in the columns np.reshape(slmap, (-1, npairs))).T # need to prep that list of pairs obtained from MulticlassClassifier # and since it is 1-vs-1, they all should be just pairs of lists of # 1 element so should work assert_equal(len(label_pairs_), npairs) assert_array_equal(np.squeeze(np.array(label_pairs)), label_pairs_) assert_equal(label_pairs_.shape, (npairs, 2)) # for this particular case out = get_pairwise_accuracies(slmap) out123 = get_pairwise_accuracies(slmap, select=[1, 2, 3]) assert_array_equal(np.unique(out123.T), np.arange(1, 4)) # so we got at least correct targets # test that we extracted correct accuracies # First 3 in out.T should have category 0, so skip them and compare otherwise assert_array_equal(out.samples[3:], out123.samples) ok_(np.all(out.samples[:, 1] == 1.), "This was with super-strong result")
def test_voxel_selection(self): '''Compare surface and volume based searchlight''' ''' Tests to see whether results are identical for surface-based searchlight (just one plane; Euclidean distnace) and volume-based searchlight. Note that the current value is a float; if it were int, it would specify the number of voxels in each searchlight''' radius = 10. '''Define input filenames''' epi_fn = os.path.join(pymvpa_dataroot, 'bold.nii.gz') maskfn = os.path.join(pymvpa_dataroot, 'mask.nii.gz') ''' Use the EPI datafile to define a surface. The surface has as many nodes as there are voxels and is parallel to the volume 'slice' ''' vg = volgeom.from_any(maskfn, mask_volume=True) aff = vg.affine nx, ny, nz = vg.shape[:3] '''Plane goes in x and y direction, so we take these vectors from the affine transformation matrix of the volume''' plane = surf.generate_plane(aff[:3, 3], aff[:3, 0], aff[:3, 1], nx, ny) ''' Simulate pial and white matter as just above and below the central plane ''' normal_vec = aff[:3, 2] outer = plane + normal_vec inner = plane + -normal_vec ''' Combine volume and surface information ''' vsm = volsurf.VolSurfMaximalMapping(vg, outer, inner) ''' Run voxel selection with specified radius (in mm), using Euclidean distance measure ''' surf_voxsel = surf_voxel_selection.voxel_selection(vsm, radius, distance_metric='e') '''Define the measure''' # run_slow=True would give an actual cross-validation with meaningful # accuracies. Because this is a unit-test only the number of voxels # in each searchlight is tested. run_slow = False if run_slow: meas = CrossValidation(GNB(), OddEvenPartitioner(), errorfx=lambda p, t: np.mean(p == t)) postproc = mean_sample else: meas = _Voxel_Count_Measure() postproc = lambda x: x ''' Surface analysis: define the query engine, cross validation, and searchlight ''' surf_qe = SurfaceVerticesQueryEngine(surf_voxsel) surf_sl = Searchlight(meas, queryengine=surf_qe, postproc=postproc) ''' new (Sep 2012): also test 'simple' queryengine wrapper function ''' surf_qe2 = disc_surface_queryengine(radius, maskfn, inner, outer, plane, volume_mask=True, distance_metric='euclidean') surf_sl2 = Searchlight(meas, queryengine=surf_qe2, postproc=postproc) ''' Same for the volume analysis ''' element_sizes = tuple(map(abs, (aff[0, 0], aff[1, 1], aff[2, 2]))) sph = Sphere(radius, element_sizes=element_sizes) kwa = {'voxel_indices': sph} vol_qe = IndexQueryEngine(**kwa) vol_sl = Searchlight(meas, queryengine=vol_qe, postproc=postproc) '''The following steps are similar to start_easy.py''' attr = SampleAttributes( os.path.join(pymvpa_dataroot, 'attributes_literal.txt')) mask = surf_voxsel.get_mask() dataset = fmri_dataset(samples=os.path.join(pymvpa_dataroot, 'bold.nii.gz'), targets=attr.targets, chunks=attr.chunks, mask=mask) if run_slow: # do chunkswise linear detrending on dataset poly_detrend(dataset, polyord=1, chunks_attr='chunks') # zscore dataset relative to baseline ('rest') mean zscore(dataset, chunks_attr='chunks', param_est=('targets', ['rest'])) # select class face and house for this demo analysis # would work with full datasets (just a little slower) dataset = dataset[np.array( [l in ['face', 'house'] for l in dataset.sa.targets], dtype='bool')] '''Apply searchlight to datasets''' surf_dset = surf_sl(dataset) surf_dset2 = surf_sl2(dataset) vol_dset = vol_sl(dataset) surf_data = surf_dset.samples surf_data2 = surf_dset2.samples vol_data = vol_dset.samples assert_array_equal(surf_data, surf_data2) assert_array_equal(surf_data, vol_data)
clfswh += \ FeatureSelectionClassifier( linearSVMC.clone(), SensitivityBasedFeatureSelection( SMLRWeights(SMLR(lm=0.1, implementation="C"), postproc=maxofabs_sample()), RangeElementSelector(mode='select')), descr="LinSVM on SMLR(lm=0.1) non-0") _rfeclf = linearSVMC.clone() clfswh += \ FeatureSelectionClassifier( _rfeclf, SplitRFE( _rfeclf, OddEvenPartitioner(), fselector=FractionTailSelector( 0.2, mode='discard', tail='lower')), descr="LinSVM with nested-CV RFE") _clfs_splitrfe_svm_anova = \ FeatureSelectionClassifier( linearSVMC.clone(), SplitRFE( linearSVMC.clone(), OddEvenPartitioner(), fselector=FractionTailSelector( 0.2, mode='discard', tail='lower'), fmeasure=OneWayAnova()), descr="LinSVM with nested-CV Anova RFE") clfswh += _clfs_splitrfe_svm_anova
def get_dsm_roi_secondorder_xval2(ds, rois, zscore_ds=True, part=OddEvenPartitioner(), cond_chunk='condition'): """ Obtain second-order dissimilarities between ROIs. This version cross-validates at the second level, thus the resulting dsms are not symmetrical. Arguments -------- ds: dataset rois: dict each item in the dictionary must be a tuple where the 0th element is the center of the roi, and the 1st element is a list of ids zscore_ds: bool is the dset already zscored? part: partitioner cond_chunk: str across which sample attribute to perform mean group sample Returns ------- dataset containing second level dsm """ #ds = h5load(fns.betafn(subnr)) #ds = ds[:, mask_] #ds = ds[ds.sa.condition != 'self'] if zscore_ds: zscore(ds, chunks_attr='chunks') # set up oddeven partition #part = OddEvenPartitioner() rdms = [] mgs = mean_group_sample([cond_chunk]) dissims_folds = [] for ds_ in part.generate(ds): ds_1 = ds_[ds_.sa.partitions == 1] ds_2 = ds_[ds_.sa.partitions == 2] ds_1 = mgs(ds_1) ds_2 = mgs(ds_2) assert (ds_1.samples.shape == ds_2.samples.shape) # first generate first-order rdms for each fold names = [] centers = [] dissims_1 = [] dissims_2 = [] for roi, (center, ids) in rois.iteritems(): names.append(roi) centers.append(center) sample1_roi = ds_1.samples[:, ids] sample2_roi = ds_2.samples[:, ids] dissim1_roi = pdist(sample1_roi, 'correlation') dissim2_roi = pdist(sample2_roi, 'correlation') dissims_1.append(dissim1_roi) dissims_2.append(dissim2_roi) dss1 = np.array(dissims_1) dss2 = np.array(dissims_2) # now compute second-order rdm correlating across folds dissim_2ndorder = 1. - corrcoefxy(dss1.T, dss2.T) dissim_2ndorder = dataset_wizard(dissim_2ndorder, targets=names) dissim_2ndorder.sa['centers'] = centers # also add fa information about roi dissim_2ndorder.fa['roi'] = names dissims_folds.append(dissim_2ndorder) # average dissims = dissims_folds[0] for d in dissims_folds[1:]: dissims.samples += d.samples dissims.samples /= len(dissims_folds) return dissims
def get_dsm_roi_xval1(ds, rois, zscore_ds=True, part=OddEvenPartitioner(), cond_chunk='condition'): """ Obtain second-order dissimilarities between ROIs. This version cross-validates at the first level, thus the resulting dsms are symmetrical. Arguments -------- ds: dataset rois: dict each item in the dictionary must be a tuple where the 0th element is the center of the roi, and the 1st element is a list of ids zscore_ds: bool is the dset already zscored? part: partitioner cond_chunk: str across which sample attribute to perform mean group sample Returns ------- dataset containing second level dsm """ #ds = h5load(fns.betafn(subnr)) #ds = ds[:, mask_] #ds = ds[ds.sa.condition != 'self'] if zscore_ds: zscore(ds, chunks_attr='chunks') # set up oddeven partition #part = OddEvenPartitioner() rdms = [] mgs = mean_group_sample([cond_chunk]) dissims_folds = [] for ds_ in part.generate(ds): ds_1 = ds_[ds_.sa.partitions == 1] ds_2 = ds_[ds_.sa.partitions == 2] ds_1 = mgs(ds_1) ds_2 = mgs(ds_2) assert (ds_1.samples.shape == ds_2.samples.shape) # first generate first-order rdms cross-validated across folds names = [] centers = [] dissims = [] for roi, (center, ids) in rois.iteritems(): names.append(roi) centers.append(center) sample1_roi = ds_1.samples[:, ids] sample2_roi = ds_2.samples[:, ids] dissim_roi = 1. - corrcoefxy(sample1_roi.T, sample2_roi.T) nsamples = ds_1.nsamples assert (dissim_roi.shape == (nsamples, nsamples)) dissims.append( dissim_roi.flatten()) # now the RDM is not symmetrical anymore dissims_folds.append(np.array(dissims)) # average across folds dissims_folds = np.array(dissims_folds).mean(axis=0) assert (dissims_folds.shape == (len(names), nsamples**2)) # now compute second level (distances) distance_roi = dist.pdist(dissims_folds, metric='correlation') dissims_folds = dataset_wizard(dist.squareform(distance_roi), targets=names) dissims_folds.fa['roi'] = names dissims_folds.sa['centers'] = centers return dissims_folds
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity ( RFE( sens_ana, cvmeasure, Repeater( 2 ), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector(0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits ( RFE( rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.mean), FxMapper('samples', np.abs) ])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater( 2), # we will use the same full cv-training dataset fselector=FractionTailSelector(0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit( BestDetector(), 10), train_pmeasure= False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin()) else: imin = np.argmin(e) if 'does_feature_selection' in clf.__tags__: # if clf is smart it might figure it out right away assert_array_less(imin, len(e)) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? self.assertTrue(1 < imin < len(e) - 1) else: self.assertTrue(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all()) # check if history has elements for every step self.assertTrue( set(rfe.ca.history) == set(range(len(np.array( rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.assertTrue(rfe.ca.nfeatures[-1] == len( np.where(rfe.ca.history == max(rfe.ca.history))[0]))
def get_dsm_roi_xval1_firstlev(ds, rois, zscore_ds=True, part=OddEvenPartitioner(), cond_chunk='condition', fisher=False): """ Obtain second-order dissimilarities between ROIs. This version cross-validates at the first level and returns only the first level, without distances between ROIs Arguments -------- ds: dataset rois: dict each item in the dictionary must be a tuple where the 0th element is the center of the roi, and the 1st element is a list of ids zscore_ds: bool is the dset already zscored? part: partitioner cond_chunk: str across which sample attribute to perform mean group sample fisher: bool whether to fisher-transform the correlations before averaging across folds Returns ------- dataset containing first level dsm of shape (nrois, ncond**2) """ #ds = h5load(fns.betafn(subnr)) #ds = ds[:, mask_] #ds = ds[ds.sa.condition != 'self'] # set up oddeven partition #part = OddEvenPartitioner() mgs = mean_group_sample([cond_chunk]) dissims_folds = [] folds = 1 for ds_ in part.generate(ds): print("Running fold {0}".format(folds)) ds_1 = ds_[ds_.sa.partitions == 1] ds_2 = ds_[ds_.sa.partitions == 2] ds_1 = mgs(ds_1) ds_2 = mgs(ds_2) if ds_1.nsamples >= 4 and zscore_ds: zscore(ds_1, chunks_attr='chunks') zscore(ds_2, chunks_attr='chunks') assert (ds_1.samples.shape == ds_2.samples.shape) # first generate first-order rdms cross-validated across folds names = [] centers = [] dissims = [] for roi, (center, ids) in rois.iteritems(): names.append(roi) centers.append(center) sample1_roi = ds_1.samples[:, ids] sample2_roi = ds_2.samples[:, ids] dissim_roi = corrcoefxy(sample1_roi.T, sample2_roi.T, fisher=fisher) nsamples = ds_1.nsamples assert (dissim_roi.shape == (nsamples, nsamples)) dissims.append( dissim_roi.flatten()) # now the RDM is not symmetrical anymore dissims_folds.append(np.array(dissims)) folds += 1 # average across folds dissims_folds = np.array(dissims_folds).mean(axis=0) assert (dissims_folds.shape == (len(names), nsamples**2)) if fisher: dissims_folds = np.tanh(dissims_folds) dissims_folds = dataset_wizard(dissims_folds, targets=names) dissims_folds.sa['centers'] = centers return dissims_folds
from mvpa2.clfs.svm import LinearCSVMC from mvpa2.measures.base import CrossValidation from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.testing.datasets import datasets from mvpa2.mappers.fx import mean_sample """For the sake of simplicity, let's use a small artificial dataset.""" # Lets just use our tiny 4D dataset from testing battery dataset = datasets['3dlarge'] """Now it only takes three lines for a searchlight analysis.""" # setup measure to be computed in each sphere (cross-validated # generalization error on odd/even splits) cv = CrossValidation(LinearCSVMC(), OddEvenPartitioner()) # setup searchlight with 2 voxels radius and measure configured above sl = sphere_searchlight(cv, radius=2, space='myspace', postproc=mean_sample()) # run searchlight on dataset sl_map = sl(dataset) print 'Best performing sphere error:', np.min(sl_map.samples) """ If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting searchlight map (`sl_map`) can be mapped back into the original dataspace and viewed as a brain overlay. :ref:`Another example <example_searchlight>` shows a typical application of this algorithm.
def generate_testing_datasets(specs): # Lets permute upon each invocation of test, so we could possibly # trigger some funny cases nonbogus_pool = np.random.permutation([0, 1, 3, 5]) datasets = {} # use a partitioner to flag odd/even samples as training and test ttp = OddEvenPartitioner(space='train', count=1) for kind, spec in specs.iteritems(): # set of univariate datasets for nlabels in [2, 3, 4]: basename = 'uni%d%s' % (nlabels, kind) nonbogus_features = nonbogus_pool[:nlabels] dataset = normal_feature_dataset( nlabels=nlabels, nonbogus_features=nonbogus_features, **spec) # full dataset datasets[basename] = list(ttp.generate(dataset))[0] # sample 3D total = 2 * spec['perlabel'] nchunks = spec['nchunks'] data = np.random.standard_normal((total, 3, 6, 6)) labels = np.concatenate( (np.repeat(0, spec['perlabel']), np.repeat(1, spec['perlabel']))) data[:, 1, 0, 0] += 2 * labels # add some signal chunks = np.asarray(range(nchunks) * (total // nchunks)) mask = np.ones((3, 6, 6), dtype='bool') mask[0, 0, 0] = 0 mask[1, 3, 2] = 0 ds = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks, mask=mask, space='myspace') # and to stress tests on manipulating sa/fa possibly containing # attributes of dtype object ds.sa['test_object'] = [['a'], [1, 2]] * (ds.nsamples // 2) datasets['3d%s' % kind] = ds # some additional datasets datasets['dumb2'] = dumb_feature_binary_dataset() datasets['dumb'] = dumb_feature_dataset() # dataset with few invariant features _dsinv = dumb_feature_dataset() _dsinv.samples = np.hstack((_dsinv.samples, np.zeros( (_dsinv.nsamples, 1)), np.ones((_dsinv.nsamples, 1)))) datasets['dumbinv'] = _dsinv # Datasets for regressions testing datasets['sin_modulated'] = list( ttp.generate(multiple_chunks(sin_modulated, 4, 30, 1)))[0] # use the same full for training datasets['sin_modulated_train'] = datasets['sin_modulated'] datasets['sin_modulated_test'] = sin_modulated(30, 1, flat=True) # simple signal for linear regressors datasets['chirp_linear'] = multiple_chunks(chirp_linear, 6, 50, 10, 2, 0.3, 0.1) datasets['chirp_linear_test'] = chirp_linear(20, 5, 2, 0.4, 0.1) datasets['wr1996'] = multiple_chunks(wr1996, 4, 50) datasets['wr1996_test'] = wr1996(50) datasets['hollow'] = Dataset(HollowSamples((40, 20)), sa={'targets': np.tile(['one', 'two'], 20)}) return datasets
def test_regressions(self, regr): """Simple tests on regressions """ if not externals.exists('scipy'): raise SkipTest else: from mvpa2.misc.errorfx import corr_error ds = datasets['chirp_linear'] # we want numeric labels to maintain the previous behavior, especially # since we deal with regressions here ds.sa.targets = AttributeMap().to_numeric(ds.targets) cve = CrossValidation(regr, NFoldPartitioner(), postproc=mean_sample(), errorfx=corr_error, enable_ca=['training_stats', 'stats']) # check the default #self.assertTrue(cve.transerror.errorfx is corr_error) corr = np.asscalar(cve(ds).samples) # Our CorrErrorFx should never return NaN self.assertTrue(not np.isnan(corr)) self.assertTrue(corr == cve.ca.stats.stats['CCe']) splitregr = SplitClassifier( regr, partitioner=OddEvenPartitioner(), enable_ca=['training_stats', 'stats']) splitregr.train(ds) split_corr = splitregr.ca.stats.stats['CCe'] split_corr_tr = splitregr.ca.training_stats.stats['CCe'] for confusion, error in ( (cve.ca.stats, corr), (splitregr.ca.stats, split_corr), (splitregr.ca.training_stats, split_corr_tr), ): #TODO: test confusion statistics # Part of it for now -- CCe for conf in confusion.summaries: stats = conf.stats if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(stats['CCe'] < 0.5) self.assertEqual(stats['CCe'], stats['Summary CCe']) s0 = confusion.as_string(short=True) s1 = confusion.as_string(short=False) for s in [s0, s1]: self.assertTrue(len(s) > 10, msg="We should get some string representation " "of regression summary. Got %s" % s) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(error < 0.2, msg="Regressions should perform well on a simple " "dataset. Got correlation error of %s " % error) # Test access to summary statistics # YOH: lets start making testing more reliable. # p-value for such accident to have is verrrry tiny, # so if regression works -- it better has at least 0.5 ;) # otherwise fix it! ;) # YOH: not now -- issues with libsvr in SG and linear kernel if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(confusion.stats['CCe'] < 0.5) # just to check if it works fine split_predictions = splitregr.predict(ds.samples)