def test_splitclf_sensitivities(): datasets = [normal_feature_dataset(perlabel=100, nlabels=2, nfeatures=4, nonbogus_features=[0, i + 1], snr=1, nchunks=2) for i in xrange(2)] sclf = SplitClassifier(SMLR(), NFoldPartitioner()) analyzer = sclf.get_sensitivity_analyzer() senses1 = analyzer(datasets[0]) senses2 = analyzer(datasets[1]) for senses in senses1, senses2: # This should be False when comparing two folds assert_false(np.allclose(senses.samples[0], senses.samples[2])) assert_false(np.allclose(senses.samples[1], senses.samples[3])) # Moreover with new data we should have got different results # (i.e. it must retrained correctly) for s1, s2 in zip(senses1, senses2): assert_false(np.allclose(s1, s2)) # and we should have "selected" "correct" voxels for i, senses in enumerate((senses1, senses2)): assert_equal(set(np.argsort(np.max(np.abs(senses), axis=0))[-2:]), set((0, i + 1)))
def test_ds_shallowcopy(): # lets use some instance of somewhat evolved dataset ds = normal_feature_dataset() ds.samples = ds.samples.view(myarray) # SHALLOW copy the beast ds_ = copy.copy(ds) # verify that we have the same data assert_array_equal(ds.samples, ds_.samples) assert_array_equal(ds.targets, ds_.targets) assert_array_equal(ds.chunks, ds_.chunks) # array subclass survives ok_(isinstance(ds_.samples, myarray)) # modify and see that we actually DO change the data in both ds_.samples[0, 0] = 1234 assert_array_equal(ds.samples, ds_.samples) assert_array_equal(ds.targets, ds_.targets) assert_array_equal(ds.chunks, ds_.chunks) ds_.sa.targets[0] = 'ab' ds_.sa.chunks[0] = 234 assert_array_equal(ds.samples, ds_.samples) assert_array_equal(ds.targets, ds_.targets) assert_array_equal(ds.chunks, ds_.chunks) ok_(ds.sa.targets[0] == 'ab') ok_(ds.sa.chunks[0] == 234)
def test_ds_deepcopy(): # lets use some instance of somewhat evolved dataset ds = normal_feature_dataset() ds.samples = ds.samples.view(myarray) # Clone the beast ds_ = ds.copy() # array subclass survives ok_(isinstance(ds_.samples, myarray)) # verify that we have the same data assert_array_equal(ds.samples, ds_.samples) assert_array_equal(ds.targets, ds_.targets) assert_array_equal(ds.chunks, ds_.chunks) # modify and see if we don't change data in the original one ds_.samples[0, 0] = 1234 ok_(np.any(ds.samples != ds_.samples)) assert_array_equal(ds.targets, ds_.targets) assert_array_equal(ds.chunks, ds_.chunks) ds_.sa.targets = np.hstack(([123], ds_.targets[1:])) ok_(np.any(ds.samples != ds_.samples)) ok_(np.any(ds.targets != ds_.targets)) assert_array_equal(ds.chunks, ds_.chunks) ds_.sa.chunks = np.hstack(([1234], ds_.chunks[1:])) ok_(np.any(ds.samples != ds_.samples)) ok_(np.any(ds.targets != ds_.targets)) ok_(np.any(ds.chunks != ds_.chunks))
def test_factorialpartitioner_big(): # just to see that we can cope with relatively large datasets/numbers ds = normal_feature_dataset(nlabels=6, perlabel=66, nfeatures=2, nchunks=11) # and now let's do factorial partitioner def partition(ds_=ds, **kwargs): partitioner = FactorialPartitioner( partitioner=NFoldPartitioner(attr='targets'), attr='chunks', **kwargs) return [p.sa.partitions for p in partitioner.generate(ds_)] # prohibitively large # print len(partition(ds)) t0 = time() assert_equal(len(partition(ds, count=2, selection_strategy='first')), 2) # Those time limits are really a stretch. on a any reasonable box not too busy # should be done in fraction of a second, but allow to catch "naive" # implementation assert(time() - t0 < 3) assert_equal(len(partition(ds, count=2, selection_strategy='random')), 2) assert(time() - t0 < 3)
def test_gnb_sensitivities(): gnb = GNB(common_variance=True) ds = normal_feature_dataset(perlabel=4, nlabels=3, nfeatures=5, nchunks=4, snr=10, nonbogus_features=[0, 1, 2] ) s = gnb.get_sensitivity_analyzer()(ds) assert_in('targets', s.sa) assert_equal(s.shape, (((len(ds.uniquetargets) * (len(ds.uniquetargets) - 1))/2), ds.nfeatures)) # test zero variance case # set variance of feature to zero ds.samples[:,3]=0.3 s_zerovar = gnb.get_sensitivity_analyzer() sens = s_zerovar(ds) assert_true(all(sens.samples[:, 3] == 0)) # test whether tagging and untagging works assert 'has_sensitivity' in gnb.__tags__ gnb.untrain() assert 'has_sensitivity' not in gnb.__tags__ # test whether content of sensitivities makes rough sense # e.g.: sensitivity of first feature should be larger than of bogus last feature assert_true(abs(sens.samples[i, 0]) > abs(sens.samples[i, 4]) for i in range(np.shape(sens.samples)[0]))
def test_smlr_sensitivities(clf): data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) # use SMLR on binary problem, but not fitting all weights clf.train(data) # now ask for the sensitivities WITHOUT having to pass the dataset # again sens = clf.get_sensitivity_analyzer(force_train=False)(None) assert_equal(sens.shape, (len(data.UT) - 1, data.nfeatures))
def test_mdpflowmapper(): flow = mdp.nodes.PCANode() + mdp.nodes.SFANode() fm = MDPFlowMapper(flow) ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) fm.train(ds) assert_false(fm.flow[0].is_training()) assert_false(fm.flow[1].is_training()) fds = fm.forward(ds) assert_true(isinstance(fds, Dataset)) assert_equal(fds.samples.shape, ds.samples.shape)
def test_glmnet_c_sensitivities(): data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) # use GLMNET on binary problem clf = GLMNET_C() clf.train(data) # now ask for the sensitivities WITHOUT having to pass the dataset # again sens = clf.get_sensitivity_analyzer(force_train=False)(None) #failUnless(sens.shape == (data.nfeatures,)) assert_equal(sens.shape, (len(data.UT), data.nfeatures))
def test_imshow(): from mvpa2.viz import matshow from mvpa2.misc.data_generators import normal_feature_dataset from matplotlib.colorbar import Colorbar ds = normal_feature_dataset(10, 2, 18, 5) im = matshow(ds) # old mpl returns a tuple of Colorbar which is anyways available as its .ax if isinstance(im.colorbar, tuple): assert_is_instance(im.colorbar[0], Colorbar) assert_true(im.colorbar[1] is im.colorbar[0].ax) else: # new mpls do it withough unnecessary duplication assert_is_instance(im.colorbar, Colorbar)
def test_sifter_superord_usecase(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.svm import LinearCSVMC # fast one to use for tests from mvpa2.measures.base import CrossValidation from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset(nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1])%3,) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 npart = ChainNode([ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True}) ]), ], space='partitions') # and then do your normal where clf is space='superord' clf = LinearCSVMC(space='superord') cvte_regular = CrossValidation(clf, NFoldPartitioner(), errorfx=lambda p,t: np.mean(p==t)) cvte_super = CrossValidation(clf, npart, errorfx=lambda p,t: np.mean(p==t)) accs_regular = cvte_regular(ds) accs_super = cvte_super(ds) # With sifting we should get only 2^3 = 8 splits assert(len(accs_super) == 8) # I don't think that this would ever fail, so not marking it labile assert(np.mean(accs_regular) > .8) assert(np.mean(accs_super) < .6)
def test_confusionmatrix_nulldist(self): from mvpa2.clfs.gnb import GNB class ConfusionMatrixError(object): """Custom error "function" """ def __init__(self, labels=None): self.labels = labels def __call__(self, predictions, targets): cm = ConfusionMatrix(labels=list(self.labels), targets=targets, predictions=predictions) #print cm.matrix # We have to add a degenerate leading dimension # so we could separate them into separate 'samples' return cm.matrix[None, :] from mvpa2.misc.data_generators import normal_feature_dataset for snr in [0., 2.,]: ds = normal_feature_dataset(snr=snr, perlabel=42, nchunks=3, nonbogus_features=[0,1], nfeatures=2) clf = GNB() num_perm = 50 permutator = AttributePermutator('targets', limit='chunks', count=num_perm) cv = CrossValidation( clf, NFoldPartitioner(), errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique), postproc=mean_sample(), null_dist=MCNullDist(permutator, tail='right', # because we now look at accuracy not error enable_ca=['dist_samples']), enable_ca=['stats']) cmatrix = cv(ds) #print "Result:\n", cmatrix.samples cvnp = cv.ca.null_prob.samples #print cvnp self.assertTrue(cvnp.shape, (2, 2)) if cfg.getboolean('tests', 'labile', default='yes'): if snr == 0.: # all p should be high since no signal assert_array_less(0.05, cvnp) else: # diagonal p is low -- we have signal after all assert_array_less(np.diag(cvnp), 0.05) # off diagonals are high p since for them we would # need to look at the other tail assert_array_less(0.9, cvnp[(np.array([0,1]), np.array([1,0]))])
def test_mdpnodemapper(): ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) node = mdp.nodes.PCANode() mm = MDPNodeMapper(node, nodeargs={'stoptrain': ((), {'debug': True})}) mm.train(ds) fds = mm.forward(ds) if externals.versions['mdp'] >= '2.5': assert_true(hasattr(mm.node, 'cov_mtx')) assert_true(isinstance(fds, Dataset)) assert_equal(fds.samples.shape, ds.samples.shape) # set projection onto first 2 components mm.nodeargs['exec'] = ((), {'n': 2}) #should be different from above lfds = mm.forward(ds.samples) # output shape changes although the node still claim otherwise assert_equal(mm.node.output_dim, 4) assert_equal(lfds.shape[0], fds.samples.shape[0]) assert_equal(lfds.shape[1], 2) assert_array_equal(lfds, fds.samples[:, :2]) # reverse rfds = mm.reverse(fds) # even smaller size works rlfds = mm.reverse(lfds) assert_equal(rfds.samples.shape, ds.samples.shape) # retraining has to work on a new dataset too, since we copy the node # internally dsbig = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=10) mm.train(dsbig)
def test_mdpflow_additional_arguments_nones(): skip_if_no_external('mdp', min_version='2.5') # we have no IdentityNode yet... is there analog? ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) flow = mdp.nodes.PCANode() + mdp.nodes.IdentityNode() + mdp.nodes.FDANode() # this is what it would look like in MDP itself #flow.train([[ds.samples], # [[ds.samples, ds.sa.targets]]]) assert_raises(ValueError, MDPFlowMapper, flow, node_arguments=[[],[]]) fm = MDPFlowMapper(flow, node_arguments = (None, None, [ds.sa.targets])) fm.train(ds) fds = fm.forward(ds) assert_equal(ds.samples.shape, fds.samples.shape) rds = fm.reverse(fds) assert_array_almost_equal(ds.samples, rds.samples)
def test_hist(): from mvpa2.viz import hist from mvpa2.misc.data_generators import normal_feature_dataset from matplotlib.axes import Subplot ds = normal_feature_dataset(10, 3, 10, 5) plots = hist(ds, ygroup_attr='targets', xgroup_attr='chunks', noticks=None, xlim=(-.5, .5), normed=True) assert_equal(len(plots), 15) for sp in plots: assert_is_instance(sp, Subplot) # simple case plots = hist(ds) assert_equal(len(plots), 1) assert_is_instance(plots[0], Subplot) # make sure it works with plan arrays too plots = hist(ds.samples) assert_equal(len(plots), 1) assert_is_instance(plots[0], Subplot)
def _get_superord_dataset(): """A little helper to simulate a dataset with super/subord targets structure """ # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset(nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1]) % 3,) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 return ds
def test_SplitRFE(self): # just a smoke test ATM from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import MappedClassifier from mvpa2.misc.data_generators import normal_feature_dataset #import mvpa2.featsel.rfe #reload(mvpa2.featsel.rfe) from mvpa2.featsel.rfe import RFE, SplitRFE from mvpa2.generators.partition import NFoldPartitioner from mvpa2.featsel.helpers import FractionTailSelector from mvpa2.testing import ok_, assert_equal clf = LinearCSVMC(C=1) dataset = normal_feature_dataset(perlabel=20, nlabels=2, nfeatures=30, snr=1., nonbogus_features=[1,5]) # flip one of the meaningful features around to see # if we are still getting proper selection dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1 # 4 partitions should be enough for testing partitioner = NFoldPartitioner(count=4) rfeclf = MappedClassifier( clf, SplitRFE(clf, partitioner, fselector=FractionTailSelector( 0.2, mode='discard', tail='lower'))) r0 = repr(rfeclf) ok_(rfeclf.mapper.nfeatures_min == 0) rfeclf.train(dataset) ok_(rfeclf.mapper.nfeatures_min > 0) predictions = rfeclf(dataset).samples # at least 1 of the nonbogus-features should be chosen ok_(len(set(dataset.a.nonbogus_features).intersection( rfeclf.mapper.slicearg)) > 0) # check repr to have all needed pieces r = repr(rfeclf) s = str(rfeclf) ok_(('partitioner=NFoldP' in r) or ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r)) ok_('lrn=' in r) ok_(not 'slicearg=' in r) assert_equal(r, r0)
def test_exclude_targets_combinations(): partitioner = ChainNode( [NFoldPartitioner(), ExcludeTargetsCombinationsPartitioner(k=2, targets_attr="targets", space="partitions")], space="partitions", ) from mvpa2.misc.data_generators import normal_feature_dataset ds = normal_feature_dataset(snr=0.0, nlabels=4, perlabel=3, nchunks=3, nonbogus_features=[0, 1, 2, 3], nfeatures=4) partitions = list(partitioner.generate(ds)) assert_equal(len(partitions), 3 * 6) splitter = Splitter("partitions") combs = [] comb_chunks = [] for p in partitions: trds, teds = list(splitter.generate(p))[:2] comb = tuple(np.unique(teds.targets)) combs.append(comb) comb_chunks.append(comb + tuple(np.unique(teds.chunks))) assert_equal(len(set(combs)), 6) # just 6 possible combinations of 2 out of 4 assert_equal(len(set(comb_chunks)), 3 * 6) # all unique
def test_hypal_michael_caused_problem(self): from mvpa2.misc import data_generators from mvpa2.mappers.zscore import zscore # Fake data ds = data_generators.normal_feature_dataset(nfeatures=20) ds_all = [data_generators.random_affine_transformation(ds) for i in range(3)] _ = [zscore(sd, chunks_attr=None) for sd in ds_all] # Making random data per subject for testing with bias added to first subject ds_test = [np.random.rand(1, ds.nfeatures) for i in range(len(ds_all))] ds_test[0] += np.arange(1, ds.nfeatures + 1) * 100 assert(np.corrcoef(ds_test[2], ds_test[1])[0, 1] < 0.99) # that would have been rudiculous if it was # Test with varying alpha so we for sure to not have that issue now for alpha in (0, 0.01, 0.5, 0.99, 1.0): hyper09 = Hyperalignment(alpha=alpha) mappers = hyper09([sd for sd in ds_all]) ds_test_a = [m.forward(sd) for m, sd in zip(mappers, ds_test)] ds_test_a = [mappers[0].reverse(sd) for sd in ds_test_a] corr = np.corrcoef(ds_test_a[2], ds_test_a[1])[0, 1] assert(corr < 0.99)
def test_nodeargs(): skip_if_no_external('mdp', min_version='2.4') ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) for svd_val in [True, False]: pcm = PCAMapper(alg='PCA', svd=svd_val) assert_equal(pcm.node.svd, svd_val) pcm.train(ds) assert_equal(pcm.node.svd, svd_val) for output_dim in [0.5, 0.95, 0.99, 10, 50, 100]: pcm = PCAMapper(alg='PCA', output_dim=output_dim) for i in range(2): # so we also test on trained one if isinstance(output_dim, float): assert_equal(pcm.node.desired_variance, output_dim) else: assert_equal(pcm.node.output_dim, output_dim) pcm.train(ds) if isinstance(output_dim, float): assert_not_equal(pcm.node.output_dim, output_dim) # some dimensions are chosen assert_true(pcm.node.output_dim > 0)
def test_cache_speedup(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1) sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1) cv_c = CrossValidation(ck, NFoldPartitioner()) cv_s = CrossValidation(sk, NFoldPartitioner()) #data = datasets['uni4large'] P = 5000 data = normal_feature_dataset(snr=2, perlabel=200, nchunks=10, means=np.random.randn(2, P), nfeatures=P) t0 = time() ck.params.kernel.compute(data) cachetime = time()-t0 t0 = time() cached_err = cv_c(data) ccv_time = time()-t0 t0 = time() norm_err = cv_s(data) ncv_time = time()-t0 assert_almost_equal(np.asanyarray(cached_err), np.asanyarray(norm_err)) ok_(cachetime<ncv_time) ok_(ccv_time<ncv_time) #print 'Regular CV time: %s seconds'%ncv_time #print 'Caching time: %s seconds'%cachetime #print 'Cached CV time: %s seconds'%ccv_time speedup = ncv_time/(ccv_time+cachetime) #print 'Speedup factor: %s'%speedup # Speedup ideally should be 10, though it's not purely linear self.failIf(speedup < 2, 'Problem caching data - too slow!')
def test_hypal_michael_caused_problem(self): from mvpa2.misc import data_generators from mvpa2.mappers.zscore import zscore # Fake data ds = data_generators.normal_feature_dataset(nfeatures=20) ds_all = [ data_generators.random_affine_transformation(ds) for i in range(3) ] _ = [zscore(sd, chunks_attr=None) for sd in ds_all] # Making random data per subject for testing with bias added to first subject ds_test = [np.random.rand(1, ds.nfeatures) for i in range(len(ds_all))] ds_test[0] += np.arange(1, ds.nfeatures + 1) * 100 assert (np.corrcoef(ds_test[2], ds_test[1])[0, 1] < 0.99 ) # that would have been ridiculous if it was # Test with varying alpha so we for sure to not have that issue now for alpha in (0, 0.01, 0.5, 0.99, 1.0): hyper09 = Hyperalignment(alpha=alpha) mappers = hyper09([sd for sd in ds_all]) ds_test_a = [m.forward(sd) for m, sd in zip(mappers, ds_test)] ds_test_a = [mappers[0].reverse(sd) for sd in ds_test_a] corr = np.corrcoef(ds_test_a[2], ds_test_a[1])[0, 1] assert (corr < 0.99)
def test_confusion_as_node(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.gnb import GNB from mvpa2.clfs.transerror import Confusion ds = normal_feature_dataset(snr=2.0, perlabel=42, nchunks=3, nonbogus_features=[0,1], nfeatures=2) clf = GNB() cv = CrossValidation( clf, NFoldPartitioner(), errorfx=None, postproc=Confusion(labels=ds.UT), enable_ca=['stats']) res = cv(ds) # needs to be identical to CA assert_array_equal(res.samples, cv.ca.stats.matrix) assert_array_equal(res.sa.predictions, ds.UT) assert_array_equal(res.fa.targets, ds.UT) skip_if_no_external('scipy') from mvpa2.clfs.transerror import BayesConfusionHypothesis from mvpa2.base.node import ChainNode # same again, but this time with Bayesian hypothesis testing at the end cv = CrossValidation( clf, NFoldPartitioner(), errorfx=None, postproc=ChainNode((Confusion(labels=ds.UT), BayesConfusionHypothesis()))) res = cv(ds) # only two possible hypothesis with two classes assert_equals(len(res), 2) # the first hypothesis is the can't discriminate anything assert_equal(len(res.sa.hypothesis[0]), 1) assert_equal(len(res.sa.hypothesis[0][0]), 2) # and the hypothesis is actually less likely than the other one # (both classes can be distinguished) assert(np.e**res.samples[0,0] < np.e**res.samples[1,0])
def test_gnb_sensitivities(logprob): gnb = GNB(common_variance=True, logprob=logprob) ds = normal_feature_dataset(perlabel=4, nlabels=3, nfeatures=5, nchunks=4, snr=20, nonbogus_features=[0, 1, 2]) s = gnb.get_sensitivity_analyzer()(ds) assert_in('targets', s.sa) assert_equal(s.shape, (((len(ds.uniquetargets) * (len(ds.uniquetargets) - 1)) / 2), ds.nfeatures)) # test zero variance case # set variance of feature to zero ds.samples[:, 3] = 0.3 s_zerovar = gnb.get_sensitivity_analyzer() sens = s_zerovar(ds) assert_equal(sens.T.dtype, 'O') # we store pairs assert_equal(sens.T[0], ('L0', 'L1')) assert_true(all(sens.samples[:, 3] == 0)) gnb.untrain() # test whether content of sensitivities makes rough sense # First feature has information only about L0, so it would be of # no use for L1 -vs- L2 classification, so we will go through each pair # and make sure that signs etc all correct for each pair. # This in principle should be a generic test for multiclass sensitivities abssens = abs(sens.samples) for (t1, t2), t1t2sens in zip(sens.T, sens.samples): # go from literal L1 to 1, L0 to 0 - corresponds to feature i1 = int(t1[1]) i2 = int(t2[1]) assert t1t2sens[i1] < 0 assert t1t2sens[i2] > 0 assert t1t2sens[i2] > t1t2sens[4]
def test_gnb_sensitivities(): gnb = GNB(common_variance=True) ds = normal_feature_dataset(perlabel=4, nlabels=3, nfeatures=5, nchunks=4, snr=20, nonbogus_features=[0, 1, 2] ) s = gnb.get_sensitivity_analyzer()(ds) assert_in('targets', s.sa) assert_equal(s.shape, (((len(ds.uniquetargets) * (len(ds.uniquetargets) - 1))/2), ds.nfeatures)) # test zero variance case # set variance of feature to zero ds.samples[:, 3] = 0.3 s_zerovar = gnb.get_sensitivity_analyzer() sens = s_zerovar(ds) assert_equal(sens.T.dtype, 'O') # we store pairs assert_equal(sens.T[0], ('L0', 'L1')) assert_true(all(sens.samples[:, 3] == 0)) gnb.untrain() # test whether content of sensitivities makes rough sense # First feature has information only about L0, so it would be of # no use for L1 -vs- L2 classification, so we will go through each pair # and make sure that signs etc all correct for each pair. # This in principle should be a generic test for multiclass sensitivities abssens = abs(sens.samples) for (t1, t2), t1t2sens in zip(sens.T, sens.samples): # go from literal L1 to 1, L0 to 0 - corresponds to feature i1 = int(t1[1]) i2 = int(t2[1]) assert t1t2sens[i1] < 0 assert t1t2sens[i2] > 0 assert t1t2sens[i2] > t1t2sens[4]
def test_confusionmatrix_nulldist(self): from mvpa2.clfs.gnb import GNB from mvpa2.clfs.transerror import ConfusionMatrixError from mvpa2.misc.data_generators import normal_feature_dataset for snr in [0., 2.,]: ds = normal_feature_dataset(snr=snr, perlabel=42, nchunks=3, nonbogus_features=[0,1], nfeatures=2) clf = GNB() num_perm = 50 permutator = AttributePermutator('targets', limit='chunks', count=num_perm) cv = CrossValidation( clf, NFoldPartitioner(), errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique), postproc=mean_sample(), null_dist=MCNullDist(permutator, tail='right', # because we now look at accuracy not error enable_ca=['dist_samples']), enable_ca=['stats']) cmatrix = cv(ds) #print "Result:\n", cmatrix.samples cvnp = cv.ca.null_prob.samples #print cvnp self.assertTrue(cvnp.shape, (2, 2)) if cfg.getboolean('tests', 'labile', default='yes'): if snr == 0.: # all p should be high since no signal assert_array_less(0.05, cvnp) else: # diagonal p is low -- we have signal after all assert_array_less(np.diag(cvnp), 0.05) # off diagonals are high p since for them we would # need to look at the other tail assert_array_less(0.9, cvnp[(np.array([0,1]), np.array([1,0]))])
def test_confusionmatrix_nulldist(self): from mvpa2.clfs.gnb import GNB from mvpa2.clfs.transerror import ConfusionMatrixError from mvpa2.misc.data_generators import normal_feature_dataset for snr in [0., 2.,]: ds = normal_feature_dataset(snr=snr, perlabel=42, nchunks=3, nonbogus_features=[0,1], nfeatures=2) clf = GNB() num_perm = 50 permutator = AttributePermutator('targets', limit='chunks', count=num_perm) cv = CrossValidation( clf, NFoldPartitioner(), errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique), postproc=mean_sample(), null_dist=MCNullDist(permutator, tail='right', # because we now look at accuracy not error enable_ca=['dist_samples']), enable_ca=['stats']) cmatrix = cv(ds) #print "Result:\n", cmatrix.samples cvnp = cv.ca.null_prob.samples #print cvnp self.assertTrue(cvnp.shape, (2, 2)) if cfg.getboolean('tests', 'labile', default='yes'): if snr == 0.: # all p should be high since no signal assert_array_less(0.05, cvnp) else: # diagonal p is low -- we have signal after all assert_array_less(np.diag(cvnp), 0.05) # off diagonals are high p since for them we would # need to look at the other tail assert_array_less(0.9, cvnp[(np.array([0,1]), np.array([1,0]))])
def test_binds(self): ds = normal_feature_dataset() ds_data = ds.samples.copy() ds_chunks = ds.chunks.copy() self.assertTrue(np.all(ds.samples == ds_data)) # sanity check funcs = ['coarsen_chunks'] for f in funcs: eval('ds.%s()' % f) self.assertTrue(np.any(ds.samples != ds_data) or np.any(ds.chunks != ds_chunks), msg="We should have modified original dataset with %s" % f) ds.samples = ds_data.copy() ds.sa['chunks'].value = ds_chunks.copy() # and some which should just return results for f in ['aggregate_features', 'remove_invariant_features', 'get_samples_per_chunk_target']: res = eval('ds.%s()' % f) self.assertTrue(res is not None, msg='We should have got result from function %s' % f) self.assertTrue(np.all(ds.samples == ds_data), msg="Function %s should have not modified original dataset" % f)
def test_binds(self): ds = normal_feature_dataset() ds_data = ds.samples.copy() ds_chunks = ds.chunks.copy() self.assertTrue(np.all(ds.samples == ds_data)) # sanity check funcs = ['coarsen_chunks'] for f in funcs: eval('ds.%s()' % f) self.assertTrue(np.any(ds.samples != ds_data) or np.any(ds.chunks != ds_chunks), msg="We should have modified original dataset with %s" % f) ds.samples = ds_data.copy() ds.sa['chunks'].value = ds_chunks.copy() # and some which should just return results for f in ['aggregate_features', 'remove_invariant_features', 'get_samples_per_chunk_target']: res = eval('ds.%s()' % f) self.assertTrue(res is not None, msg='We should have got result from function %s' % f) self.assertTrue(np.all(ds.samples == ds_data), msg="Function %s should have not modified original dataset" % f)
def test_binds(self): ds = normal_feature_dataset() ds_data = ds.samples.copy() ds_chunks = ds.chunks.copy() self.failUnless(np.all(ds.samples == ds_data)) # sanity check funcs = ["coarsen_chunks"] for f in funcs: eval("ds.%s()" % f) self.failUnless( np.any(ds.samples != ds_data) or np.any(ds.chunks != ds_chunks), msg="We should have modified original dataset with %s" % f, ) ds.samples = ds_data.copy() ds.sa["chunks"].value = ds_chunks.copy() # and some which should just return results for f in ["aggregate_features", "remove_invariant_features", "get_samples_per_chunk_target"]: res = eval("ds.%s()" % f) self.failUnless(res is not None, msg="We should have got result from function %s" % f) self.failUnless( np.all(ds.samples == ds_data), msg="Function %s should have not modified original dataset" % f )
def setUp(self): self.dataset = normal_feature_dataset(perlabel=100, nlabels=2, nfeatures=10, nonbogus_features=[0,1], snr=0.3, nchunks=2)
def test_confusion_as_node(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.gnb import GNB from mvpa2.clfs.transerror import Confusion ds = normal_feature_dataset(snr=2.0, perlabel=42, nchunks=3, nonbogus_features=[0,1], nfeatures=2) clf = GNB() cv = CrossValidation( clf, NFoldPartitioner(), errorfx=None, postproc=Confusion(labels=ds.UT), enable_ca=['stats']) res = cv(ds) # needs to be identical to CA assert_array_equal(res.samples, cv.ca.stats.matrix) assert_array_equal(res.sa.predictions, ds.UT) assert_array_equal(res.fa.targets, ds.UT) skip_if_no_external('scipy') from mvpa2.clfs.transerror import BayesConfusionHypothesis from mvpa2.base.node import ChainNode # same again, but this time with Bayesian hypothesis testing at the end cv = CrossValidation( clf, NFoldPartitioner(), errorfx=None, postproc=ChainNode([Confusion(labels=ds.UT), BayesConfusionHypothesis()])) res = cv(ds) # only two possible hypothesis with two classes assert_equals(len(res), 2) # the first hypothesis is the can't discriminate anything assert_equal(len(res.sa.hypothesis[0]), 1) assert_equal(len(res.sa.hypothesis[0][0]), 2) # and the hypothesis is actually less likely than the other one # (both classes can be distinguished) assert(np.e**res.samples[0,0] < np.e**res.samples[1,0]) # Let's see how well it would work within the searchlight when we also # would like to store the hypotheses per each voxel # Somewhat an ad-hoc solution for the answer posted on the ML # # run 1d searchlight of radii 0, for that just provide a .fa with coordinates ds.fa['voxel_indices'] = [[0], [1]] # and a custom Node which would collect .sa.hypothesis to place together along # with the posterior probabilities from mvpa2.base.node import Node from mvpa2.measures.searchlight import sphere_searchlight class KeepBothPosteriorAndHypothesis(Node): def _call(self, ds): out = np.zeros(1, dtype=object) out[0] = (ds.samples, ds.sa.hypothesis) return out cv.postproc.append(KeepBothPosteriorAndHypothesis()) sl = sphere_searchlight(cv, radius=0, nproc=1) res = sl(ds) assert_equal(res.shape, (1, 2)) assert_equal(len(res.samples[0,0]), 2) assert_equal(res.samples[0,0][0].shape, (2, 2)) # posteriors per 1st SL assert_equal(len(res.samples[0,0][1]), 2) # 2 of hypotheses
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity ( RFE( sens_ana, cvmeasure, Repeater( 2 ), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector(0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits ( RFE( rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.mean), FxMapper('samples', np.abs) ])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater( 2), # we will use the same full cv-training dataset fselector=FractionTailSelector(0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit( BestDetector(), 10), train_pmeasure= False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin()) else: imin = np.argmin(e) if 'does_feature_selection' in clf.__tags__: # if clf is smart it might figure it out right away assert_array_less(imin, len(e)) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? self.assertTrue(1 < imin < len(e) - 1) else: self.assertTrue(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all()) # check if history has elements for every step self.assertTrue( set(rfe.ca.history) == set(range(len(np.array( rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.assertTrue(rfe.ca.nfeatures[-1] == len( np.where(rfe.ca.history == max(rfe.ca.history))[0]))
import mvpa2 import pylab as pl import numpy as np from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.svm import LinearCSVMC from mvpa2.generators.partition import NFoldPartitioner from mvpa2.measures.base import CrossValidation from mvpa2.mappers.zscore import zscore """ Generate a binary dataset without any signal (snr=0). """ mvpa2.seed(1) ds_noise = normal_feature_dataset(perlabel=100, nlabels=2, nfeatures=2, snr=0, nonbogus_features=[0, 1]) # signal levels sigs = [0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0] """ To mimic behavior of hard-margin SVM whenever classes become separable, which is easier to comprehend, we are intentionally setting very high C value. """ clf = LinearCSVMC(C=1000, enable_ca=['training_stats']) cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats')
def test_SplitRFE(self, fmeasure): # just a smoke test ATM from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import MappedClassifier from mvpa2.misc.data_generators import normal_feature_dataset #import mvpa2.featsel.rfe #reload(mvpa2.featsel.rfe) from mvpa2.featsel.rfe import RFE, SplitRFE from mvpa2.generators.partition import NFoldPartitioner from mvpa2.featsel.helpers import FractionTailSelector from mvpa2.testing import ok_, assert_equal clf = LinearCSVMC(C=1) dataset = normal_feature_dataset(perlabel=20, nlabels=2, nfeatures=11, snr=1., nonbogus_features=[1, 5]) # flip one of the meaningful features around to see # if we are still getting proper selection dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1 # 3 partitions should be enough for testing partitioner = NFoldPartitioner(count=3) rfeclf = MappedClassifier( clf, SplitRFE( clf, partitioner, fselector=FractionTailSelector(0.5, mode='discard', tail='lower'), fmeasure=fmeasure, # need to update only when using clf's sens anal update_sensitivity=fmeasure is None)) r0 = repr(rfeclf) ok_(rfeclf.mapper.nfeatures_min == 0) rfeclf.train(dataset) ok_(rfeclf.mapper.nfeatures_min > 0) predictions = rfeclf(dataset).samples # at least 1 of the nonbogus-features should be chosen ok_( len( set(dataset.a.nonbogus_features).intersection( rfeclf.mapper.slicearg)) > 0) # check repr to have all needed pieces r = repr(rfeclf) s = str(rfeclf) ok_(('partitioner=NFoldP' in r) or ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r)) ok_('lrn=' in r) ok_(not 'slicearg=' in r) assert_equal(r, r0) if externals.exists('joblib'): rfeclf.mapper.nproc = -1 # compare results against the one ran in parallel _slicearg = rfeclf.mapper.slicearg _predictions = predictions rfeclf.train(dataset) predictions = rfeclf(dataset).samples assert_array_equal(predictions, _predictions) assert_array_equal(_slicearg, rfeclf.mapper.slicearg) # Test that we can collect stats from cas within cross-validation sensitivities = [] nested_errors = [] nested_nfeatures = [] def store_me(data, node, result): sens = node.measure.get_sensitivity_analyzer( force_train=False)(data) sensitivities.append(sens) nested_errors.append(node.measure.mapper.ca.nested_errors) nested_nfeatures.append(node.measure.mapper.ca.nested_nfeatures) cv = CrossValidation(rfeclf, NFoldPartitioner(count=1), callback=store_me, enable_ca=['stats']) _ = cv(dataset) # just to make sure we collected them assert_equal(len(sensitivities), 1) assert_equal(len(nested_errors), 1) assert_equal(len(nested_nfeatures), 1)
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity (RFE(sens_ana, cvmeasure, Repeater(2), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector( 0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits (RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.mean), FxMapper('samples', np.abs)])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater(2), # we will use the same full cv-training dataset fselector=FractionTailSelector( 0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), train_pmeasure=False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin()) else: imin = np.argmin(e) if 'does_feature_selection' in clf.__tags__: # if clf is smart it might figure it out right away assert_array_less( imin, len(e) ) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? self.assertTrue( 1 < imin < len(e) - 1 ) else: self.assertTrue(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.assertTrue( (nfeatures[::-1] == rfe.ca.nfeatures).all() ) # check if history has elements for every step self.assertTrue(set(rfe.ca.history) == set(range(len(np.array(rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.assertTrue(rfe.ca.nfeatures[-1] == len(np.where(rfe.ca.history ==max(rfe.ca.history))[0]))
def test_rfe_sensmap(): # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html # just a smoke test. fails with from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import FeatureSelectionClassifier from mvpa2.measures.base import CrossValidation, RepeatedMeasure from mvpa2.generators.splitters import Splitter from mvpa2.generators.partition import NFoldPartitioner from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.mappers.fx import mean_sample from mvpa2.mappers.fx import maxofabs_sample from mvpa2.generators.base import Repeater from mvpa2.featsel.rfe import RFE from mvpa2.featsel.helpers import FractionTailSelector, BestDetector from mvpa2.featsel.helpers import NBackHistoryStopCrit from mvpa2.datasets import vstack from mvpa2.misc.data_generators import normal_feature_dataset # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent fds = normal_feature_dataset(nlabels=3, snr=1, # 100, # pure signal! ;) perlabel=9, nfeatures=6, nonbogus_features=range(3), nchunks=3) clfsvm = LinearCSVMC() rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()), CrossValidation( clfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()), Repeater(2), fselector=FractionTailSelector(0.70, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), update_sensitivity=True) fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm) sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()) # manually repeating/splitting so we do both RFE sensitivity and classification senses, errors = [], [] for i, pset in enumerate(NFoldPartitioner().generate(fds)): # split partitioned dataset split = [d for d in Splitter('partitions').generate(pset)] senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets)) senses = vstack(senses) errors = vstack(errors) # Let's compare against rerunning the beast simply for classification with CV errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds) # and they should match assert_array_equal(errors, errors_cv) # buggy! cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner()) senses_rm = cv_sensana_svm(fds) #print senses.samples, senses_rm.samples #print errors, errors_cv.samples assert_raises(AssertionError, assert_array_almost_equal, senses.samples, senses_rm.samples) raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
def test_factorialpartitioner(): # Test against sifter and chainmap implemented in test_usecases # -- code below copied from test_usecases -- # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, ) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 # let's make two other datasets to test later # one superordinate category only ds_1super = ds.copy() ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets] # one superordinate category has only one subordinate #ds_unbalanced = ds.copy() #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1') #mask_superord = ds_unbalanced.sa.superord == 'super1' #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord]) #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)] ds_unbalanced = Dataset(range(4), sa={ 'subord': [0, 0, 1, 2], 'superord': [1, 1, 2, 2] }) npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') # now the new implementation factpart = FactorialPartitioner(NFoldPartitioner(attr='subord'), attr='superord') partitions_npart = [p.sa.partitions for p in npart.generate(ds)] partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)] assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart)) # now let's check it behaves correctly if we have only one superord class nfold = NFoldPartitioner(attr='subord') partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)] partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_1super) ] assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart)) # smoke test for unbalanced subord classes warning_msg = 'One or more superordinate attributes do not have the same '\ 'number of subordinate attributes. This could yield to '\ 'unbalanced partitions.' with assert_warnings([(RuntimeWarning, warning_msg)]): partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_unbalanced) ] partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])] superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])] subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])] for out_part, true_part, super_out, sub_out in \ zip(partitions_factpart, partitions_unbalanced, superord_unbalanced, subord_unbalanced): assert_array_equal(out_part, true_part) assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()), super_out) assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out) # now let's test on a dummy dataset ds_dummy = Dataset(range(4), sa={ 'subord': range(4), 'superord': [1, 2] * 2 }) partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_dummy) ] assert_array_equal( partitions_factpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
def test_factorialpartitioner(): # Test against sifter and chainmap implemented in test_usecases # -- code below copied from test_usecases -- # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, ) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 # let's make two other datasets to test later # one superordinate category only ds_1super = ds.copy() ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets] # one superordinate category has only one subordinate #ds_unbalanced = ds.copy() #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1') #mask_superord = ds_unbalanced.sa.superord == 'super1' #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord]) #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)] ds_unbalanced = Dataset(range(4), sa={ 'subord': [0, 0, 1, 2], 'superord': [1, 1, 2, 2] }) npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') def partition(partitioner, ds_=ds): return [p.sa.partitions for p in partitioner.generate(ds_)] # now the new implementation # common kwargs factkw = dict(partitioner=NFoldPartitioner(attr='subord'), attr='superord') fpart = FactorialPartitioner(**factkw) p_npart = partition(npart) p_fpart = partition(fpart) assert_array_equal(np.sort(p_npart), np.sort(p_fpart)) fpart2 = FactorialPartitioner(count=2, selection_strategy='first', **factkw) p_fpart2 = partition(fpart2) assert_equal(len(p_fpart), 8) assert_equal(len(p_fpart2), 2) assert_array_equal(p_fpart[:2], p_fpart2) # 1 equidistant -- should be the first one fpart1 = FactorialPartitioner(count=1, **factkw) p_fpart1 = partition(fpart1) assert_equal(len(p_fpart1), 1) assert_array_equal(p_fpart[:1], p_fpart1) # 2 equidistant fpart2 = FactorialPartitioner(count=2, **factkw) p_fpart2 = partition(fpart2) assert_equal(len(p_fpart2), 2) assert_array_equal(p_fpart[::4], p_fpart2) # without count -- should be all of them in original order fpartr = FactorialPartitioner(selection_strategy='random', **factkw) assert_array_equal(p_fpart, partition(fpartr)) # but if with a count we should get some selection fpartr2 = FactorialPartitioner(selection_strategy='random', count=2, **factkw) # Let's generate a number of random selections: rand2_partitions = [partition(fpartr2) for i in xrange(10)] for p in rand2_partitions: assert_equal(len(p), 2) # majority of them must be different assert len(set([tuple(map(tuple, x)) for x in rand2_partitions])) >= 5 # now let's check it behaves correctly if we have only one superord class nfold = NFoldPartitioner(attr='subord') p_nfold = partition(nfold, ds_1super) p_fpart = partition(fpart, ds_1super) assert_array_equal(np.sort(p_nfold), np.sort(p_fpart)) # smoke test for unbalanced subord classes warning_msg = 'One or more superordinate attributes do not have the same '\ 'number of subordinate attributes. This could yield to '\ 'unbalanced partitions.' with assert_warnings([(RuntimeWarning, warning_msg)]): p_fpart = partition(fpart, ds_unbalanced) p_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])] superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])] subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])] for out_part, true_part, super_out, sub_out in \ zip(p_fpart, p_unbalanced, superord_unbalanced, subord_unbalanced): assert_array_equal(out_part, true_part) assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()), super_out) assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out) # now let's test on a dummy dataset ds_dummy = Dataset(range(4), sa={ 'subord': range(4), 'superord': [1, 2] * 2 }) p_fpart = partition(fpart, ds_dummy) assert_array_equal( p_fpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
def test_confusion_as_node(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.gnb import GNB from mvpa2.clfs.transerror import Confusion ds = normal_feature_dataset(snr=2.0, perlabel=42, nchunks=3, nonbogus_features=[0,1], nfeatures=2) clf = GNB() cv = CrossValidation( clf, NFoldPartitioner(), errorfx=None, postproc=Confusion(labels=ds.UT), enable_ca=['stats']) res = cv(ds) # needs to be identical to CA assert_array_equal(res.samples, cv.ca.stats.matrix) assert_array_equal(res.sa.predictions, ds.UT) assert_array_equal(res.fa.targets, ds.UT) skip_if_no_external('scipy') from mvpa2.clfs.transerror import BayesConfusionHypothesis from mvpa2.base.node import ChainNode # same again, but this time with Bayesian hypothesis testing at the end cv = CrossValidation( clf, NFoldPartitioner(), errorfx=None, postproc=ChainNode([Confusion(labels=ds.UT), BayesConfusionHypothesis()])) res = cv(ds) # only two possible hypothesis with two classes assert_equals(len(res), 2) # the first hypothesis is the can't discriminate anything assert_equal(len(res.sa.hypothesis[0]), 1) assert_equal(len(res.sa.hypothesis[0][0]), 2) # and the hypothesis is actually less likely than the other one # (both classes can be distinguished) assert(np.e**res.samples[0,0] < np.e**res.samples[1,0]) # Let's see how well it would work within the searchlight when we also # would like to store the hypotheses per each voxel # Somewhat an ad-hoc solution for the answer posted on the ML # # run 1d searchlight of radii 0, for that just provide a .fa with coordinates ds.fa['voxel_indices'] = [[0], [1]] # and a custom Node which would collect .sa.hypothesis to place together along # with the posterior probabilities from mvpa2.base.node import Node from mvpa2.measures.searchlight import sphere_searchlight class KeepBothPosteriorAndHypothesis(Node): def _call(self, ds): out = np.zeros(1, dtype=object) out[0] = (ds.samples, ds.sa.hypothesis) return out cv.postproc.append(KeepBothPosteriorAndHypothesis()) sl = sphere_searchlight(cv, radius=0, nproc=1) res = sl(ds) assert_equal(res.shape, (1, 2)) assert_equal(len(res.samples[0,0]), 2) assert_equal(res.samples[0,0][0].shape, (2, 2)) # posteriors per 1st SL assert_equal(len(res.samples[0,0][1]), 2) # 2 of hypotheses
""" import mvpa2 import pylab as pl import numpy as np from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.svm import LinearCSVMC from mvpa2.generators.partition import NFoldPartitioner from mvpa2.measures.base import CrossValidation from mvpa2.mappers.zscore import zscore """ Generate a binary dataset without any signal (snr=0). """ mvpa2.seed(1); ds_noise = normal_feature_dataset(perlabel=100, nlabels=2, nfeatures=2, snr=0, nonbogus_features=[0,1]) # signal levels sigs = [0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0] """ To mimic behavior of hard-margin SVM whenever classes become separable, which is easier to comprehend, we are intentionally setting very high C value. """ clf = LinearCSVMC(C=1000, enable_ca=['training_stats']) cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats') sana = clf.get_sensitivity_analyzer(postproc=None)