def test_nfold_random_counted_selection_partitioner(self): # Lets get somewhat extensive but complete one and see if # everything is legit. 0.5 must correspond to 50%, in our case # 5 out of 10 unique chunks split_partitions = [ tuple(x.sa.partitions) for x in NFoldPartitioner(0.5).generate(self.data)] # 252 is # of combinations of 5 from 10 assert_equal(len(split_partitions), 252) # verify that all of them are unique assert_equal(len(set(split_partitions)), 252) # now let's limit our query kwargs = dict(count=10, selection_strategy='random') split10_partitions = [ tuple(x.sa.partitions) for x in NFoldPartitioner(5, **kwargs).generate(self.data)] split10_partitions_ = [ tuple(x.sa.partitions) for x in NFoldPartitioner(0.5, **kwargs).generate(self.data)] # to make sure that I deal with sets of tuples correctly: assert_equal(len(set(split10_partitions)), 10) assert_equal(len(split10_partitions), 10) assert_equal(len(split10_partitions_), 10) # and they must differ (same ones are possible but very very unlikely) assert_not_equal(split10_partitions, split10_partitions_) # but every one of them must be within known exhaustive set assert_equal(set(split_partitions).intersection(split10_partitions), set(split10_partitions)) assert_equal(set(split_partitions).intersection(split10_partitions_), set(split10_partitions_))
def test_custom_targets(self, lrn): """Simple test if a learner could cope with custom sa not targets """ # Since we are comparing performances of two learners, we need # to assure that if they depend on some random seed -- they # would use the same value. Currently we have such stochastic # behavior in SMLR if 'seed' in lrn.params: from mvpa2 import _random_seed lrn = lrn.clone() # clone the beast lrn.params.seed = _random_seed # reuse the same seed lrn_ = lrn.clone() lrn_.set_space('custom') te = CrossValidation(lrn, NFoldPartitioner()) te_ = CrossValidation(lrn_, NFoldPartitioner()) nclasses = 2 * (1 + int('multiclass' in lrn.__tags__)) dsname = ('uni%dsmall' % nclasses, 'sin_modulated')[int(lrn.__is_regression__)] ds = datasets[dsname] ds_ = ds.copy() ds_.sa['custom'] = ds_.sa['targets'] ds_.sa.pop('targets') self.assertTrue('targets' in ds.sa, msg="'targets' should remain in original ds") try: cve = te(ds) cve_ = te_(ds_) except Exception, e: self.fail("Failed with %r" % e)
def test_noise_classification(self): # get a dataset with a very high SNR data = get_mv_pattern(10) # do crossval with default errorfx and 'mean' combiner cv = CrossValidation(sample_clf_nl, NFoldPartitioner()) # must return a scalar value result = cv(data) # must be perfect self.assertTrue((result.samples < 0.05).all()) # do crossval with permuted regressors cv = CrossValidation( sample_clf_nl, ChainNode( [NFoldPartitioner(), AttributePermutator('targets', count=10)], space='partitions')) results = cv(data) # results must not be the same self.assertTrue(len(np.unique(results.samples)) > 1) # must be at chance level pmean = np.array(results).mean() self.assertTrue(pmean < 0.58 and pmean > 0.42)
def test_sifter_with_balancing(): # extended previous test which was already # "... somewhat duplicating the doctest" ds = Dataset(samples=np.arange(12).reshape((-1, 2)), sa={ 'chunks': [0, 1, 2, 3, 4, 5], 'targets': ['c', 'c', 'c', 'p', 'p', 'p'] }) # Without sifter -- just to assure that we do get all of them # i.e. 6*5*4*3/(4!) = 15 par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')]) assert_equal(len(list(par.generate(ds))), 15) # so we will take 4 chunks out of available 7, but would care only # about those partitions where we have balanced number of 'c' and 'p' # entries assert_raises( ValueError, lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds) par = ChainNode([ NFoldPartitioner(cvtype=4, attr='chunks'), Sifter([('partitions', 2), ('targets', dict(uvalues=['c', 'p'], balanced=True))]) ]) dss = list(par.generate(ds)) # print [ x[x.sa.partitions==2].sa.targets for x in dss ] assert_equal(len(dss), 9) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def blocked_detection_n_equals_1(mech_vec_list, mech_nm_list): data, _ = mar.create_blocked_dataset_semantic_classes(mech_vec_list, mech_nm_list, append_robot = False) nfs = NFoldPartitioner(cvtype=1, attr='targets') # 1-fold ? spl = splitters.Splitter(attr='partitions') splits = [list(spl.generate(x)) for x in nfs.generate(data)] ## splitter = NFoldSplitter(cvtype=1) ## label_splitter = NFoldSplitter(cvtype=1, attr='labels') mean_thresh_known_mech_dict = {} for l_wdata, l_vdata in splits: mean_thresh_known_mech_list = [] Ms = mar.compute_Ms(data, l_vdata.targets[0], plot=True) break mechs = l_vdata.uniquechunks for m in mechs: n_std = 0. all_trials = l_vdata.samples[np.where(l_vdata.chunks == m)] le = all_trials.shape[1] for i in range(all_trials.shape[0]): one_trial = all_trials[i,:].reshape(1,le) mn_list, std_list = mar.estimate_theta(one_trial, Ms, plot=False) mn_arr, std_arr = np.array(mn_list), np.array(std_list) n_std = max(n_std, np.max(np.abs(all_trials - mn_arr) / std_arr)) mean_thresh_known_mech_dict[m] = (Ms, n_std) # store on a per mechanism granularity print 'n_std for', m, ':', n_std print 'max error force for', m, ':', np.max(n_std*std_arr[2:])
def test_chained_crossvalidation_searchlight(): from mvpa2.clfs.gnb import GNB from mvpa2.clfs.meta import MappedClassifier from mvpa2.generators.partition import NFoldPartitioner from mvpa2.mappers.base import ChainMapper from mvpa2.mappers.base import Mapper from mvpa2.measures.base import CrossValidation from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.testing.datasets import datasets dataset = datasets['3dlarge'].copy() dataset.fa['voxel_indices'] = dataset.fa.myspace sample_clf = GNB() # fast and deterministic class ZScoreFeaturesMapper(Mapper): """Very basic mapper which would take care about standardizing all features within each sample separately """ def _forward_data(self, data): return (data - np.mean(data, axis=1)[:, None])/np.std(data, axis=1)[:, None] # only do partial to save time sl_kwargs = dict(radius=2, center_ids=[3, 50]) clf_mapped = MappedClassifier(sample_clf, ZScoreFeaturesMapper()) cv = CrossValidation(clf_mapped, NFoldPartitioner()) sl = sphere_searchlight(cv, **sl_kwargs) results_mapped = sl(dataset) cv_chained = ChainMapper([ZScoreFeaturesMapper(auto_train=True), CrossValidation(sample_clf, NFoldPartitioner())]) sl_chained = sphere_searchlight(cv_chained, **sl_kwargs) results_chained = sl_chained(dataset) assert_array_equal(results_mapped, results_chained)
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr="partitions") splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] # with numpy 1.7.0b1 "chaining" was deprecated so let's create # check function appropriate for the given numpy version _a = np.arange(5) __a = _a[:4][:3] if __a.base is _a: # 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base is base elif __a.base.base is _a: # prior 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base.base is base else: raise RuntimeError("Uknown handling of .base by numpy") for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples)) assert_true(is_the_same_base(s[1].samples)) spl = Splitter(attr="partitions", noslicing=True) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr="partitions") splits = [list(spl.generate(p)) for p in nfs.generate(self.data)] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(is_the_same_base(s[0].samples)) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(is_the_same_base(s[1].samples)) step_ds = Dataset(np.random.randn(20, 2), sa={"chunks": np.tile([0, 1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr="partitions") splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [list(spl.generate(p)) for p in oes.generate(step_ds)] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples, step_ds.samples)) assert_true(is_the_same_base(s[1].samples, step_ds.samples))
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] # with numpy 1.7.0b1 "chaining" was deprecated so let's create # check function appropriate for the given numpy version _a = np.arange(5) __a = _a[:4][:3] if __a.base is _a: # 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base is base elif __a.base.base is _a: # prior 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base.base is base else: raise RuntimeError("Uknown handling of .base by numpy") for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples)) assert_true(is_the_same_base(s[1].samples)) spl = Splitter(attr='partitions', noslicing=True) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [list(spl.generate(p)) for p in nfs.generate(self.data)] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(is_the_same_base(s[0].samples)) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20, 2), sa={'chunks': np.tile([0, 1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [list(spl.generate(p)) for p in oes.generate(step_ds)] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def test_sifter_superord_usecase(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.svm import LinearCSVMC # fast one to use for tests from mvpa2.measures.base import CrossValidation from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, ) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') # and then do your normal where clf is space='superord' clf = LinearCSVMC(space='superord') cvte_regular = CrossValidation(clf, NFoldPartitioner(), errorfx=lambda p, t: np.mean(p == t)) cvte_super = CrossValidation(clf, npart, errorfx=lambda p, t: np.mean(p == t)) accs_regular = cvte_regular(ds) accs_super = cvte_super(ds) # With sifting we should get only 2^3 = 8 splits assert (len(accs_super) == 8) # I don't think that this would ever fail, so not marking it labile assert (np.mean(accs_regular) > .8) assert (np.mean(accs_super) < .6)
def test_gnbsearchlight_matchaccuracy(self): # was not able to deal with custom errorfx collapsing samples # after 55e147e0bd30fbf4edede3faef3a15c6c65b33ea ds = datasets['3dmedium'].copy() ds.fa['voxel_indices'] = ds.fa.myspace sl_err = sphere_gnbsearchlight(GNB(), NFoldPartitioner(cvtype=1), radius=0) sl_acc = sphere_gnbsearchlight(GNB(), NFoldPartitioner(cvtype=1), radius=0, errorfx=mean_match_accuracy) assert_array_almost_equal(sl_err(ds), 1.0 - sl_acc(ds).samples)
def test_simplest_cv_pat_gen(self): # create the generator nfs = NFoldPartitioner(cvtype=1) spl = Splitter(attr='partitions') # now get the xval pattern sets One-Fold CV) xvpat = [list(spl.generate(p)) for p in nfs.generate(self.data)] self.assertTrue(len(xvpat) == 10) for i, p in enumerate(xvpat): self.assertTrue(len(p) == 2) self.assertTrue(p[0].nsamples == 90) self.assertTrue(p[1].nsamples == 10) self.assertTrue(p[1].chunks[0] == i)
def test_simplest_cv_pat_gen(self): # create the generator nfs = NFoldPartitioner(cvtype=1) spl = Splitter(attr='partitions') # now get the xval pattern sets One-Fold CV) xvpat = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] self.failUnless( len(xvpat) == 10 ) for i,p in enumerate(xvpat): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 90 ) self.failUnless( p[1].nsamples == 10 ) self.failUnless( p[1].chunks[0] == i )
def test_exclude_targets_combinations_subjectchunks(): partitioner = ChainNode([NFoldPartitioner(attr='subjects'), ExcludeTargetsCombinationsPartitioner( k=1, targets_attr='chunks', space='partitions')], space='partitions') # targets do not need even to be defined! ds = Dataset(np.arange(18).reshape(9, 2), sa={'chunks': np.arange(9) // 3, 'subjects': np.arange(9) % 3}) dss = list(partitioner.generate(ds)) assert_equal(len(dss), 9) testing_subjs, testing_chunks = [], [] for ds_ in dss: testing_partition = ds_.sa.partitions == 2 training_partition = ds_.sa.partitions == 1 # must be scalars -- so implicit test here # if not -- would be error testing_subj = np.asscalar(np.unique(ds_.sa.subjects[testing_partition])) testing_subjs.append(testing_subj) testing_chunk = np.asscalar(np.unique(ds_.sa.chunks[testing_partition])) testing_chunks.append(testing_chunk) # and those must not appear for training ok_(not testing_subj in ds_.sa.subjects[training_partition]) ok_(not testing_chunk in ds_.sa.chunks[training_partition]) # and we should have gone through all chunks/subjs pairs testing_pairs = set(zip(testing_subjs, testing_chunks)) assert_equal(len(testing_pairs), 9) # yoh: equivalent to set(itertools.product(range(3), range(3)))) # but .product is N/A for python2.5 assert_equal(testing_pairs, set(zip(*np.where(np.ones((3,3))))))
def test_classifier_generalization(self, clf): """Simple test if classifiers can generalize ok on simple data """ te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample()) # check the default #self.assertTrue(te.transerror.errorfx is mean_mismatch_error) nclasses = 2 * (1 + int('multiclass' in clf.__tags__)) ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))] try: cve = te(ds).samples.squeeze() except Exception as e: self.fail("Failed with %s" % e) if cfg.getboolean('tests', 'labile', default='yes'): if nclasses > 2 and \ ((clf.descr is not None and 'on 5%(' in clf.descr) or 'regression_based' in clf.__tags__): # skip those since they are barely applicable/testable here raise SkipTest("Skip testing of cve on %s" % clf) self.assertTrue( cve < 0.25, # TODO: use multinom distribution msg="Got transfer error %g on %s with %d labels" % (cve, ds, len(ds.UT)))
def test_splitclf_sensitivities(): datasets = [ normal_feature_dataset(perlabel=100, nlabels=2, nfeatures=4, nonbogus_features=[0, i + 1], snr=1, nchunks=2) for i in xrange(2) ] sclf = SplitClassifier(SMLR(), NFoldPartitioner()) analyzer = sclf.get_sensitivity_analyzer() senses1 = analyzer(datasets[0]) senses2 = analyzer(datasets[1]) for senses in senses1, senses2: # This should be False when comparing two folds assert_false(np.allclose(senses.samples[0], senses.samples[2])) assert_false(np.allclose(senses.samples[1], senses.samples[3])) # Moreover with new data we should have got different results # (i.e. it must retrained correctly) for s1, s2 in zip(senses1, senses2): assert_false(np.allclose(s1, s2)) # and we should have "selected" "correct" voxels for i, senses in enumerate((senses1, senses2)): assert_equal(set(np.argsort(np.max(np.abs(senses), axis=0))[-2:]), set((0, i + 1)))
def test_split_clf_on_chainpartitioner(self): # pretty much a smoke test for #156 ds = datasets['uni2small'] part = ChainNode([ NFoldPartitioner(cvtype=1), Balancer(attr='targets', count=2, limit='partitions', apply_selection=True) ]) partitions = list(part.generate(ds)) sclf = SplitClassifier(sample_clf_lin, part, enable_ca=['stats', 'splits']) sclf.train(ds) pred = sclf.predict(ds) assert_equal(len(pred), len(ds)) # rudimentary check assert_equal(len(sclf.ca.splits), len(partitions)) assert_equal(len(sclf.clfs), len(partitions)) # now let's do sensitivity analyzer just in case sclf.untrain() sensana = sclf.get_sensitivity_analyzer() sens = sensana(ds) # basic check that sensitivities varied across splits from mvpa2.mappers.fx import FxMapper sens_stds = FxMapper('samples', np.std, uattrs=['targets'])(sens) assert_true(np.any(sens_stds != 0))
def test_split_classifier_extended(self, clf_): clf2 = clf_.clone() ds = datasets['uni2%s' % self._get_clf_ds(clf2)] clf = SplitClassifier( clf=clf_, #SameSignClassifier(), enable_ca=['stats', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.stats.error cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds).samples.squeeze() if not 'non-deterministic' in clf.__tags__: self.assertTrue( abs(error - cverror) < 0.01, msg="We should get the same error using split classifier as" " using CrossValidation. Got %s and %s" % (error, cverror)) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(error < 0.25, msg="clf should generalize more or less fine. " "Got error %s" % error) self.assertEqual(len(clf.ca.stats.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.assertEqual( len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs")
def _test_mcasey20120222(): # pragma: no cover # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2012q1/002034.html # This one is conditioned on allowing # of samples to be changed # by the mapper provided to MappedClassifier. See # https://github.com/yarikoptic/PyMVPA/tree/_tent/allow_ch_nsamples import numpy as np from mvpa2.datasets.base import dataset_wizard from mvpa2.generators.partition import NFoldPartitioner from mvpa2.mappers.base import ChainMapper from mvpa2.mappers.svd import SVDMapper from mvpa2.mappers.fx import mean_group_sample from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import MappedClassifier from mvpa2.measures.base import CrossValidation mapper = ChainMapper([mean_group_sample(['targets','chunks']), SVDMapper()]) clf = MappedClassifier(LinearCSVMC(), mapper) cvte = CrossValidation(clf, NFoldPartitioner(), enable_ca=['repetition_results', 'stats']) ds = dataset_wizard( samples=np.arange(32).reshape((8, -1)), targets=[1, 1, 2, 2, 1, 1, 2, 2], chunks=[1, 1, 1, 1, 2, 2, 2, 2]) errors = cvte(ds)
def test_cached_qe_gnbsearchlight(self): ds1 = datasets['3dsmall'].copy(deep=True) qe = IndexQueryEngine(myspace=Sphere(2)) cached_qe = CachedQueryEngine(qe) gnb_sl = GNBSearchlight(GNB(), NFoldPartitioner(), qe=cached_qe) res = gnb_sl(ds1) assert_false(cached_qe.ids is None)
def test_multiclass_classifier_pass_ds_attributes(): # TODO: replicate/extend basic testing of pass_attr # in some more "basic" test_* clf = LinearCSVMC(C=1) ds = datasets['uni3small'].copy() ds.sa['ids'] = np.arange(len(ds)) mclf = MulticlassClassifier( clf, pass_attr=[ 'ids', 'sa.chunks', 'a.bogus_features', # 'ca.raw_estimates' # this one is binary_clf x samples list ATM # that is why raw_predictions_ds was born 'ca.raw_predictions_ds', 'ca.estimates', # this one is ok 'ca.predictions', ], enable_ca=['all']) mcv = CrossValidation(mclf, NFoldPartitioner(), errorfx=None) res = mcv(ds) assert_array_equal(sorted(res.sa.ids), ds.sa.ids) assert_array_equal(res.chunks, ds.chunks[res.sa.ids]) assert_array_equal(res.sa.predictions, res.samples[:, 0]) assert_array_equal(res.sa.cvfolds, np.repeat(range(len(ds.UC)), len(ds) / len(ds.UC)))
def test_exclude_targets_combinations(): partitioner = ChainNode([ NFoldPartitioner(), ExcludeTargetsCombinationsPartitioner( k=2, targets_attr='targets', space='partitions') ], space='partitions') from mvpa2.misc.data_generators import normal_feature_dataset ds = normal_feature_dataset(snr=0., nlabels=4, perlabel=3, nchunks=3, nonbogus_features=[0, 1, 2, 3], nfeatures=4) partitions = list(partitioner.generate(ds)) assert_equal(len(partitions), 3 * 6) splitter = Splitter('partitions') combs = [] comb_chunks = [] for p in partitions: trds, teds = list(splitter.generate(p))[:2] comb = tuple(np.unique(teds.targets)) combs.append(comb) comb_chunks.append(comb + tuple(np.unique(teds.chunks))) assert_equal(len(set(combs)), 6) # just 6 possible combinations of 2 out of 4 assert_equal(len(set(comb_chunks)), 3 * 6) # all unique
def test_split_classifier(self): ds = self.data_bin_1 clf = SplitClassifier( clf=SameSignClassifier(), enable_ca=['stats', 'training_stats', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.stats.error tr_error = clf.ca.training_stats.error clf2 = clf.clone() cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds) cverror = cverror.samples.squeeze() tr_cverror = cv.ca.training_stats.error self.assertEqual( error, cverror, msg="We should get the same error using split classifier as" " using CrossValidation. Got %s and %s" % (error, cverror)) self.assertEqual( tr_error, tr_cverror, msg="We should get the same training error using split classifier as" " using CrossValidation. Got %s and %s" % (tr_error, tr_cverror)) self.assertEqual(clf.ca.stats.percent_correct, 100, msg="Dummy clf should train perfectly") # CV and SplitClassifier should get the same confusion matrices assert_array_equal(clf.ca.stats.matrix, cv.ca.stats.matrix) self.assertEqual(len(clf.ca.stats.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.assertEqual( len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs") self.assertEqual(clf.predict(ds.samples), list(ds.targets), msg="Should classify correctly") # feature_ids must be list of lists, and since it is not # feature-selecting classifier used - we expect all features # to be utilized # NOT ANYMORE -- for BoostedClassifier we have now union of all # used features across slave classifiers. That makes # semantics clear. If you need to get deeper -- use upcoming # harvesting facility ;-) # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks)) # self.assertTrue(np.array([len(ids)==ds.nfeatures # for ids in clf.feature_ids]).all()) # Just check if we get it at all ;-) summary = clf.summary()
def test_permute_superord(): from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter from mvpa2.generators.permutation import AttributePermutator ds = _get_superord_dataset() # mvpa2.seed(1) part = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), AttributePermutator(['superord'], limit=['partitions', 'chunks']), ], space='partitions') for ds_perm in part.generate(ds): # it does permutation assert (np.sum(ds_perm.sa.superord != ds.sa.superord) != 0)
def test_unpartitioned_cv(self): data = get_mv_pattern(10) # only one big chunk data.sa.chunks[:] = 1 cv = CrossValidation(sample_clf_nl, NFoldPartitioner()) # need to fail, because it can't be split into training and testing assert_raises(ValueError, cv, data)
def test_split_featurewise_dataset_measure(self): ds = datasets['uni3small'] sana = RepeatedMeasure( SMLR(fit_all_weights=True).get_sensitivity_analyzer(), ChainNode( [NFoldPartitioner(), Splitter('partitions', attr_values=[1])])) sens = sana(ds) # a sensitivity for each chunk and each label combination assert_equal(sens.shape, (len(ds.sa['chunks'].unique) * len(ds.sa['targets'].unique), ds.nfeatures)) # Lets try more complex example with 'boosting' ds = datasets['uni3medium'] ds.init_origids('samples') sana = RepeatedMeasure( SMLR(fit_all_weights=True).get_sensitivity_analyzer(), Balancer(amount=0.25, count=2, apply_selection=True), enable_ca=['datasets', 'repetition_results']) sens = sana(ds) assert_equal(sens.shape, (2 * len(ds.sa['targets'].unique), ds.nfeatures)) splits = sana.ca.datasets self.assertEqual(len(splits), 2) self.assertTrue( np.all([s.nsamples == ds.nsamples // 4 for s in splits])) # should have used different samples self.assertTrue(np.any([splits[0].sa.origids != splits[1].sa.origids])) # and should have got different sensitivities self.assertTrue(np.any(sens[0] != sens[3]))
def _test_edmund_chong_20120907(): # pragma: no cover # commented out to avoid syntax warnings while compiling # from mvpa2.suite import * from mvpa2.testing.datasets import datasets repeater = Repeater(count=20) partitioner = ChainNode([NFoldPartitioner(cvtype=1), Balancer(attr='targets', count=1, # for real data > 1 limit='partitions', apply_selection=True )], space='partitions') clf = LinearCSVMC() #choice of classifier permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) null_cv = CrossValidation( clf, ChainNode([partitioner, permutator], space=partitioner.get_space()), errorfx=mean_mismatch_error) distr_est = MCNullDist(repeater, tail='left', measure=null_cv, enable_ca=['dist_samples']) cvte = CrossValidation(clf, partitioner, errorfx=mean_mismatch_error, null_dist=distr_est, enable_ca=['stats']) errors = cvte(datasets['uni2small'])
def test_multiclass_without_combiner(): # The goal is to obtain all pairwise results as the resultant dataset # avoiding even calling any combiner clf = LinearCSVMC(C=1) ds = datasets['uni3small'].copy() ds.sa['ids'] = np.arange(len(ds)) mclf = MulticlassClassifier(clf, combiner=None) # without combining results at all mcv = CrossValidation(mclf, NFoldPartitioner(), errorfx=None) res = mcv(ds) assert_equal(len(res), len(ds)) assert_equal(res.nfeatures, 3) # 3 pairs for 3 classes assert_array_equal(res.UT, ds.UT) assert_array_equal(np.unique(np.array(res.fa.targets.tolist())), ds.UT) # TODO -- check that we have all the pairs? assert_array_equal(res.sa['cvfolds'].unique, np.arange(len(ds.UC))) if mcv.ca.is_enabled('training_stats'): # we must have received a dictionary per each pair training_stats = mcv.ca.training_stats assert_equal(set(training_stats.keys()), set([('L0', 'L1'), ('L0', 'L2'), ('L1', 'L2')])) for pair, cm in training_stats.iteritems(): assert_array_equal(cm.labels, ds.UT) # we should have no predictions for absent label assert_array_equal(cm.matrix[~np.in1d(ds.UT, pair)], 0) # while altogether all samples were processed once assert_array_equal(cm.stats['P'], len(ds)) # and number of sets should be equal number of chunks here assert_equal(len(cm.sets), len(ds.UC))
def test_multiclass_classifier_cv(clf, ds): # Extending test_clf.py:ClassifiersTests.test_multiclass_classifier # Compare performance with our MaximalVote to the one done natively # by e.g. LIBSVM clf = clf.clone() clf.params.C = 1 # so it doesn't auto-adjust mclf = MulticlassClassifier(clf=clf.clone()) part = NFoldPartitioner() cv = CrossValidation(clf, part, enable_ca=['stats', 'training_stats']) mcv = CrossValidation(mclf, part, enable_ca=['stats', 'training_stats']) er = cv(ds) mer = mcv(ds) # errors should be the same assert_array_equal(er, mer) assert_equal(str(cv.ca.training_stats), str(mcv.ca.training_stats)) # if it was a binary task, cv.ca.stats would also have AUC column # while mcv would not :-/ TODO if len(ds.UT) == 2: # so just compare the matrix and ACC assert_array_equal(cv.ca.stats.matrix, mcv.ca.stats.matrix) assert_equal(cv.ca.stats.stats['ACC'], mcv.ca.stats.stats['ACC']) else: assert_equal(str(cv.ca.stats), str(mcv.ca.stats))
def __test_matthias_question(self): rfe_clf = LinearCSVMC(C=1) rfesvm_split = SplitClassifier(rfe_clf) clf = \ FeatureSelectionClassifier( clf = LinearCSVMC(C=1), feature_selection = RFE( sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer( combiner=first_axis_mean, transformer=np.abs), transfer_error=ConfusionBasedError( rfesvm_split, confusion_state="confusion"), stopping_criterion=FixedErrorThresholdStopCrit(0.20), feature_selector=FractionTailSelector( 0.2, mode='discard', tail='lower'), update_sensitivity=True)) no_permutations = 1000 permutator = AttributePermutator('targets', count=no_permutations) cv = CrossValidation(clf, NFoldPartitioner(), null_dist=MCNullDist(permutator, tail='left'), enable_ca=['stats']) error = cv(datasets['uni2small']) self.assertTrue(error < 0.4) self.assertTrue(cv.ca.null_prob < 0.05)
def test_counted_splitting(self): spl = Splitter(attr='partitions') # count > #chunks, should result in 10 splits nchunks = len(self.data.sa['chunks'].unique) for strategy in Partitioner._STRATEGIES: for count, target in [(nchunks * 2, nchunks), (nchunks, nchunks), (nchunks - 1, nchunks - 1), (3, 3), (0, 0), (1, 1)]: nfs = NFoldPartitioner(cvtype=1, count=count, selection_strategy=strategy) splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] self.assertTrue(len(splits) == target) chosenchunks = [int(s[1].uniquechunks) for s in splits] # Test if configuration matches as well nsplits_cfg = len(nfs.get_partition_specs(self.data)) self.assertEqual(nsplits_cfg, target) # Check if "lastsplit" dsattr was assigned appropriately nsplits = len(splits) if nsplits > 0: # dummy-proof testing of last split for ds_ in splits[-1]: self.assertTrue(ds_.a.lastpartitionset) # test all now for isplit, split in enumerate(splits): for ds_ in split: ds_.a.lastpartitionset == isplit == nsplits - 1 # Check results of different strategies if strategy == 'first': self.assertEqual(chosenchunks, range(target)) elif strategy == 'equidistant': if target == 3: self.assertEqual(chosenchunks, [0, 3, 7]) elif strategy == 'random': # none is selected twice self.assertTrue( len(set(chosenchunks)) == len(chosenchunks)) self.assertTrue(target == len(chosenchunks)) else: raise RuntimeError, "Add unittest for strategy %s" \ % strategy
def test_counted_splitting(self): spl = Splitter(attr='partitions') # count > #chunks, should result in 10 splits nchunks = len(self.data.sa['chunks'].unique) for strategy in Partitioner._STRATEGIES: for count, target in [ (nchunks*2, nchunks), (nchunks, nchunks), (nchunks-1, nchunks-1), (3, 3), (0, 0), (1, 1) ]: nfs = NFoldPartitioner(cvtype=1, count=count, selection_strategy=strategy) splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] self.failUnless(len(splits) == target) chosenchunks = [int(s[1].uniquechunks) for s in splits] # Test if configuration matches as well nsplits_cfg = len(nfs.get_partition_specs(self.data)) self.failUnlessEqual(nsplits_cfg, target) # Check if "lastsplit" dsattr was assigned appropriately nsplits = len(splits) if nsplits > 0: # dummy-proof testing of last split for ds_ in splits[-1]: self.failUnless(ds_.a.lastpartitionset) # test all now for isplit,split in enumerate(splits): for ds_ in split: ds_.a.lastpartitionset == isplit==nsplits-1 # Check results of different strategies if strategy == 'first': self.failUnlessEqual(chosenchunks, range(target)) elif strategy == 'equidistant': if target == 3: self.failUnlessEqual(chosenchunks, [0, 3, 7]) elif strategy == 'random': # none is selected twice self.failUnless(len(set(chosenchunks)) == len(chosenchunks)) self.failUnless(target == len(chosenchunks)) else: raise RuntimeError, "Add unittest for strategy %s" \ % strategy
def test_partial_searchlight_with_full_report(self): ds = self.dataset.copy() center_ids = np.zeros(ds.nfeatures, dtype='bool') center_ids[[3, 50]] = True ds.fa['center_ids'] = center_ids # compute N-1 cross-validation for each sphere cv = CrossValidation(GNB(), NFoldPartitioner()) # contruct diameter 1 (or just radius 0) searchlight # one time give center ids as a list, the other one takes it from the # dataset itself sls = ( sphere_searchlight(cv, radius=0, center_ids=[3, 50]), sphere_searchlight(None, radius=0, center_ids=[3, 50]), sphere_searchlight(cv, radius=0, center_ids='center_ids'), ) for sl in sls: # assure that we could set cv post constructor if sl.datameasure is None: sl.datameasure = cv # run searchlight results = sl(ds) # only two spheres but error for all CV-folds self.assertEqual(results.shape, (len(self.dataset.UC), 2)) # Test if results hold if we "set" a "new" datameasure sl.datameasure = CrossValidation(GNB(), NFoldPartitioner()) results2 = sl(ds) assert_array_almost_equal(results, results2) # test if we graciously puke if center_ids are out of bounds dataset0 = ds[:, :50] # so we have no 50th feature self.assertRaises(IndexError, sls[0], dataset0) # but it should be fine on the one that gets the ids from the dataset # itself results = sl(dataset0) assert_equal(results.nfeatures, 1) # check whether roi_seeds are correct sl = sphere_searchlight(lambda x: np.vstack( (x.fa.roi_seed, x.samples)), radius=1, add_center_fa=True, center_ids=[12]) res = sl(ds) assert_array_equal( res.samples[1:, res.samples[0].astype('bool')].squeeze(), ds.samples[:, 12])
def test_incorrect_parameter_error(self): # Just a sample class from mvpa2.generators.partition import NFoldPartitioner try: spl = NFoldPartitioner(1, incorrect=None) raise AssertionError("Must have failed with an exception here " "due to incorrect parameter") except Exception, e: estr = str(e)
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is self.data.samples) assert_true(s[1].samples.base.base is self.data.samples) spl = Splitter(attr='partitions', noslicing=True) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(s[0].samples.base.base is self.data.samples) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20,2), sa={'chunks': np.tile([0,1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [ list(spl.generate(p)) for p in oes.generate(step_ds) ] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def _test_gideon_weird_case(self): """'The utter collapse' -- communicated by Peter J. Kohler Desire to collapse all samples per each category in training and testing sets, thus resulting only in a single sample/category per training and per testing. As it is now, CrossValidation on MappedClassifier would not work observations: chance distribution obviously gets wide, but also gets skewed to anti-learning on nfolds like 4. """ from mvpa2.mappers.fx import mean_group_sample from mvpa2.clfs.knn import kNN clf = kNN() print "HERE" ds = datasets['uni2large'].copy() ds = ds[ds.sa.chunks < 9] accs = [] for i in xrange(10): # # of random samples ds.samples = np.random.randn(*ds.shape) if False: # this would have been a native way IF we allowed change of number of samples clf2 = MappedClassifier(clf=kNN(), #clf, mapper=mean_group_sample(['targets', 'partitions'])) cv = CrossValidation(clf2, NFoldPartitioner(4), postproc=None, enable_ca=['stats']) print cv(ds) else: from mvpa2.clfs.transerror import ConfusionMatrix partitioner = NFoldPartitioner(6) meaner = mean_group_sample(['targets', 'partitions']) cm = ConfusionMatrix() te = TransferMeasure(clf, Splitter('partitions'), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), enable_ca = ['stats'] ) for part in partitioner.generate(ds): ds_meaned = meaner(part) error = np.asscalar(te(ds_meaned)) cm += te.ca.stats print i, cm.stats['ACC'] accs.append(cm.stats['ACC'])
def test_searchlight_cross_decoding(path, subjects, conf_file, type, **kwargs): conf = read_configuration(path, conf_file, type) for arg in kwargs: conf[arg] = kwargs[arg] if arg == 'radius': radius = kwargs[arg] debug.active += ["SLC"] ds_merged = get_merged_ds(path, subjects, conf_file, type, **kwargs) clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) cv = CrossValidation(clf, NFoldPartitioner(attr='task')) maps = [] for ds in ds_merged: ds.targets[ds.targets == 'point'] = 'face' ds.targets[ds.targets == 'saccade'] = 'place' sl = sphere_searchlight(cv, radius, space='voxel_indices') sl_map = sl(ds) sl_map.samples *= -1 sl_map.samples += 1 nif = map2nifti(sl_map, imghdr=ds.a.imghdr) maps.append(nif) datetime = get_time() analysis = 'cross_searchlight' mask = conf['mask_area'] task = type new_dir = datetime + '_' + analysis + '_' + mask + '_' + task command = 'mkdir ' + os.path.join(path, '0_results', new_dir) os.system(command) parent_dir = os.path.join(path, '0_results', new_dir) for s, map in zip(subjects, maps): name = s command = 'mkdir ' + os.path.join(parent_dir, name) os.system(command) results_dir = os.path.join(parent_dir, name) fname = name + '_radius_' + str(radius) + '_searchlight_map.nii.gz' map.to_filename(os.path.join(results_dir, fname)) return maps
def test_factorialpartitioner(): # Test against sifter and chainmap implemented in test_usecases # -- code below copied from test_usecases -- # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5 # pure signal! ;) ) ds.sa["subord"] = ds.sa.targets.copy() ds.sa["superord"] = ["super%d" % (int(i[1]) % 3,) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 # let's make two other datasets to test later # one superordinate category only ds_1super = ds.copy() ds_1super.sa["superord"] = ["super1" for i in ds_1super.targets] # one superordinate category has only one subordinate # ds_unbalanced = ds.copy() # nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1') # mask_superord = ds_unbalanced.sa.superord == 'super1' # uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord]) # ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)] ds_unbalanced = Dataset(range(4), sa={"subord": [0, 0, 1, 2], "superord": [1, 1, 2, 2]}) npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa["superord"].unique), attr="subord"), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([("partitions", 2), ("superord", {"uvalues": ds.sa["superord"].unique, "balanced": True})]), ], space="partitions", ) # now the new implementation factpart = FactorialPartitioner(NFoldPartitioner(attr="subord"), attr="superord") partitions_npart = [p.sa.partitions for p in npart.generate(ds)] partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)] assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart)) # now let's check it behaves correctly if we have only one superord class nfold = NFoldPartitioner(attr="subord") partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)] partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_1super)] assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart)) # smoke test for unbalanced subord classes warning_msg = ( "One or more superordinate attributes do not have the same " "number of subordinate attributes. This could yield to " "unbalanced partitions." ) with assert_warnings([(RuntimeWarning, warning_msg)]): partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_unbalanced)] partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])] superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])] subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])] for out_part, true_part, super_out, sub_out in zip( partitions_factpart, partitions_unbalanced, superord_unbalanced, subord_unbalanced ): assert_array_equal(out_part, true_part) assert_array_equal( (ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()), super_out, ) assert_array_equal( (ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out ) # now let's test on a dummy dataset ds_dummy = Dataset(range(4), sa={"subord": range(4), "superord": [1, 2] * 2}) partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_dummy)] assert_array_equal(partitions_factpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
def test_gnbsearchlight_permutations(): import mvpa2 from mvpa2.base.node import ChainNode from mvpa2.clfs.gnb import GNB from mvpa2.generators.base import Repeater from mvpa2.generators.partition import NFoldPartitioner, OddEvenPartitioner #import mvpa2.generators.permutation #reload(mvpa2.generators.permutation) from mvpa2.generators.permutation import AttributePermutator from mvpa2.testing.datasets import datasets from mvpa2.measures.base import CrossValidation from mvpa2.measures.gnbsearchlight import sphere_gnbsearchlight from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.mappers.fx import mean_sample from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.clfs.stats import MCNullDist from mvpa2.testing.tools import assert_raises, ok_, assert_array_less # mvpa2.debug.active = ['APERM', 'SLC'] #, 'REPM'] # mvpa2.debug.metrics += ['pid'] count = 10 nproc = 1 + int(mvpa2.externals.exists('pprocess')) ds = datasets['3dsmall'].copy() ds.fa['voxel_indices'] = ds.fa.myspace slkwargs = dict(radius=3, space='voxel_indices', enable_ca=['roi_sizes'], center_ids=[1, 10, 70, 100]) mvpa2.seed(mvpa2._random_seed) clf = GNB() splt = NFoldPartitioner(cvtype=2, attr='chunks') repeater = Repeater(count=count) permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) null_sl = sphere_gnbsearchlight(clf, ChainNode([splt, permutator], space=splt.get_space()), postproc=mean_sample(), errorfx=mean_mismatch_error, **slkwargs) distr_est = MCNullDist(repeater, tail='left', measure=null_sl, enable_ca=['dist_samples']) sl = sphere_gnbsearchlight(clf, splt, reuse_neighbors=True, null_dist=distr_est, postproc=mean_sample(), errorfx=mean_mismatch_error, **slkwargs) if __debug__: # assert is done only without -O mode assert_raises(NotImplementedError, sl, ds) # "ad-hoc searchlights can't handle yet varying targets across partitions" if False: # after above limitation is removed -- enable sl_map = sl(ds) sl_null_prob = sl.ca.null_prob.samples.copy() mvpa2.seed(mvpa2._random_seed) ### 'normal' Searchlight clf = GNB() splt = NFoldPartitioner(cvtype=2, attr='chunks') repeater = Repeater(count=count) permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) # rng=np.random.RandomState(0)) # to trigger failure since the same np.random state # would be reused across all pprocesses null_cv = CrossValidation(clf, ChainNode([splt, permutator], space=splt.get_space()), postproc=mean_sample()) null_sl_normal = sphere_searchlight(null_cv, nproc=nproc, **slkwargs) distr_est_normal = MCNullDist(repeater, tail='left', measure=null_sl_normal, enable_ca=['dist_samples']) cv = CrossValidation(clf, splt, errorfx=mean_mismatch_error, enable_ca=['stats'], postproc=mean_sample() ) sl = sphere_searchlight(cv, nproc=nproc, null_dist=distr_est_normal, **slkwargs) sl_map_normal = sl(ds) sl_null_prob_normal = sl.ca.null_prob.samples.copy() # For every feature -- we should get some variance in estimates In # case of failure they are all really close to each other (up to # numerical precision), so variance will be close to 0 assert_array_less(-np.var(distr_est_normal.ca.dist_samples.samples[0], axis=1), -1e-5) for s in distr_est_normal.ca.dist_samples.samples[0]: ok_(len(np.unique(s)) > 1)
def test_analyzer_with_split_classifier(self, clfds): """Test analyzers in split classifier """ clf, ds = clfds # unroll the tuple # We need to skip some LARSes here _sclf = str(clf) if 'LARS(' in _sclf and "type='stepwise'" in _sclf: # ADD KnownToFail thingie from NiPy return # To don't waste too much time testing lets limit to 3 splits nsplits = 3 partitioner = NFoldPartitioner(count=nsplits) mclf = SplitClassifier(clf=clf, partitioner=partitioner, enable_ca=['training_stats', 'stats']) sana = mclf.get_sensitivity_analyzer(# postproc=absolute_features(), pass_attr=['fa.nonbogus_targets'], enable_ca=["sensitivities"]) ulabels = ds.uniquetargets nlabels = len(ulabels) # Can't rely on splitcfg since count-limit is done in __call__ assert(nsplits == len(list(partitioner.generate(ds)))) sens = sana(ds) assert('nonbogus_targets' in sens.fa) # were they passsed? # TODO: those few do not expose biases if not len(set(clf.__tags__).intersection(('lars', 'glmnet', 'gpr'))): assert('biases' in sens.sa) # print sens.sa.biases # It should return either ... # nlabels * nsplits req_nsamples = [ nlabels * nsplits ] if nlabels == 2: # A single sensitivity in case of binary req_nsamples += [ nsplits ] else: # and for pairs in case of multiclass req_nsamples += [ (nlabels * (nlabels - 1) / 2) * nsplits ] # and for 1-vs-1 embedded within Multiclass operating on # pairs (e.g. SMLR) req_nsamples += [req_nsamples[-1] * 2] # Also for regression_based -- they can do multiclass # but only 1 sensitivity is provided if 'regression_based' in clf.__tags__: req_nsamples += [ nsplits ] # # of features should correspond self.assertEqual(sens.shape[1], ds.nfeatures) # # of samples/sensitivities should also be reasonable self.assertTrue(sens.shape[0] in req_nsamples) # Check if labels are present self.assertTrue('splits' in sens.sa) self.assertTrue('targets' in sens.sa) # should be 1D -- otherwise dtype object self.assertTrue(sens.sa.targets.ndim == 1) sens_ulabels = sens.sa['targets'].unique # Some labels might be pairs(tuples) so ndarray would be of # dtype object and we would need to get them all if sens_ulabels.dtype is np.dtype('object'): sens_ulabels = np.unique( reduce(lambda x, y: x + y, [list(x) for x in sens_ulabels])) assert_array_equal(sens_ulabels, ds.sa['targets'].unique) errors = [x.percent_correct for x in sana.clf.ca.stats.matrices] # lets go through all sensitivities and see if we selected the right # features #if 'meta' in clf.__tags__ and len(sens.samples[0].nonzero()[0])<2: if '5%' in clf.descr \ or (nlabels > 2 and 'regression_based' in clf.__tags__): # Some meta classifiers (5% of ANOVA) are too harsh ;-) # if we get less than 2 features with on-zero sensitivities we # cannot really test # Also -- regression based classifiers performance for multiclass # is expected to suck in general return if cfg.getboolean('tests', 'labile', default='yes'): for conf_matrix in [sana.clf.ca.training_stats] \ + sana.clf.ca.stats.matrices: self.assertTrue( conf_matrix.percent_correct >= 70, msg="We must have trained on each one more or " \ "less correctly. Got %f%% correct on %d labels" % (conf_matrix.percent_correct, nlabels)) # Since now we have per split and possibly per label -- lets just find # mean per each feature per label across splits sensm = FxMapper('samples', lambda x: np.sum(x), uattrs=['targets']).forward(sens) sensgm = maxofabs_sample().forward(sensm) # global max of abs of means assert_equal(sensgm.shape[0], 1) assert_equal(sensgm.shape[1], ds.nfeatures) selected = FixedNElementTailSelector( len(ds.a.bogus_features))(sensgm.samples[0]) if cfg.getboolean('tests', 'labile', default='yes'): self.assertEqual( set(selected), set(ds.a.nonbogus_features), msg="At the end we should have selected the right features. " "Chose %s whenever nonbogus are %s" % (selected, ds.a.nonbogus_features)) # Now test each one per label # TODO: collect all failures and spit them out at once -- # that would make it easy to see if the sensitivity # just has incorrect order of labels assigned for sens1 in sensm: labels1 = sens1.targets # labels (1) for this sensitivity lndim = labels1.ndim label = labels1[0] # current label # XXX whole lndim comparison should be gone after # things get fixed and we arrive here with a tuple! if lndim == 1: # just a single label self.assertTrue(label in ulabels) ilabel_all = np.where(ds.fa.nonbogus_targets == label)[0] # should have just 1 feature for the label self.assertEqual(len(ilabel_all), 1) ilabel = ilabel_all[0] maxsensi = np.argmax(sens1) # index of max sensitivity self.assertEqual(maxsensi, ilabel, "Maximal sensitivity for %s was found in %i whenever" " original feature was %i for nonbogus features %s" % (labels1, maxsensi, ilabel, ds.a.nonbogus_features)) elif lndim == 2 and labels1.shape[1] == 2: # pair of labels # we should have highest (in abs) coefficients in # those two labels maxsensi2 = np.argsort(np.abs(sens1))[0][-2:] ilabel2 = [np.where(ds.fa.nonbogus_targets == l)[0][0] for l in label] self.assertEqual( set(maxsensi2), set(ilabel2), "Maximal sensitivity for %s was found in %s whenever" " original features were %s for nonbogus features %s" % (labels1, maxsensi2, ilabel2, ds.a.nonbogus_features)) """ # Now test for the sign of each one in pair ;) in # all binary problems L1 (-1) -> L2(+1), then # weights for L2 should be positive. to test for # L1 -- invert the sign # We already know (if we haven't failed in previous test), # that those 2 were the strongest -- so check only signs """ self.assertTrue( sens1.samples[0, ilabel2[0]] < 0, "With %i classes in pair %s got feature %i for %r >= 0" % (nlabels, label, ilabel2[0], label[0])) self.assertTrue(sens1.samples[0, ilabel2[1]] > 0, "With %i classes in pair %s got feature %i for %r <= 0" % (nlabels, label, ilabel2[1], label[1])) else: # yoh could be wrong at this assumption... time will show self.fail("Got unknown number labels per sensitivity: %s." " Should be either a single label or a pair" % labels1)
def generate_roc_curve(mech_vec_list, mech_nm_list, semantic_range = np.arange(0.2, 2.7, 0.3), mech_range = np.arange(0.2, 6.5, 0.7), n_prev_trials = 1, prev_c = 'r', plot_prev=True, sem_c = 'b', sem_m = '+', plot_semantic=True, semantic_label='operating 1st time and \n known mechanism class'): t_nm_list, t_mech_vec_list = [], [] for i, nm in enumerate(mech_nm_list): ## print 'nm:', nm if 'known' in nm: continue t_nm_list.append(nm) t_mech_vec_list.append(mech_vec_list[i]) data, _ = mar.create_blocked_dataset_semantic_classes(t_mech_vec_list, t_nm_list, append_robot = False) ## label_splitter = NFoldSplitter(cvtype=1, attr='labels') thresh_dict = ut.load_pickle('blocked_thresh_dict.pkl') # human + robot data mean_charlie_dict = thresh_dict['mean_charlie'] mean_known_mech_dict = thresh_dict['mean_known_mech'] #---------------- semantic class prior ------------- if plot_semantic: fp_l_l = [] mn_l_l = [] err_l_l = [] mech_fp_l_l = [] mech_mn_l_l = [] mech_err_l_l = [] nfs = NFoldPartitioner(cvtype=1, attr='targets') # 1-fold ? label_splitter = splitters.Splitter(attr='partitions') splits = [list(label_splitter.generate(x)) for x in nfs.generate(data)] # Grouping by labels for l_wdata, l_vdata in splits: #label_splitter(data): print "Number of data: ", len(l_vdata.chunks) # Why zero??? Do we want specific chunk? -> changed into 10 lab = l_vdata.targets[0] # all same label chunk = l_vdata.chunks[0] # chunk should be independant!! trials = l_vdata.samples if lab == 'Refrigerator': lab = 'Fridge' ## tot_mean = None ## tot_std = None ## for chunk in l_vdata.chunks: ## _, mean, std = mean_charlie_dict[chunk] # mean except the specified chunk in same class ## if tot_mean is None: ## tot_mean = mean ## tot_std = std ## else: ## tot_mean += mean ## tot_std += std ## print chunk, mean[0], tot_mean[0] ## mean = tot_mean/float(len(l_vdata.chunks)) ## std = tot_std/float(len(l_vdata.chunks)) ## print mean[0], tot_mean[0], float(len(l_vdata.chunks)) ## sys.exit() # Select evaluation chunk for the ROC ? ## _, mean, std = mean_charlie_dict[lab] _, mean, std = mean_charlie_dict[chunk] # cutting into the same length min_len = min(len(mean), trials.shape[1]) trials = trials[:,:min_len] mean = mean[:min_len] std = std[:min_len] #??? mn_list = [] fp_list, err_list = [], [] for n in semantic_range: err = (mean + n*std) - trials #false_pos = np.sum(np.any(err<0, 1)) #tot = trials.shape[0] false_pos = np.sum(err<0) # Count false cases tot = trials.shape[0] * trials.shape[1] fp_list.append(false_pos/(tot*0.01)) err = err[np.where(err>0)] err_list.append(err.flatten()) mn_list.append(np.mean(err)) err_l_l.append(err_list) fp_l_l.append(fp_list) mn_l_l.append(mn_list) ll = [[] for i in err_l_l[0]] # why 0? for i,e in enumerate(err_l_l): # labels for j,l in enumerate(ll): # multiplier range l.append(e[j]) std_list = [] for l in ll: std_list.append(np.std(np.concatenate(l).flatten())) mn_list = np.mean(np.row_stack(mn_l_l), 0).tolist() # means into a row fp_list = np.mean(np.row_stack(fp_l_l), 0).tolist() #pp.errorbar(fp_list, mn_list, std_list) ## mn_list = np.array(mn_l_l).flatten() ## fp_list = np.array(fp_l_l).flatten() pp.plot(fp_list, mn_list, '--'+sem_m+sem_c, label= semantic_label, mec=sem_c, ms=8, mew=2) #pp.plot(fp_list, mn_list, '-ob', label='with prior') #---------------- mechanism knowledge prior ------------- if plot_prev: t_nm_list, t_mech_vec_list = [], [] for i, nm in enumerate(mech_nm_list): ## print 'nm:', nm if 'known' in nm: t_nm_list.append(nm) t_mech_vec_list.append(mech_vec_list[i]) if t_nm_list == []: t_mech_vec_list = mech_vec_list t_nm_list = mech_nm_list data, _ = mar.create_blocked_dataset_semantic_classes(t_mech_vec_list, t_nm_list, append_robot = False) ## chunk_splitter = NFoldSplitter(cvtype=1, attr='chunks') nfs = NFoldPartitioner(cvtype=1, attr='chunks') # 1-fold ? chunk_splitter = splitters.Splitter(attr='partitions') splits = [list(label_splitter.generate(x)) for x in nfs.generate(data)] err_mean_list = [] err_std_list = [] fp_list = [] for n in mech_range: false_pos = 0 n_trials = 0 err_list = [] for _, l_vdata in splits: #chunk_splitter(data): lab = l_vdata.targets[0] trials = l_vdata.samples m = l_vdata.chunks[0] #one_trial = trials[0].reshape(1, len(trials[0])) one_trial = trials[0:n_prev_trials] ## print n, ": ", lab, chunk Ms, n_std = mean_known_mech_dict[m] mn_list, std_list = mar.estimate_theta(one_trial, Ms, plot=False, add_var = 0.0) mn_mech_arr = np.array(mn_list) std_mech_arr = np.array(std_list) # trials = trials[:,:len(mn_mech_arr)] min_len = min(len(mn_mech_arr), trials.shape[1]) trials = trials[:,:min_len] mn_mech_arr = mn_mech_arr[:min_len] std_mech_arr = std_mech_arr[:min_len] for t in trials: err = (mn_mech_arr + n*std_mech_arr) - t #false_pos += np.any(err<0) #n_trials += 1 false_pos += np.sum(err<0) n_trials += len(err) err = err[np.where(err>0)] err_list.append(err) e_all = np.concatenate(err_list) err_mean_list.append(np.mean(e_all)) err_std_list.append(np.std(e_all)) fp_list.append(false_pos/(n_trials*0.01)) #pp.plot(fp_list, err_mean_list, '-o'+prev_c, label='knowledge of mechanism and \n opened earlier %d times'%n_prev_trials) pp.plot(fp_list, err_mean_list, '-o'+prev_c, mec=prev_c, ms=5, label='operating 2nd time and \n known mechanism identity') #pp.plot(fp_list, err_mean_list, '-or', label='with prior') pp.xlabel('False positive rate (percentage)', fontsize=22) pp.ylabel('Mean excess force (Newtons)', fontsize=22) pp.xlim(-0.5,45) mpu.legend()
def test_gideon_weird_case(self): """Test if MappedClassifier could handle a mapper altering number of samples 'The utter collapse' -- communicated by Peter J. Kohler Desire to collapse all samples per each category in training and testing sets, thus resulting only in a single sample/category per training and per testing. It is a peculiar scenario which pin points the problem that so far mappers assumed not to change number of samples """ from mvpa2.mappers.fx import mean_group_sample from mvpa2.clfs.knn import kNN from mvpa2.mappers.base import ChainMapper ds = datasets['uni2large'].copy() #ds = ds[ds.sa.chunks < 9] accs = [] k = 1 # for kNN nf = 1 # for NFoldPartitioner for i in xrange(1): # # of random runs ds.samples = np.random.randn(*ds.shape) # # There are 3 ways to accomplish needed goal # # 0. Hard way: overcome the problem by manually # pre-splitting/meaning in a loop from mvpa2.clfs.transerror import ConfusionMatrix partitioner = NFoldPartitioner(nf) meaner = mean_group_sample(['targets', 'partitions']) cm = ConfusionMatrix() te = TransferMeasure(kNN(k), Splitter('partitions'), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), enable_ca = ['stats'] ) errors = [] for part in partitioner.generate(ds): ds_meaned = meaner(part) errors.append(np.asscalar(te(ds_meaned))) cm += te.ca.stats #print i, cm.stats['ACC'] accs.append(cm.stats['ACC']) if False: # not yet working -- see _tent/allow_ch_nsamples # branch for attempt to make it work # 1. This is a "native way" IF we allow change of number # of samples via _call to be done by MappedClassifier # while operating solely on the mapped dataset clf2 = MappedClassifier(clf=kNN(k), #clf, mapper=mean_group_sample(['targets', 'partitions'])) cv = CrossValidation(clf2, NFoldPartitioner(nf), postproc=None, enable_ca=['stats']) # meaning all should be ok since we should have ballanced # sets across all chunks here errors_native = cv(ds) self.assertEqual(np.max(np.abs(errors_native.samples[:,0] - errors)), 0) # 2. Work without fixes to MappedClassifier allowing # change of # of samples # # CrossValidation will operate on a chain mapper which # would perform necessary meaning first before dealing with # kNN cons: .stats would not be exposed since ChainMapper # doesn't expose them from ChainMapper (yet) if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active: raise SkipTest("Known to fail while trying to enable " "training_stats for the ChainMapper") cv2 = CrossValidation(ChainMapper([mean_group_sample(['targets', 'partitions']), kNN(k)], space='targets'), NFoldPartitioner(nf), postproc=None) errors_native2 = cv2(ds) self.assertEqual(np.max(np.abs(errors_native2.samples[:,0] - errors)), 0) # All of the ways should provide the same results #print i, np.max(np.abs(errors_native.samples[:,0] - errors)), \ # np.max(np.abs(errors_native2.samples[:,0] - errors)) if False: # just to investigate the distribution if we have enough iterations import pylab as pl uaccs = np.unique(accs) step = np.asscalar(np.unique(np.round(uaccs[1:] - uaccs[:-1], 4))) bins = np.linspace(0., 1., np.round(1./step+1)) xx = pl.hist(accs, bins=bins, align='left') pl.xlim((0. - step/2, 1.+step/2))
def setup_classifier(**kwargs): ''' Thinked! ''' for arg in kwargs: if arg == 'clf_type': clf_type = kwargs[arg] if arg == 'fsel': f_sel = kwargs[arg] if arg == 'cv_type': cv_approach = kwargs[arg] if arg == 'cv_folds': if np.int(kwargs[arg]) == 0: cv_type = np.float(kwargs[arg]) else: cv_type = np.int(kwargs[arg]) if arg == 'permutations': permutations = np.int(kwargs[arg]) if arg == 'cv_attribute': attribute = kwargs[arg] cv_n = cv_type ################# Classifier ####################### if clf_type == 'SVM': clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) elif clf_type == 'GNB': clf = GNB() elif clf_type == 'LDA': clf = LDA() elif clf_type == 'QDA': clf = QDA() elif clf_type == 'SMLR': clf = SMLR() elif clf_type == 'RbfSVM': sk_clf = SVC(gamma=0.1, C=1) clf = SKLLearnerAdapter(sk_clf, enable_ca=['probabilities']) elif clf_type == 'GP': clf = GPR() else: clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) ############## Feature Selection ######################### if f_sel == 'True': logger.info('Feature Selection selected.') fsel = SensitivityBasedFeatureSelection(OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')) fclf = FeatureSelectionClassifier(clf, fsel) elif f_sel == 'Fixed': logger.info('Fixed Feature Selection selected.') fsel = SensitivityBasedFeatureSelection(OneWayAnova(), FixedNElementTailSelector(100, mode='select', tail='upper')) fclf = FeatureSelectionClassifier(clf, fsel) elif f_sel == 'PCA': from mvpa2.mappers.skl_adaptor import SKLTransformer from sklearn.decomposition import PCA logger.info('Fixed Feature Selection selected.') fsel = SKLTransformer(PCA(n_components=45)) fclf = FeatureSelectionClassifier(clf, fsel) else: fclf = clf ######################### Permutations ############################# if permutations != 0: if __debug__: debug.active += ["STATMC"] repeater = Repeater(count=permutations) permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) partitioner = NFoldPartitioner(cvtype=cv_n, attr=attribute) null_cv = CrossValidation( clf, ChainNode([partitioner, permutator], space=partitioner.get_space()), errorfx=mean_mismatch_error) distr_est = MCNullDist(repeater, tail='left', measure=null_cv, enable_ca=['dist_samples']) #postproc = mean_sample() else: distr_est = None #postproc = None ######################################################## if cv_approach == 'n_fold': if cv_type != 0: splitter_used = NFoldPartitioner(cvtype=cv_type, attr=attribute) else: splitter_used = NFoldPartitioner(cvtype=1, attr=attribute) else: splitter_used = HalfPartitioner(attr=attribute) chain_splitter = ChainNode([splitter_used, Balancer(attr='targets', count=1, limit='partitions', apply_selection=True)], space='partitions') ############################################################# if distr_est == None: cvte = CrossValidation(fclf, chain_splitter, enable_ca=['stats', 'repetition_results']) else: cvte = CrossValidation(fclf, chain_splitter, errorfx=mean_mismatch_error, null_dist=distr_est, enable_ca=['stats', 'repetition_results']) logger.info('Classifier set...') return [fclf, cvte]