Пример #1
0
    def test_null_dist_prob(self, l_clf):
        train = datasets['uni2medium']

        num_perm = 10
        permutator = AttributePermutator('targets',
                                         count=num_perm,
                                         limit='chunks')
        # define class to estimate NULL distribution of errors
        # use left tail of the distribution since we use MeanMatchFx as error
        # function and lower is better
        terr = TransferMeasure(l_clf,
                               Repeater(count=2),
                               postproc=BinaryFxNode(mean_mismatch_error,
                                                     'targets'),
                               null_dist=MCNullDist(permutator, tail='left'))

        # check reasonable error range
        err = terr(train)
        self.assertTrue(np.mean(err) < 0.4)

        # Lets do the same for CVTE
        cvte = CrossValidation(l_clf,
                               OddEvenPartitioner(),
                               null_dist=MCNullDist(permutator,
                                                    tail='left',
                                                    enable_ca=['dist_samples'
                                                               ]),
                               postproc=mean_sample())
        cv_err = cvte(train)

        # check that the result is highly significant since we know that the
        # data has signal
        null_prob = np.asscalar(terr.ca.null_prob)

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(
                null_prob <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got %f) since we know that the data has signal" % null_prob)

            self.assertTrue(
                np.asscalar(cvte.ca.null_prob) <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got p(cvte)=%f) since we know that the data has signal" %
                np.asscalar(cvte.ca.null_prob))

        # we should be able to access the actual samples of the distribution
        # yoh: why it is 3D really?
        # mih: because these are the distribution samples for the ONE error
        #      collapsed into ONE value across all folds. It will also be
        #      3d if the return value of the measure isn't a scalar and it is
        #      not collapsed across folds. it simply corresponds to the shape
        #      of the output dataset of the respective measure (+1 axis)
        # Some permutations could have been skipped since classifier failed
        # to train due to degenerate situation etc, thus accounting for them
        self.assertEqual(cvte.null_dist.ca.dist_samples.shape[2],
                         num_perm - cvte.null_dist.ca.skipped)
Пример #2
0
 def test_nblocks(self):
     skip_if_no_external('pprocess')
     # just a basic test to see that we are getting the same
     # results with different nblocks
     ds = datasets['3dsmall'].copy(deep=True)[:, :13]
     ds.fa['voxel_indices'] = ds.fa.myspace
     cv = CrossValidation(GNB(), OddEvenPartitioner())
     res1 = sphere_searchlight(cv, radius=1, nproc=2)(ds)
     res2 = sphere_searchlight(cv, radius=1, nproc=2, nblocks=5)(ds)
     assert_array_equal(res1, res2)
Пример #3
0
def test_searchlight_cross_decoding(path, subjects, conf_file, type, **kwargs):

    conf = read_configuration(path, conf_file, type)

    for arg in kwargs:
        conf[arg] = kwargs[arg]
        if arg == 'radius':
            radius = kwargs[arg]

    debug.active += ["SLC"]

    ds_merged = get_merged_ds(path, subjects, conf_file, type, **kwargs)

    clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities'])
    cv = CrossValidation(clf, NFoldPartitioner(attr='task'))

    maps = []

    for ds in ds_merged:

        ds.targets[ds.targets == 'point'] = 'face'
        ds.targets[ds.targets == 'saccade'] = 'place'

        sl = sphere_searchlight(cv, radius, space='voxel_indices')

        sl_map = sl(ds)

        sl_map.samples *= -1
        sl_map.samples += 1

        nif = map2nifti(sl_map, imghdr=ds.a.imghdr)

        maps.append(nif)

    datetime = get_time()
    analysis = 'cross_searchlight'
    mask = conf['mask_area']
    task = type

    new_dir = datetime + '_' + analysis + '_' + mask + '_' + task
    command = 'mkdir ' + os.path.join(path, '0_results', new_dir)
    os.system(command)

    parent_dir = os.path.join(path, '0_results', new_dir)

    for s, map in zip(subjects, maps):
        name = s
        command = 'mkdir ' + os.path.join(parent_dir, name)
        os.system(command)

        results_dir = os.path.join(parent_dir, name)
        fname = name + '_radius_' + str(radius) + '_searchlight_map.nii.gz'
        map.to_filename(os.path.join(results_dir, fname))

    return maps
Пример #4
0
def test_sifter_superord_usecase():
    from mvpa2.misc.data_generators import normal_feature_dataset
    from mvpa2.clfs.svm import LinearCSVMC  # fast one to use for tests
    from mvpa2.measures.base import CrossValidation

    from mvpa2.base.node import ChainNode
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.generators.base import Sifter

    ds = _get_superord_dataset()

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
        ],
        space='partitions')

    # and then do your normal where clf is space='superord'
    clf = LinearCSVMC(space='superord')
    cvte_regular = CrossValidation(clf,
                                   NFoldPartitioner(),
                                   errorfx=lambda p, t: np.mean(p == t))
    cvte_super = CrossValidation(clf,
                                 npart,
                                 errorfx=lambda p, t: np.mean(p == t))

    accs_regular = cvte_regular(ds)
    accs_super = cvte_super(ds)

    # With sifting we should get only 2^3 = 8 splits
    assert (len(accs_super) == 8)
    # I don't think that this would ever fail, so not marking it labile
    assert (np.mean(accs_regular) > .8)
    assert (np.mean(accs_super) < .6)
Пример #5
0
    def test_cached_kernel_different_datasets(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        # Inspired by the problem Swaroop ran into
        k  = LinearSGKernel(normalizer_cls=False)
        k_ = LinearSGKernel(normalizer_cls=False)   # to be cached
        ck = CachedKernel(k_)

        clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1)
        clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1)

        cvte = CrossValidation(clf, NFoldPartitioner())
        cvte_ = CrossValidation(clf_, NFoldPartitioner())

        postproc=BinaryFxNode(mean_mismatch_error, 'targets')
        te = ProxyMeasure(clf, postproc=postproc)
        te_ = ProxyMeasure(clf_, postproc=postproc)

        for r in xrange(2):
            ds1 = datasets['uni2medium']
            errs1 = cvte(ds1)
            ck.compute(ds1)
            ok_(ck._recomputed)
            errs1_ = cvte_(ds1)
            ok_(~ck._recomputed)
            assert_array_equal(errs1, errs1_)

            ds2 = datasets['uni3small']
            errs2 = cvte(ds2)
            ck.compute(ds2)
            ok_(ck._recomputed)
            errs2_ = cvte_(ds2)
            ok_(~ck._recomputed)
            assert_array_equal(errs2, errs2_)

            ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int)
            te.train(datasets['uni3small'][::2])
            terr = np.asscalar(te(datasets['uni3small'][ssel]))
            te_.train(datasets['uni3small'][::2])
            terr_ = np.asscalar(te_(datasets['uni3small'][ssel]))
            ok_(~ck._recomputed)
            ok_(terr == terr_)
Пример #6
0
    def test_split_classifier(self):
        ds = self.data_bin_1
        clf = SplitClassifier(clf=SameSignClassifier(),
                enable_ca=['stats', 'training_stats',
                               'feature_ids'])
        clf.train(ds)                   # train the beast
        error = clf.ca.stats.error
        tr_error = clf.ca.training_stats.error

        clf2 = clf.clone()
        cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(),
            enable_ca=['stats', 'training_stats'])
        cverror = cv(ds)
        cverror = cverror.samples.squeeze()
        tr_cverror = cv.ca.training_stats.error

        self.assertEqual(error, cverror,
                msg="We should get the same error using split classifier as"
                    " using CrossValidation. Got %s and %s"
                    % (error, cverror))

        self.assertEqual(tr_error, tr_cverror,
                msg="We should get the same training error using split classifier as"
                    " using CrossValidation. Got %s and %s"
                    % (tr_error, tr_cverror))

        self.assertEqual(clf.ca.stats.percent_correct,
                             100,
                             msg="Dummy clf should train perfectly")
        # CV and SplitClassifier should get the same confusion matrices
        assert_array_equal(clf.ca.stats.matrix,
                           cv.ca.stats.matrix)

        self.assertEqual(len(clf.ca.stats.sets),
                             len(ds.UC),
                             msg="Should have 1 confusion per each split")
        self.assertEqual(len(clf.clfs), len(ds.UC),
                             msg="Should have number of classifiers equal # of epochs")
        self.assertEqual(clf.predict(ds.samples), list(ds.targets),
                             msg="Should classify correctly")

        # feature_ids must be list of lists, and since it is not
        # feature-selecting classifier used - we expect all features
        # to be utilized
        #  NOT ANYMORE -- for BoostedClassifier we have now union of all
        #  used features across slave classifiers. That makes
        #  semantics clear. If you need to get deeper -- use upcoming
        #  harvesting facility ;-)
        # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks))
        # self.assertTrue(np.array([len(ids)==ds.nfeatures
        #                         for ids in clf.feature_ids]).all())

        # Just check if we get it at all ;-)
        summary = clf.summary()
Пример #7
0
    def test_cache_speedup(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1)
        sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1)

        cv_c = CrossValidation(ck, NFoldPartitioner())
        cv_s = CrossValidation(sk, NFoldPartitioner())

        #data = datasets['uni4large']
        P = 5000
        data = normal_feature_dataset(snr=2,
                                      perlabel=200,
                                      nchunks=10,
                                      means=np.random.randn(2, P),
                                      nfeatures=P)

        t0 = time()
        ck.params.kernel.compute(data)
        cachetime = time() - t0

        t0 = time()
        cached_err = cv_c(data)
        ccv_time = time() - t0

        t0 = time()
        norm_err = cv_s(data)
        ncv_time = time() - t0

        assert_almost_equal(np.asanyarray(cached_err), np.asanyarray(norm_err))
        ok_(cachetime < ncv_time)
        ok_(ccv_time < ncv_time)
        #print 'Regular CV time: %s seconds'%ncv_time
        #print 'Caching time: %s seconds'%cachetime
        #print 'Cached CV time: %s seconds'%ccv_time

        speedup = ncv_time / (ccv_time + cachetime)
        #print 'Speedup factor: %s'%speedup

        # Speedup ideally should be 10, though it's not purely linear
        self.failIf(speedup < 2, 'Problem caching data - too slow!')
Пример #8
0
def test_confusion_as_node():
    from mvpa2.misc.data_generators import normal_feature_dataset
    from mvpa2.clfs.gnb import GNB
    from mvpa2.clfs.transerror import Confusion
    ds = normal_feature_dataset(snr=2.0,
                                perlabel=42,
                                nchunks=3,
                                nonbogus_features=[0, 1],
                                nfeatures=2)
    clf = GNB()
    cv = CrossValidation(clf,
                         NFoldPartitioner(),
                         errorfx=None,
                         postproc=Confusion(labels=ds.UT),
                         enable_ca=['stats'])
    res = cv(ds)
    # needs to be identical to CA
    assert_array_equal(res.samples, cv.ca.stats.matrix)
    assert_array_equal(res.sa.predictions, ds.UT)
    assert_array_equal(res.fa.targets, ds.UT)

    skip_if_no_external('scipy')

    from mvpa2.clfs.transerror import BayesConfusionHypothesis
    from mvpa2.base.node import ChainNode
    # same again, but this time with Bayesian hypothesis testing at the end
    cv = CrossValidation(clf,
                         NFoldPartitioner(),
                         errorfx=None,
                         postproc=ChainNode((Confusion(labels=ds.UT),
                                             BayesConfusionHypothesis())))
    res = cv(ds)
    # only two possible hypothesis with two classes
    assert_equals(len(res), 2)
    # the first hypothesis is the can't discriminate anything
    assert_equal(len(res.sa.hypothesis[0]), 1)
    assert_equal(len(res.sa.hypothesis[0][0]), 2)
    # and the hypothesis is actually less likely than the other one
    # (both classes can be distinguished)
    assert (np.e**res.samples[0, 0] < np.e**res.samples[1, 0])
Пример #9
0
    def test_james_problem_multiclass(self):
        percent = 80
        dataset = datasets['uni4large']
        #dataset = dataset[:, dataset.a.nonbogus_features]

        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(
            postproc=ChainMapper([
                #FxMapper('features', l2_normed),
                #FxMapper('samples', np.mean),
                #FxMapper('samples', np.abs)
                FxMapper('features', lambda x: np.argsort(np.abs(x))),
                #maxofabs_sample()
                mean_sample()
                ])),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)

        # update sensitivity at each step (since we're not using the
        # same CLF as sensitivity analyzer)

        class StoreResults(object):
            def __init__(self):
                self.storage = []

            def __call__(self, data, node, result):
                self.storage.append((node.measure.mapper.ca.history,
                                     node.measure.mapper.ca.errors)),

        cv_storage = StoreResults()
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             callback=cv_storage,
                             enable_ca=['stats'])
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception, e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e, ))
Пример #10
0
def _test_gnb_overflow_haxby():  # pragma: no cover
    # example from https://github.com/PyMVPA/PyMVPA/issues/581
    # a heavier version of the above test
    import os
    import numpy as np

    from mvpa2.datasets.sources.native import load_tutorial_data
    from mvpa2.clfs.gnb import GNB
    from mvpa2.measures.base import CrossValidation
    from mvpa2.generators.partition import HalfPartitioner
    from mvpa2.mappers.zscore import zscore
    from mvpa2.mappers.detrend import poly_detrend
    from mvpa2.datasets.miscfx import remove_invariant_features
    from mvpa2.testing.datasets import *

    datapath = '/usr/share/data/pymvpa2-tutorial/'
    haxby = load_tutorial_data(datapath,
                               roi='vt',
                               add_fa={
                                   'vt_thr_glm':
                                   os.path.join(datapath, 'haxby2001',
                                                'sub001', 'masks', 'orig',
                                                'vt.nii.gz')
                               })
    # poly_detrend(haxby, polyord=1, chunks_attr='chunks')
    haxby = haxby[np.array(
        [
            l in ['rest', 'scrambled']  # ''house', 'face']
            for l in haxby.targets
        ],
        dtype='bool')]
    #zscore(haxby, chunks_attr='chunks', param_est=('targets', ['rest']),
    #       dtype='float32')
    # haxby = haxby[haxby.sa.targets != 'rest']
    haxby = remove_invariant_features(haxby)

    clf = GNB(enable_ca='estimates', logprob=True, normalize=True)

    #clf.train(haxby)
    #clf.predict(haxby)
    # estimates a bit "overfit" to judge in the train/predict on the same data

    cv = CrossValidation(clf,
                         HalfPartitioner(attr='chunks'),
                         postproc=None,
                         enable_ca=['stats'])

    cv_results = cv(haxby)
    res1_est = clf.ca.estimates
    print "Estimates:\n", res1_est
    print "Exp(estimates):\n", np.round(np.exp(res1_est), 3)
    assert np.all(np.isfinite(res1_est))
Пример #11
0
    def test_classifier_generalization(self, clf):
        """Simple test if classifiers can generalize ok on simple data
        """
        te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample())
        # check the default
        #self.assertTrue(te.transerror.errorfx is mean_mismatch_error)

        nclasses = 2 * (1 + int('multiclass' in clf.__tags__))

        ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))]
        try:
            cve = te(ds).samples.squeeze()
        except Exception, e:
            self.fail("Failed with %s" % e)
    def test_perturbation_sensitivity_analyzer(self):
        # compute N-1 cross-validation as datameasure
        cv = CrossValidation(sample_clf_lin, NFoldPartitioner())
        # do perturbation analysis using gaussian noise
        pa = NoisePerturbationSensitivity(cv, noise=np.random.normal)

        # run analysis
        map = pa(self.dataset)

        # check for correct size of map
        self.assertTrue(map.nfeatures == self.dataset.nfeatures)

        # dataset is noise -> mean sensitivity should be zero
        self.assertTrue(-0.2 < np.mean(map) < 0.2)
Пример #13
0
    def test_values(self, clf):
        if isinstance(clf, MulticlassClassifier):
            # TODO: handle those values correctly
            return
        ds = datasets['uni2small']
        clf.ca.change_temporarily(enable_ca = ['estimates'])
        cv = CrossValidation(clf, OddEvenPartitioner(),
            enable_ca=['stats', 'training_stats'])
        _ = cv(ds)
        #print clf.descr, clf.values[0]
        # basic test either we get 1 set of values per each sample
        self.assertEqual(len(clf.ca.estimates), ds.nsamples/2)

        clf.ca.reset_changed_temporarily()
Пример #14
0
    def test_custom_targets(self, lrn):
        """Simple test if a learner could cope with custom sa not targets
        """

        # Since we are comparing performances of two learners, we need
        # to assure that if they depend on some random seed -- they
        # would use the same value.  Currently we have such stochastic
        # behavior in SMLR
        # yoh: we explicitly seed right before calling a CVs below so
        #      this setting of .seed is of no real effect/testing
        if 'seed' in lrn.params:
            from mvpa2 import _random_seed
            lrn = lrn.clone()  # clone the beast
            lrn.params.seed = _random_seed  # reuse the same seed
        lrn_ = lrn.clone()
        lrn_.set_space('custom')

        te = CrossValidation(lrn, NFoldPartitioner())
        te_ = CrossValidation(lrn_, NFoldPartitioner())
        nclasses = 2 * (1 + int('multiclass' in lrn.__tags__))
        dsname = ('uni%dsmall' % nclasses,
                  'sin_modulated')[int(lrn.__is_regression__)]
        ds = datasets[dsname]
        ds_ = ds.copy()
        ds_.sa['custom'] = ds_.sa['targets']
        ds_.sa.pop('targets')
        self.assertTrue('targets' in ds.sa,
                        msg="'targets' should remain in original ds")

        try:
            mvpa2.seed()
            cve = te(ds)

            mvpa2.seed()
            cve_ = te_(ds_)
        except Exception, e:
            self.fail("Failed with %r" % e)
Пример #15
0
def test_chained_crossvalidation_searchlight():
    from mvpa2.clfs.gnb import GNB
    from mvpa2.clfs.meta import MappedClassifier
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.mappers.base import ChainMapper
    from mvpa2.mappers.base import Mapper
    from mvpa2.measures.base import CrossValidation
    from mvpa2.measures.searchlight import sphere_searchlight
    from mvpa2.testing.datasets import datasets

    dataset = datasets['3dlarge'].copy()
    dataset.fa['voxel_indices'] = dataset.fa.myspace
    sample_clf = GNB()  # fast and deterministic

    class ZScoreFeaturesMapper(Mapper):
        """Very basic mapper which would take care about standardizing
        all features within each sample separately
        """
        def _forward_data(self, data):
            return (data - np.mean(data, axis=1)[:, None]) / np.std(
                data, axis=1)[:, None]

    # only do partial to save time
    sl_kwargs = dict(radius=2, center_ids=[3, 50])
    clf_mapped = MappedClassifier(sample_clf, ZScoreFeaturesMapper())
    cv = CrossValidation(clf_mapped, NFoldPartitioner())
    sl = sphere_searchlight(cv, **sl_kwargs)
    results_mapped = sl(dataset)

    cv_chained = ChainMapper([
        ZScoreFeaturesMapper(auto_train=True),
        CrossValidation(sample_clf, NFoldPartitioner())
    ])
    sl_chained = sphere_searchlight(cv_chained, **sl_kwargs)
    results_chained = sl_chained(dataset)

    assert_array_equal(results_mapped, results_chained)
Пример #16
0
 def test_auc(self, clf):
     """Test AUC computation
     """
     if isinstance(clf, MulticlassClassifier):
         raise SkipTest, \
               "TODO: handle values correctly in MulticlassClassifier"
     clf.ca.change_temporarily(enable_ca=['estimates'])
     if 'qda' in clf.__tags__:
         # for reliable estimation of covariances, need sufficient
         # sample size
         ds_size = 'large'
     else:
         ds_size = 'small'
     # uni2 dataset with reordered labels
     ds2 = datasets['uni2' + ds_size].copy()
     # revert labels
     ds2.sa['targets'].value = ds2.targets[::-1].copy()
     # same with uni3
     ds3 = datasets['uni3' + ds_size].copy()
     ul = ds3.sa['targets'].unique
     nl = ds3.targets.copy()
     for l in xrange(3):
         nl[ds3.targets == ul[l]] = ul[(l + 1) % 3]
     ds3.sa.targets = nl
     for ds in [
             datasets['uni2' + ds_size], ds2, datasets['uni3' + ds_size],
             ds3
     ]:
         cv = CrossValidation(clf,
                              OddEvenPartitioner(),
                              enable_ca=['stats', 'training_stats'])
         cverror = cv(ds)
         stats = cv.ca.stats.stats
         Nlabels = len(ds.uniquetargets)
         # so we at least do slightly above chance
         # But LARS manages to screw up there as well ATM from time to time, so making
         # its testing labile
         if (('lars' in clf.__tags__) and cfg.getboolean('tests', 'labile', default='yes')) \
             or (not 'lars' in clf.__tags__):
             self.assertTrue(stats['ACC'] > 1.2 / Nlabels)
         auc = stats['AUC']
         if (Nlabels == 2) or (Nlabels > 2 and auc[0] is not np.nan):
             mauc = np.min(stats['AUC'])
             if cfg.getboolean('tests', 'labile', default='yes'):
                 self.assertTrue(
                     mauc > 0.55,
                     msg='All AUCs must be above chance. Got minimal '
                     'AUC=%.2g among %s' % (mauc, stats['AUC']))
     clf.ca.reset_changed_temporarily()
Пример #17
0
    def test_noise_classification(self):
        # get a dataset with a very high SNR
        data = get_mv_pattern(10)

        # do crossval with default errorfx and 'mean' combiner
        cv = CrossValidation(sample_clf_nl, NFoldPartitioner())

        # must return a scalar value
        result = cv(data)
        # must be perfect
        self.assertTrue((result.samples < 0.05).all())

        # do crossval with permuted regressors
        cv = CrossValidation(
            sample_clf_nl,
            ChainNode(
                [NFoldPartitioner(),
                 AttributePermutator('targets', count=10)],
                space='partitions'))
        results = cv(data)

        # must be at chance level
        pmean = np.array(results).mean()
        self.assertTrue(pmean < 0.58 and pmean > 0.42)
Пример #18
0
    def test_pseudo_cv_measure(self):
        clf = SMLR()
        enode = BinaryFxNode(mean_mismatch_error, 'targets')
        tm = TransferMeasure(clf, Splitter('partitions'), postproc=enode)
        cvgen = NFoldPartitioner()
        rm = RepeatedMeasure(tm, cvgen)
        res = rm(self.dataset)
        # one error per fold
        assert_equal(res.shape, (len(self.dataset.sa['chunks'].unique), 1))

        # we can do the same with Crossvalidation
        cv = CrossValidation(clf, cvgen, enable_ca=['stats', 'training_stats',
                                                    'datasets'])
        res = cv(self.dataset)
        assert_equal(res.shape, (len(self.dataset.sa['chunks'].unique), 1))
Пример #19
0
    def test_vstack_and_origids_issue(self):
        # That is actually what swaroop hit
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        # Inspired by the problem Swaroop ran into
        k  = LinearSGKernel(normalizer_cls=False)
        k_ = LinearSGKernel(normalizer_cls=False)   # to be cached
        ck = CachedKernel(k_)

        clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1)
        clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1)

        cvte = CrossValidation(clf, NFoldPartitioner())
        cvte_ = CrossValidation(clf_, NFoldPartitioner())

        ds = datasets['uni2large'].copy(deep=True)
        ok_(~('orig_ids' in ds.sa))     # assure that there are None
        ck.compute(ds)                  # so we initialize origids
        ok_('origids' in ds.sa)
        ds2 = ds.copy(deep=True)
        ds2.samples = np.zeros(ds2.shape)
        from mvpa2.base.dataset import vstack
        ds_vstacked = vstack((ds2, ds))
        # should complaint now since there would not be unique
        # samples' origids
        if __debug__:
            assert_raises(ValueError, ck.compute, ds_vstacked)

        ds_vstacked.init_origids('samples')      # reset origids
        ck.compute(ds_vstacked)

        errs = cvte(ds_vstacked)
        errs_ = cvte_(ds_vstacked)
        # Following test would have failed since origids
        # were just ints, and then non-unique after vstack
        assert_array_equal(errs.samples, errs_.samples)
Пример #20
0
    def test_james_problem(self):
        percent = 80
        dataset = datasets['uni2small']
        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)

        # update sensitivity at each step (since we're not using the
        # same CLF as sensitivity analyzer)

        class StoreResults(object):
            def __init__(self):
                self.storage = []

            def __call__(self, data, node, result):
                self.storage.append((node.measure.mapper.ca.history,
                                     node.measure.mapper.ca.errors)),

        cv_storage = StoreResults()
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             callback=cv_storage,
                             enable_ca=['confusion'])  # TODO -- it is stats
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception as e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e, ))

        assert (len(cv_storage.storage) == len(dataset.sa['chunks'].unique))
        assert (len(cv_storage.storage[0]) == 2)
        assert (len(cv_storage.storage[0][0]) == dataset.nfeatures)

        self.assertTrue(error < 0.2)
Пример #21
0
    def test_regressions_classifiers(self, clf):
        """Simple tests on regressions being used as classifiers
        """
        # check if we get values set correctly
        clf.ca.change_temporarily(enable_ca=['estimates'])
        self.assertRaises(UnknownStateError, clf.ca['estimates']._get)
        cv = CrossValidation(clf, NFoldPartitioner(),
            enable_ca=['stats', 'training_stats'])
        ds = datasets['uni2small'].copy()
        # we want numeric labels to maintain the previous behavior, especially
        # since we deal with regressions here
        ds.sa.targets = AttributeMap().to_numeric(ds.targets)
        cverror = cv(ds)

        self.assertTrue(len(clf.ca.estimates) == ds[ds.chunks == 1].nsamples)
        clf.ca.reset_changed_temporarily()
Пример #22
0
    def test_ifs(self, svm):

        # measure for feature selection criterion and performance assesment
        # use the SAME clf!
        errorfx = mean_mismatch_error
        fmeasure = CrossValidation(svm,
                                   NFoldPartitioner(),
                                   postproc=mean_sample())
        pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets'))

        ifs = IFS(fmeasure,
                  pmeasure,
                  Splitter('purpose', attr_values=['train', 'test']),
                  fselector=\
                    # go for lower tail selection as data_measure will return
                    # errors -> low is good


                    FixedNElementTailSelector(1, tail='lower', mode='select'),
                  )
        wdata = self.get_data()
        wdata.sa['purpose'] = np.repeat('train', len(wdata))
        tdata = self.get_data()
        tdata.sa['purpose'] = np.repeat('test', len(tdata))
        ds = vstack((wdata, tdata))
        orig_nfeatures = ds.nfeatures

        ifs.train(ds)
        resds = ifs(ds)

        # fail if orig datasets are changed
        self.assertTrue(ds.nfeatures == orig_nfeatures)

        # check that the features set with the least error is selected
        self.assertTrue(len(ifs.ca.errors))
        e = np.array(ifs.ca.errors)
        self.assertTrue(resds.nfeatures == e.argmin() + 1)

        # repeat with dataset where selection order is known
        wsignal = datasets['dumb2'].copy()
        wsignal.sa['purpose'] = np.repeat('train', len(wsignal))
        tsignal = datasets['dumb2'].copy()
        tsignal.sa['purpose'] = np.repeat('test', len(tsignal))
        signal = vstack((wsignal, tsignal))
        ifs.train(signal)
        resds = ifs(signal)
        self.assertTrue((resds.samples[:, 0] == signal.samples[:, 0]).all())
Пример #23
0
    def test_confusionmatrix_nulldist(self):
        from mvpa2.clfs.gnb import GNB
        from mvpa2.clfs.transerror import ConfusionMatrixError
        from mvpa2.misc.data_generators import normal_feature_dataset
        for snr in [
                0.,
                2.,
        ]:
            ds = normal_feature_dataset(snr=snr,
                                        perlabel=42,
                                        nchunks=3,
                                        nonbogus_features=[0, 1],
                                        nfeatures=2)

            clf = GNB()
            num_perm = 50
            permutator = AttributePermutator('targets',
                                             limit='chunks',
                                             count=num_perm)
            cv = CrossValidation(
                clf,
                NFoldPartitioner(),
                errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique),
                postproc=mean_sample(),
                null_dist=MCNullDist(
                    permutator,
                    tail='right',  # because we now look at accuracy not error
                    enable_ca=['dist_samples']),
                enable_ca=['stats'])
            cmatrix = cv(ds)
            #print "Result:\n", cmatrix.samples
            cvnp = cv.ca.null_prob.samples
            #print cvnp
            self.assertTrue(cvnp.shape, (2, 2))
            if cfg.getboolean('tests', 'labile', default='yes'):
                if snr == 0.:
                    # all p should be high since no signal
                    assert_array_less(0.05, cvnp)
                else:
                    # diagonal p is low -- we have signal after all
                    assert_array_less(np.diag(cvnp), 0.05)
                    # off diagonals are high p since for them we would
                    # need to look at the other tail
                    assert_array_less(
                        0.9, cvnp[(np.array([0, 1]), np.array([1, 0]))])
Пример #24
0
def test_CDist_cval():
    if _ENFORCE_CA_ENABLED:
        # skip testing for now, since we are having issue with 'training_stats'
        raise SkipTest(
            "Skipping test to avoid issue with 'training_stats while CA enabled"
        )

    targets = np.tile(list(range(3)), 2)
    chunks = np.repeat(np.array((0, 1)), 3)
    ds = dataset_wizard(samples=data, targets=targets, chunks=chunks)

    cv = CrossValidation(CDist(), generator=NFoldPartitioner(), errorfx=None)
    res = cv(ds)
    # Testing to make sure the both folds return same results, as they should
    assert_array_almost_equal(
        res[res.sa.cvfolds == 0, ].samples.reshape(3, 3),
        res[res.sa.cvfolds == 1, ].samples.reshape(3, 3).T)
    # Testing to make sure the last dimension is always 1 to make it work with Searchlights
    assert_equal(res.nfeatures, 1)
Пример #25
0
    def test_gnbsearchlight_3partitions_and_splitter(self):
        ds = self.dataset[:, :20]
        # custom partitioner which provides 3 partitions
        part = CustomPartitioner([([2], [3], [1])])
        gnb_sl = sphere_gnbsearchlight(GNB(), part)
        res_gnb_sl = gnb_sl(ds)

        # compare results to full blown searchlight
        sl = sphere_searchlight(CrossValidation(GNB(), part))
        res_sl = sl(ds)

        assert_datasets_equal(res_gnb_sl, res_sl)

        # and theoretically for this simple single cross-validation we could
        # just use Splitter
        splitter = Splitter('chunks', [2, 3])
        # we have to put explicit None since can't become a kwarg in 1 day any
        # longer here
        gnb_sl_ = sphere_gnbsearchlight(GNB(), None, splitter=splitter)
        res_gnb_sl_ = gnb_sl_(ds)
        assert_datasets_equal(res_gnb_sl, res_gnb_sl_)
Пример #26
0
    def test_regression_with_additional_sa(self):
        regr = regrswh[:][0]
        ds = datasets['3dsmall'].copy()
        ds.fa['voxel_indices'] = ds.fa.myspace

        # Create a new sample attribute which will be used along with
        # every searchlight
        ds.sa['beh'] = np.random.normal(size=(ds.nsamples, 2))

        # and now for fun -- lets create custom linar regression
        # targets out of some random feature and beh linearly combined
        rfeature = np.random.randint(ds.nfeatures)
        ds.sa.targets = np.dot(
            np.hstack((ds.sa.beh, ds.samples[:, rfeature:rfeature + 1])),
            np.array([0.3, 0.2, 0.3]))

        class CrossValidationWithBeh(CrossValidation):
            """An adapter for regular CV which would hstack
               sa.beh to the searchlighting ds"""
            def _call(self, ds):
                return CrossValidation._call(
                    self, Dataset(np.hstack((ds, ds.sa.beh)), sa=ds.sa))

        cvbeh = CrossValidationWithBeh(regr,
                                       OddEvenPartitioner(),
                                       errorfx=corr_error)
        # regular cv
        cv = CrossValidation(regr, OddEvenPartitioner(), errorfx=corr_error)

        slbeh = sphere_searchlight(cvbeh, radius=1)
        slmapbeh = slbeh(ds)
        sl = sphere_searchlight(cv, radius=1)
        slmap = sl(ds)

        assert_equal(slmap.shape, (2, ds.nfeatures))
        # SL which had access to beh should have got for sure better
        # results especially in the vicinity of the chosen feature...
        features = sl.queryengine.query_byid(rfeature)
        assert_array_lequal(slmapbeh.samples[:, features],
                            slmap.samples[:, features])
Пример #27
0
    def test_simple_n_minus_one_cv(self):
        data = get_mv_pattern(3)
        data.init_origids('samples')

        self.assertTrue(data.nsamples == 120)
        self.assertTrue(data.nfeatures == 2)
        self.assertTrue(
            (data.sa.targets == \
                [0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0] * 6).all())
        self.assertTrue(
            (data.sa.chunks == \
                [k for k in range(1, 7) for i in range(20)]).all())
        assert_equal(len(np.unique(data.sa.origids)), data.nsamples)

        cv = CrossValidation(sample_clf_nl,
                             NFoldPartitioner(),
                             enable_ca=['stats', 'training_stats'])
        #                               'samples_error'])

        results = cv(data)
        self.assertTrue((results.samples < 0.2).all()
                        and (results.samples >= 0.0).all())
Пример #28
0
    def test_chi_square_searchlight(self):
        # only do partial to save time

        # Can't yet do this since test_searchlight isn't yet "under nose"
        #skip_if_no_external('scipy')
        if not externals.exists('scipy'):
            return

        from mvpa2.misc.stats import chisquare

        cv = CrossValidation(sample_clf_lin,
                             NFoldPartitioner(),
                             enable_ca=['stats'])

        def getconfusion(data):
            cv(data)
            return chisquare(cv.ca.stats.matrix)[0]

        sl = sphere_searchlight(getconfusion, radius=0, center_ids=[3, 50])

        # run searchlight
        results = sl(self.dataset)
        self.assertTrue(results.nfeatures == 2)
Пример #29
0
    def test_cv_no_generator(self):
        ds = Dataset(np.arange(4),
                     sa={
                         'partitions': [1, 1, 2, 2],
                         'targets': ['a', 'b', 'c', 'd']
                     })

        class Measure(Classifier):
            def _train(self, ds_):
                assert_array_equal(ds_.samples, ds.samples[:2])
                assert_array_equal(ds_.sa.partitions, [1] * len(ds_))

            def _predict(self, ds_):
                # also called for estimating training error
                assert (ds_ is not ds)  # we pass a shallow copy
                assert (len(ds_) < len(ds))
                assert_equal(len(ds_.sa['partitions'].unique), 1)

                return ['c', 'd']

        measure = Measure()
        cv = CrossValidation(measure)
        res = cv(ds)
        assert_array_equal(res, [[0]])  # we did perfect here ;)
Пример #30
0
from mvpa2.clfs.svm import LinearCSVMC
from mvpa2.measures.base import CrossValidation
from mvpa2.measures.searchlight import sphere_searchlight
from mvpa2.testing.datasets import datasets
from mvpa2.mappers.fx import mean_sample

"""For the sake of simplicity, let's use a small artificial dataset."""

# Lets just use our tiny 4D dataset from testing battery
dataset = datasets['3dlarge']

"""Now it only takes three lines for a searchlight analysis."""

# setup measure to be computed in each sphere (cross-validated
# generalization error on odd/even splits)
cv = CrossValidation(LinearCSVMC(), OddEvenPartitioner())

# setup searchlight with 2 voxels radius and measure configured above
sl = sphere_searchlight(cv, radius=2, space='myspace',
                        postproc=mean_sample())

# run searchlight on dataset
sl_map = sl(dataset)

print 'Best performing sphere error:', np.min(sl_map.samples)

"""
If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting
searchlight map (`sl_map`) can be mapped back into the original dataspace
and viewed as a brain overlay. :ref:`Another example <example_searchlight>`
shows a typical application of this algorithm.
Пример #31
0
 def _call(self, ds):
     return CrossValidation._call(
         self,
         Dataset(np.hstack((ds, ds.sa.beh)),
                 sa=ds.sa))
Пример #32
0
    def test_partial_searchlight_with_confusion_matrix(self):
        ds = self.dataset
        from mvpa2.clfs.stats import MCNullDist
        from mvpa2.mappers.fx import mean_sample, sum_sample

        # compute N-1 cross-validation for each sphere
        cm = ConfusionMatrix(labels=ds.UT)
        cv = CrossValidation(
            sample_clf_lin, NFoldPartitioner(),
            # we have to assure that matrix does not get flatted by
            # first vstack in cv and then hstack in searchlight --
            # thus 2 leading dimensions
            # TODO: RF? make searchlight/crossval smarter?
            errorfx=lambda *a: cm(*a)[None, None, :])
        # contruct diameter 2 (or just radius 1) searchlight
        sl = sphere_searchlight(cv, radius=1, center_ids=[3, 5, 50])

        # our regular searchlight -- to compare results
        cv_gross = CrossValidation(sample_clf_lin, NFoldPartitioner())
        sl_gross = sphere_searchlight(cv_gross, radius=1, center_ids=[3, 5, 50])

        # run searchlights
        res = sl(ds)
        res_gross = sl_gross(ds)

        # only two spheres but error for all CV-folds and complete confusion matrix
        assert_equal(res.shape, (len(ds.UC), 3, len(ds.UT), len(ds.UT)))
        assert_equal(res_gross.shape, (len(ds.UC), 3))

        # briefly inspect the confusion matrices
        mat = res.samples
        # since input dataset is probably balanced (otherwise adjust
        # to be per label): sum within columns (thus axis=-2) should
        # be identical to per-class/chunk number of samples
        samples_per_classchunk = len(ds) / (len(ds.UT) * len(ds.UC))
        ok_(np.all(np.sum(mat, axis= -2) == samples_per_classchunk))
        # and if we compute accuracies manually -- they should
        # correspond to the one from sl_gross
        assert_array_almost_equal(res_gross.samples,
                           # from accuracies to errors
                           1 - (mat[..., 0, 0] + mat[..., 1, 1]).astype(float)
                           / (2 * samples_per_classchunk))

        # and now for those who remained sited -- lets perform H0 MC
        # testing of this searchlight... just a silly one with minimal
        # number of permutations
        no_permutations = 10
        permutator = AttributePermutator('targets', count=no_permutations)

        # once again -- need explicit leading dimension to avoid
        # vstacking during cross-validation
        cv.postproc = lambda x: sum_sample()(x)[None, :]

        sl = sphere_searchlight(cv, radius=1, center_ids=[3, 5, 50],
                                null_dist=MCNullDist(permutator, tail='right',
                                                     enable_ca=['dist_samples']))
        res_perm = sl(ds)
        # XXX all of the res_perm, sl.ca.null_prob and
        #     sl.null_dist.ca.dist_samples carry a degenerate leading
        #     dimension which was probably due to introduced new axis
        #     above within cv.postproc
        assert_equal(res_perm.shape, (1, 3, 2, 2))
        assert_equal(sl.null_dist.ca.dist_samples.shape,
                     res_perm.shape + (no_permutations,))
        assert_equal(sl.ca.null_prob.shape, res_perm.shape)
        # just to make sure ;)
        ok_(np.all(sl.ca.null_prob.samples >= 0))
        ok_(np.all(sl.ca.null_prob.samples <= 1))

        # we should have got sums of hits across the splits
        assert_array_equal(np.sum(mat, axis=0), res_perm.samples[0])
Пример #33
0
fsel = OneWayAnova()
import mvpa2.featsel as fs
fselector = fs.helpers.FixedNElementTailSelector(nVox, tail='upper',
                                                 mode='select', sort=False)
# fselector = fs.helpers.FractionTailSelector(0.05, mode='select', tail='upper')
sbfs = fs.base.SensitivityBasedFeatureSelection(fsel, fselector,
                                                enable_ca=['sensitivities'])
from mvpa2.clfs.meta import FeatureSelectionClassifier, MappedClassifier
fclf = FeatureSelectionClassifier(clf, sbfs)

from mvpa2.measures.base import CrossValidation
from mvpa2.misc import errorfx
from mvpa2.generators.partition import NFoldPartitioner

cv = CrossValidation(fclf,
                     NFoldPartitioner(attr='chunks'),
                     errorfx=errorfx.mean_match_accuracy)

import numpy as np
from mvpa2.misc.io.base import SampleAttributes
cv_attr = SampleAttributes(os.path.join(paths[3], (con + "_attribute_labels.txt")))

from mvpa2.measures import rsa
dsm = rsa.PDist(square=True)
# searchlight
# import searchlightutils as sl
# from mvpa2.measures.searchlight import sphere_searchlight
# cvSL = sphere_searchlight(cv, radius=r)
# lres = sl.run_cv_sl(cvSL, fds[lidx].copy(deep=False))

lresults = []