예제 #1
0
def test_chained_crossvalidation_searchlight():
    from mvpa2.clfs.gnb import GNB
    from mvpa2.clfs.meta import MappedClassifier
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.mappers.base import ChainMapper
    from mvpa2.mappers.base import Mapper
    from mvpa2.measures.base import CrossValidation
    from mvpa2.measures.searchlight import sphere_searchlight
    from mvpa2.testing.datasets import datasets

    dataset = datasets['3dlarge'].copy()
    dataset.fa['voxel_indices'] = dataset.fa.myspace
    sample_clf = GNB()              # fast and deterministic

    class ZScoreFeaturesMapper(Mapper):
        """Very basic mapper which would take care about standardizing
        all features within each sample separately
        """
        def _forward_data(self, data):
            return (data - np.mean(data, axis=1)[:, None])/np.std(data, axis=1)[:, None]

    # only do partial to save time
    sl_kwargs = dict(radius=2, center_ids=[3, 50])
    clf_mapped = MappedClassifier(sample_clf, ZScoreFeaturesMapper())
    cv = CrossValidation(clf_mapped, NFoldPartitioner())
    sl = sphere_searchlight(cv, **sl_kwargs)
    results_mapped = sl(dataset)

    cv_chained = ChainMapper([ZScoreFeaturesMapper(auto_train=True),
                              CrossValidation(sample_clf, NFoldPartitioner())])
    sl_chained = sphere_searchlight(cv_chained, **sl_kwargs)
    results_chained = sl_chained(dataset)

    assert_array_equal(results_mapped, results_chained)
예제 #2
0
    def test_noise_classification(self):
        # get a dataset with a very high SNR
        data = get_mv_pattern(10)

        # do crossval with default errorfx and 'mean' combiner
        cv = CrossValidation(sample_clf_nl, NFoldPartitioner())

        # must return a scalar value
        result = cv(data)
        # must be perfect
        self.assertTrue((result.samples < 0.05).all())

        # do crossval with permuted regressors
        cv = CrossValidation(
            sample_clf_nl,
            ChainNode(
                [NFoldPartitioner(),
                 AttributePermutator('targets', count=10)],
                space='partitions'))
        results = cv(data)

        # results must not be the same
        self.assertTrue(len(np.unique(results.samples)) > 1)

        # must be at chance level
        pmean = np.array(results).mean()
        self.assertTrue(pmean < 0.58 and pmean > 0.42)
예제 #3
0
def test_sifter_with_balancing():
    # extended previous test which was already
    # "... somewhat duplicating the doctest"
    ds = Dataset(samples=np.arange(12).reshape((-1, 2)),
                 sa={
                     'chunks': [0, 1, 2, 3, 4, 5],
                     'targets': ['c', 'c', 'c', 'p', 'p', 'p']
                 })

    # Without sifter -- just to assure that we do get all of them
    # i.e. 6*5*4*3/(4!) = 15
    par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')])
    assert_equal(len(list(par.generate(ds))), 15)

    # so we will take 4 chunks out of available 7, but would care only
    # about those partitions where we have balanced number of 'c' and 'p'
    # entries
    assert_raises(
        ValueError,
        lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds)

    par = ChainNode([
        NFoldPartitioner(cvtype=4, attr='chunks'),
        Sifter([('partitions', 2),
                ('targets', dict(uvalues=['c', 'p'], balanced=True))])
    ])
    dss = list(par.generate(ds))
    # print [ x[x.sa.partitions==2].sa.targets for x in dss ]
    assert_equal(len(dss), 9)
    for ds_ in dss:
        testing = ds[ds_.sa.partitions == 2]
        assert_array_equal(np.unique(testing.sa.targets), ['c', 'p'])
        # and we still have both targets  present in training
        training = ds[ds_.sa.partitions == 1]
        assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
예제 #4
0
    def test_nfold_random_counted_selection_partitioner(self):
        # Lets get somewhat extensive but complete one and see if
        # everything is legit. 0.5 must correspond to 50%, in our case
        # 5 out of 10 unique chunks
        split_partitions = [
            tuple(x.sa.partitions)
            for x in NFoldPartitioner(0.5).generate(self.data)]
        # 252 is # of combinations of 5 from 10
        assert_equal(len(split_partitions), 252)

        # verify that all of them are unique
        assert_equal(len(set(split_partitions)), 252)

        # now let's limit our query
        kwargs = dict(count=10, selection_strategy='random')
        split10_partitions = [
            tuple(x.sa.partitions)
            for x in NFoldPartitioner(5, **kwargs).generate(self.data)]
        split10_partitions_ = [
            tuple(x.sa.partitions)
            for x in NFoldPartitioner(0.5, **kwargs).generate(self.data)]
        # to make sure that I deal with sets of tuples correctly:
        assert_equal(len(set(split10_partitions)), 10)
        assert_equal(len(split10_partitions), 10)
        assert_equal(len(split10_partitions_), 10)
        # and they must differ (same ones are possible but very very unlikely)
        assert_not_equal(split10_partitions, split10_partitions_)
        # but every one of them must be within known exhaustive set
        assert_equal(set(split_partitions).intersection(split10_partitions),
                     set(split10_partitions))
        assert_equal(set(split_partitions).intersection(split10_partitions_),
                     set(split10_partitions_))
예제 #5
0
    def test_custom_targets(self, lrn):
        """Simple test if a learner could cope with custom sa not targets
        """

        # Since we are comparing performances of two learners, we need
        # to assure that if they depend on some random seed -- they
        # would use the same value.  Currently we have such stochastic
        # behavior in SMLR
        if 'seed' in lrn.params:
            from mvpa2 import _random_seed
            lrn = lrn.clone()  # clone the beast
            lrn.params.seed = _random_seed  # reuse the same seed
        lrn_ = lrn.clone()
        lrn_.set_space('custom')

        te = CrossValidation(lrn, NFoldPartitioner())
        te_ = CrossValidation(lrn_, NFoldPartitioner())
        nclasses = 2 * (1 + int('multiclass' in lrn.__tags__))
        dsname = ('uni%dsmall' % nclasses,
                  'sin_modulated')[int(lrn.__is_regression__)]
        ds = datasets[dsname]
        ds_ = ds.copy()
        ds_.sa['custom'] = ds_.sa['targets']
        ds_.sa.pop('targets')
        self.assertTrue('targets' in ds.sa,
                        msg="'targets' should remain in original ds")

        try:
            cve = te(ds)
            cve_ = te_(ds_)
        except Exception, e:
            self.fail("Failed with %r" % e)
예제 #6
0
def test_sifter_superord_usecase():
    from mvpa2.misc.data_generators import normal_feature_dataset
    from mvpa2.clfs.svm import LinearCSVMC  # fast one to use for tests
    from mvpa2.measures.base import CrossValidation

    from mvpa2.base.node import ChainNode
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.generators.base import Sifter

    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6,
        snr=100,  # pure signal! ;)
        perlabel=30,
        nfeatures=6,
        nonbogus_features=range(6),
        nchunks=5)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, )
                         for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
        ],
        space='partitions')

    # and then do your normal where clf is space='superord'
    clf = LinearCSVMC(space='superord')
    cvte_regular = CrossValidation(clf,
                                   NFoldPartitioner(),
                                   errorfx=lambda p, t: np.mean(p == t))
    cvte_super = CrossValidation(clf,
                                 npart,
                                 errorfx=lambda p, t: np.mean(p == t))

    accs_regular = cvte_regular(ds)
    accs_super = cvte_super(ds)

    # With sifting we should get only 2^3 = 8 splits
    assert (len(accs_super) == 8)
    # I don't think that this would ever fail, so not marking it labile
    assert (np.mean(accs_regular) > .8)
    assert (np.mean(accs_super) < .6)
예제 #7
0
 def test_gnbsearchlight_matchaccuracy(self):
     # was not able to deal with custom errorfx collapsing samples
     # after 55e147e0bd30fbf4edede3faef3a15c6c65b33ea
     ds = datasets['3dmedium'].copy()
     ds.fa['voxel_indices'] = ds.fa.myspace
     sl_err = sphere_gnbsearchlight(GNB(),
                                    NFoldPartitioner(cvtype=1),
                                    radius=0)
     sl_acc = sphere_gnbsearchlight(GNB(),
                                    NFoldPartitioner(cvtype=1),
                                    radius=0,
                                    errorfx=mean_match_accuracy)
     assert_array_almost_equal(sl_err(ds), 1.0 - sl_acc(ds).samples)
예제 #8
0
    def test_split_clf_on_chainpartitioner(self):
        # pretty much a smoke test for #156
        ds = datasets['uni2small']
        part = ChainNode([
            NFoldPartitioner(cvtype=1),
            Balancer(attr='targets',
                     count=2,
                     limit='partitions',
                     apply_selection=True)
        ])
        partitions = list(part.generate(ds))
        sclf = SplitClassifier(sample_clf_lin,
                               part,
                               enable_ca=['stats', 'splits'])
        sclf.train(ds)
        pred = sclf.predict(ds)
        assert_equal(len(pred), len(ds))  # rudimentary check
        assert_equal(len(sclf.ca.splits), len(partitions))
        assert_equal(len(sclf.clfs), len(partitions))

        # now let's do sensitivity analyzer just in case
        sclf.untrain()
        sensana = sclf.get_sensitivity_analyzer()
        sens = sensana(ds)
        # basic check that sensitivities varied across splits
        from mvpa2.mappers.fx import FxMapper
        sens_stds = FxMapper('samples', np.std, uattrs=['targets'])(sens)
        assert_true(np.any(sens_stds != 0))
예제 #9
0
    def test_split_classifier_extended(self, clf_):
        clf2 = clf_.clone()
        ds = datasets['uni2%s' % self._get_clf_ds(clf2)]
        clf = SplitClassifier(
            clf=clf_,  #SameSignClassifier(),
            enable_ca=['stats', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.stats.error

        cv = CrossValidation(clf2,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = cv(ds).samples.squeeze()

        if not 'non-deterministic' in clf.__tags__:
            self.assertTrue(
                abs(error - cverror) < 0.01,
                msg="We should get the same error using split classifier as"
                " using CrossValidation. Got %s and %s" % (error, cverror))

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(error < 0.25,
                            msg="clf should generalize more or less fine. "
                            "Got error %s" % error)
        self.assertEqual(len(clf.ca.stats.sets),
                         len(ds.UC),
                         msg="Should have 1 confusion per each split")
        self.assertEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
예제 #10
0
    def test_split_classifier(self):
        ds = self.data_bin_1
        clf = SplitClassifier(
            clf=SameSignClassifier(),
            enable_ca=['stats', 'training_stats', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.stats.error
        tr_error = clf.ca.training_stats.error

        clf2 = clf.clone()
        cv = CrossValidation(clf2,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = cv(ds)
        cverror = cverror.samples.squeeze()
        tr_cverror = cv.ca.training_stats.error

        self.assertEqual(
            error,
            cverror,
            msg="We should get the same error using split classifier as"
            " using CrossValidation. Got %s and %s" % (error, cverror))

        self.assertEqual(
            tr_error,
            tr_cverror,
            msg="We should get the same training error using split classifier as"
            " using CrossValidation. Got %s and %s" % (tr_error, tr_cverror))

        self.assertEqual(clf.ca.stats.percent_correct,
                         100,
                         msg="Dummy clf should train perfectly")
        # CV and SplitClassifier should get the same confusion matrices
        assert_array_equal(clf.ca.stats.matrix, cv.ca.stats.matrix)

        self.assertEqual(len(clf.ca.stats.sets),
                         len(ds.UC),
                         msg="Should have 1 confusion per each split")
        self.assertEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
        self.assertEqual(clf.predict(ds.samples),
                         list(ds.targets),
                         msg="Should classify correctly")

        # feature_ids must be list of lists, and since it is not
        # feature-selecting classifier used - we expect all features
        # to be utilized
        #  NOT ANYMORE -- for BoostedClassifier we have now union of all
        #  used features across slave classifiers. That makes
        #  semantics clear. If you need to get deeper -- use upcoming
        #  harvesting facility ;-)
        # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks))
        # self.assertTrue(np.array([len(ids)==ds.nfeatures
        #                         for ids in clf.feature_ids]).all())

        # Just check if we get it at all ;-)
        summary = clf.summary()
예제 #11
0
    def test_classifier_generalization(self, clf):
        """Simple test if classifiers can generalize ok on simple data
        """
        te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample())
        # check the default
        #self.assertTrue(te.transerror.errorfx is mean_mismatch_error)

        nclasses = 2 * (1 + int('multiclass' in clf.__tags__))

        ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))]
        try:
            cve = te(ds).samples.squeeze()
        except Exception as e:
            self.fail("Failed with %s" % e)

        if cfg.getboolean('tests', 'labile', default='yes'):
            if nclasses > 2 and \
                   ((clf.descr is not None and 'on 5%(' in clf.descr)
                    or 'regression_based' in clf.__tags__):
                # skip those since they are barely applicable/testable here
                raise SkipTest("Skip testing of cve on %s" % clf)

            self.assertTrue(
                cve < 0.25,  # TODO: use multinom distribution
                msg="Got transfer error %g on %s with %d labels" %
                (cve, ds, len(ds.UT)))
예제 #12
0
def test_exclude_targets_combinations_subjectchunks():
    partitioner = ChainNode([NFoldPartitioner(attr='subjects'),
                             ExcludeTargetsCombinationsPartitioner(
                                 k=1,
                                 targets_attr='chunks',
                                 space='partitions')],
                            space='partitions')
    # targets do not need even to be defined!
    ds = Dataset(np.arange(18).reshape(9, 2),
                 sa={'chunks': np.arange(9) // 3,
                     'subjects': np.arange(9) % 3})
    dss = list(partitioner.generate(ds))
    assert_equal(len(dss), 9)

    testing_subjs, testing_chunks = [], []
    for ds_ in dss:
        testing_partition = ds_.sa.partitions == 2
        training_partition = ds_.sa.partitions == 1
        # must be scalars -- so implicit test here
        # if not -- would be error
        testing_subj = np.asscalar(np.unique(ds_.sa.subjects[testing_partition]))
        testing_subjs.append(testing_subj)
        testing_chunk = np.asscalar(np.unique(ds_.sa.chunks[testing_partition]))
        testing_chunks.append(testing_chunk)
        # and those must not appear for training
        ok_(not testing_subj in ds_.sa.subjects[training_partition])
        ok_(not testing_chunk in ds_.sa.chunks[training_partition])
    # and we should have gone through all chunks/subjs pairs
    testing_pairs = set(zip(testing_subjs, testing_chunks))
    assert_equal(len(testing_pairs), 9)
    # yoh: equivalent to set(itertools.product(range(3), range(3))))
    #      but .product is N/A for python2.5
    assert_equal(testing_pairs, set(zip(*np.where(np.ones((3,3))))))
예제 #13
0
def test_permute_superord():
    from mvpa2.base.node import ChainNode
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.generators.base import Sifter
    from mvpa2.generators.permutation import AttributePermutator

    ds = _get_superord_dataset()
    # mvpa2.seed(1)
    part = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
            AttributePermutator(['superord'], limit=['partitions', 'chunks']),
        ],
        space='partitions')

    for ds_perm in part.generate(ds):
        # it does permutation
        assert (np.sum(ds_perm.sa.superord != ds.sa.superord) != 0)
예제 #14
0
def test_splitclf_sensitivities():
    datasets = [
        normal_feature_dataset(perlabel=100,
                               nlabels=2,
                               nfeatures=4,
                               nonbogus_features=[0, i + 1],
                               snr=1,
                               nchunks=2) for i in xrange(2)
    ]

    sclf = SplitClassifier(SMLR(), NFoldPartitioner())
    analyzer = sclf.get_sensitivity_analyzer()

    senses1 = analyzer(datasets[0])
    senses2 = analyzer(datasets[1])

    for senses in senses1, senses2:
        # This should be False when comparing two folds
        assert_false(np.allclose(senses.samples[0], senses.samples[2]))
        assert_false(np.allclose(senses.samples[1], senses.samples[3]))
    # Moreover with new data we should have got different results
    # (i.e. it must retrained correctly)
    for s1, s2 in zip(senses1, senses2):
        assert_false(np.allclose(s1, s2))

    # and we should have "selected" "correct" voxels
    for i, senses in enumerate((senses1, senses2)):
        assert_equal(set(np.argsort(np.max(np.abs(senses), axis=0))[-2:]),
                     set((0, i + 1)))
예제 #15
0
 def test_unpartitioned_cv(self):
     data = get_mv_pattern(10)
     # only one big chunk
     data.sa.chunks[:] = 1
     cv = CrossValidation(sample_clf_nl, NFoldPartitioner())
     # need to fail, because it can't be split into training and testing
     assert_raises(ValueError, cv, data)
예제 #16
0
def _test_mcasey20120222():  # pragma: no cover
    # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2012q1/002034.html

    # This one is conditioned on allowing # of samples to be changed
    # by the mapper provided to MappedClassifier.  See
    # https://github.com/yarikoptic/PyMVPA/tree/_tent/allow_ch_nsamples

    import numpy as np
    from mvpa2.datasets.base import dataset_wizard
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.mappers.base import ChainMapper
    from mvpa2.mappers.svd import SVDMapper
    from mvpa2.mappers.fx import mean_group_sample
    from mvpa2.clfs.svm import LinearCSVMC
    from mvpa2.clfs.meta import MappedClassifier
    from mvpa2.measures.base import CrossValidation

    mapper = ChainMapper([mean_group_sample(['targets','chunks']),
                          SVDMapper()])
    clf = MappedClassifier(LinearCSVMC(), mapper)
    cvte = CrossValidation(clf, NFoldPartitioner(),
                           enable_ca=['repetition_results', 'stats'])

    ds = dataset_wizard(
        samples=np.arange(32).reshape((8, -1)),
        targets=[1, 1, 2, 2, 1, 1, 2, 2],
        chunks=[1, 1, 1, 1, 2, 2, 2, 2])

    errors = cvte(ds)
예제 #17
0
    def test_split_featurewise_dataset_measure(self):
        ds = datasets['uni3small']
        sana = RepeatedMeasure(
            SMLR(fit_all_weights=True).get_sensitivity_analyzer(),
            ChainNode(
                [NFoldPartitioner(),
                 Splitter('partitions', attr_values=[1])]))

        sens = sana(ds)
        # a sensitivity for each chunk and each label combination
        assert_equal(sens.shape, (len(ds.sa['chunks'].unique) *
                                  len(ds.sa['targets'].unique), ds.nfeatures))

        # Lets try more complex example with 'boosting'
        ds = datasets['uni3medium']
        ds.init_origids('samples')
        sana = RepeatedMeasure(
            SMLR(fit_all_weights=True).get_sensitivity_analyzer(),
            Balancer(amount=0.25, count=2, apply_selection=True),
            enable_ca=['datasets', 'repetition_results'])
        sens = sana(ds)

        assert_equal(sens.shape,
                     (2 * len(ds.sa['targets'].unique), ds.nfeatures))
        splits = sana.ca.datasets
        self.assertEqual(len(splits), 2)
        self.assertTrue(
            np.all([s.nsamples == ds.nsamples // 4 for s in splits]))
        # should have used different samples
        self.assertTrue(np.any([splits[0].sa.origids != splits[1].sa.origids]))
        # and should have got different sensitivities
        self.assertTrue(np.any(sens[0] != sens[3]))
예제 #18
0
def _test_edmund_chong_20120907():  # pragma: no cover
    # commented out to avoid syntax warnings while compiling
    # from mvpa2.suite import *
    from mvpa2.testing.datasets import datasets
    repeater = Repeater(count=20)

    partitioner = ChainNode([NFoldPartitioner(cvtype=1),
                             Balancer(attr='targets',
                                      count=1, # for real data > 1
                                      limit='partitions',
                                      apply_selection=True
                                      )],
                            space='partitions')

    clf = LinearCSVMC() #choice of classifier
    permutator = AttributePermutator('targets', limit={'partitions': 1},
                                     count=1)
    null_cv = CrossValidation(
        clf,
        ChainNode([partitioner, permutator], space=partitioner.get_space()),
        errorfx=mean_mismatch_error)
    distr_est = MCNullDist(repeater, tail='left', measure=null_cv,
                           enable_ca=['dist_samples'])
    cvte = CrossValidation(clf, partitioner,
                           errorfx=mean_mismatch_error,
                           null_dist=distr_est,
                           enable_ca=['stats'])
    errors = cvte(datasets['uni2small'])
예제 #19
0
def test_multiclass_without_combiner():
    # The goal is to obtain all pairwise results as the resultant dataset
    # avoiding even calling any combiner
    clf = LinearCSVMC(C=1)
    ds = datasets['uni3small'].copy()
    ds.sa['ids'] = np.arange(len(ds))
    mclf = MulticlassClassifier(clf, combiner=None)

    # without combining results at all
    mcv = CrossValidation(mclf, NFoldPartitioner(), errorfx=None)
    res = mcv(ds)
    assert_equal(len(res), len(ds))
    assert_equal(res.nfeatures, 3)  # 3 pairs for 3 classes
    assert_array_equal(res.UT, ds.UT)
    assert_array_equal(np.unique(np.array(res.fa.targets.tolist())), ds.UT)
    # TODO -- check that we have all the pairs?
    assert_array_equal(res.sa['cvfolds'].unique, np.arange(len(ds.UC)))
    if mcv.ca.is_enabled('training_stats'):
        # we must have received a dictionary per each pair
        training_stats = mcv.ca.training_stats
        assert_equal(set(training_stats.keys()),
                     set([('L0', 'L1'), ('L0', 'L2'), ('L1', 'L2')]))
        for pair, cm in training_stats.iteritems():
            assert_array_equal(cm.labels, ds.UT)
            # we should have no predictions for absent label
            assert_array_equal(cm.matrix[~np.in1d(ds.UT, pair)], 0)
            # while altogether all samples were processed once
            assert_array_equal(cm.stats['P'], len(ds))
            # and number of sets should be equal number of chunks here
            assert_equal(len(cm.sets), len(ds.UC))
예제 #20
0
def test_multiclass_classifier_cv(clf, ds):
    # Extending test_clf.py:ClassifiersTests.test_multiclass_classifier
    # Compare performance with our MaximalVote to the one done natively
    # by e.g. LIBSVM
    clf = clf.clone()
    clf.params.C = 1  # so it doesn't auto-adjust
    mclf = MulticlassClassifier(clf=clf.clone())
    part = NFoldPartitioner()
    cv = CrossValidation(clf, part, enable_ca=['stats', 'training_stats'])
    mcv = CrossValidation(mclf, part, enable_ca=['stats', 'training_stats'])

    er = cv(ds)
    mer = mcv(ds)

    # errors should be the same
    assert_array_equal(er, mer)
    assert_equal(str(cv.ca.training_stats), str(mcv.ca.training_stats))
    # if it was a binary task, cv.ca.stats would also have AUC column
    # while mcv would not  :-/  TODO
    if len(ds.UT) == 2:
        # so just compare the matrix and ACC
        assert_array_equal(cv.ca.stats.matrix, mcv.ca.stats.matrix)
        assert_equal(cv.ca.stats.stats['ACC'], mcv.ca.stats.stats['ACC'])
    else:
        assert_equal(str(cv.ca.stats), str(mcv.ca.stats))
예제 #21
0
def test_multiclass_classifier_pass_ds_attributes():
    # TODO: replicate/extend basic testing of pass_attr
    #       in some more "basic" test_*
    clf = LinearCSVMC(C=1)
    ds = datasets['uni3small'].copy()
    ds.sa['ids'] = np.arange(len(ds))
    mclf = MulticlassClassifier(
        clf,
        pass_attr=[
            'ids',
            'sa.chunks',
            'a.bogus_features',
            # 'ca.raw_estimates' # this one is binary_clf x samples list ATM
            # that is why raw_predictions_ds was born
            'ca.raw_predictions_ds',
            'ca.estimates',  # this one is ok
            'ca.predictions',
        ],
        enable_ca=['all'])
    mcv = CrossValidation(mclf, NFoldPartitioner(), errorfx=None)
    res = mcv(ds)
    assert_array_equal(sorted(res.sa.ids), ds.sa.ids)
    assert_array_equal(res.chunks, ds.chunks[res.sa.ids])
    assert_array_equal(res.sa.predictions, res.samples[:, 0])
    assert_array_equal(res.sa.cvfolds,
                       np.repeat(range(len(ds.UC)),
                                 len(ds) / len(ds.UC)))
예제 #22
0
    def __test_matthias_question(self):
        rfe_clf = LinearCSVMC(C=1)

        rfesvm_split = SplitClassifier(rfe_clf)
        clf = \
            FeatureSelectionClassifier(
            clf = LinearCSVMC(C=1),
            feature_selection = RFE(
                sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer(
                    combiner=first_axis_mean,
                    transformer=np.abs),
                transfer_error=ConfusionBasedError(
                    rfesvm_split,
                    confusion_state="confusion"),
                stopping_criterion=FixedErrorThresholdStopCrit(0.20),
                feature_selector=FractionTailSelector(
                    0.2, mode='discard', tail='lower'),
                update_sensitivity=True))

        no_permutations = 1000
        permutator = AttributePermutator('targets', count=no_permutations)
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             null_dist=MCNullDist(permutator, tail='left'),
                             enable_ca=['stats'])
        error = cv(datasets['uni2small'])
        self.assertTrue(error < 0.4)
        self.assertTrue(cv.ca.null_prob < 0.05)
예제 #23
0
def test_exclude_targets_combinations():
    partitioner = ChainNode([
        NFoldPartitioner(),
        ExcludeTargetsCombinationsPartitioner(
            k=2, targets_attr='targets', space='partitions')
    ],
                            space='partitions')
    from mvpa2.misc.data_generators import normal_feature_dataset
    ds = normal_feature_dataset(snr=0.,
                                nlabels=4,
                                perlabel=3,
                                nchunks=3,
                                nonbogus_features=[0, 1, 2, 3],
                                nfeatures=4)
    partitions = list(partitioner.generate(ds))
    assert_equal(len(partitions), 3 * 6)
    splitter = Splitter('partitions')
    combs = []
    comb_chunks = []
    for p in partitions:
        trds, teds = list(splitter.generate(p))[:2]
        comb = tuple(np.unique(teds.targets))
        combs.append(comb)
        comb_chunks.append(comb + tuple(np.unique(teds.chunks)))
    assert_equal(len(set(combs)),
                 6)  # just 6 possible combinations of 2 out of 4
    assert_equal(len(set(comb_chunks)), 3 * 6)  # all unique
예제 #24
0
 def test_cached_qe_gnbsearchlight(self):
     ds1 = datasets['3dsmall'].copy(deep=True)
     qe = IndexQueryEngine(myspace=Sphere(2))
     cached_qe = CachedQueryEngine(qe)
     gnb_sl = GNBSearchlight(GNB(), NFoldPartitioner(), qe=cached_qe)
     res = gnb_sl(ds1)
     assert_false(cached_qe.ids is None)
예제 #25
0
    def test_slicing(self):
        hs = HalfPartitioner()
        spl = Splitter(attr='partitions')
        splits = list(hs.generate(self.data))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is self.data.samples)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]

        # with numpy 1.7.0b1 "chaining" was deprecated so let's create
        # check function appropriate for the given numpy version
        _a = np.arange(5)
        __a = _a[:4][:3]
        if __a.base is _a:
            # 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base is base
        elif __a.base.base is _a:
            # prior 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base.base is base
        else:
            raise RuntimeError("Uknown handling of .base by numpy")

        for s in splits:
            # we get slicing all the time
            assert_true(is_the_same_base(s[0].samples))
            assert_true(is_the_same_base(s[1].samples))
        spl = Splitter(attr='partitions', noslicing=True)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]
        for s in splits:
            # we no slicing at all
            assert_false(s[0].samples.base is self.data.samples)
            assert_false(s[1].samples.base is self.data.samples)
        nfs = NFoldPartitioner()
        spl = Splitter(attr='partitions')
        splits = [list(spl.generate(p)) for p in nfs.generate(self.data)]
        for i, s in enumerate(splits):
            # training only first and last split
            if i == 0 or i == len(splits) - 1:
                assert_true(is_the_same_base(s[0].samples))
            else:
                assert_true(s[0].samples.base is None)
            # we get slicing all the time
            assert_true(s[1].samples.base.base is self.data.samples)
        step_ds = Dataset(np.random.randn(20, 2),
                          sa={'chunks': np.tile([0, 1], 10)})
        oes = OddEvenPartitioner()
        spl = Splitter(attr='partitions')
        splits = list(oes.generate(step_ds))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is step_ds.samples)
        splits = [list(spl.generate(p)) for p in oes.generate(step_ds)]
        assert_equal(len(splits), 2)
        for s in splits:
            # we get slicing all the time
            assert_true(s[0].samples.base.base is step_ds.samples)
            assert_true(s[1].samples.base.base is step_ds.samples)
예제 #26
0
    def test_partial_searchlight_with_full_report(self):
        ds = self.dataset.copy()
        center_ids = np.zeros(ds.nfeatures, dtype='bool')
        center_ids[[3, 50]] = True
        ds.fa['center_ids'] = center_ids
        # compute N-1 cross-validation for each sphere
        cv = CrossValidation(GNB(), NFoldPartitioner())
        # contruct diameter 1 (or just radius 0) searchlight
        # one time give center ids as a list, the other one takes it from the
        # dataset itself
        sls = (
            sphere_searchlight(cv, radius=0, center_ids=[3, 50]),
            sphere_searchlight(None, radius=0, center_ids=[3, 50]),
            sphere_searchlight(cv, radius=0, center_ids='center_ids'),
        )
        for sl in sls:
            # assure that we could set cv post constructor
            if sl.datameasure is None:
                sl.datameasure = cv
            # run searchlight
            results = sl(ds)
            # only two spheres but error for all CV-folds
            self.assertEqual(results.shape, (len(self.dataset.UC), 2))
            # Test if results hold if we "set" a "new" datameasure
            sl.datameasure = CrossValidation(GNB(), NFoldPartitioner())
            results2 = sl(ds)
            assert_array_almost_equal(results, results2)

        # test if we graciously puke if center_ids are out of bounds
        dataset0 = ds[:, :50]  # so we have no 50th feature
        self.assertRaises(IndexError, sls[0], dataset0)
        # but it should be fine on the one that gets the ids from the dataset
        # itself
        results = sl(dataset0)
        assert_equal(results.nfeatures, 1)
        # check whether roi_seeds are correct
        sl = sphere_searchlight(lambda x: np.vstack(
            (x.fa.roi_seed, x.samples)),
                                radius=1,
                                add_center_fa=True,
                                center_ids=[12])
        res = sl(ds)
        assert_array_equal(
            res.samples[1:, res.samples[0].astype('bool')].squeeze(),
            ds.samples[:, 12])
예제 #27
0
 def test_incorrect_parameter_error(self):
     # Just a sample class
     from mvpa2.generators.partition import NFoldPartitioner
     try:
         spl = NFoldPartitioner(1, incorrect=None)
         raise AssertionError("Must have failed with an exception here "
                              "due to incorrect parameter")
     except Exception, e:
         estr = str(e)
예제 #28
0
def test_searchlight_cross_decoding(path, subjects, conf_file, type, **kwargs):

    conf = read_configuration(path, conf_file, type)

    for arg in kwargs:
        conf[arg] = kwargs[arg]
        if arg == 'radius':
            radius = kwargs[arg]

    debug.active += ["SLC"]

    ds_merged = get_merged_ds(path, subjects, conf_file, type, **kwargs)

    clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities'])
    cv = CrossValidation(clf, NFoldPartitioner(attr='task'))

    maps = []

    for ds in ds_merged:

        ds.targets[ds.targets == 'point'] = 'face'
        ds.targets[ds.targets == 'saccade'] = 'place'

        sl = sphere_searchlight(cv, radius, space='voxel_indices')

        sl_map = sl(ds)

        sl_map.samples *= -1
        sl_map.samples += 1

        nif = map2nifti(sl_map, imghdr=ds.a.imghdr)

        maps.append(nif)

    datetime = get_time()
    analysis = 'cross_searchlight'
    mask = conf['mask_area']
    task = type

    new_dir = datetime + '_' + analysis + '_' + mask + '_' + task
    command = 'mkdir ' + os.path.join(path, '0_results', new_dir)
    os.system(command)

    parent_dir = os.path.join(path, '0_results', new_dir)

    for s, map in zip(subjects, maps):
        name = s
        command = 'mkdir ' + os.path.join(parent_dir, name)
        os.system(command)

        results_dir = os.path.join(parent_dir, name)
        fname = name + '_radius_' + str(radius) + '_searchlight_map.nii.gz'
        map.to_filename(os.path.join(results_dir, fname))

    return maps
예제 #29
0
    def test_cached_kernel_different_datasets(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        # Inspired by the problem Swaroop ran into
        k  = LinearSGKernel(normalizer_cls=False)
        k_ = LinearSGKernel(normalizer_cls=False)   # to be cached
        ck = CachedKernel(k_)

        clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1)
        clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1)

        cvte = CrossValidation(clf, NFoldPartitioner())
        cvte_ = CrossValidation(clf_, NFoldPartitioner())

        postproc=BinaryFxNode(mean_mismatch_error, 'targets')
        te = ProxyMeasure(clf, postproc=postproc)
        te_ = ProxyMeasure(clf_, postproc=postproc)

        for r in xrange(2):
            ds1 = datasets['uni2medium']
            errs1 = cvte(ds1)
            ck.compute(ds1)
            ok_(ck._recomputed)
            errs1_ = cvte_(ds1)
            ok_(~ck._recomputed)
            assert_array_equal(errs1, errs1_)

            ds2 = datasets['uni3small']
            errs2 = cvte(ds2)
            ck.compute(ds2)
            ok_(ck._recomputed)
            errs2_ = cvte_(ds2)
            ok_(~ck._recomputed)
            assert_array_equal(errs2, errs2_)

            ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int)
            te.train(datasets['uni3small'][::2])
            terr = np.asscalar(te(datasets['uni3small'][ssel]))
            te_.train(datasets['uni3small'][::2])
            terr_ = np.asscalar(te_(datasets['uni3small'][ssel]))
            ok_(~ck._recomputed)
            ok_(terr == terr_)
예제 #30
0
def test_sifter_superord_usecase():
    from mvpa2.misc.data_generators import normal_feature_dataset
    from mvpa2.clfs.svm import LinearCSVMC  # fast one to use for tests
    from mvpa2.measures.base import CrossValidation

    from mvpa2.base.node import ChainNode
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.generators.base import Sifter

    ds = _get_superord_dataset()

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
        ],
        space='partitions')

    # and then do your normal where clf is space='superord'
    clf = LinearCSVMC(space='superord')
    cvte_regular = CrossValidation(clf,
                                   NFoldPartitioner(),
                                   errorfx=lambda p, t: np.mean(p == t))
    cvte_super = CrossValidation(clf,
                                 npart,
                                 errorfx=lambda p, t: np.mean(p == t))

    accs_regular = cvte_regular(ds)
    accs_super = cvte_super(ds)

    # With sifting we should get only 2^3 = 8 splits
    assert (len(accs_super) == 8)
    # I don't think that this would ever fail, so not marking it labile
    assert (np.mean(accs_regular) > .8)
    assert (np.mean(accs_super) < .6)