示例#1
0
def test_partitionmapper():
    ds = give_data()
    oep = OddEvenPartitioner()
    parts = list(oep.generate(ds))
    assert_equal(len(parts), 2)
    for i, p in enumerate(parts):
        assert_array_equal(p.sa['partitions'].unique, [1, 2])
        assert_equal(p.a.partitions_set, i)
        assert_equal(len(p), len(ds))
示例#2
0
def test_partitionmapper():
    ds = give_data()
    oep = OddEvenPartitioner()
    parts = list(oep.generate(ds))
    assert_equal(len(parts), 2)
    for i, p in enumerate(parts):
        assert_array_equal(p.sa['partitions'].unique, [1, 2])
        assert_equal(p.a.partitions_set, i)
        assert_equal(len(p), len(ds))
示例#3
0
def test_searchlight_errors_per_trial():
    # To make sure that searchlight can return error/accuracy per trial
    from mvpa2.clfs.gnb import GNB
    from mvpa2.generators.partition import OddEvenPartitioner
    from mvpa2.measures.base import CrossValidation
    from mvpa2.measures.searchlight import sphere_searchlight
    from mvpa2.measures.gnbsearchlight import sphere_gnbsearchlight
    from mvpa2.testing.datasets import datasets
    from mvpa2.misc.errorfx import prediction_target_matches

    dataset = datasets['3dsmall'].copy()
    # randomly permute samples so we break any random correspondence
    # to strengthen tests below
    sample_idx = np.arange(len(dataset))
    dataset = dataset[np.random.permutation(sample_idx)]

    dataset.sa.targets = ['L%d' % l for l in dataset.sa.targets]
    dataset.fa['voxel_indices'] = dataset.fa.myspace
    sample_clf = GNB()              # fast and deterministic

    part = OddEvenPartitioner()
    # only do partial to save time
    cv = CrossValidation(sample_clf, part, errorfx=None) #prediction_target_matches)
    # Just to compare error
    cv_error = CrossValidation(sample_clf, part)

    # Large searchlight radius so we get entire ROI, 2 centers just to make sure
    # that all stacking works correctly
    sl = sphere_searchlight(cv, radius=10, center_ids=[0, 1])
    results = sl(dataset)

    sl_gnb = sphere_gnbsearchlight(sample_clf, part, radius=10, errorfx=None,
                                   center_ids=[0, 1])
    results_gnbsl = sl_gnb(dataset)

    # inspect both results
    # verify that partitioning was done correctly
    partitions = list(part.generate(dataset))
    for res in (results, results_gnbsl):
        assert('targets' in res.sa.keys())  # should carry targets
        assert('cvfolds' in res.sa.keys())  # should carry cvfolds
        for ipart in xrange(len(partitions)):
            assert_array_equal(dataset[partitions[ipart].sa.partitions == 2].targets,
                               res.sa.targets[res.sa.cvfolds == ipart])

    assert_datasets_equal(results, results_gnbsl)

    # one "accuracy" per each trial
    assert_equal(results.shape, (len(dataset), 2))
    # with accuracies the same in both searchlights since the same
    # features were to be selected in both cases due too large radii
    errors_dataset = cv(dataset)
    assert_array_equal(errors_dataset.samples[:, 0], results.samples[:, 0])
    assert_array_equal(errors_dataset.samples[:, 0], results.samples[:, 1])
    # and error matching (up to precision) the one if we run with default error function
    assert_array_almost_equal(np.mean(results.targets[:, None] != results.samples, axis=0)[0],
                              np.mean(cv_error(dataset)))
示例#4
0
    def test_regression_as_classifier(self, regr):
        """Basic tests of metaclass for using regressions as classifiers
        """
        for dsname in 'uni2small', 'uni4small':
            ds = datasets[dsname]

            clf = RegressionAsClassifier(regr, enable_ca=['distances'])
            cv = CrossValidation(clf,
                                 OddEvenPartitioner(),
                                 postproc=mean_sample(),
                                 enable_ca=['stats', 'training_stats'])

            error = cv(ds).samples.squeeze()

            nlabels = len(ds.uniquetargets)
            if nlabels == 2 \
               and cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(error < 0.3,
                                msg="Got error %.2f on %s dataset" %
                                (error, dsname))

            # Check if does not puke on repr and str
            self.assertTrue(str(clf) != "")
            self.assertTrue(repr(clf) != "")

            self.assertEqual(clf.ca.distances.shape,
                             (ds.nsamples / 2, nlabels))
示例#5
0
    def test_null_dist_prob(self, l_clf):
        train = datasets['uni2medium']

        num_perm = 10
        permutator = AttributePermutator('targets',
                                         count=num_perm,
                                         limit='chunks')
        # define class to estimate NULL distribution of errors
        # use left tail of the distribution since we use MeanMatchFx as error
        # function and lower is better
        terr = TransferMeasure(l_clf,
                               Repeater(count=2),
                               postproc=BinaryFxNode(mean_mismatch_error,
                                                     'targets'),
                               null_dist=MCNullDist(permutator, tail='left'))

        # check reasonable error range
        err = terr(train)
        self.assertTrue(np.mean(err) < 0.4)

        # Lets do the same for CVTE
        cvte = CrossValidation(l_clf,
                               OddEvenPartitioner(),
                               null_dist=MCNullDist(permutator,
                                                    tail='left',
                                                    enable_ca=['dist_samples'
                                                               ]),
                               postproc=mean_sample())
        cv_err = cvte(train)

        # check that the result is highly significant since we know that the
        # data has signal
        null_prob = np.asscalar(terr.ca.null_prob)

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(
                null_prob <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got %f) since we know that the data has signal" % null_prob)

            self.assertTrue(
                np.asscalar(cvte.ca.null_prob) <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got p(cvte)=%f) since we know that the data has signal" %
                np.asscalar(cvte.ca.null_prob))

        # we should be able to access the actual samples of the distribution
        # yoh: why it is 3D really?
        # mih: because these are the distribution samples for the ONE error
        #      collapsed into ONE value across all folds. It will also be
        #      3d if the return value of the measure isn't a scalar and it is
        #      not collapsed across folds. it simply corresponds to the shape
        #      of the output dataset of the respective measure (+1 axis)
        # Some permutations could have been skipped since classifier failed
        # to train due to degenerate situation etc, thus accounting for them
        self.assertEqual(cvte.null_dist.ca.dist_samples.shape[2],
                         num_perm - cvte.null_dist.ca.skipped)
示例#6
0
 def test_nblocks(self):
     skip_if_no_external('pprocess')
     # just a basic test to see that we are getting the same
     # results with different nblocks
     ds = datasets['3dsmall'].copy(deep=True)[:, :13]
     ds.fa['voxel_indices'] = ds.fa.myspace
     cv = CrossValidation(GNB(), OddEvenPartitioner())
     res1 = sphere_searchlight(cv, radius=1, nproc=2)(ds)
     res2 = sphere_searchlight(cv, radius=1, nproc=2, nblocks=5)(ds)
     assert_array_equal(res1, res2)
示例#7
0
    def test_regression_with_additional_sa(self):
        regr = regrswh[:][0]
        ds = datasets['3dsmall'].copy()
        ds.fa['voxel_indices'] = ds.fa.myspace

        # Create a new sample attribute which will be used along with
        # every searchlight
        ds.sa['beh'] = np.random.normal(size=(ds.nsamples, 2))

        # and now for fun -- lets create custom linar regression
        # targets out of some random feature and beh linearly combined
        rfeature = np.random.randint(ds.nfeatures)
        ds.sa.targets = np.dot(
            np.hstack((ds.sa.beh, ds.samples[:, rfeature:rfeature + 1])),
            np.array([0.3, 0.2, 0.3]))

        class CrossValidationWithBeh(CrossValidation):
            """An adapter for regular CV which would hstack
               sa.beh to the searchlighting ds"""
            def _call(self, ds):
                return CrossValidation._call(
                    self, Dataset(np.hstack((ds, ds.sa.beh)), sa=ds.sa))

        cvbeh = CrossValidationWithBeh(regr,
                                       OddEvenPartitioner(),
                                       errorfx=corr_error)
        # regular cv
        cv = CrossValidation(regr, OddEvenPartitioner(), errorfx=corr_error)

        slbeh = sphere_searchlight(cvbeh, radius=1)
        slmapbeh = slbeh(ds)
        sl = sphere_searchlight(cv, radius=1)
        slmap = sl(ds)

        assert_equal(slmap.shape, (2, ds.nfeatures))
        # SL which had access to beh should have got for sure better
        # results especially in the vicinity of the chosen feature...
        features = sl.queryengine.query_byid(rfeature)
        assert_array_lequal(slmapbeh.samples[:, features],
                            slmap.samples[:, features])
示例#8
0
    def test_values(self, clf):
        if isinstance(clf, MulticlassClassifier):
            # TODO: handle those values correctly
            return
        ds = datasets['uni2small']
        clf.ca.change_temporarily(enable_ca = ['estimates'])
        cv = CrossValidation(clf, OddEvenPartitioner(),
            enable_ca=['stats', 'training_stats'])
        _ = cv(ds)
        #print clf.descr, clf.values[0]
        # basic test either we get 1 set of values per each sample
        self.assertEqual(len(clf.ca.estimates), ds.nsamples/2)

        clf.ca.reset_changed_temporarily()
示例#9
0
 def test_auc(self, clf):
     """Test AUC computation
     """
     if isinstance(clf, MulticlassClassifier):
         raise SkipTest, \
               "TODO: handle values correctly in MulticlassClassifier"
     clf.ca.change_temporarily(enable_ca=['estimates'])
     if 'qda' in clf.__tags__:
         # for reliable estimation of covariances, need sufficient
         # sample size
         ds_size = 'large'
     else:
         ds_size = 'small'
     # uni2 dataset with reordered labels
     ds2 = datasets['uni2' + ds_size].copy()
     # revert labels
     ds2.sa['targets'].value = ds2.targets[::-1].copy()
     # same with uni3
     ds3 = datasets['uni3' + ds_size].copy()
     ul = ds3.sa['targets'].unique
     nl = ds3.targets.copy()
     for l in xrange(3):
         nl[ds3.targets == ul[l]] = ul[(l + 1) % 3]
     ds3.sa.targets = nl
     for ds in [
             datasets['uni2' + ds_size], ds2, datasets['uni3' + ds_size],
             ds3
     ]:
         cv = CrossValidation(clf,
                              OddEvenPartitioner(),
                              enable_ca=['stats', 'training_stats'])
         cverror = cv(ds)
         stats = cv.ca.stats.stats
         Nlabels = len(ds.uniquetargets)
         # so we at least do slightly above chance
         # But LARS manages to screw up there as well ATM from time to time, so making
         # its testing labile
         if (('lars' in clf.__tags__) and cfg.getboolean('tests', 'labile', default='yes')) \
             or (not 'lars' in clf.__tags__):
             self.assertTrue(stats['ACC'] > 1.2 / Nlabels)
         auc = stats['AUC']
         if (Nlabels == 2) or (Nlabels > 2 and auc[0] is not np.nan):
             mauc = np.min(stats['AUC'])
             if cfg.getboolean('tests', 'labile', default='yes'):
                 self.assertTrue(
                     mauc > 0.55,
                     msg='All AUCs must be above chance. Got minimal '
                     'AUC=%.2g among %s' % (mauc, stats['AUC']))
     clf.ca.reset_changed_temporarily()
示例#10
0
    def test_single_class(self, clf):
        """Test if binary and multiclass can handle single class training/testing
        """
        ds = datasets['uni2small']
        ds = ds[ds.sa.targets == 'L0']  #  only 1 label
        assert(ds.sa['targets'].unique == ['L0'])

        ds_ = list(OddEvenPartitioner().generate(ds))[0]
        # Here is our "nice" 0.6 substitute for TransferError:
        trerr = TransferMeasure(clf, Splitter('train'),
                                postproc=BinaryFxNode(mean_mismatch_error,
                                                      'targets'))
        try:
            err = np.asscalar(trerr(ds_))
        except Exception, e:
            self.fail(str(e))
示例#11
0
    def test_tree_classifier(self):
        """Basic tests for TreeClassifier
        """
        ds = datasets['uni4medium']
        # make it simple of the beast -- take only informative ones
        # because classifiers for the tree are selected randomly, so
        # performance varies a lot and we just need to check on
        # correct operation
        ds = ds[:, ds.fa.nonbogus_targets != [None]]

        clfs = clfswh['binary']  # pool of classifiers
        # Lets permute so each time we try some different combination
        # of the classifiers but exclude those operating on %s of
        # features since we might not have enough for that
        clfs = [
            clfs[i] for i in np.random.permutation(len(clfs))
            if not '%' in str(clfs[i])
        ]

        # NB: It is necessary that the same classifier was not used at
        # different nodes, since it would be re-trained for a new set
        # of targets, thus leading to incorrect behavior/high error.
        #
        # Clone only those few leading ones which we will use
        # throughout the test
        clfs = [clf.clone() for clf in clfs[:4]]

        # Test conflicting definition
        tclf = TreeClassifier(clfs[0], {
            'L0+2': (('L0', 'L2'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])
        })
        self.assertRaises(ValueError, tclf.train, ds)
        """Should raise exception since label 2 is in both"""

        # Test insufficient definition
        tclf = TreeClassifier(clfs[0], {
            'L0+5': (('L0', 'L5'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])
        })
        self.assertRaises(ValueError, tclf.train, ds)
        """Should raise exception since no group for L1"""

        # proper definition now
        tclf = TreeClassifier(clfs[0], {
            'L0+1': (('L0', 'L1'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])
        })

        # Lets test train/test cycle using CVTE
        cv = CrossValidation(tclf,
                             OddEvenPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = cv(ds).samples.squeeze()
        try:
            rtclf = repr(tclf)
        except:
            self.fail(msg="Could not obtain repr for TreeClassifier")

        # Test accessibility of .clfs
        self.assertTrue(tclf.clfs['L0+1'] is clfs[1])
        self.assertTrue(tclf.clfs['L2+3'] is clfs[2])

        cvtrc = cv.ca.training_stats
        cvtc = cv.ca.stats

        if cfg.getboolean('tests', 'labile', default='yes'):
            # just a dummy check to make sure everything is working
            self.assertTrue(cvtrc != cvtc)
            self.assertTrue(cverror < 0.3,
                            msg="Got too high error = %s using %s" %
                            (cverror, tclf))

        # Test trailing nodes with no classifier

        # That is why we use separate pool of classifiers here
        # (that is probably old/not-needed since switched to use clones)
        clfs_mc = clfswh['multiclass']  # pool of classifiers
        clfs_mc = [
            clfs_mc[i] for i in np.random.permutation(len(clfs_mc))
            if not '%' in str(clfs_mc[i])
        ]
        clfs_mc = [clf.clone() for clf in clfs_mc[:4]]  # and clones again

        tclf = TreeClassifier(clfs_mc[0], {
            'L0': (('L0', ), None),
            'L1+2+3': (('L1', 'L2', 'L3'), clfs_mc[1])
        })

        cv = CrossValidation(tclf,
                             OddEvenPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = np.asscalar(cv(ds))
        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(cverror < 0.3,
                            msg="Got too high error = %s using %s" %
                            (cverror, tclf))
示例#12
0
def generate_testing_datasets(specs):
    # Lets permute upon each invocation of test, so we could possibly
    # trigger some funny cases
    nonbogus_pool = np.random.permutation([0, 1, 3, 5])

    datasets = {}

    # use a partitioner to flag odd/even samples as training and test
    ttp = OddEvenPartitioner(space='train', count=1)

    for kind, spec in specs.iteritems():
        # set of univariate datasets
        for nlabels in [ 2, 3, 4 ]:
            basename = 'uni%d%s' % (nlabels, kind)
            nonbogus_features = nonbogus_pool[:nlabels]

            dataset = normal_feature_dataset(
                nlabels=nlabels,
                nonbogus_features=nonbogus_features,
                **spec)

            # full dataset
            datasets[basename] = list(ttp.generate(dataset))[0]

        # sample 3D
        total = 2*spec['perlabel']
        nchunks = spec['nchunks']
        data = np.random.standard_normal(( total, 3, 6, 6 ))
        labels = np.concatenate( ( np.repeat( 0, spec['perlabel'] ),
                                  np.repeat( 1, spec['perlabel'] ) ) )
        data[:, 1, 0, 0] += 2*labels           # add some signal
        chunks = np.asarray(range(nchunks)*(total/nchunks))
        mask = np.ones((3, 6, 6), dtype='bool')
        mask[0, 0, 0] = 0
        mask[1, 3, 2] = 0
        ds = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks,
                                 mask=mask, space='myspace')
        # and to stress tests on manipulating sa/fa possibly containing
        # attributes of dtype object
        ds.sa['test_object'] = [['a'], [1, 2]] * (ds.nsamples/2)
        datasets['3d%s' % kind] = ds


    # some additional datasets
    datasets['dumb2'] = dumb_feature_binary_dataset()
    datasets['dumb'] = dumb_feature_dataset()
    # dataset with few invariant features
    _dsinv = dumb_feature_dataset()
    _dsinv.samples = np.hstack((_dsinv.samples,
                               np.zeros((_dsinv.nsamples, 1)),
                               np.ones((_dsinv.nsamples, 1))))
    datasets['dumbinv'] = _dsinv

    # Datasets for regressions testing
    datasets['sin_modulated'] = list(ttp.generate(multiple_chunks(sin_modulated, 4, 30, 1)))[0]
    # use the same full for training
    datasets['sin_modulated_train'] = datasets['sin_modulated']
    datasets['sin_modulated_test'] = sin_modulated(30, 1, flat=True)

    # simple signal for linear regressors
    datasets['chirp_linear'] = multiple_chunks(chirp_linear, 6, 50, 10, 2, 0.3, 0.1)
    datasets['chirp_linear_test'] = chirp_linear(20, 5, 2, 0.4, 0.1)

    datasets['wr1996'] = multiple_chunks(wr1996, 4, 50)
    datasets['wr1996_test'] = wr1996(50)

    datasets['hollow'] = Dataset(HollowSamples((40,20)),
                                 sa={'targets': np.tile(['one', 'two'], 20)})

    return datasets
示例#13
0
def test_multiclass_pairs_svm_searchlight():
    from mvpa2.measures.searchlight import sphere_searchlight
    import mvpa2.clfs.meta
    #reload(mvpa2.clfs.meta)
    from mvpa2.clfs.meta import MulticlassClassifier

    from mvpa2.datasets import Dataset
    from mvpa2.clfs.svm import LinearCSVMC
    #import mvpa2.testing.datasets
    #reload(mvpa2.testing.datasets)
    from mvpa2.testing.datasets import datasets
    from mvpa2.generators.partition import NFoldPartitioner, OddEvenPartitioner
    from mvpa2.measures.base import CrossValidation

    from mvpa2.testing import ok_, assert_equal, assert_array_equal
    from mvpa2.sandbox.multiclass import get_pairwise_accuracies

    # Some parameters used in the test below
    nproc = 1 + int(mvpa2.externals.exists('pprocess'))
    ntargets = 4                                # number of targets
    npairs = ntargets*(ntargets-1)/2
    center_ids = [35, 55, 1]
    ds = datasets['3dsmall'].copy()

    # redefine C,T so we have a multiclass task
    nsamples = len(ds)
    ds.sa.targets = range(ntargets) * (nsamples//ntargets)
    ds.sa.chunks = np.arange(nsamples) // ntargets
    # and add some obvious signal where it is due
    ds.samples[:, 55] += 15*ds.sa.targets   # for all 4 targets
    ds.samples[:, 35] += 15*(ds.sa.targets % 2) # so we have conflicting labels
    # while 35 would still be just for 2 categories which would conflict

    mclf = MulticlassClassifier(LinearCSVMC(),
                                pass_attr=['sa.chunks', 'ca.raw_predictions_ds'],
                                enable_ca=['raw_predictions_ds'])

    label_pairs = mclf._get_binary_pairs(ds)

    def place_sa_as_samples(ds):
        # add a degenerate dimension for the hstacking in the searchlight
        ds.samples = ds.sa.raw_predictions_ds[:, None]
        ds.sa.pop('raw_predictions_ds')   # no need to drag the copy
        return ds

    mcv = CrossValidation(mclf, OddEvenPartitioner(), errorfx=None,
                          postproc=place_sa_as_samples)
    sl = sphere_searchlight(mcv, nproc=nproc, radius=2, space='myspace',
                            center_ids=center_ids)
    slmap = sl(ds)


    ok_('chunks' in slmap.sa)
    ok_('cvfolds' in slmap.sa)
    ok_('targets' in slmap.sa)
    # so for each SL we got all pairwise tests
    assert_equal(slmap.shape, (nsamples, len(center_ids), npairs))
    assert_array_equal(np.unique(slmap.sa.cvfolds), [0, 1])

    # Verify that we got right labels in each 'pair'
    # all searchlights should have the same set of labels for a given
    # pair of targets
    label_pairs_ = np.apply_along_axis(
        np.unique, 0,
        ## reshape slmap so we have only simple pairs in the columns
        np.reshape(slmap, (-1, npairs))).T

    # need to prep that list of pairs obtained from MulticlassClassifier
    # and since it is 1-vs-1, they all should be just pairs of lists of
    # 1 element so should work
    assert_equal(len(label_pairs_), npairs)
    assert_array_equal(np.squeeze(np.array(label_pairs)), label_pairs_)
    assert_equal(label_pairs_.shape, (npairs, 2))   # for this particular case


    out    = get_pairwise_accuracies(slmap)
    out123 = get_pairwise_accuracies(slmap, select=[1, 2, 3])

    assert_array_equal(np.unique(out123.T), np.arange(1, 4))   # so we got at least correct targets
    # test that we extracted correct accuracies
    # First 3 in out.T should have category 0, so skip them and compare otherwise
    assert_array_equal(out.samples[3:], out123.samples)

    ok_(np.all(out.samples[:, 1] == 1.), "This was with super-strong result")
    def test_voxel_selection(self):
        '''Compare surface and volume based searchlight'''
        '''
        Tests to see whether results are identical for surface-based
        searchlight (just one plane; Euclidean distnace) and volume-based
        searchlight.

        Note that the current value is a float; if it were int, it would
        specify the number of voxels in each searchlight'''

        radius = 10.
        '''Define input filenames'''
        epi_fn = os.path.join(pymvpa_dataroot, 'bold.nii.gz')
        maskfn = os.path.join(pymvpa_dataroot, 'mask.nii.gz')
        '''
        Use the EPI datafile to define a surface.
        The surface has as many nodes as there are voxels
        and is parallel to the volume 'slice'
        '''
        vg = volgeom.from_any(maskfn, mask_volume=True)

        aff = vg.affine
        nx, ny, nz = vg.shape[:3]
        '''Plane goes in x and y direction, so we take these vectors
        from the affine transformation matrix of the volume'''
        plane = surf.generate_plane(aff[:3, 3], aff[:3, 0], aff[:3, 1], nx, ny)
        '''
        Simulate pial and white matter as just above and below
        the central plane
        '''
        normal_vec = aff[:3, 2]
        outer = plane + normal_vec
        inner = plane + -normal_vec
        '''
        Combine volume and surface information
        '''
        vsm = volsurf.VolSurfMaximalMapping(vg, outer, inner)
        '''
        Run voxel selection with specified radius (in mm), using
        Euclidean distance measure
        '''
        surf_voxsel = surf_voxel_selection.voxel_selection(vsm,
                                                           radius,
                                                           distance_metric='e')
        '''Define the measure'''

        # run_slow=True would give an actual cross-validation with meaningful
        # accuracies. Because this is a unit-test only the number of voxels
        # in each searchlight is tested.
        run_slow = False

        if run_slow:
            meas = CrossValidation(GNB(),
                                   OddEvenPartitioner(),
                                   errorfx=lambda p, t: np.mean(p == t))
            postproc = mean_sample
        else:
            meas = _Voxel_Count_Measure()
            postproc = lambda x: x
        '''
        Surface analysis: define the query engine, cross validation,
        and searchlight
        '''
        surf_qe = SurfaceVerticesQueryEngine(surf_voxsel)
        surf_sl = Searchlight(meas, queryengine=surf_qe, postproc=postproc)
        '''
        new (Sep 2012): also test 'simple' queryengine wrapper function
        '''

        surf_qe2 = disc_surface_queryengine(radius,
                                            maskfn,
                                            inner,
                                            outer,
                                            plane,
                                            volume_mask=True,
                                            distance_metric='euclidean')
        surf_sl2 = Searchlight(meas, queryengine=surf_qe2, postproc=postproc)
        '''
        Same for the volume analysis
        '''
        element_sizes = tuple(map(abs, (aff[0, 0], aff[1, 1], aff[2, 2])))
        sph = Sphere(radius, element_sizes=element_sizes)
        kwa = {'voxel_indices': sph}

        vol_qe = IndexQueryEngine(**kwa)
        vol_sl = Searchlight(meas, queryengine=vol_qe, postproc=postproc)
        '''The following steps are similar to start_easy.py'''
        attr = SampleAttributes(
            os.path.join(pymvpa_dataroot, 'attributes_literal.txt'))

        mask = surf_voxsel.get_mask()

        dataset = fmri_dataset(samples=os.path.join(pymvpa_dataroot,
                                                    'bold.nii.gz'),
                               targets=attr.targets,
                               chunks=attr.chunks,
                               mask=mask)

        if run_slow:
            # do chunkswise linear detrending on dataset

            poly_detrend(dataset, polyord=1, chunks_attr='chunks')

            # zscore dataset relative to baseline ('rest') mean
            zscore(dataset,
                   chunks_attr='chunks',
                   param_est=('targets', ['rest']))

        # select class face and house for this demo analysis
        # would work with full datasets (just a little slower)
        dataset = dataset[np.array(
            [l in ['face', 'house'] for l in dataset.sa.targets],
            dtype='bool')]
        '''Apply searchlight to datasets'''
        surf_dset = surf_sl(dataset)
        surf_dset2 = surf_sl2(dataset)
        vol_dset = vol_sl(dataset)

        surf_data = surf_dset.samples
        surf_data2 = surf_dset2.samples
        vol_data = vol_dset.samples

        assert_array_equal(surf_data, surf_data2)
        assert_array_equal(surf_data, vol_data)
示例#15
0
    clfswh += \
         FeatureSelectionClassifier(
             linearSVMC.clone(),
             SensitivityBasedFeatureSelection(
                SMLRWeights(SMLR(lm=0.1, implementation="C"),
                            postproc=maxofabs_sample()),
                RangeElementSelector(mode='select')),
             descr="LinSVM on SMLR(lm=0.1) non-0")

    _rfeclf = linearSVMC.clone()
    clfswh += \
         FeatureSelectionClassifier(
             _rfeclf,
             SplitRFE(
                 _rfeclf,
                 OddEvenPartitioner(),
                 fselector=FractionTailSelector(
                     0.2, mode='discard', tail='lower')),
             descr="LinSVM with nested-CV RFE")

    _clfs_splitrfe_svm_anova = \
         FeatureSelectionClassifier(
             linearSVMC.clone(),
             SplitRFE(
                 linearSVMC.clone(),
                 OddEvenPartitioner(),
                 fselector=FractionTailSelector(
                     0.2, mode='discard', tail='lower'),
                 fmeasure=OneWayAnova()),
             descr="LinSVM with nested-CV Anova RFE")
    clfswh += _clfs_splitrfe_svm_anova
示例#16
0
def get_dsm_roi_secondorder_xval2(ds,
                                  rois,
                                  zscore_ds=True,
                                  part=OddEvenPartitioner(),
                                  cond_chunk='condition'):
    """ Obtain second-order dissimilarities between ROIs. This version
    cross-validates at the second level, thus the resulting dsms are 
    not symmetrical.

    Arguments
    --------
    ds: dataset
    rois: dict
        each item in the dictionary must be a tuple where the 0th element is
        the center of the roi, and the 1st element is a list of ids
    zscore_ds: bool
        is the dset already zscored?
    part: partitioner
    cond_chunk: str
        across which sample attribute to perform mean group sample

    Returns
    -------
    dataset containing second level dsm
    """
    #ds = h5load(fns.betafn(subnr))
    #ds = ds[:, mask_]
    #ds = ds[ds.sa.condition != 'self']
    if zscore_ds:
        zscore(ds, chunks_attr='chunks')

    # set up oddeven partition
    #part = OddEvenPartitioner()

    rdms = []
    mgs = mean_group_sample([cond_chunk])
    dissims_folds = []
    for ds_ in part.generate(ds):
        ds_1 = ds_[ds_.sa.partitions == 1]
        ds_2 = ds_[ds_.sa.partitions == 2]

        ds_1 = mgs(ds_1)
        ds_2 = mgs(ds_2)
        assert (ds_1.samples.shape == ds_2.samples.shape)

        # first generate first-order rdms for each fold
        names = []
        centers = []
        dissims_1 = []
        dissims_2 = []
        for roi, (center, ids) in rois.iteritems():
            names.append(roi)
            centers.append(center)

            sample1_roi = ds_1.samples[:, ids]
            sample2_roi = ds_2.samples[:, ids]

            dissim1_roi = pdist(sample1_roi, 'correlation')
            dissim2_roi = pdist(sample2_roi, 'correlation')

            dissims_1.append(dissim1_roi)
            dissims_2.append(dissim2_roi)

        dss1 = np.array(dissims_1)
        dss2 = np.array(dissims_2)

        # now compute second-order rdm correlating across folds
        dissim_2ndorder = 1. - corrcoefxy(dss1.T, dss2.T)
        dissim_2ndorder = dataset_wizard(dissim_2ndorder, targets=names)
        dissim_2ndorder.sa['centers'] = centers
        # also add fa information about roi
        dissim_2ndorder.fa['roi'] = names
        dissims_folds.append(dissim_2ndorder)

    # average
    dissims = dissims_folds[0]
    for d in dissims_folds[1:]:
        dissims.samples += d.samples
    dissims.samples /= len(dissims_folds)
    return dissims
示例#17
0
def get_dsm_roi_xval1(ds,
                      rois,
                      zscore_ds=True,
                      part=OddEvenPartitioner(),
                      cond_chunk='condition'):
    """ Obtain second-order dissimilarities between ROIs. This version
    cross-validates at the first level, thus the resulting dsms are 
    symmetrical.

    Arguments
    --------
    ds: dataset
    rois: dict
        each item in the dictionary must be a tuple where the 0th element is
        the center of the roi, and the 1st element is a list of ids
    zscore_ds: bool
        is the dset already zscored?
    part: partitioner
    cond_chunk: str
        across which sample attribute to perform mean group sample

    Returns
    -------
    dataset containing second level dsm
    """
    #ds = h5load(fns.betafn(subnr))
    #ds = ds[:, mask_]
    #ds = ds[ds.sa.condition != 'self']
    if zscore_ds:
        zscore(ds, chunks_attr='chunks')

    # set up oddeven partition
    #part = OddEvenPartitioner()

    rdms = []
    mgs = mean_group_sample([cond_chunk])
    dissims_folds = []
    for ds_ in part.generate(ds):
        ds_1 = ds_[ds_.sa.partitions == 1]
        ds_2 = ds_[ds_.sa.partitions == 2]

        ds_1 = mgs(ds_1)
        ds_2 = mgs(ds_2)
        assert (ds_1.samples.shape == ds_2.samples.shape)

        # first generate first-order rdms cross-validated across folds
        names = []
        centers = []
        dissims = []
        for roi, (center, ids) in rois.iteritems():
            names.append(roi)
            centers.append(center)

            sample1_roi = ds_1.samples[:, ids]
            sample2_roi = ds_2.samples[:, ids]

            dissim_roi = 1. - corrcoefxy(sample1_roi.T, sample2_roi.T)
            nsamples = ds_1.nsamples
            assert (dissim_roi.shape == (nsamples, nsamples))

            dissims.append(
                dissim_roi.flatten())  # now the RDM is not symmetrical anymore
        dissims_folds.append(np.array(dissims))

    # average across folds
    dissims_folds = np.array(dissims_folds).mean(axis=0)
    assert (dissims_folds.shape == (len(names), nsamples**2))

    # now compute second level (distances)
    distance_roi = dist.pdist(dissims_folds, metric='correlation')

    dissims_folds = dataset_wizard(dist.squareform(distance_roi),
                                   targets=names)
    dissims_folds.fa['roi'] = names
    dissims_folds.sa['centers'] = centers

    return dissims_folds
示例#18
0
    def test_rfe(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())
        pmeasure = ProxyMeasure(clf,
                                postproc=BinaryFxNode(mean_mismatch_error,
                                                      'targets'))
        cvmeasure = CrossValidation(clf,
                                    NFoldPartitioner(),
                                    errorfx=mean_mismatch_error,
                                    postproc=mean_sample())

        rfesvm_split = SplitClassifier(clf, OddEvenPartitioner())

        # explore few recipes
        for rfe, data in [
                # because the clf is already trained when computing the sensitivity
                # map, prevent retraining for transfer error calculation
                # Use absolute of the svm weights as sensitivity
            (RFE(sens_ana,
                 pmeasure,
                 Splitter('train'),
                 fselector=FixedNElementTailSelector(1),
                 train_pmeasure=False), self.get_data()),
                # use cross-validation within training to get error for the stopping point
                # but use full training data to derive sensitivity
            (
                RFE(
                    sens_ana,
                    cvmeasure,
                    Repeater(
                        2
                    ),  # give the same full dataset to sens_ana and cvmeasure
                    fselector=FractionTailSelector(0.70,
                                                   mode='select',
                                                   tail='upper'),
                    train_pmeasure=True),
                normal_feature_dataset(perlabel=20,
                                       nchunks=5,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5)),
                # use cross-validation (via SplitClassifier) and get mean
                # of normed sensitivities across those splits
            (
                RFE(
                    rfesvm_split.get_sensitivity_analyzer(
                        postproc=ChainMapper([
                            FxMapper('features', l2_normed),
                            FxMapper('samples', np.mean),
                            FxMapper('samples', np.abs)
                        ])),
                    ConfusionBasedError(rfesvm_split, confusion_state='stats'),
                    Repeater(
                        2),  #  we will use the same full cv-training dataset
                    fselector=FractionTailSelector(0.50,
                                                   mode='select',
                                                   tail='upper'),
                    stopping_criterion=NBackHistoryStopCrit(
                        BestDetector(), 10),
                    train_pmeasure=
                    False,  # we just extract it from existing confusion
                    update_sensitivity=True),
                normal_feature_dataset(perlabel=28,
                                       nchunks=7,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5))
        ]:
            # prep data
            # data = datasets['uni2medium']
            data_nfeatures = data.nfeatures

            rfe.train(data)
            resds = rfe(data)

            # fail if orig datasets are changed
            self.assertTrue(data.nfeatures == data_nfeatures)

            # check that the features set with the least error is selected
            if len(rfe.ca.errors):
                e = np.array(rfe.ca.errors)
                if isinstance(rfe._fselector, FixedNElementTailSelector):
                    self.assertTrue(resds.nfeatures == data_nfeatures -
                                    e.argmin())
                else:
                    imin = np.argmin(e)
                    if 'does_feature_selection' in clf.__tags__:
                        # if clf is smart it might figure it out right away
                        assert_array_less(imin, len(e))
                    else:
                        # in this case we can even check if we had actual
                        # going down/up trend... although -- why up???
                        self.assertTrue(1 < imin < len(e) - 1)
            else:
                self.assertTrue(resds.nfeatures == data_nfeatures)

            # silly check if nfeatures is in decreasing order
            nfeatures = np.array(rfe.ca.nfeatures).copy()
            nfeatures.sort()
            self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all())

            # check if history has elements for every step
            self.assertTrue(
                set(rfe.ca.history) == set(range(len(np.array(
                    rfe.ca.errors)))))

            # Last (the largest number) can be present multiple times even
            # if we remove 1 feature at a time -- just need to stop well
            # in advance when we have more than 1 feature left ;)
            self.assertTrue(rfe.ca.nfeatures[-1] == len(
                np.where(rfe.ca.history == max(rfe.ca.history))[0]))
示例#19
0
def get_dsm_roi_xval1_firstlev(ds,
                               rois,
                               zscore_ds=True,
                               part=OddEvenPartitioner(),
                               cond_chunk='condition',
                               fisher=False):
    """ Obtain second-order dissimilarities between ROIs. This version
    cross-validates at the first level and returns only the first level,
    without distances between ROIs

    Arguments
    --------
    ds: dataset
    rois: dict
        each item in the dictionary must be a tuple where the 0th element is
        the center of the roi, and the 1st element is a list of ids
    zscore_ds: bool
        is the dset already zscored?
    part: partitioner
    cond_chunk: str
        across which sample attribute to perform mean group sample
    fisher: bool
        whether to fisher-transform the correlations before averaging across folds

    Returns
    -------
    dataset containing first level dsm of shape (nrois, ncond**2)
    """
    #ds = h5load(fns.betafn(subnr))
    #ds = ds[:, mask_]
    #ds = ds[ds.sa.condition != 'self']

    # set up oddeven partition
    #part = OddEvenPartitioner()

    mgs = mean_group_sample([cond_chunk])
    dissims_folds = []
    folds = 1
    for ds_ in part.generate(ds):
        print("Running fold {0}".format(folds))
        ds_1 = ds_[ds_.sa.partitions == 1]
        ds_2 = ds_[ds_.sa.partitions == 2]

        ds_1 = mgs(ds_1)
        ds_2 = mgs(ds_2)
        if ds_1.nsamples >= 4 and zscore_ds:
            zscore(ds_1, chunks_attr='chunks')
            zscore(ds_2, chunks_attr='chunks')
        assert (ds_1.samples.shape == ds_2.samples.shape)

        # first generate first-order rdms cross-validated across folds
        names = []
        centers = []
        dissims = []
        for roi, (center, ids) in rois.iteritems():
            names.append(roi)
            centers.append(center)

            sample1_roi = ds_1.samples[:, ids]
            sample2_roi = ds_2.samples[:, ids]

            dissim_roi = corrcoefxy(sample1_roi.T,
                                    sample2_roi.T,
                                    fisher=fisher)
            nsamples = ds_1.nsamples
            assert (dissim_roi.shape == (nsamples, nsamples))

            dissims.append(
                dissim_roi.flatten())  # now the RDM is not symmetrical anymore
        dissims_folds.append(np.array(dissims))
        folds += 1

    # average across folds
    dissims_folds = np.array(dissims_folds).mean(axis=0)
    assert (dissims_folds.shape == (len(names), nsamples**2))

    if fisher:
        dissims_folds = np.tanh(dissims_folds)

    dissims_folds = dataset_wizard(dissims_folds, targets=names)
    dissims_folds.sa['centers'] = centers

    return dissims_folds
示例#20
0
from mvpa2.clfs.svm import LinearCSVMC
from mvpa2.measures.base import CrossValidation
from mvpa2.measures.searchlight import sphere_searchlight
from mvpa2.testing.datasets import datasets
from mvpa2.mappers.fx import mean_sample

"""For the sake of simplicity, let's use a small artificial dataset."""

# Lets just use our tiny 4D dataset from testing battery
dataset = datasets['3dlarge']

"""Now it only takes three lines for a searchlight analysis."""

# setup measure to be computed in each sphere (cross-validated
# generalization error on odd/even splits)
cv = CrossValidation(LinearCSVMC(), OddEvenPartitioner())

# setup searchlight with 2 voxels radius and measure configured above
sl = sphere_searchlight(cv, radius=2, space='myspace',
                        postproc=mean_sample())

# run searchlight on dataset
sl_map = sl(dataset)

print 'Best performing sphere error:', np.min(sl_map.samples)

"""
If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting
searchlight map (`sl_map`) can be mapped back into the original dataspace
and viewed as a brain overlay. :ref:`Another example <example_searchlight>`
shows a typical application of this algorithm.
示例#21
0
def generate_testing_datasets(specs):
    # Lets permute upon each invocation of test, so we could possibly
    # trigger some funny cases
    nonbogus_pool = np.random.permutation([0, 1, 3, 5])

    datasets = {}

    # use a partitioner to flag odd/even samples as training and test
    ttp = OddEvenPartitioner(space='train', count=1)

    for kind, spec in specs.iteritems():
        # set of univariate datasets
        for nlabels in [2, 3, 4]:
            basename = 'uni%d%s' % (nlabels, kind)
            nonbogus_features = nonbogus_pool[:nlabels]

            dataset = normal_feature_dataset(
                nlabels=nlabels, nonbogus_features=nonbogus_features, **spec)

            # full dataset
            datasets[basename] = list(ttp.generate(dataset))[0]

        # sample 3D
        total = 2 * spec['perlabel']
        nchunks = spec['nchunks']
        data = np.random.standard_normal((total, 3, 6, 6))
        labels = np.concatenate(
            (np.repeat(0, spec['perlabel']), np.repeat(1, spec['perlabel'])))
        data[:, 1, 0, 0] += 2 * labels  # add some signal
        chunks = np.asarray(range(nchunks) * (total // nchunks))
        mask = np.ones((3, 6, 6), dtype='bool')
        mask[0, 0, 0] = 0
        mask[1, 3, 2] = 0
        ds = Dataset.from_wizard(samples=data,
                                 targets=labels,
                                 chunks=chunks,
                                 mask=mask,
                                 space='myspace')
        # and to stress tests on manipulating sa/fa possibly containing
        # attributes of dtype object
        ds.sa['test_object'] = [['a'], [1, 2]] * (ds.nsamples // 2)
        datasets['3d%s' % kind] = ds

    # some additional datasets
    datasets['dumb2'] = dumb_feature_binary_dataset()
    datasets['dumb'] = dumb_feature_dataset()
    # dataset with few invariant features
    _dsinv = dumb_feature_dataset()
    _dsinv.samples = np.hstack((_dsinv.samples, np.zeros(
        (_dsinv.nsamples, 1)), np.ones((_dsinv.nsamples, 1))))
    datasets['dumbinv'] = _dsinv

    # Datasets for regressions testing
    datasets['sin_modulated'] = list(
        ttp.generate(multiple_chunks(sin_modulated, 4, 30, 1)))[0]
    # use the same full for training
    datasets['sin_modulated_train'] = datasets['sin_modulated']
    datasets['sin_modulated_test'] = sin_modulated(30, 1, flat=True)

    # simple signal for linear regressors
    datasets['chirp_linear'] = multiple_chunks(chirp_linear, 6, 50, 10, 2, 0.3,
                                               0.1)
    datasets['chirp_linear_test'] = chirp_linear(20, 5, 2, 0.4, 0.1)

    datasets['wr1996'] = multiple_chunks(wr1996, 4, 50)
    datasets['wr1996_test'] = wr1996(50)

    datasets['hollow'] = Dataset(HollowSamples((40, 20)),
                                 sa={'targets': np.tile(['one', 'two'], 20)})

    return datasets
示例#22
0
    def test_regressions(self, regr):
        """Simple tests on regressions
        """
        if not externals.exists('scipy'):
            raise SkipTest
        else:
            from mvpa2.misc.errorfx import corr_error
        ds = datasets['chirp_linear']
        # we want numeric labels to maintain the previous behavior, especially
        # since we deal with regressions here
        ds.sa.targets = AttributeMap().to_numeric(ds.targets)

        cve = CrossValidation(regr, NFoldPartitioner(), postproc=mean_sample(),
            errorfx=corr_error, enable_ca=['training_stats', 'stats'])
        # check the default
        #self.assertTrue(cve.transerror.errorfx is corr_error)

        corr = np.asscalar(cve(ds).samples)

        # Our CorrErrorFx should never return NaN
        self.assertTrue(not np.isnan(corr))
        self.assertTrue(corr == cve.ca.stats.stats['CCe'])

        splitregr = SplitClassifier(
            regr, partitioner=OddEvenPartitioner(),
            enable_ca=['training_stats', 'stats'])
        splitregr.train(ds)
        split_corr = splitregr.ca.stats.stats['CCe']
        split_corr_tr = splitregr.ca.training_stats.stats['CCe']

        for confusion, error in (
            (cve.ca.stats, corr),
            (splitregr.ca.stats, split_corr),
            (splitregr.ca.training_stats, split_corr_tr),
            ):
            #TODO: test confusion statistics
            # Part of it for now -- CCe
            for conf in confusion.summaries:
                stats = conf.stats
                if cfg.getboolean('tests', 'labile', default='yes'):
                    self.assertTrue(stats['CCe'] < 0.5)
                self.assertEqual(stats['CCe'], stats['Summary CCe'])

            s0 = confusion.as_string(short=True)
            s1 = confusion.as_string(short=False)

            for s in [s0, s1]:
                self.assertTrue(len(s) > 10,
                                msg="We should get some string representation "
                                "of regression summary. Got %s" % s)
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(error < 0.2,
                            msg="Regressions should perform well on a simple "
                            "dataset. Got correlation error of %s " % error)

            # Test access to summary statistics
            # YOH: lets start making testing more reliable.
            #      p-value for such accident to have is verrrry tiny,
            #      so if regression works -- it better has at least 0.5 ;)
            #      otherwise fix it! ;)
            # YOH: not now -- issues with libsvr in SG and linear kernel
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(confusion.stats['CCe'] < 0.5)

        # just to check if it works fine
        split_predictions = splitregr.predict(ds.samples)