Exemplo n.º 1
0
    def test_custom_split(self):
        #simulate half splitter
        hs = CustomPartitioner([(None,[0,1,2,3,4]),(None,[5,6,7,8,9])])
        spl = Splitter(attr='partitions')
        splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]
        self.failUnless(len(splits) == 2)

        for i,p in enumerate(splits):
            self.failUnless( len(p) == 2 )
            self.failUnless( p[0].nsamples == 50 )
            self.failUnless( p[1].nsamples == 50 )

        assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4])


        # check fully customized split with working and validation set specified
        cs = CustomPartitioner([([0,3,4],[5,9])])
        # we want to discared the unselected partition of the data, hence attr_value
        # these two splitters should do exactly the same thing
        splitters = (Splitter(attr='partitions', attr_values=[1,2]),
                     Splitter(attr='partitions', ignore_values=(0,)))
        for spl in splitters:
            splits = [ list(spl.generate(p)) for p in cs.generate(self.data) ]
            self.failUnless(len(splits) == 1)

            for i,p in enumerate(splits):
                self.failUnless( len(p) == 2 )
                self.failUnless( p[0].nsamples == 30 )
                self.failUnless( p[1].nsamples == 20 )

            self.failUnless((splits[0][1].sa['chunks'].unique == [5, 9]).all())
            self.failUnless((splits[0][0].sa['chunks'].unique == [0, 3, 4]).all())
Exemplo n.º 2
0
    def test_odd_even_split(self):
        oes = OddEvenPartitioner()
        spl = Splitter(attr='partitions')

        splits = [list(spl.generate(p)) for p in oes.generate(self.data)]

        self.assertTrue(len(splits) == 2)

        for i, p in enumerate(splits):
            self.assertTrue(len(p) == 2)
            self.assertTrue(p[0].nsamples == 50)
            self.assertTrue(p[1].nsamples == 50)

        assert_array_equal(splits[0][1].sa['chunks'].unique, [1, 3, 5, 7, 9])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [0, 2, 4, 6, 8])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [1, 3, 5, 7, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [0, 2, 4, 6, 8])

        # check if it works on pure odd and even chunk ids
        moresplits = [
            list(spl.generate(p)) for p in oes.generate(splits[0][0])
        ]

        for split in moresplits:
            self.assertTrue(split[0] != None)
            self.assertTrue(split[1] != None)
Exemplo n.º 3
0
    def test_repeated_features(self):
        class CountFeatures(Measure):
            is_trained = True

            def _call(self, ds):
                return Dataset([ds.nfeatures],
                               fa={
                                   'nonbogus_targets':
                                   list(ds.fa['nonbogus_targets'].unique)
                               })

        cf = CountFeatures()
        spl = Splitter('fa.nonbogus_targets')
        nsplits = len(list(spl.generate(self.dataset)))
        assert_equal(nsplits, 3)
        rm = RepeatedMeasure(cf, spl, concat_as='features')
        res = rm(self.dataset)
        assert_equal(res.shape, (1, nsplits))
        # due to https://github.com/numpy/numpy/issues/641 we are
        # using list(set(...)) construct and there order of
        # nonbogus_targets.unique can vary from run to run, thus there
        # is no guarantee that we would get 18 first, which is a
        # questionable assumption anyways, thus performing checks
        # which do not require any specific order.
        # And yet due to another issue
        # https://github.com/numpy/numpy/issues/3759
        # we can't just is None for the bool mask
        None_fa = np.array([x is None for x in res.fa.nonbogus_targets])
        assert_array_equal(res.samples[0, None_fa], [18])
        assert_array_equal(res.samples[0, ~None_fa], [1, 1])

        if sys.version_info[0] < 3:
            # with python2 order seems to be consistent
            assert_array_equal(res.samples[0], [18, 1, 1])
Exemplo n.º 4
0
def test_exclude_targets_combinations():
    partitioner = ChainNode([
        NFoldPartitioner(),
        ExcludeTargetsCombinationsPartitioner(
            k=2, targets_attr='targets', space='partitions')
    ],
                            space='partitions')
    from mvpa2.misc.data_generators import normal_feature_dataset
    ds = normal_feature_dataset(snr=0.,
                                nlabels=4,
                                perlabel=3,
                                nchunks=3,
                                nonbogus_features=[0, 1, 2, 3],
                                nfeatures=4)
    partitions = list(partitioner.generate(ds))
    assert_equal(len(partitions), 3 * 6)
    splitter = Splitter('partitions')
    combs = []
    comb_chunks = []
    for p in partitions:
        trds, teds = list(splitter.generate(p))[:2]
        comb = tuple(np.unique(teds.targets))
        combs.append(comb)
        comb_chunks.append(comb + tuple(np.unique(teds.chunks)))
    assert_equal(len(set(combs)),
                 6)  # just 6 possible combinations of 2 out of 4
    assert_equal(len(set(comb_chunks)), 3 * 6)  # all unique
Exemplo n.º 5
0
    def test_repeated_features(self):
        class CountFeatures(Measure):
            is_trained = True
            def _call(self, ds):
                return Dataset([ds.nfeatures],
                                fa={'nonbogus_targets': list(ds.fa['nonbogus_targets'].unique)})

        cf = CountFeatures()
        spl = Splitter('fa.nonbogus_targets')
        nsplits = len(list(spl.generate(self.dataset)))
        assert_equal(nsplits, 3)
        rm = RepeatedMeasure(cf, spl, concat_as='features')
        res = rm(self.dataset)
        assert_equal(res.shape, (1, nsplits))
        # due to https://github.com/numpy/numpy/issues/641 we are
        # using list(set(...)) construct and there order of
        # nonbogus_targets.unique can vary from run to run, thus there
        # is no guarantee that we would get 18 first, which is a
        # questionable assumption anyways, thus performing checks
        # which do not require any specific order.
        # And yet due to another issue
        # https://github.com/numpy/numpy/issues/3759
        # we can't just == None for the bool mask
        None_fa = np.array([x == None for x in  res.fa.nonbogus_targets])
        assert_array_equal(res.samples[0, None_fa], [18])
        assert_array_equal(res.samples[0, ~None_fa], [1, 1])

        if sys.version_info[0] < 3:
            # with python2 order seems to be consistent
            assert_array_equal(res.samples[0], [18, 1, 1])
Exemplo n.º 6
0
    def test_label_splitter(self):
        oes = OddEvenPartitioner(attr='targets')
        spl = Splitter(attr='partitions')

        splits = [list(spl.generate(p)) for p in oes.generate(self.data)]

        assert_array_equal(splits[0][0].sa['targets'].unique, [0, 2])
        assert_array_equal(splits[0][1].sa['targets'].unique, [1, 3])
        assert_array_equal(splits[1][0].sa['targets'].unique, [1, 3])
        assert_array_equal(splits[1][1].sa['targets'].unique, [0, 2])
Exemplo n.º 7
0
    def test_label_splitter(self):
        oes = OddEvenPartitioner(attr='targets')
        spl = Splitter(attr='partitions')

        splits = [ list(spl.generate(p)) for p in oes.generate(self.data) ]

        assert_array_equal(splits[0][0].sa['targets'].unique, [0,2])
        assert_array_equal(splits[0][1].sa['targets'].unique, [1,3])
        assert_array_equal(splits[1][0].sa['targets'].unique, [1,3])
        assert_array_equal(splits[1][1].sa['targets'].unique, [0,2])
Exemplo n.º 8
0
 def test_clf_transfer_measure(self):
     # and now on a classifier
     clf = SMLR()
     enode = BinaryFxNode(mean_mismatch_error, 'targets')
     tm = TransferMeasure(clf, Splitter('chunks', count=2),
                          enable_ca=['stats'])
     res = tm(self.dataset)
     manual_error = np.mean(res.samples.squeeze() != res.sa.targets)
     postproc_error = enode(res)
     tm_err = TransferMeasure(clf, Splitter('chunks', count=2),
                              postproc=enode)
     auto_error = tm_err(self.dataset)
     ok_(manual_error == postproc_error.samples[0, 0])
Exemplo n.º 9
0
    def test_slicing(self):
        hs = HalfPartitioner()
        spl = Splitter(attr="partitions")
        splits = list(hs.generate(self.data))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is self.data.samples)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]

        # with numpy 1.7.0b1 "chaining" was deprecated so let's create
        # check function appropriate for the given numpy version
        _a = np.arange(5)
        __a = _a[:4][:3]
        if __a.base is _a:
            # 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base is base

        elif __a.base.base is _a:
            # prior 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base.base is base

        else:
            raise RuntimeError("Uknown handling of .base by numpy")

        for s in splits:
            # we get slicing all the time
            assert_true(is_the_same_base(s[0].samples))
            assert_true(is_the_same_base(s[1].samples))
        spl = Splitter(attr="partitions", noslicing=True)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]
        for s in splits:
            # we no slicing at all
            assert_false(s[0].samples.base is self.data.samples)
            assert_false(s[1].samples.base is self.data.samples)
        nfs = NFoldPartitioner()
        spl = Splitter(attr="partitions")
        splits = [list(spl.generate(p)) for p in nfs.generate(self.data)]
        for i, s in enumerate(splits):
            # training only first and last split
            if i == 0 or i == len(splits) - 1:
                assert_true(is_the_same_base(s[0].samples))
            else:
                assert_true(s[0].samples.base is None)
            # we get slicing all the time
            assert_true(is_the_same_base(s[1].samples))
        step_ds = Dataset(np.random.randn(20, 2), sa={"chunks": np.tile([0, 1], 10)})
        oes = OddEvenPartitioner()
        spl = Splitter(attr="partitions")
        splits = list(oes.generate(step_ds))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is step_ds.samples)
        splits = [list(spl.generate(p)) for p in oes.generate(step_ds)]
        assert_equal(len(splits), 2)
        for s in splits:
            # we get slicing all the time
            assert_true(is_the_same_base(s[0].samples, step_ds.samples))
            assert_true(is_the_same_base(s[1].samples, step_ds.samples))
Exemplo n.º 10
0
    def test_simplest_cv_pat_gen(self):
        # create the generator
        nfs = NFoldPartitioner(cvtype=1)
        spl = Splitter(attr='partitions')
        # now get the xval pattern sets One-Fold CV)
        xvpat = [ list(spl.generate(p)) for p in nfs.generate(self.data) ]

        self.failUnless( len(xvpat) == 10 )

        for i,p in enumerate(xvpat):
            self.failUnless( len(p) == 2 )
            self.failUnless( p[0].nsamples == 90 )
            self.failUnless( p[1].nsamples == 10 )
            self.failUnless( p[1].chunks[0] == i )
Exemplo n.º 11
0
    def test_simplest_cv_pat_gen(self):
        # create the generator
        nfs = NFoldPartitioner(cvtype=1)
        spl = Splitter(attr='partitions')
        # now get the xval pattern sets One-Fold CV)
        xvpat = [list(spl.generate(p)) for p in nfs.generate(self.data)]

        self.assertTrue(len(xvpat) == 10)

        for i, p in enumerate(xvpat):
            self.assertTrue(len(p) == 2)
            self.assertTrue(p[0].nsamples == 90)
            self.assertTrue(p[1].nsamples == 10)
            self.assertTrue(p[1].chunks[0] == i)
Exemplo n.º 12
0
    def test_repeated_features(self):
        print self.dataset
        print self.dataset.fa.nonbogus_targets
        class CountFeatures(Measure):
            is_trained = True
            def _call(self, ds):
                return ds.nfeatures

        cf = CountFeatures()
        spl = Splitter('fa.nonbogus_targets')
        nsplits = len(list(spl.generate(self.dataset)))
        assert_equal(nsplits, 3)
        rm = RepeatedMeasure(cf, spl, concat_as='features')
        res = rm(self.dataset)
        assert_equal(res.shape, (1, nsplits))
        assert_array_equal(res.samples[0], [18,1,1])
Exemplo n.º 13
0
    def test_split_featurewise_dataset_measure(self):
        ds = datasets['uni3small']
        sana = RepeatedMeasure(
            SMLR(fit_all_weights=True).get_sensitivity_analyzer(),
            ChainNode(
                [NFoldPartitioner(),
                 Splitter('partitions', attr_values=[1])]))

        sens = sana(ds)
        # a sensitivity for each chunk and each label combination
        assert_equal(sens.shape, (len(ds.sa['chunks'].unique) *
                                  len(ds.sa['targets'].unique), ds.nfeatures))

        # Lets try more complex example with 'boosting'
        ds = datasets['uni3medium']
        ds.init_origids('samples')
        sana = RepeatedMeasure(
            SMLR(fit_all_weights=True).get_sensitivity_analyzer(),
            Balancer(amount=0.25, count=2, apply_selection=True),
            enable_ca=['datasets', 'repetition_results'])
        sens = sana(ds)

        assert_equal(sens.shape,
                     (2 * len(ds.sa['targets'].unique), ds.nfeatures))
        splits = sana.ca.datasets
        self.assertEqual(len(splits), 2)
        self.assertTrue(
            np.all([s.nsamples == ds.nsamples // 4 for s in splits]))
        # should have used different samples
        self.assertTrue(np.any([splits[0].sa.origids != splits[1].sa.origids]))
        # and should have got different sensitivities
        self.assertTrue(np.any(sens[0] != sens[3]))
Exemplo n.º 14
0
    def test_repeated_features(self):
        print self.dataset
        print self.dataset.fa.nonbogus_targets
        class CountFeatures(Measure):
            is_trained = True
            def _call(self, ds):
                return ds.nfeatures

        cf = CountFeatures()
        spl = Splitter('fa.nonbogus_targets')
        nsplits = len(list(spl.generate(self.dataset)))
        assert_equal(nsplits, 3)
        rm = RepeatedMeasure(cf, spl, concat_as='features')
        res = rm(self.dataset)
        assert_equal(res.shape, (1, nsplits))
        assert_array_equal(res.samples[0], [18,1,1])
Exemplo n.º 15
0
    def test_cv_no_generator_custom_splitter(self):
        ds = Dataset(np.arange(4),
                     sa={
                         'category': ['to', 'to', 'from', 'from'],
                         'targets': ['a', 'b', 'c', 'd']
                     })

        class Measure(Classifier):
            def _train(self, ds_):
                assert_array_equal(ds_.samples, ds.samples[2:])
                assert_array_equal(ds_.sa.category, ['from'] * len(ds_))

            def _predict(self, ds_):
                assert (ds_ is not ds)  # we pass a shallow copy
                # could be called to predit training or testing data
                if np.all(ds_.sa.targets != ['c', 'd']):
                    assert_array_equal(ds_.samples, ds.samples[:2])
                    assert_array_equal(ds_.sa.category, ['to'] * len(ds_))
                else:
                    assert_array_equal(ds_.sa.category, ['from'] * len(ds_))

                return ['c', 'd']

        measure = Measure()
        cv = CrossValidation(measure,
                             splitter=Splitter('category', ['from', 'to']))
        res = cv(ds)
        assert_array_equal(res, [[1]])  # failed perfectly ;-)
Exemplo n.º 16
0
    def test_confusion_based_error(self, l_clf):
        train = datasets['uni2medium']
        train = train[train.sa.train == 1]
        # to check if we fail to classify for 3 labels
        test3 = datasets['uni3medium']
        test3 = test3[test3.sa.train == 1]
        err = ConfusionBasedError(clf=l_clf)
        terr = TransferMeasure(l_clf,
                               Splitter('train', attr_values=[1, 1]),
                               postproc=BinaryFxNode(mean_mismatch_error,
                                                     'targets'))

        self.assertRaises(UnknownStateError, err, None)
        """Shouldn't be able to access the state yet"""

        l_clf.train(train)
        e, te = err(None), terr(train)
        te = np.asscalar(te)
        self.assertTrue(
            abs(e - te) < 1e-10,
            msg="ConfusionBasedError (%.2g) should be equal to TransferError "
            "(%.2g) on traindataset" % (e, te))

        # this will print nasty WARNING but it is ok -- it is just checking code
        # NB warnings are not printed while doing whole testing
        warning("Don't worry about the following warning.")
        if 'multiclass' in l_clf.__tags__:
            self.assertFalse(terr(test3) is None)

        # try copying the beast
        terr_copy = copy(terr)
Exemplo n.º 17
0
    def test_counted_splitting(self):
        spl = Splitter(attr='partitions')
        # count > #chunks, should result in 10 splits
        nchunks = len(self.data.sa['chunks'].unique)
        for strategy in Partitioner._STRATEGIES:
            for count, target in [ (nchunks*2, nchunks),
                                   (nchunks, nchunks),
                                   (nchunks-1, nchunks-1),
                                   (3, 3),
                                   (0, 0),
                                   (1, 1)
                                   ]:
                nfs = NFoldPartitioner(cvtype=1, count=count,
                                       selection_strategy=strategy)
                splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ]
                self.failUnless(len(splits) == target)
                chosenchunks = [int(s[1].uniquechunks) for s in splits]

                # Test if configuration matches as well
                nsplits_cfg = len(nfs.get_partition_specs(self.data))
                self.failUnlessEqual(nsplits_cfg, target)

                # Check if "lastsplit" dsattr was assigned appropriately
                nsplits = len(splits)
                if nsplits > 0:
                    # dummy-proof testing of last split
                    for ds_ in splits[-1]:
                        self.failUnless(ds_.a.lastpartitionset)
                    # test all now
                    for isplit,split in enumerate(splits):
                        for ds_ in split:
                            ds_.a.lastpartitionset == isplit==nsplits-1

                # Check results of different strategies
                if strategy == 'first':
                    self.failUnlessEqual(chosenchunks, range(target))
                elif strategy == 'equidistant':
                    if target == 3:
                        self.failUnlessEqual(chosenchunks, [0, 3, 7])
                elif strategy == 'random':
                    # none is selected twice
                    self.failUnless(len(set(chosenchunks)) == len(chosenchunks))
                    self.failUnless(target == len(chosenchunks))
                else:
                    raise RuntimeError, "Add unittest for strategy %s" \
                          % strategy
Exemplo n.º 18
0
    def test_counted_splitting(self):
        spl = Splitter(attr='partitions')
        # count > #chunks, should result in 10 splits
        nchunks = len(self.data.sa['chunks'].unique)
        for strategy in Partitioner._STRATEGIES:
            for count, target in [(nchunks * 2, nchunks), (nchunks, nchunks),
                                  (nchunks - 1, nchunks - 1), (3, 3), (0, 0),
                                  (1, 1)]:
                nfs = NFoldPartitioner(cvtype=1,
                                       count=count,
                                       selection_strategy=strategy)
                splits = [
                    list(spl.generate(p)) for p in nfs.generate(self.data)
                ]
                self.assertTrue(len(splits) == target)
                chosenchunks = [int(s[1].uniquechunks) for s in splits]

                # Test if configuration matches as well
                nsplits_cfg = len(nfs.get_partition_specs(self.data))
                self.assertEqual(nsplits_cfg, target)

                # Check if "lastsplit" dsattr was assigned appropriately
                nsplits = len(splits)
                if nsplits > 0:
                    # dummy-proof testing of last split
                    for ds_ in splits[-1]:
                        self.assertTrue(ds_.a.lastpartitionset)
                    # test all now
                    for isplit, split in enumerate(splits):
                        for ds_ in split:
                            ds_.a.lastpartitionset == isplit == nsplits - 1

                # Check results of different strategies
                if strategy == 'first':
                    self.assertEqual(chosenchunks, range(target))
                elif strategy == 'equidistant':
                    if target == 3:
                        self.assertEqual(chosenchunks, [0, 3, 7])
                elif strategy == 'random':
                    # none is selected twice
                    self.assertTrue(
                        len(set(chosenchunks)) == len(chosenchunks))
                    self.assertTrue(target == len(chosenchunks))
                else:
                    raise RuntimeError, "Add unittest for strategy %s" \
                          % strategy
Exemplo n.º 19
0
def test_splitter():
    ds = give_data()
    # split with defaults
    spl1 = Splitter('chunks')
    assert_raises(NotImplementedError, spl1, ds)

    splits = list(spl1.generate(ds))
    assert_equal(len(splits), len(ds.sa['chunks'].unique))

    for split in splits:
        # it should have perform basic slicing!
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.sa['chunks'].unique), 1)
        assert_true('lastsplit' in split.a)
    assert_true(splits[-1].a.lastsplit)

    # now again, more customized
    spl2 = Splitter('targets',
                    attr_values=[0, 1, 1, 2, 3, 3, 3],
                    count=4,
                    noslicing=True)
    splits = list(spl2.generate(ds))
    assert_equal(len(splits), 4)
    for split in splits:
        # it should NOT have perform basic slicing!
        assert_false(split.samples.base is ds.samples)
        assert_equal(len(split.sa['targets'].unique), 1)
        assert_equal(len(split.sa['chunks'].unique), 10)
    assert_true(splits[-1].a.lastsplit)

    # two should be identical
    assert_array_equal(splits[1].samples, splits[2].samples)

    # now go wild and split by feature attribute
    ds.fa['roi'] = np.repeat([0, 1], 5)
    # splitter should auto-detect that this is a feature attribute
    spl3 = Splitter('roi')
    splits = list(spl3.generate(ds))
    assert_equal(len(splits), 2)
    for split in splits:
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.fa['roi'].unique), 1)
        assert_equal(split.shape, (100, 5))

    # and finally test chained splitters
    cspl = ChainNode([spl2, spl3, spl1])
    splits = list(cspl.generate(ds))
    # 4 target splits and 2 roi splits each and 10 chunks each
    assert_equal(len(splits), 80)
Exemplo n.º 20
0
    def _train(self, dataset):
        pmeasure = ProxyMeasure(
            self.lrn,
            postproc=BinaryFxNode(self.errorfx, self.lrn.space),
            skip_train=True  # do not train since fmeasure will
        )

        # First we need to replicate our RFE construct but this time
        # with pmeasure for the classifier
        rfe = RFE(
            self.fmeasure,
            pmeasure,
            Splitter('partitions'),
            fselector=self.fselector,
            bestdetector=None,
            train_pmeasure=False,
            stopping_criterion=None,  # full "track"
            update_sensitivity=self.update_sensitivity,
            enable_ca=['errors', 'nfeatures'])

        errors, nfeatures = [], []

        if __debug__:
            debug("RFEC", "Stage 1: initial nested CV/RFE for %s", (dataset, ))

        for partition in self.partitioner.generate(dataset):
            rfe.train(partition)
            errors.append(rfe.ca.errors)
            nfeatures.append(rfe.ca.nfeatures)

        # mean errors across splits and find optimal number
        errors_mean = np.mean(errors, axis=0)
        nfeatures_mean = np.mean(nfeatures, axis=0)
        # we will take the "mean location" of the min to stay
        # within the most 'stable' choice

        mins_idx = np.where(errors_mean == np.min(errors_mean))[0]
        min_idx = mins_idx[int(len(mins_idx) / 2)]
        min_error = errors_mean[min_idx]
        assert (min_error == np.min(errors_mean))
        nfeatures_min = nfeatures_mean[min_idx]

        if __debug__:
            debug(
                "RFEC", "Choosing among %d choices to have %d features with "
                "mean error=%.2g (initial mean error %.2g)",
                (len(mins_idx), nfeatures_min, min_error, errors_mean[0]))

        self.nfeatures_min = nfeatures_min

        if __debug__:
            debug(
                "RFEC", "Stage 2: running RFE on full training dataset to "
                "distil best %d features" % nfeatures_min)

        super(SplitRFE, self)._train(dataset)
Exemplo n.º 21
0
    def test_svms(self, clf):
        knows_probabilities = \
            'probabilities' in clf.ca.keys() and clf.params.probability
        enable_ca = ['estimates']
        if knows_probabilities:
            enable_ca += ['probabilities']

        clf.ca.change_temporarily(enable_ca=enable_ca)
        spl = Splitter('train', count=2)
        traindata, testdata = list(spl.generate(datasets['uni2small']))
        clf.train(traindata)
        predicts = clf.predict(testdata.samples)
        # values should be different from predictions for SVMs we have
        self.assertTrue(np.any(predicts != clf.ca.estimates))

        if knows_probabilities and clf.ca.is_set('probabilities'):
            # XXX test more thoroughly what we are getting here ;-)
            self.assertEqual(len(clf.ca.probabilities), len(testdata.samples))
        clf.ca.reset_changed_temporarily()
Exemplo n.º 22
0
    def test_svms(self, clf):
        knows_probabilities = \
            'probabilities' in clf.ca.keys() and clf.params.probability
        enable_ca = ['estimates']
        if knows_probabilities:
            enable_ca += ['probabilities']

        clf.ca.change_temporarily(enable_ca = enable_ca)
        spl = Splitter('train', count=2)
        traindata, testdata = list(spl.generate(datasets['uni2small']))
        clf.train(traindata)
        predicts = clf.predict(testdata.samples)
        # values should be different from predictions for SVMs we have
        self.assertTrue(np.any(predicts != clf.ca.estimates))

        if knows_probabilities and clf.ca.is_set('probabilities'):
            # XXX test more thoroughly what we are getting here ;-)
            self.assertEqual( len(clf.ca.probabilities),
                                  len(testdata.samples)  )
        clf.ca.reset_changed_temporarily()
Exemplo n.º 23
0
def test_exclude_targets_combinations():
    partitioner = ChainNode(
        [NFoldPartitioner(), ExcludeTargetsCombinationsPartitioner(k=2, targets_attr="targets", space="partitions")],
        space="partitions",
    )
    from mvpa2.misc.data_generators import normal_feature_dataset

    ds = normal_feature_dataset(snr=0.0, nlabels=4, perlabel=3, nchunks=3, nonbogus_features=[0, 1, 2, 3], nfeatures=4)
    partitions = list(partitioner.generate(ds))
    assert_equal(len(partitions), 3 * 6)
    splitter = Splitter("partitions")
    combs = []
    comb_chunks = []
    for p in partitions:
        trds, teds = list(splitter.generate(p))[:2]
        comb = tuple(np.unique(teds.targets))
        combs.append(comb)
        comb_chunks.append(comb + tuple(np.unique(teds.chunks)))
    assert_equal(len(set(combs)), 6)  # just 6 possible combinations of 2 out of 4
    assert_equal(len(set(comb_chunks)), 3 * 6)  # all unique
Exemplo n.º 24
0
    def test_splitter_gnbsearghlight(self):
        ds1 = datasets['3dsmall'].copy(deep=True)

        gnb_sl = GNBSearchlight(GNB(),
                                generator=CustomPartitioner([([0], [1])]),
                                qe=IndexQueryEngine(myspace=Sphere(2)),
                                splitter=Splitter(attr='partitions',
                                                  attr_values=[1, 2]),
                                errorfx=None)
        res = gnb_sl(ds1)
        assert_equal(res.nsamples, (ds1.chunks == 1).sum())
Exemplo n.º 25
0
 def _forward_dataset(self, ds):
     if self.__chunks_attr is None:
         return self._forward_dataset_helper(ds)
     else:
         # strip down dataset to speedup local processing
         if self.__attr_strategy == "remove":
             keep_sa = []
         else:
             keep_sa = None
         proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[])
         # process all chunks individually
         # use a customsplitter to speed-up splitting
         spl = Splitter(self.__chunks_attr)
         dses = [self._forward_dataset_helper(d) for d in spl.generate(proc_ds)]
         # and merge them again
         mds = vstack(dses)
         # put back attributes
         mds.fa.update(ds.fa)
         mds.a.update(ds.a)
         return mds
Exemplo n.º 26
0
    def test_transfer_measure(self):
        # come up with my own measure that only checks if training data
        # and test data are the same
        class MyMeasure(Measure):
            def _train(self, ds):
                self._tds = ds
            def _call(self, ds):
                return Dataset(ds.samples == self._tds.samples)

        tm = TransferMeasure(MyMeasure(), Splitter('chunks', count=2))
        # result should not be all True (== identical)
        assert_true((tm(self.dataset).samples == False).any())
Exemplo n.º 27
0
 def _forward_dataset(self, ds):
     if self.__chunks_attr is None:
         return self._forward_dataset_helper(ds)
     else:
         # strip down dataset to speedup local processing
         if self.__attr_strategy == 'remove':
             keep_sa = []
         else:
             keep_sa = None
         proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[])
         # process all chunks individually
         # use a customsplitter to speed-up splitting
         spl = Splitter(self.__chunks_attr)
         dses = [self._forward_dataset_helper(d)
                     for d in spl.generate(proc_ds)]
         # and merge them again
         mds = vstack(dses)
         # put back attributes
         mds.fa.update(ds.fa)
         mds.a.update(ds.a)
         return mds
Exemplo n.º 28
0
def get_partitioner(split_attr='group_split'):

    splitter = Splitter(attr='partitions', attr_values=(2, 3))

    if split_attr == 'group_split':

        splitrule = [
            # (leave, training, testing)
            (['3', '4'], ['1'], ['2']),
            (['3', '4'], ['2'], ['1']),
            (['1'], ['2'], ['3', '4']),
            (['2'], ['1'], ['3', '4']),
            (['1', '2'], ['3'], ['4']),
            (['1', '2'], ['4'], ['3']),
            (['3'], ['4'], ['1', '2']),
            (['4'], ['3'], ['1', '2']),
        ]
        partitioner = CustomPartitioner(splitrule=splitrule, attr=split_attr)

    elif split_attr == 'subject':

        partitioner = MemoryGroupSubjectPartitioner(group_attr='group_split',
                                                    subject_attr=split_attr,
                                                    attr=split_attr)

    elif split_attr == "group_mdm":
        partitioner = LeaveOneSubjectPerGroupPartitioner(
            group_attr='group', subject_attr="subject", attr="subject")

    elif split_attr == "subject_ofp":

        partitioner = partitioner = NFoldPartitioner(attr="subject")
        splitter = Splitter(attr="partitions")

    elif split_attr == 'group':

        partitioner = NFoldPartitioner(attr=split_attr)
        splitter = Splitter(attr='partitions')

    return partitioner, splitter
Exemplo n.º 29
0
    def test_custom_split(self):
        #simulate half splitter
        hs = CustomPartitioner([(None,[0,1,2,3,4]),(None,[5,6,7,8,9])])
        spl = Splitter(attr='partitions')
        splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]
        self.assertTrue(len(splits) == 2)

        for i,p in enumerate(splits):
            self.assertTrue( len(p) == 2 )
            self.assertTrue( p[0].nsamples == 50 )
            self.assertTrue( p[1].nsamples == 50 )

        assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4])


        # check fully customized split with working and validation set specified
        cs = CustomPartitioner([([0,3,4],[5,9])])
        # we want to discared the unselected partition of the data, hence attr_value
        # these two splitters should do exactly the same thing
        splitters = (Splitter(attr='partitions', attr_values=[1,2]),
                     Splitter(attr='partitions', ignore_values=(0,)))
        for spl in splitters:
            splits = [ list(spl.generate(p)) for p in cs.generate(self.data) ]
            self.assertTrue(len(splits) == 1)

            for i,p in enumerate(splits):
                self.assertTrue( len(p) == 2 )
                self.assertTrue( p[0].nsamples == 30 )
                self.assertTrue( p[1].nsamples == 20 )

            self.assertTrue((splits[0][1].sa['chunks'].unique == [5, 9]).all())
            self.assertTrue((splits[0][0].sa['chunks'].unique == [0, 3, 4]).all())
Exemplo n.º 30
0
    def test_james_problem_multiclass(self):
        percent = 80
        dataset = datasets['uni4large']
        #dataset = dataset[:, dataset.a.nonbogus_features]

        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(
            postproc=ChainMapper([
                #FxMapper('features', l2_normed),
                #FxMapper('samples', np.mean),
                #FxMapper('samples', np.abs)
                FxMapper('features', lambda x: np.argsort(np.abs(x))),
                #maxofabs_sample()
                mean_sample()
                ])),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)

        # update sensitivity at each step (since we're not using the
        # same CLF as sensitivity analyzer)

        class StoreResults(object):
            def __init__(self):
                self.storage = []

            def __call__(self, data, node, result):
                self.storage.append((node.measure.mapper.ca.history,
                                     node.measure.mapper.ca.errors)),

        cv_storage = StoreResults()
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             callback=cv_storage,
                             enable_ca=['stats'])
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception, e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e, ))
Exemplo n.º 31
0
    def test_pseudo_cv_measure(self):
        clf = SMLR()
        enode = BinaryFxNode(mean_mismatch_error, 'targets')
        tm = TransferMeasure(clf, Splitter('partitions'), postproc=enode)
        cvgen = NFoldPartitioner()
        rm = RepeatedMeasure(tm, cvgen)
        res = rm(self.dataset)
        # one error per fold
        assert_equal(res.shape, (len(self.dataset.sa['chunks'].unique), 1))

        # we can do the same with Crossvalidation
        cv = CrossValidation(clf, cvgen, enable_ca=['stats', 'training_stats',
                                                    'datasets'])
        res = cv(self.dataset)
        assert_equal(res.shape, (len(self.dataset.sa['chunks'].unique), 1))
Exemplo n.º 32
0
    def test_half_split(self):
        hs = HalfPartitioner()
        spl = Splitter(attr='partitions')

        splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]

        self.failUnless(len(splits) == 2)

        for i,p in enumerate(splits):
            self.failUnless( len(p) == 2 )
            self.failUnless( p[0].nsamples == 50 )
            self.failUnless( p[1].nsamples == 50 )

        assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4])

        # check if it works on pure odd and even chunk ids
        moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])]

        for split in moresplits:
            self.failUnless(split[0] != None)
            self.failUnless(split[1] != None)
Exemplo n.º 33
0
    def test_odd_even_split(self):
        oes = OddEvenPartitioner()
        spl = Splitter(attr='partitions')

        splits = [ list(spl.generate(p)) for p in oes.generate(self.data) ]

        self.assertTrue(len(splits) == 2)

        for i,p in enumerate(splits):
            self.assertTrue( len(p) == 2 )
            self.assertTrue( p[0].nsamples == 50 )
            self.assertTrue( p[1].nsamples == 50 )

        assert_array_equal(splits[0][1].sa['chunks'].unique, [1, 3, 5, 7, 9])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [0, 2, 4, 6, 8])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [1, 3, 5, 7, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [0, 2, 4, 6, 8])

        # check if it works on pure odd and even chunk ids
        moresplits = [ list(spl.generate(p)) for p in oes.generate(splits[0][0])]

        for split in moresplits:
            self.assertTrue(split[0] != None)
            self.assertTrue(split[1] != None)
Exemplo n.º 34
0
    def test_half_split(self):
        hs = HalfPartitioner()
        spl = Splitter(attr='partitions')

        splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]

        self.assertTrue(len(splits) == 2)

        for i,p in enumerate(splits):
            self.assertTrue( len(p) == 2 )
            self.assertTrue( p[0].nsamples == 50 )
            self.assertTrue( p[1].nsamples == 50 )

        assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4])

        # check if it works on pure odd and even chunk ids
        moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])]

        for split in moresplits:
            self.assertTrue(split[0] is not None)
            self.assertTrue(split[1] is not None)
Exemplo n.º 35
0
    def test_james_problem(self):
        percent = 80
        dataset = datasets['uni2small']
        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)

        # update sensitivity at each step (since we're not using the
        # same CLF as sensitivity analyzer)

        class StoreResults(object):
            def __init__(self):
                self.storage = []

            def __call__(self, data, node, result):
                self.storage.append((node.measure.mapper.ca.history,
                                     node.measure.mapper.ca.errors)),

        cv_storage = StoreResults()
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             callback=cv_storage,
                             enable_ca=['confusion'])  # TODO -- it is stats
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception as e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e, ))

        assert (len(cv_storage.storage) == len(dataset.sa['chunks'].unique))
        assert (len(cv_storage.storage[0]) == 2)
        assert (len(cv_storage.storage[0][0]) == dataset.nfeatures)

        self.assertTrue(error < 0.2)
Exemplo n.º 36
0
    def test_single_class(self, clf):
        """Test if binary and multiclass can handle single class training/testing
        """
        ds = datasets['uni2small']
        ds = ds[ds.sa.targets == 'L0']  #  only 1 label
        assert(ds.sa['targets'].unique == ['L0'])

        ds_ = list(OddEvenPartitioner().generate(ds))[0]
        # Here is our "nice" 0.6 substitute for TransferError:
        trerr = TransferMeasure(clf, Splitter('train'),
                                postproc=BinaryFxNode(mean_mismatch_error,
                                                      'targets'))
        try:
            err = np.asscalar(trerr(ds_))
        except Exception, e:
            self.fail(str(e))
Exemplo n.º 37
0
    def test_ifs(self, svm):

        # measure for feature selection criterion and performance assesment
        # use the SAME clf!
        errorfx = mean_mismatch_error
        fmeasure = CrossValidation(svm,
                                   NFoldPartitioner(),
                                   postproc=mean_sample())
        pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets'))

        ifs = IFS(fmeasure,
                  pmeasure,
                  Splitter('purpose', attr_values=['train', 'test']),
                  fselector=\
                    # go for lower tail selection as data_measure will return
                    # errors -> low is good


                    FixedNElementTailSelector(1, tail='lower', mode='select'),
                  )
        wdata = self.get_data()
        wdata.sa['purpose'] = np.repeat('train', len(wdata))
        tdata = self.get_data()
        tdata.sa['purpose'] = np.repeat('test', len(tdata))
        ds = vstack((wdata, tdata))
        orig_nfeatures = ds.nfeatures

        ifs.train(ds)
        resds = ifs(ds)

        # fail if orig datasets are changed
        self.assertTrue(ds.nfeatures == orig_nfeatures)

        # check that the features set with the least error is selected
        self.assertTrue(len(ifs.ca.errors))
        e = np.array(ifs.ca.errors)
        self.assertTrue(resds.nfeatures == e.argmin() + 1)

        # repeat with dataset where selection order is known
        wsignal = datasets['dumb2'].copy()
        wsignal.sa['purpose'] = np.repeat('train', len(wsignal))
        tsignal = datasets['dumb2'].copy()
        tsignal.sa['purpose'] = np.repeat('test', len(tsignal))
        signal = vstack((wsignal, tsignal))
        ifs.train(signal)
        resds = ifs(signal)
        self.assertTrue((resds.samples[:, 0] == signal.samples[:, 0]).all())
Exemplo n.º 38
0
    def test_gnb(self):
        gnb = GNB()
        gnb_nc = GNB(common_variance=False)
        gnb_n = GNB(normalize=True)
        gnb_n_nc = GNB(normalize=True, common_variance=False)
        gnb_lin = GNB(common_variance=True)

        ds = datasets['uni2medium']

        # Generic silly coverage just to assure that it works in all
        # possible scenarios:
        bools = (True, False)
        # There should be better way... heh
        for cv in bools:  # common_variance?
            for prior in ('uniform', 'laplacian_smoothing', 'ratio'):
                tp = None  # predictions -- all above should
                # result in the same predictions
                for n in bools:  # normalized?
                    for ls in bools:  # logspace?
                        for es in ((), ('estimates')):
                            gnb_ = GNB(common_variance=cv,
                                       prior=prior,
                                       normalize=n,
                                       logprob=ls,
                                       enable_ca=es)
                            tm = TransferMeasure(gnb_, Splitter('train'))
                            predictions = tm(ds).samples[:, 0]
                            if tp is None:
                                tp = predictions
                            assert_array_equal(predictions, tp)
                            # if normalized -- check if estimates are such
                            if n and 'estimates' in es:
                                v = gnb_.ca.estimates
                                if ls:  # in log space -- take exp ;)
                                    v = np.exp(v)
                                d1 = np.sum(v, axis=1) - 1.0
                                self.assertTrue(np.max(np.abs(d1)) < 1e-5)
                            # smoke test to see whether invocation of sensitivity analyser blows
                            # if gnb classifier isn't linear, and to see whether it doesn't blow
                            # when it is linear.
                            if cv:
                                assert 'has_sensitivity' in gnb_.__tags__
                                gnb_.get_sensitivity_analyzer()
                            if not cv:
                                with self.assertRaises(NotImplementedError):
                                    gnb_.get_sensitivity_analyzer()
Exemplo n.º 39
0
 def test_slicing(self):
     hs = HalfPartitioner()
     spl = Splitter(attr='partitions')
     splits = list(hs.generate(self.data))
     for s in splits:
         # partitioned dataset shared the data
         assert_true(s.samples.base is self.data.samples)
     splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]
     for s in splits:
         # we get slicing all the time
         assert_true(s[0].samples.base.base is self.data.samples)
         assert_true(s[1].samples.base.base is self.data.samples)
     spl = Splitter(attr='partitions', noslicing=True)
     splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]
     for s in splits:
         # we no slicing at all
         assert_false(s[0].samples.base is self.data.samples)
         assert_false(s[1].samples.base is self.data.samples)
     nfs = NFoldPartitioner()
     spl = Splitter(attr='partitions')
     splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ]
     for i, s in enumerate(splits):
         # training only first and last split
         if i == 0 or i == len(splits) - 1:
             assert_true(s[0].samples.base.base is self.data.samples)
         else:
             assert_true(s[0].samples.base is None)
         # we get slicing all the time
         assert_true(s[1].samples.base.base is self.data.samples)
     step_ds = Dataset(np.random.randn(20,2),
                       sa={'chunks': np.tile([0,1], 10)})
     oes = OddEvenPartitioner()
     spl = Splitter(attr='partitions')
     splits = list(oes.generate(step_ds))
     for s in splits:
         # partitioned dataset shared the data
         assert_true(s.samples.base is step_ds.samples)
     splits = [ list(spl.generate(p)) for p in oes.generate(step_ds) ]
     assert_equal(len(splits), 2)
     for s in splits:
         # we get slicing all the time
         assert_true(s[0].samples.base.base is step_ds.samples)
         assert_true(s[1].samples.base.base is step_ds.samples)
Exemplo n.º 40
0
    def test_gnbsearchlight_3partitions_and_splitter(self):
        ds = self.dataset[:, :20]
        # custom partitioner which provides 3 partitions
        part = CustomPartitioner([([2], [3], [1])])
        gnb_sl = sphere_gnbsearchlight(GNB(), part)
        res_gnb_sl = gnb_sl(ds)

        # compare results to full blown searchlight
        sl = sphere_searchlight(CrossValidation(GNB(), part))
        res_sl = sl(ds)

        assert_datasets_equal(res_gnb_sl, res_sl)

        # and theoretically for this simple single cross-validation we could
        # just use Splitter
        splitter = Splitter('chunks', [2, 3])
        # we have to put explicit None since can't become a kwarg in 1 day any
        # longer here
        gnb_sl_ = sphere_gnbsearchlight(GNB(), None, splitter=splitter)
        res_gnb_sl_ = gnb_sl_(ds)
        assert_datasets_equal(res_gnb_sl, res_gnb_sl_)
Exemplo n.º 41
0
def test_splitter():
    ds = give_data()
    # split with defaults
    spl1 = Splitter('chunks')
    assert_raises(NotImplementedError, spl1, ds)

    splits = list(spl1.generate(ds))
    assert_equal(len(splits), len(ds.sa['chunks'].unique))

    for split in splits:
        # it should have perform basic slicing!
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.sa['chunks'].unique), 1)
        assert_true('lastsplit' in split.a)
    assert_true(splits[-1].a.lastsplit)

    # now again, more customized
    spl2 = Splitter('targets', attr_values = [0,1,1,2,3,3,3], count=4,
                   noslicing=True)
    splits = list(spl2.generate(ds))
    assert_equal(len(splits), 4)
    for split in splits:
        # it should NOT have perform basic slicing!
        assert_false(split.samples.base is ds.samples)
        assert_equal(len(split.sa['targets'].unique), 1)
        assert_equal(len(split.sa['chunks'].unique), 10)
    assert_true(splits[-1].a.lastsplit)

    # two should be identical
    assert_array_equal(splits[1].samples, splits[2].samples)

    # now go wild and split by feature attribute
    ds.fa['roi'] = np.repeat([0,1], 5)
    # splitter should auto-detect that this is a feature attribute
    spl3 = Splitter('roi')
    splits = list(spl3.generate(ds))
    assert_equal(len(splits), 2)
    for split in splits:
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.fa['roi'].unique), 1)
        assert_equal(split.shape, (100, 5))

    # and finally test chained splitters
    cspl = ChainNode([spl2, spl3, spl1])
    splits = list(cspl.generate(ds))
    # 4 target splits and 2 roi splits each and 10 chunks each
    assert_equal(len(splits), 80)
Exemplo n.º 42
0
    def test_gnb(self):
        gnb = GNB()
        gnb_nc = GNB(common_variance=False)
        gnb_n = GNB(normalize=True)
        gnb_n_nc = GNB(normalize=True, common_variance=False)

        ds = datasets['uni2medium']

        # Generic silly coverage just to assure that it works in all
        # possible scenarios:
        bools = (True, False)
        # There should be better way... heh
        for cv in bools:  # common_variance?
            for prior in ('uniform', 'laplacian_smoothing', 'ratio'):
                tp = None  # predictions -- all above should
                # result in the same predictions
                for n in bools:  # normalized?
                    for ls in bools:  # logspace?
                        for es in ((), ('estimates')):
                            gnb_ = GNB(common_variance=cv,
                                       prior=prior,
                                       normalize=n,
                                       logprob=ls,
                                       enable_ca=es)
                            tm = TransferMeasure(gnb_, Splitter('train'))
                            predictions = tm(ds).samples[:, 0]
                            if tp is None:
                                tp = predictions
                            assert_array_equal(predictions, tp)
                            # if normalized -- check if estimates are such
                            if n and 'estimates' in es:
                                v = gnb_.ca.estimates
                                if ls:  # in log space -- take exp ;)
                                    v = np.exp(v)
                                d1 = np.sum(v, axis=1) - 1.0
                                self.assertTrue(np.max(np.abs(d1)) < 1e-5)
Exemplo n.º 43
0
    def _sl_call(self, dataset, roi_ids, nproc):
        """Call to SimpleStatBaseSearchlight
        """
        # Local bindings
        generator = self.generator
        qe = self.queryengine
        errorfx = self.errorfx

        if __debug__:
            time_start = time.time()

        targets_sa_name = self._get_space()
        targets_sa = dataset.sa[targets_sa_name]

        if __debug__:
            debug_slc_ = 'SLC_' in debug.active

        # get the dataset information into easy vars
        X = dataset.samples
        if len(X.shape) != 2:
            raise ValueError(
                  'Unlike a classifier, %s (for now) operates on already'
                  'flattened datasets' % (self.__class__.__name__))
        labels = targets_sa.value
        ulabels = targets_sa.unique
        nlabels = len(ulabels)
        label2index = dict((l, il) for il, l in enumerate(ulabels))
        labels_numeric = np.array([label2index[l] for l in labels])
        self._ulabels_numeric = [label2index[l] for l in ulabels]
        # set the feature dimensions
        nsamples = len(X)
        nrois = len(roi_ids)
        s_shape = X.shape[1:]           # shape of a single sample
        # The shape of results
        r_shape = (nrois,) + X.shape[2:]

        #
        # Everything toward optimization ;)
        #
        # Silly Yarik thinks that it might be worth to pre-compute
        # statistics per each feature within a block of the samples
        # which always come together in splits -- most often it is a
        # (chunk, label) combination, but since we simply use a
        # generator -- who knows! Therefore lets figure out what are
        # those blocks and operate on them instead of original samples.
        #
        # After additional thinking about this -- probably it would be
        # just minor additional improvements (ie not worth it) but
        # since it is coded already -- let it be so

        # 1. Query generator for the splits we will have
        if __debug__:
            debug('SLC',
                  'Phase 1. Initializing partitions using %s on %s'
                  % (generator, dataset))

        # Lets just create a dummy ds which will store for us actual sample
        # indicies
        # XXX we could make it even more lightweight I guess...
        dataset_indicies = Dataset(np.arange(nsamples), sa=dataset.sa)
        splitter = Splitter(attr=generator.get_space())
        partitions = list(generator.generate(dataset_indicies))
        if __debug__:
            for p in partitions:
                if not (np.all(p.sa[targets_sa_name].value == labels)):
                    raise NotImplementedError(
                        "%s does not yet support partitioners altering the targets "
                        "(e.g. permutators)" % self.__class__)

        nsplits = len(partitions)
        # ATM we need to keep the splits instead since they are used
        # in two places in the code: step 2 and 5
        splits = list(tuple(splitter.generate(ds_)) for ds_ in partitions)
        del partitions                    # not used any longer

        # 2. Figure out the new 'chunks x labels' blocks of combinations
        #    of samples
        if __debug__:
            debug('SLC',
                  'Phase 2. Blocking data for %i splits and %i labels'
                  % (nsplits, nlabels))
        # array of indicies for label, split1, split2, ...
        # through which we will pass later on to figure out
        # unique combinations
        combinations = np.ones((nsamples, 1+nsplits), dtype=int)*-1
        # labels
        combinations[:, 0] = labels_numeric
        for ipartition, (split1, split2) in enumerate(splits):
            combinations[split1.samples[:, 0], 1+ipartition] = 1
            combinations[split2.samples[:, 0], 1+ipartition] = 2
            # Check for over-sampling, i.e. no same sample used twice here
            if not (len(np.unique(split1.samples[:, 0])) == len(split1) and
                    len(np.unique(split2.samples[:, 0])) == len(split2)):
                raise RuntimeError(
                    "%s needs a partitioner which does not reuse "
                    "the same the same samples more than once"
                    % self.__class__)
        # sample descriptions -- should be unique for
        # samples within the same block
        descriptions = [tuple(c) for c in combinations]
        udescriptions = sorted(list(set(descriptions)))
        nblocks = len(udescriptions)
        description2block = dict([(d, i) for i, d in enumerate(udescriptions)])
        # Indices for samples to point to their block
        self.__sample2block = sample2block = \
            np.array([description2block[d] for d in descriptions])

        # 3. Compute statistics per each block
        #
        if __debug__:
            debug('SLC',
                  'Phase 3. Computing statistics for %i blocks' % (nblocks,))

        self._compute_pb_stats(labels_numeric, X, (nblocks,) + s_shape)

        # derived classes might decide differently on what they
        # actually need, so defer reserving the space and computing
        # stats to them
        self._reserve_pl_stats_space((nlabels, ) + s_shape)

        # results
        results = np.zeros((nsplits,) + r_shape)

        # 4. Lets deduce all neighbors... might need to be RF into the
        #    parallel part later on
        # TODO: needs OPT since this is the step consuming 50% of time
        #       or more allow to cache them entirely so this would
        #       not be an unnecessary burden during permutation testing
        if not self.reuse_neighbors or self.__roi_fids is None:
            if __debug__:
                debug('SLC',
                      'Phase 4. Deducing neighbors information for %i ROIs'
                      % (nrois,))
            roi_fids = [qe.query_byid(f) for f in roi_ids]

        else:
            if __debug__:
                debug('SLC',
                      'Phase 4. Reusing neighbors information for %i ROIs'
                      % (nrois,))
            roi_fids = self.__roi_fids

        self.ca.roi_feature_ids = roi_fids

        roi_sizes = []
        if isinstance(roi_fids, list):
            nroi_fids = len(roi_fids)
            if self.ca.is_enabled('roi_sizes'):
                roi_sizes = [len(x) for x in roi_fids]
        elif externals.exists('scipy') and isinstance(roi_fids, sps.spmatrix):
            nroi_fids = roi_fids.shape[1]
            if self.ca.is_enabled('roi_sizes'):
                # very expensive operation, so better not to ask over again
                # roi_sizes = [roi_fids.getrow(r).nnz for r in range(nroi_fids)]
                warning("Since 'sparse' trick is used, extracting sizes of "
                        "roi's are expensive at this point.  Get them from the "
                        ".ca value of the original instance before "
                        "calling again and using reuse_neighbors")
        else:
            raise RuntimeError("Should not be reachable")

        # Since this is ad-hoc implementation of the searchlight, we are not passing
        # those via ds.a  but rather assign directly to self.ca
        self.ca.roi_sizes = roi_sizes

        indexsum = self._indexsum
        if indexsum == 'sparse':
            if not self.reuse_neighbors or self.__roi_fids is None:
                if __debug__:
                    debug('SLC',
                          'Phase 4b. Converting neighbors to sparse matrix '
                          'representation')
                # convert to "sparse representation" where column j contains
                # 1s only at the roi_fids[j] indices
                roi_fids = inds_to_coo(roi_fids,
                                       shape=(dataset.nfeatures, nroi_fids))
            indexsum_fx = lastdim_columnsums_spmatrix
        elif indexsum == 'fancy':
            indexsum_fx = lastdim_columnsums_fancy_indexing
        else:
            raise ValueError, \
                  "Do not know how to deal with indexsum=%s" % indexsum

        # Store roi_fids
        if self.reuse_neighbors and self.__roi_fids is None:
            self.__roi_fids = roi_fids

        # 5. Lets do actual "splitting" and "classification"
        if __debug__:
            debug('SLC', 'Phase 5. Major loop' )


        for isplit, split in enumerate(splits):
            if __debug__:
                debug('SLC', ' Split %i out of %i' % (isplit, nsplits))
            # figure out for a given splits the blocks we want to work
            # with
            # sample_indicies
            training_sis = split[0].samples[:, 0]
            testing_sis = split[1].samples[:, 0]

            # That is the GNB specificity
            targets, predictions = self._sl_call_on_a_split(
                split, X,               # X2 might light to go
                training_sis, testing_sis,
                ## training_nsamples,      # GO? == np.sum(pl.nsamples)
                ## training_non0labels,
                ## pl.sums, pl.means, pl.sums2, pl.variances,
                # passing nroi_fids as well since in 'sparse' way it has no 'length'
                nroi_fids, roi_fids,
                indexsum_fx,
                labels_numeric,
                )

            # assess the errors
            if __debug__:
                debug('SLC', "  Assessing accuracies")

            if errorfx is mean_mismatch_error:
                results[isplit, :] = \
                    (predictions != targets[:, None]).sum(axis=0) \
                    / float(len(targets))
            else:
                # somewhat silly but a way which allows to use pre-crafted
                # error functions without a chance to screw up
                for i, fpredictions in enumerate(predictions.T):
                    results[isplit, i] = errorfx(fpredictions, targets)


        if __debug__:
            debug('SLC', "%s._call() is done in %.3g sec" %
                  (self.__class__.__name__, time.time() - time_start))

        return Dataset(results)
Exemplo n.º 44
0
def test_rfe_sensmap():
    # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html
    # just a smoke test. fails with
    from mvpa2.clfs.svm import LinearCSVMC
    from mvpa2.clfs.meta import FeatureSelectionClassifier
    from mvpa2.measures.base import CrossValidation, RepeatedMeasure
    from mvpa2.generators.splitters import Splitter
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.misc.errorfx import mean_mismatch_error
    from mvpa2.mappers.fx import mean_sample
    from mvpa2.mappers.fx import maxofabs_sample
    from mvpa2.generators.base import Repeater
    from mvpa2.featsel.rfe import RFE
    from mvpa2.featsel.helpers import FractionTailSelector, BestDetector
    from mvpa2.featsel.helpers import NBackHistoryStopCrit
    from mvpa2.datasets import vstack

    from mvpa2.misc.data_generators import normal_feature_dataset

    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    fds = normal_feature_dataset(nlabels=3,
                                 snr=1, # 100,   # pure signal! ;)
                                 perlabel=9,
                                 nfeatures=6,
                                 nonbogus_features=range(3),
                                 nchunks=3)
    clfsvm = LinearCSVMC()

    rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()),
                 CrossValidation(
                     clfsvm,
                     NFoldPartitioner(),
                     errorfx=mean_mismatch_error, postproc=mean_sample()),
                 Repeater(2),
                 fselector=FractionTailSelector(0.70, mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 update_sensitivity=True)

    fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm)

    sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample())


    # manually repeating/splitting so we do both RFE sensitivity and classification
    senses, errors = [], []
    for i, pset in enumerate(NFoldPartitioner().generate(fds)):
        # split partitioned dataset
        split = [d for d in Splitter('partitions').generate(pset)]
        senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error
        errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets))

    senses = vstack(senses)
    errors = vstack(errors)

    # Let's compare against rerunning the beast simply for classification with CV
    errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds)
    # and they should match
    assert_array_equal(errors, errors_cv)

    # buggy!
    cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner())
    senses_rm = cv_sensana_svm(fds)

    #print senses.samples, senses_rm.samples
    #print errors, errors_cv.samples
    assert_raises(AssertionError,
                  assert_array_almost_equal,
                  senses.samples, senses_rm.samples)
    raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
Exemplo n.º 45
0
def test_multiclass_ties(clf):
    if 'lars' in clf.__tags__:
        raise SkipTest("Known to crash while running this test")
    ds = _dsties1

    # reassign data between ties, so we know that decision is data, not order driven
    ds_ = ds.copy(deep=True)
    ds_.samples[ds.a.ties_idx[1]] = ds.samples[ds.a.ties_idx[0]]
    ds_.samples[ds.a.ties_idx[0]] = ds.samples[ds.a.ties_idx[1]]
    ok_(np.any(ds_.samples != ds.samples))

    clf_ = clf.clone()
    clf = clf.clone()
    clf.ca.enable(['estimates', 'predictions'])
    clf_.ca.enable(['estimates', 'predictions'])
    te = TransferMeasure(clf,
                         Splitter('train'),
                         postproc=BinaryFxNode(mean_mismatch_error, 'targets'),
                         enable_ca=['stats'])
    te_ = TransferMeasure(clf_,
                          Splitter('train'),
                          postproc=BinaryFxNode(mean_mismatch_error,
                                                'targets'),
                          enable_ca=['stats'])

    te = CrossValidation(clf,
                         NFoldPartitioner(),
                         postproc=mean_sample(),
                         enable_ca=['stats'])
    te_ = CrossValidation(clf_,
                          NFoldPartitioner(),
                          postproc=mean_sample(),
                          enable_ca=['stats'])

    error = te(ds)
    matrix = te.ca.stats.matrix

    # if ties were broken randomly we should have got nearly the same
    # number of hits for tied targets
    ties_indices = [te.ca.stats.labels.index(c) for c in ds.a.ties]
    hits = np.diag(te.ca.stats.matrix)[ties_indices]

    # First check is to see if we swap data between tied labels we
    # are getting the same results if we permute labels accordingly,
    # i.e. that tie resolution is not dependent on the labels order
    # but rather on the data
    te_(ds_)
    matrix_swapped = te_.ca.stats.matrix

    if False:  #0 in hits:
        print clf, matrix, matrix_swapped
        print clf.ca.estimates[:, 2] - clf.ca.estimates[:, 0]
        #print clf.ca.estimates

    # TODO: for now disabled all the non-compliant ones to pass the
    #       tests. For visibility decided to skip them instead of just
    #       exclusion and skipping only here to possibly catch crashes
    #       which might happen before
    if len(
            set(('libsvm', 'sg', 'skl', 'gpr',
                 'blr')).intersection(clf.__tags__)):
        raise SkipTest("Skipped %s because it is known to fail")

    ok_(not (np.array_equal(matrix, matrix_swapped) and 0 in hits))

    # this check is valid only if ties are not broken randomly
    # like it is the case with SMLR
    if not ('random_tie_breaking' in clf.__tags__
            or  # since __tags__ would not go that high up e.g. in
            # <knn on SMLR non-0>
            'SMLR' in str(clf)):
        assert_array_equal(hits, np.diag(matrix_swapped)[ties_indices[::-1]])

    # Second check is to just see if we didn't get an obvious bias and
    # got 0 in one of the hits, although it is labile
    if cfg.getboolean('tests', 'labile', default='yes'):
        ok_(not 0 in hits)
Exemplo n.º 46
0
    def test_n_group_split(self):
        """Test NGroupSplitter alongside with the reversal of the
        order of spit out datasets
        """
        # Test 2 groups like HalfSplitter first
        hs = NGroupPartitioner(2)

        for isreversed, splitter in enumerate((hs, hs)):
            if isreversed:
                spl = Splitter(attr='partitions', reverse=True)
            else:
                spl = Splitter(attr='partitions')
            splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]
            self.failUnless(len(splits) == 2)

            for i, p in enumerate(splits):
                self.failUnless( len(p) == 2 )
                self.failUnless( p[0].nsamples == 50 )
                self.failUnless( p[1].nsamples == 50 )

            assert_array_equal(splits[0][1-isreversed].sa['chunks'].unique,
                               [0, 1, 2, 3, 4])
            assert_array_equal(splits[0][isreversed].sa['chunks'].unique,
                               [5, 6, 7, 8, 9])
            assert_array_equal(splits[1][1-isreversed].sa['chunks'].unique,
                               [5, 6, 7, 8, 9])
            assert_array_equal(splits[1][isreversed].sa['chunks'].unique,
                               [0, 1, 2, 3, 4])

        # check if it works on pure odd and even chunk ids
        moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])]

        for split in moresplits:
            self.failUnless(split[0] != None)
            self.failUnless(split[1] != None)

        # now test more groups
        s5 = NGroupPartitioner(5)

        # get the splits
        for isreversed, s5splitter in enumerate((s5, s5)):
            if isreversed:
                spl = Splitter(attr='partitions', reverse=True)
            else:
                spl = Splitter(attr='partitions')
            splits = [ list(spl.generate(p)) for p in s5splitter.generate(self.data) ]

            # must have 10 splits
            self.failUnless(len(splits) == 5)

            # check split content
            assert_array_equal(splits[0][1-isreversed].sa['chunks'].unique,
                               [0, 1])
            assert_array_equal(splits[0][isreversed].sa['chunks'].unique,
                               [2, 3, 4, 5, 6, 7, 8, 9])
            assert_array_equal(splits[1][1-isreversed].sa['chunks'].unique,
                               [2, 3])
            assert_array_equal(splits[1][isreversed].sa['chunks'].unique,
                               [0, 1, 4, 5, 6, 7, 8, 9])
            # ...
            assert_array_equal(splits[4][1-isreversed].sa['chunks'].unique,
                               [8, 9])
            assert_array_equal(splits[4][isreversed].sa['chunks'].unique,
                               [0, 1, 2, 3, 4, 5, 6, 7])


        # Test for too many groups
        def splitcall(spl, dat):
            return list(spl.generate(dat))
        s20 = NGroupPartitioner(20)
        self.assertRaises(ValueError,splitcall,s20,self.data)
Exemplo n.º 47
0
    def _sl_call(self, dataset, roi_ids, nproc):
        """Call to GNBSearchlight
        """
        # Local bindings
        gnb = self.gnb
        params = gnb.params
        generator = self.generator
        errorfx = self.errorfx
        qe = self.queryengine

        ## if False:
        ##     class A(Learner):
        ##         pass
        ##     self = A()
        ##     import numpy as np
        ##     from mvpa2.clfs.gnb import GNB
        ##     from mvpa2.generators.partition import NFoldPartitioner
        ##     from mvpa2.misc.errorfx import mean_mismatch_error
        ##     from mvpa2.testing.datasets import datasets as tdatasets
        ##     from mvpa2.datasets import Dataset
        ##     from mvpa2.misc.neighborhood import IndexQueryEngine, Sphere
        ##     from mvpa2.clfs.distance import absmin_distance
        ##     import time
        ##     if __debug__:
        ##         from mvpa2.base import debug
        ##         debug.active += ['SLC.*']
        ##         # XXX is it that ugly?
        ##         debug.active.pop(debug.active.index('SLC_'))
        ##         debug.metrics += ['reltime']
        ##     dataset = tdatasets['3dlarge'].copy()
        ##     dataset.fa['voxel_indices'] = dataset.fa.myspace
        ##     sphere = Sphere(radius=1,
        ##                     distance_func=absmin_distance)
        ##     qe = IndexQueryEngine(myspace=sphere)

        ##     # Fracisco's data
        ##     #dataset = ds_fp
        ##     qe = IndexQueryEngine(voxel_indices=sphere)

        ##     qe.train(dataset)
        ##     roi_ids = np.arange(dataset.nfeatures)
        ##     gnb = GNB()
        ##     params = gnb.params
        ##     generator = NFoldPartitioner()
        ##     errorfx = mean_mismatch_error

        if __debug__:
            time_start = time.time()

        targets_sa_name = gnb.get_space()
        targets_sa = dataset.sa[targets_sa_name]

        if __debug__:
            debug_slc_ = 'SLC_' in debug.active

        # get the dataset information into easy vars
        X = dataset.samples
        if len(X.shape) != 2:
            raise ValueError, \
                  'Unlike GNB, GNBSearchlight (for now) operates on already' \
                  'flattened datasets'
        labels = targets_sa.value
        ulabels = targets_sa.unique
        nlabels = len(ulabels)
        label2index = dict((l, il) for il, l in enumerate(ulabels))
        labels_numeric = np.array([label2index[l] for l in labels])
        ulabels_numeric = [label2index[l] for l in ulabels]
        # set the feature dimensions
        nsamples = len(X)
        nrois = len(roi_ids)
        s_shape = X.shape[1:]           # shape of a single sample
        # The shape of results
        r_shape = (nrois,) + X.shape[2:]

        #
        # Everything toward optimization ;)
        #
        # Silly Yarik thinks that it might be worth to pre-compute
        # statistics per each feature within a block of the samples
        # which always come together in splits -- most often it is a
        # (chunk, label) combination, but since we simply use a
        # generator -- who knows! Therefore lets figure out what are
        # those blocks and operate on them instead of original samples.
        #
        # After additional thinking about this -- probably it would be
        # just minor additional improvements (ie not worth it) but
        # since it is coded already -- let it be so

        # 1. Query generator for the splits we will have
        if __debug__:
            debug('SLC',
                  'Phase 1. Initializing partitions using %s on %s'
                  % (generator, dataset))

        # Lets just create a dummy ds which will store for us actual sample
        # indicies
        # XXX we could make it even more lightweight I guess...
        dataset_indicies = Dataset(np.arange(nsamples), sa=dataset.sa)
        splitter = Splitter(attr=generator.get_space())
        splits = list(tuple(splitter.generate(ds_))
                      for ds_ in generator.generate(dataset_indicies))
        nsplits = len(splits)

        # 2. Figure out the new 'chunks x labels' blocks of combinations
        #    of samples
        if __debug__:
            debug('SLC',
                  'Phase 2. Blocking data for %i splits and %i labels'
                  % (nsplits, nlabels))
        # array of indicies for label, split1, split2, ...
        # through which we will pass later on to figure out
        # unique combinations
        combinations = np.ones((nsamples, 1+nsplits), dtype=int)*-1
        # labels
        combinations[:, 0] = labels_numeric
        for ipartition, (split1, split2) in enumerate(splits):
            combinations[split1.samples[:, 0], 1+ipartition] = 1
            combinations[split2.samples[:, 0], 1+ipartition] = 2
            # Check for over-sampling, i.e. no same sample used twice here
            if not (len(np.unique(split1.samples[:, 0])) == len(split1) and
                    len(np.unique(split2.samples[:, 0])) == len(split2)):
                raise RuntimeError(
                    "GNBSearchlight needs a partitioner which does not reuse "
                    "the same the same samples more than once")
        # sample descriptions -- should be unique for
        # samples within the same block
        descriptions = [tuple(c) for c in combinations]
        udescriptions = sorted(list(set(descriptions)))
        nblocks = len(udescriptions)
        description2block = dict([(d, i) for i, d in enumerate(udescriptions)])
        # Indices for samples to point to their block
        sample2block = np.array([description2block[d] for d in descriptions])

        # 3. Compute statistics per each block
        #
        if __debug__:
            debug('SLC',
                  'Phase 3. Computing statistics for %i blocks' % (nblocks,))

        #
        # reusable containers which should stay of the same size
        #

        # sums and sums of squares per each block
        sums = np.zeros((nblocks, ) + s_shape)
        # sums of squares
        sums2 = np.zeros((nblocks, ) + s_shape)

        # per each label:
        means = np.zeros((nlabels, ) + s_shape)
        # means of squares for stddev computation
        means2 = np.zeros((nlabels, ) + s_shape)
        variances = np.zeros((nlabels, ) + s_shape)
        # degenerate dimension are added for easy broadcasting later on
        nsamples_per_class = np.zeros((nlabels,) + (1,)*len(s_shape))

        # results
        results = np.zeros((nsplits,) + r_shape)

        block_counts = np.zeros((nblocks,))
        block_labels = [None] * nblocks

        X2 = np.square(X)
        # silly way for now
        for l, s, s2, ib in zip(labels_numeric, X, X2, sample2block):
            sums[ib] += s
            sums2[ib] += s2
            block_counts[ib] += 1
            if block_labels[ib] is None:
                block_labels[ib] = l
            else:
                assert(block_labels[ib] == l)
        block_labels = np.asanyarray(block_labels)
        # additional silly tests for paranoid
        assert(block_labels.dtype.kind is 'i')

        # 4. Lets deduce all neighbors... might need to be RF into the
        #    parallel part later on
        if __debug__:
            debug('SLC',
                  'Phase 4. Deducing neighbors information for %i ROIs'
                  % (nrois,))
        roi_fids = [qe.query_byid(f) for f in roi_ids]
        nroi_fids = len(roi_fids)
        # makes sense to waste precious ms only if ca is enabled
        if self.ca.is_enabled('roi_sizes'):
            roi_sizes = [len(x) for x in roi_fids]
        else:
            roi_sizes = []

        indexsum = self._indexsum
        if indexsum == 'sparse':
            if __debug__:
                debug('SLC',
                      'Phase 4b. Converting neighbors to sparse matrix '
                      'representation')
            # convert to "sparse representation" where column j contains
            # 1s only at the roi_fids[j] indices
            roi_fids = inds_to_coo(roi_fids,
                                   shape=(dataset.nfeatures, nroi_fids))
            indexsum_fx = lastdim_columnsums_spmatrix
        elif indexsum == 'fancy':
            indexsum_fx = lastdim_columnsums_fancy_indexing
        else:
            raise ValueError, \
                  "Do not know how to deal with indexsum=%s" % indexsum

        # 5. Lets do actual "splitting" and "classification"
        if __debug__:
            debug('SLC', 'Phase 5. Major loop' )

        for isplit, split in enumerate(splits):
            if __debug__:
                debug('SLC', ' Split %i out of %i' % (isplit, nsplits))
            # figure out for a given splits the blocks we want to work
            # with
            # sample_indicies
            training_sis = split[0].samples[:, 0]
            # convert to blocks training split
            training_bis = np.unique(sample2block[training_sis])

            # now lets do our GNB business
            training_nsamples = 0
            for il, l in enumerate(ulabels_numeric):
                bis_il = training_bis[block_labels[training_bis] == l]
                nsamples_per_class[il] = N_float = \
                                         float(np.sum(block_counts[bis_il]))
                training_nsamples += N_float
                if N_float == 0.0:
                    variances[il] = means[il] = means2[il] = 0.
                else:
                    means[il] = np.sum(sums[bis_il], axis=0) / N_float
                    # Not yet normed
                    means2[il] = np.sum(sums2[bis_il], axis=0)

            ## Actually compute the non-0 variances
            non0labels = (nsamples_per_class.squeeze() != 0)
            if np.all(non0labels):
                # For a possible tiny speed up avoiding copying and
                # using (no) slicing
                non0labels = slice(None)

            if params.common_variance:
                variances[:] = \
                    np.sum(means2 - nsamples_per_class*np.square(means),
                           axis=0) \
                    / training_nsamples
            else:
                variances[non0labels] = \
                    (means2 - nsamples_per_class*np.square(means))[non0labels] \
                    / nsamples_per_class[non0labels]

            # assign priors
            priors = gnb._get_priors(
                nlabels, training_nsamples, nsamples_per_class)

            # proceed in a way we have in GNB code with logprob=True,
            # i.e. operating within the exponents -- should lead to some
            # performance advantage
            norm_weight = -0.5 * np.log(2*np.pi*variances)
            # last added dimension would be for ROIs
            logpriors = np.log(priors[:, np.newaxis, np.newaxis])

            if __debug__:
                debug('SLC', "  'Training' is done")

            # Now it is time to "classify" our samples.
            # and for that we first need to compute corresponding
            # probabilities (or may be un
            data = X[split[1].samples[:, 0]]
            targets = labels_numeric[split[1].samples[:, 0]]

            # argument of exponentiation
            scaled_distances = \
                 -0.5 * (((data - means[:, np.newaxis, ...])**2) \
                         / variances[:, np.newaxis, ...])

            # incorporate the normalization from normals
            lprob_csfs = norm_weight[:, np.newaxis, ...] + scaled_distances

            ## First we need to reshape to get class x samples x features
            lprob_csf = lprob_csfs.reshape(lprob_csfs.shape[:2] + (-1,))

            ## Now we come to naive part which requires looping
            ## through all spheres
            if __debug__:
                debug('SLC', "  Doing 'Searchlight'")
            # resultant logprobs for each class x sample x roi
            lprob_cs_sl = np.zeros(lprob_csfs.shape[:2] + (nroi_fids,))
            indexsum_fx(lprob_csf, roi_fids, out=lprob_cs_sl)

            lprob_cs_sl += logpriors
            lprob_cs_cp_sl = lprob_cs_sl
            # for each of the ROIs take the class with maximal (log)probability
            predictions = lprob_cs_cp_sl.argmax(axis=0)
            # no need to map back [self.ulabels[c] for c in winners]
            #predictions = winners
            # assess the errors
            if __debug__:
                debug('SLC', "  Assessing accuracies")

            if errorfx is mean_mismatch_error:
                results[isplit, :] = \
                    (predictions != targets[:, None]).sum(axis=0) \
                    / float(len(targets))
            else:
                # somewhat silly but a way which allows to use pre-crafted
                # error functions without a chance to screw up
                for i, fpredictions in enumerate(predictions.T):
                    results[isplit, i] = errorfx(fpredictions, targets)

        if __debug__:
            debug('SLC', "GNBSearchlight is done in %.3g sec" %
                  (time.time() - time_start))

        return Dataset(results), roi_sizes