示例#1
0
文件: test_rfe.py 项目: B-Rich/PyMVPA
    def __test_matthias_question(self):
        rfe_clf = LinearCSVMC(C=1)

        rfesvm_split = SplitClassifier(rfe_clf)
        clf = \
            FeatureSelectionClassifier(
            clf = LinearCSVMC(C=1),
            feature_selection = RFE(
                sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer(
                    combiner=first_axis_mean,
                    transformer=np.abs),
                transfer_error=ConfusionBasedError(
                    rfesvm_split,
                    confusion_state="confusion"),
                stopping_criterion=FixedErrorThresholdStopCrit(0.20),
                feature_selector=FractionTailSelector(
                    0.2, mode='discard', tail='lower'),
                update_sensitivity=True))

        no_permutations = 1000
        permutator = AttributePermutator('targets', count=no_permutations)
        cv = CrossValidation(clf, NFoldPartitioner(),
            null_dist=MCNullDist(permutator, tail='left'),
            enable_ca=['stats'])
        error = cv(datasets['uni2small'])
        self.failUnless(error < 0.4)
        self.failUnless(cv.ca.null_prob < 0.05)
示例#2
0
    def testSplitClassifierExtended(self, clf_):
        clf2 = clf_.clone()
        ds = datasets['uni2medium']#self.data_bin_1
        clf = SplitClassifier(clf=clf_, #SameSignClassifier(),
                splitter=NFoldSplitter(1),
                enable_states=['confusion', 'feature_ids'])
        clf.train(ds)                   # train the beast
        error = clf.confusion.error

        cv = CrossValidatedTransferError(
            TransferError(clf2),
            NFoldSplitter(),
            enable_states=['confusion', 'training_confusion'])
        cverror = cv(ds)

        self.failUnless(abs(error-cverror)<0.01,
                msg="We should get the same error using split classifier as"
                    " using CrossValidatedTransferError. Got %s and %s"
                    % (error, cverror))

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(error < 0.25,
                msg="clf should generalize more or less fine. "
                    "Got error %s" % error)
        self.failUnlessEqual(len(clf.confusion.sets), len(ds.uniquechunks),
            msg="Should have 1 confusion per each split")
        self.failUnlessEqual(len(clf.clfs), len(ds.uniquechunks),
            msg="Should have number of classifiers equal # of epochs")
示例#3
0
文件: test_rfe.py 项目: arokem/PyMVPA
    def __test_matthias_question(self):
        rfe_clf = LinearCSVMC(C=1)

        rfesvm_split = SplitClassifier(rfe_clf)
        clf = FeatureSelectionClassifier(
            clf=LinearCSVMC(C=1),
            feature_selection=RFE(
                sensitivity_analyzer=rfesvm_split.get_sensitivity_analyzer(
                    combiner=first_axis_mean, transformer=np.abs
                ),
                transfer_error=ConfusionBasedError(rfesvm_split, confusion_state="confusion"),
                stopping_criterion=FixedErrorThresholdStopCrit(0.20),
                feature_selector=FractionTailSelector(0.2, mode="discard", tail="lower"),
                update_sensitivity=True,
            ),
        )

        splitter = NFoldSplitter(cvtype=1)
        no_permutations = 1000

        cv = CrossValidatedTransferError(
            TransferError(clf),
            splitter,
            null_dist=MCNullDist(permutations=no_permutations, tail="left"),
            enable_ca=["confusion"],
        )
        error = cv(datasets["uni2small"])
        self.failUnless(error < 0.4)
        self.failUnless(cv.ca.null_prob < 0.05)
示例#4
0
文件: test_clf.py 项目: esc/PyMVPA
    def test_split_classifier_extended(self, clf_):
        clf2 = clf_.clone()
        ds = datasets['uni2%s' % self._get_clf_ds(clf2)]
        clf = SplitClassifier(clf=clf_, #SameSignClassifier(),
                enable_ca=['stats', 'feature_ids'])
        clf.train(ds)                   # train the beast
        error = clf.ca.stats.error

        cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(),
            enable_ca=['stats', 'training_stats'])
        cverror = cv(ds).samples.squeeze()

        self.failUnless(abs(error-cverror)<0.01,
                msg="We should get the same error using split classifier as"
                    " using CrossValidation. Got %s and %s"
                    % (error, cverror))

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(error < 0.25,
                msg="clf should generalize more or less fine. "
                    "Got error %s" % error)
        self.failUnlessEqual(len(clf.ca.stats.sets), len(ds.UC),
            msg="Should have 1 confusion per each split")
        self.failUnlessEqual(len(clf.clfs), len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
示例#5
0
    def __test_matthias_question(self):
        rfe_clf = LinearCSVMC(C=1)

        rfesvm_split = SplitClassifier(rfe_clf)
        clf = \
            FeatureSelectionClassifier(
            clf = LinearCSVMC(C=1),
            feature_selection = RFE(
                sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer(
                    combiner=first_axis_mean,
                    transformer=np.abs),
                transfer_error=ConfusionBasedError(
                    rfesvm_split,
                    confusion_state="confusion"),
                stopping_criterion=FixedErrorThresholdStopCrit(0.20),
                feature_selector=FractionTailSelector(
                    0.2, mode='discard', tail='lower'),
                update_sensitivity=True))

        splitter = NFoldSplitter(cvtype=1)
        no_permutations = 1000

        cv = CrossValidatedTransferError(
            TransferError(clf),
            splitter,
            null_dist=MCNullDist(permutations=no_permutations,
                                 tail='left'),
            enable_ca=['confusion'])
        error = cv(datasets['uni2small'])
        self.failUnless(error < 0.4)
        self.failUnless(cv.ca.null_prob < 0.05)
示例#6
0
    def test_split_classifier_extended(self, clf_):
        clf2 = clf_.clone()
        ds = datasets['uni2medium']  #self.data_bin_1
        clf = SplitClassifier(
            clf=clf_,  #SameSignClassifier(),
            splitter=NFoldSplitter(1),
            enable_ca=['confusion', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.confusion.error

        cv = CrossValidatedTransferError(
            TransferError(clf2),
            NFoldSplitter(),
            postproc=mean_sample(),
            enable_ca=['confusion', 'training_confusion'])
        cverror = cv(ds).samples.squeeze()

        self.failUnless(
            abs(error - cverror) < 0.01,
            msg="We should get the same error using split classifier as"
            " using CrossValidatedTransferError. Got %s and %s" %
            (error, cverror))

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(error < 0.25,
                            msg="clf should generalize more or less fine. "
                            "Got error %s" % error)
        self.failUnlessEqual(len(clf.ca.confusion.sets),
                             len(ds.UC),
                             msg="Should have 1 confusion per each split")
        self.failUnlessEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
示例#7
0
    def testAnalyzerWithSplitClassifier(self, clf):
        """Test analyzers in split classifier
        """
        # assumming many defaults it is as simple as
        mclf = SplitClassifier(clf=clf,
                               enable_states=['training_confusion',
                                              'confusion'])
        sana = mclf.getSensitivityAnalyzer(transformer=Absolute,
                                           enable_states=["sensitivities"])

        # Test access to transformers and combiners
        self.failUnless(sana.transformer is Absolute)
        self.failUnless(sana.combiner is FirstAxisMean)
        # and lets look at all sensitivities

        # and we get sensitivity analyzer which works on splits
        map_ = sana(self.dataset)
        self.failUnlessEqual(len(map_), self.dataset.nfeatures)

        if cfg.getboolean('tests', 'labile', default='yes'):
            for conf_matrix in [sana.clf.training_confusion] \
                              + sana.clf.confusion.matrices:
                self.failUnless(
                    conf_matrix.percentCorrect>75,
                    msg="We must have trained on each one more or " \
                    "less correctly. Got %f%% correct on %d labels" %
                    (conf_matrix.percentCorrect,
                     len(self.dataset.uniquelabels)))

        errors = [x.percentCorrect
                    for x in sana.clf.confusion.matrices]

        # XXX
        # That is too much to ask if the dataset is easy - thus
        # disabled for now
        #self.failUnless(N.min(errors) != N.max(errors),
        #                msg="Splits should have slightly but different " \
        #                    "generalization")

        # lets go through all sensitivities and see if we selected the right
        # features
        # XXX yoh: disabled checking of each map separately since in
        #     BoostedClassifierSensitivityAnalyzer and
        #     ProxyClassifierSensitivityAnalyzer
        #     we don't have yet way to provide transformers thus internal call
        #     to getSensitivityAnalyzer in _call of them is not parametrized
        if 'meta' in clf._clf_internals and len(map_.nonzero()[0])<2:
            # Some meta classifiers (5% of ANOVA) are too harsh ;-)
            return
        for map__ in [map_]: # + sana.combined_analyzer.sensitivities:
            selected = FixedNElementTailSelector(
                self.dataset.nfeatures -
                len(self.dataset.nonbogus_features))(map__)
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnlessEqual(
                    list(selected),
                    list(self.dataset.nonbogus_features),
                    msg="At the end we should have selected the right features")
示例#8
0
    def testRegressions(self, regr):
        """Simple tests on regressions
        """
        ds = datasets['chirp_linear']

        cve = CrossValidatedTransferError(
            TransferError(regr, CorrErrorFx()),
            splitter=NFoldSplitter(),
            enable_states=['training_confusion', 'confusion'])
        corr = cve(ds)

        self.failUnless(corr == cve.confusion.stats['CCe'])

        splitregr = SplitClassifier(regr,
                                    splitter=OddEvenSplitter(),
                                    enable_states=['training_confusion', 'confusion'])
        splitregr.train(ds)
        split_corr = splitregr.confusion.stats['CCe']
        split_corr_tr = splitregr.training_confusion.stats['CCe']

        for confusion, error in ((cve.confusion, corr),
                                 (splitregr.confusion, split_corr),
                                 (splitregr.training_confusion, split_corr_tr),
                                 ):
            #TODO: test confusion statistics
            # Part of it for now -- CCe
            for conf in confusion.summaries:
                stats = conf.stats
                self.failUnless(stats['CCe'] < 0.5)
                self.failUnlessEqual(stats['CCe'], stats['Summary CCe'])

            s0 = confusion.asstring(short=True)
            s1 = confusion.asstring(short=False)

            for s in [s0, s1]:
                self.failUnless(len(s) > 10,
                                msg="We should get some string representation "
                                "of regression summary. Got %s" % s)

            self.failUnless(error < 0.2,
                            msg="Regressions should perform well on a simple "
                            "dataset. Got correlation error of %s " % error)

            # Test access to summary statistics
            # YOH: lets start making testing more reliable.
            #      p-value for such accident to have is verrrry tiny,
            #      so if regression works -- it better has at least 0.5 ;)
            #      otherwise fix it! ;)
            #if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(confusion.stats['CCe'] < 0.5)

        split_predictions = splitregr.predict(ds.samples) # just to check if it works fine
示例#9
0
文件: test_clf.py 项目: esc/PyMVPA
 def test_harvesting(self):
     """Basic testing of harvesting based on SplitClassifier
     """
     ds = self.data_bin_1
     clf = SplitClassifier(clf=SameSignClassifier(),
             enable_ca=['stats', 'training_stats'],
             harvest_attribs=['clf.ca.training_time'],
             descr="DESCR")
     clf.train(ds)                   # train the beast
     # Number of harvested items should be equal to number of chunks
     self.failUnlessEqual(
         len(clf.ca.harvested['clf.ca.training_time']), len(ds.UC))
     # if we can blame multiple inheritance and ClassWithCollections.__init__
     self.failUnlessEqual(clf.descr, "DESCR")
示例#10
0
 def test_harvesting(self):
     """Basic testing of harvesting based on SplitClassifier
     """
     ds = self.data_bin_1
     clf = SplitClassifier(
         clf=SameSignClassifier(),
         splitter=NFoldSplitter(1),
         enable_ca=['confusion', 'training_confusion', 'feature_ids'],
         harvest_attribs=['clf.ca.feature_ids', 'clf.ca.training_time'],
         descr="DESCR")
     clf.train(ds)  # train the beast
     # Number of harvested items should be equal to number of chunks
     self.failUnlessEqual(len(clf.ca.harvested['clf.ca.feature_ids']),
                          len(ds.UC))
     # if we can blame multiple inheritance and ClassWithCollections.__init__
     self.failUnlessEqual(clf.descr, "DESCR")
示例#11
0
 def testHarvesting(self):
     """Basic testing of harvesting based on SplitClassifier
     """
     ds = self.data_bin_1
     clf = SplitClassifier(clf=SameSignClassifier(),
             splitter=NFoldSplitter(1),
             enable_states=['confusion', 'training_confusion',
                            'feature_ids'],
             harvest_attribs=['clf.feature_ids',
                              'clf.training_time'],
             descr="DESCR")
     clf.train(ds)                   # train the beast
     # Number of harvested items should be equal to number of chunks
     self.failUnlessEqual(len(clf.harvested['clf.feature_ids']),
                          len(ds.uniquechunks))
     # if we can blame multiple inheritance and ClassWithCollections.__init__
     self.failUnlessEqual(clf.descr, "DESCR")
示例#12
0
    def test_split_classifier(self):
        ds = self.data_bin_1
        clf = SplitClassifier(
            clf=SameSignClassifier(),
            splitter=NFoldSplitter(1),
            enable_ca=['confusion', 'training_confusion', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.confusion.error
        tr_error = clf.ca.training_confusion.error

        clf2 = clf.clone()
        cv = CrossValidatedTransferError(
            TransferError(clf2),
            NFoldSplitter(),
            postproc=mean_sample(),
            enable_ca=['confusion', 'training_confusion'])
        cverror = cv(ds).samples.squeeze()
        tr_cverror = cv.ca.training_confusion.error

        self.failUnlessEqual(
            error,
            cverror,
            msg="We should get the same error using split classifier as"
            " using CrossValidatedTransferError. Got %s and %s" %
            (error, cverror))

        self.failUnlessEqual(
            tr_error,
            tr_cverror,
            msg="We should get the same training error using split classifier as"
            " using CrossValidatedTransferError. Got %s and %s" %
            (tr_error, tr_cverror))

        self.failUnlessEqual(clf.ca.confusion.percent_correct,
                             100,
                             msg="Dummy clf should train perfectly")
        self.failUnlessEqual(len(clf.ca.confusion.sets),
                             len(ds.UC),
                             msg="Should have 1 confusion per each split")
        self.failUnlessEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
        self.failUnlessEqual(clf.predict(ds.samples),
                             list(ds.targets),
                             msg="Should classify correctly")

        # feature_ids must be list of lists, and since it is not
        # feature-selecting classifier used - we expect all features
        # to be utilized
        #  NOT ANYMORE -- for BoostedClassifier we have now union of all
        #  used features across slave classifiers. That makes
        #  semantics clear. If you need to get deeper -- use upcoming
        #  harvesting facility ;-)
        # self.failUnlessEqual(len(clf.feature_ids), len(ds.uniquechunks))
        # self.failUnless(np.array([len(ids)==ds.nfeatures
        #                         for ids in clf.feature_ids]).all())

        # Just check if we get it at all ;-)
        summary = clf.summary()
示例#13
0
    def __testFSPipelineWithAnalyzerWithSplitClassifier(self, basic_clf):
        #basic_clf = LinearNuSVMC()
        multi_clf = MulticlassClassifier(clf=basic_clf)
        #svm_weigths = LinearSVMWeights(svm)

        # Proper RFE: aggregate sensitivities across multiple splits,
        # but also due to multi class those need to be aggregated
        # somehow. Transfer error here should be 'leave-1-out' error
        # of split classifier itself
        sclf = SplitClassifier(clf=basic_clf)
        rfe = RFE(sensitivity_analyzer=
                    sclf.getSensitivityAnalyzer(
                        enable_states=["sensitivities"]),
                  transfer_error=trans_error,
                  feature_selector=FeatureSelectionPipeline(
                      [FractionTailSelector(0.5),
                       FixedNElementTailSelector(1)]),
                  train_clf=True)

        # and we get sensitivity analyzer which works on splits and uses
        # sensitivity
        selected_features = rfe(self.dataset)
    def __test_fspipeline_with_split_classifier(self, basic_clf):
        #basic_clf = LinearNuSVMC()
        multi_clf = MulticlassClassifier(clf=basic_clf)
        #svm_weigths = LinearSVMWeights(svm)

        # Proper RFE: aggregate sensitivities across multiple splits,
        # but also due to multi class those need to be aggregated
        # somehow. Transfer error here should be 'leave-1-out' error
        # of split classifier itself
        sclf = SplitClassifier(clf=basic_clf)
        rfe = RFE(sensitivity_analyzer=sclf.get_sensitivity_analyzer(
            enable_ca=["sensitivities"]),
                  transfer_error=trans_error,
                  feature_selector=FeatureSelectionPipeline([
                      FractionTailSelector(0.5),
                      FixedNElementTailSelector(1)
                  ]),
                  train_clf=True)

        # and we get sensitivity analyzer which works on splits and uses
        # sensitivity
        selected_features = rfe(self.dataset)
示例#15
0
文件: test_clf.py 项目: arokem/PyMVPA
    def test_split_classifier(self):
        ds = self.data_bin_1
        clf = SplitClassifier(clf=SameSignClassifier(),
                splitter=NFoldSplitter(1),
                enable_ca=['confusion', 'training_confusion',
                               'feature_ids'])
        clf.train(ds)                   # train the beast
        error = clf.ca.confusion.error
        tr_error = clf.ca.training_confusion.error

        clf2 = clf.clone()
        cv = CrossValidatedTransferError(
            TransferError(clf2),
            NFoldSplitter(),
            postproc=mean_sample(),
            enable_ca=['confusion', 'training_confusion'])
        cverror = cv(ds).samples.squeeze()
        tr_cverror = cv.ca.training_confusion.error

        self.failUnlessEqual(error, cverror,
                msg="We should get the same error using split classifier as"
                    " using CrossValidatedTransferError. Got %s and %s"
                    % (error, cverror))

        self.failUnlessEqual(tr_error, tr_cverror,
                msg="We should get the same training error using split classifier as"
                    " using CrossValidatedTransferError. Got %s and %s"
                    % (tr_error, tr_cverror))

        self.failUnlessEqual(clf.ca.confusion.percent_correct,
                             100,
                             msg="Dummy clf should train perfectly")
        self.failUnlessEqual(len(clf.ca.confusion.sets),
                             len(ds.UC),
                             msg="Should have 1 confusion per each split")
        self.failUnlessEqual(len(clf.clfs), len(ds.UC),
                             msg="Should have number of classifiers equal # of epochs")
        self.failUnlessEqual(clf.predict(ds.samples), list(ds.targets),
                             msg="Should classify correctly")

        # feature_ids must be list of lists, and since it is not
        # feature-selecting classifier used - we expect all features
        # to be utilized
        #  NOT ANYMORE -- for BoostedClassifier we have now union of all
        #  used features across slave classifiers. That makes
        #  semantics clear. If you need to get deeper -- use upcoming
        #  harvesting facility ;-)
        # self.failUnlessEqual(len(clf.feature_ids), len(ds.uniquechunks))
        # self.failUnless(np.array([len(ids)==ds.nfeatures
        #                         for ids in clf.feature_ids]).all())

        # Just check if we get it at all ;-)
        summary = clf.summary()
示例#16
0
    def test_regressions(self, regr):
        """Simple tests on regressions
        """
        ds = datasets['chirp_linear']
        # we want numeric labels to maintain the previous behavior, especially
        # since we deal with regressions here
        ds.sa.targets = AttributeMap().to_numeric(ds.targets)

        cve = CrossValidatedTransferError(
            TransferError(regr),
            splitter=NFoldSplitter(),
            postproc=mean_sample(),
            enable_ca=['training_confusion', 'confusion'])
        # check the default
        self.failUnless(isinstance(cve.transerror.errorfx,
                                   CorrErrorFx))
        corr = np.asscalar(cve(ds).samples)

        # Our CorrErrorFx should never return NaN
        self.failUnless(not np.isnan(corr))
        self.failUnless(corr == cve.ca.confusion.stats['CCe'])

        splitregr = SplitClassifier(
            regr, splitter=OddEvenSplitter(),
            enable_ca=['training_confusion', 'confusion'])
        splitregr.train(ds)
        split_corr = splitregr.ca.confusion.stats['CCe']
        split_corr_tr = splitregr.ca.training_confusion.stats['CCe']

        for confusion, error in (
            (cve.ca.confusion, corr),
            (splitregr.ca.confusion, split_corr),
            (splitregr.ca.training_confusion, split_corr_tr),
            ):
            #TODO: test confusion statistics
            # Part of it for now -- CCe
            for conf in confusion.summaries:
                stats = conf.stats
                if cfg.getboolean('tests', 'labile', default='yes'):
                    self.failUnless(stats['CCe'] < 0.5)
                self.failUnlessEqual(stats['CCe'], stats['Summary CCe'])

            s0 = confusion.as_string(short=True)
            s1 = confusion.as_string(short=False)

            for s in [s0, s1]:
                self.failUnless(len(s) > 10,
                                msg="We should get some string representation "
                                "of regression summary. Got %s" % s)
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnless(error < 0.2,
                            msg="Regressions should perform well on a simple "
                            "dataset. Got correlation error of %s " % error)

            # Test access to summary statistics
            # YOH: lets start making testing more reliable.
            #      p-value for such accident to have is verrrry tiny,
            #      so if regression works -- it better has at least 0.5 ;)
            #      otherwise fix it! ;)
            # YOH: not now -- issues with libsvr in SG and linear kernel
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnless(confusion.stats['CCe'] < 0.5)

        # just to check if it works fine
        split_predictions = splitregr.predict(ds.samples)
示例#17
0
    def test_analyzer_with_split_classifier(self, clfds):
        """Test analyzers in split classifier
        """
        clf, ds = clfds             # unroll the tuple
        # We need to skip some LARSes here
        _sclf = str(clf)
        if 'LARS(' in _sclf and "type='stepwise'" in _sclf:
            # ADD KnownToFail thingie from NiPy
            return

        # To don't waste too much time testing lets limit to 3 splits
        nsplits = 3
        partitioner = NFoldPartitioner(count=nsplits)
        mclf = SplitClassifier(clf=clf,
                               partitioner=partitioner,
                               enable_ca=['training_stats',
                                              'stats'])
        sana = mclf.get_sensitivity_analyzer(# postproc=absolute_features(),
                                           enable_ca=["sensitivities"])

        ulabels = ds.uniquetargets
        nlabels = len(ulabels)
        # Can't rely on splitcfg since count-limit is done in __call__
        assert(nsplits == len(list(partitioner.generate(ds))))
        sens = sana(ds)

        # It should return either ...
        #  nlabels * nsplits
        req_nsamples = [ nlabels * nsplits ]
        if nlabels == 2:
            # A single sensitivity in case of binary
            req_nsamples += [ nsplits ]
        else:
            # and for pairs in case of multiclass
            req_nsamples += [ (nlabels * (nlabels-1) / 2) * nsplits ]
            # and for 1-vs-1 embedded within Multiclass operating on
            # pairs (e.g. SMLR)
            req_nsamples += [req_nsamples[-1]*2]

            # Also for regression_based -- they can do multiclass
            # but only 1 sensitivity is provided
            if 'regression_based' in clf.__tags__:
                req_nsamples += [ nsplits ]

        # # of features should correspond
        self.failUnlessEqual(sens.shape[1], ds.nfeatures)
        # # of samples/sensitivities should also be reasonable
        self.failUnless(sens.shape[0] in req_nsamples)

        # Check if labels are present
        self.failUnless('splits' in sens.sa)
        self.failUnless('targets' in sens.sa)
        # should be 1D -- otherwise dtype object
        self.failUnless(sens.sa.targets.ndim == 1)

        sens_ulabels = sens.sa['targets'].unique
        # Some labels might be pairs(tuples) so ndarray would be of
        # dtype object and we would need to get them all
        if sens_ulabels.dtype is np.dtype('object'):
            sens_ulabels = np.unique(
                reduce(lambda x,y: x+y, [list(x) for x in sens_ulabels]))

        assert_array_equal(sens_ulabels, ds.sa['targets'].unique)

        errors = [x.percent_correct
                    for x in sana.clf.ca.stats.matrices]

        # lets go through all sensitivities and see if we selected the right
        # features
        #if 'meta' in clf.__tags__ and len(sens.samples[0].nonzero()[0])<2:
        if '5%' in clf.descr \
               or (nlabels > 2 and 'regression_based' in clf.__tags__):
            # Some meta classifiers (5% of ANOVA) are too harsh ;-)
            # if we get less than 2 features with on-zero sensitivities we
            # cannot really test
            # Also -- regression based classifiers performance for multiclass
            # is expected to suck in general
            return

        if cfg.getboolean('tests', 'labile', default='yes'):
            for conf_matrix in [sana.clf.ca.training_stats] \
                              + sana.clf.ca.stats.matrices:
                self.failUnless(
                    conf_matrix.percent_correct>=70,
                    msg="We must have trained on each one more or " \
                    "less correctly. Got %f%% correct on %d labels" %
                    (conf_matrix.percent_correct,
                     nlabels))


        # Since  now we have per split and possibly per label -- lets just find
        # mean per each feature per label across splits
        sensm = FxMapper('samples', lambda x: np.sum(x),
                         uattrs=['targets']).forward(sens)
        sensgm = maxofabs_sample().forward(sensm)    # global max of abs of means

        assert_equal(sensgm.shape[0], 1)
        assert_equal(sensgm.shape[1], ds.nfeatures)

        selected = FixedNElementTailSelector(
            len(ds.a.bogus_features))(sensgm.samples[0])

        if cfg.getboolean('tests', 'labile', default='yes'):

            self.failUnlessEqual(
                set(selected), set(ds.a.nonbogus_features),
                msg="At the end we should have selected the right features. "
                "Chose %s whenever nonbogus are %s"
                % (selected, ds.a.nonbogus_features))

            # Now test each one per label
            # TODO: collect all failures and spit them out at once --
            #       that would make it easy to see if the sensitivity
            #       just has incorrect order of labels assigned
            for sens1 in sensm:
                labels1 = sens1.targets  # labels (1) for this sensitivity
                lndim = labels1.ndim
                label = labels1[0]      # current label

                # XXX whole lndim comparison should be gone after
                #     things get fixed and we arrive here with a tuple!
                if lndim == 1: # just a single label
                    self.failUnless(label in ulabels)

                    ilabel_all = np.where(ds.fa.nonbogus_targets == label)[0]
                    # should have just 1 feature for the label
                    self.failUnlessEqual(len(ilabel_all), 1)
                    ilabel = ilabel_all[0]

                    maxsensi = np.argmax(sens1) # index of max sensitivity
                    self.failUnlessEqual(maxsensi, ilabel,
                        "Maximal sensitivity for %s was found in %i whenever"
                        " original feature was %i for nonbogus features %s"
                        % (labels1, maxsensi, ilabel, ds.a.nonbogus_features))
                elif lndim == 2 and labels1.shape[1] == 2: # pair of labels
                    # we should have highest (in abs) coefficients in
                    # those two labels
                    maxsensi2 = np.argsort(np.abs(sens1))[0][-2:]
                    ilabel2 = [np.where(ds.fa.nonbogus_targets == l)[0][0]
                                    for l in label]
                    self.failUnlessEqual(
                        set(maxsensi2), set(ilabel2),
                        "Maximal sensitivity for %s was found in %s whenever"
                        " original features were %s for nonbogus features %s"
                        % (labels1, maxsensi2, ilabel2, ds.a.nonbogus_features))
                    """
                    # Now test for the sign of each one in pair ;) in
                    # all binary problems L1 (-1) -> L2(+1), then
                    # weights for L2 should be positive.  to test for
                    # L1 -- invert the sign
                    # We already know (if we haven't failed in previous test),
                    # that those 2 were the strongest -- so check only signs
                    """
                    self.failUnless(
                        sens1.samples[0, ilabel2[0]]<0,
                        "With %i classes in pair %s got feature %i for %r >= 0"
                        % (nlabels, label, ilabel2[0], label[0]))
                    self.failUnless(sens1.samples[0, ilabel2[1]]>0,
                        "With %i classes in pair %s got feature %i for %r <= 0"
                        % (nlabels, label, ilabel2[1], label[1]))
                else:
                    # yoh could be wrong at this assumption... time will show
                    self.fail("Got unknown number labels per sensitivity: %s."
                              " Should be either a single label or a pair"
                              % labels1)
    def test_analyzer_with_split_classifier(self, clfds):
        """Test analyzers in split classifier
        """
        clf, ds = clfds  # unroll the tuple
        # We need to skip some LARSes here
        _sclf = str(clf)
        if 'LARS(' in _sclf and "type='stepwise'" in _sclf:
            # ADD KnownToFail thingie from NiPy
            return

        # To don't waste too much time testing lets limit to 3 splits
        nsplits = 3
        splitter = NFoldSplitter(count=nsplits)
        mclf = SplitClassifier(clf=clf,
                               splitter=splitter,
                               enable_ca=['training_confusion', 'confusion'])
        sana = mclf.get_sensitivity_analyzer(  # postproc=absolute_features(),
            enable_ca=["sensitivities"])

        ulabels = ds.uniquetargets
        nlabels = len(ulabels)
        # Can't rely on splitcfg since count-limit is done in __call__
        assert (nsplits == len(list(splitter(ds))))
        sens = sana(ds)

        # It should return either ...
        #  nlabels * nsplits
        req_nsamples = [nlabels * nsplits]
        if nlabels == 2:
            # A single sensitivity in case of binary
            req_nsamples += [nsplits]
        else:
            # and for pairs in case of multiclass
            req_nsamples += [(nlabels * (nlabels - 1) / 2) * nsplits]
            # and for 1-vs-1 embedded within Multiclass operating on
            # pairs (e.g. SMLR)
            req_nsamples += [req_nsamples[-1] * 2]

            # Also for regression_based -- they can do multiclass
            # but only 1 sensitivity is provided
            if 'regression_based' in clf.__tags__:
                req_nsamples += [nsplits]

        # # of features should correspond
        self.failUnlessEqual(sens.shape[1], ds.nfeatures)
        # # of samples/sensitivities should also be reasonable
        self.failUnless(sens.shape[0] in req_nsamples)

        # Check if labels are present
        self.failUnless('splits' in sens.sa)
        self.failUnless('targets' in sens.sa)
        # should be 1D -- otherwise dtype object
        self.failUnless(sens.sa.targets.ndim == 1)

        sens_ulabels = sens.sa['targets'].unique
        # Some labels might be pairs(tuples) so ndarray would be of
        # dtype object and we would need to get them all
        if sens_ulabels.dtype is np.dtype('object'):
            sens_ulabels = np.unique(
                reduce(lambda x, y: x + y, [list(x) for x in sens_ulabels]))

        assert_array_equal(sens_ulabels, ds.sa['targets'].unique)

        errors = [x.percent_correct for x in sana.clf.ca.confusion.matrices]

        # lets go through all sensitivities and see if we selected the right
        # features
        #if 'meta' in clf.__tags__ and len(sens.samples[0].nonzero()[0])<2:
        if '5%' in clf.descr \
               or (nlabels > 2 and 'regression_based' in clf.__tags__):
            # Some meta classifiers (5% of ANOVA) are too harsh ;-)
            # if we get less than 2 features with on-zero sensitivities we
            # cannot really test
            # Also -- regression based classifiers performance for multiclass
            # is expected to suck in general
            return

        if cfg.getboolean('tests', 'labile', default='yes'):
            for conf_matrix in [sana.clf.ca.training_confusion] \
                              + sana.clf.ca.confusion.matrices:
                self.failUnless(
                    conf_matrix.percent_correct>=70,
                    msg="We must have trained on each one more or " \
                    "less correctly. Got %f%% correct on %d labels" %
                    (conf_matrix.percent_correct,
                     nlabels))

        # Since  now we have per split and possibly per label -- lets just find
        # mean per each feature per label across splits
        sensm = FxMapper('samples', lambda x: np.sum(x),
                         uattrs=['targets'])(sens)
        sensgm = maxofabs_sample()(sensm)  # global max of abs of means

        assert_equal(sensgm.shape[0], 1)
        assert_equal(sensgm.shape[1], ds.nfeatures)

        selected = FixedNElementTailSelector(len(ds.a.bogus_features))(
            sensgm.samples[0])

        if cfg.getboolean('tests', 'labile', default='yes'):

            self.failUnlessEqual(
                set(selected),
                set(ds.a.nonbogus_features),
                msg="At the end we should have selected the right features. "
                "Chose %s whenever nonbogus are %s" %
                (selected, ds.a.nonbogus_features))

            # Now test each one per label
            # TODO: collect all failures and spit them out at once --
            #       that would make it easy to see if the sensitivity
            #       just has incorrect order of labels assigned
            for sens1 in sensm:
                labels1 = sens1.targets  # labels (1) for this sensitivity
                lndim = labels1.ndim
                label = labels1[0]  # current label

                # XXX whole lndim comparison should be gone after
                #     things get fixed and we arrive here with a tuple!
                if lndim == 1:  # just a single label
                    self.failUnless(label in ulabels)

                    ilabel_all = np.where(ds.fa.targets == label)[0]
                    # should have just 1 feature for the label
                    self.failUnlessEqual(len(ilabel_all), 1)
                    ilabel = ilabel_all[0]

                    maxsensi = np.argmax(sens1)  # index of max sensitivity
                    self.failUnlessEqual(
                        maxsensi, ilabel,
                        "Maximal sensitivity for %s was found in %i whenever"
                        " original feature was %i for nonbogus features %s" %
                        (labels1, maxsensi, ilabel, ds.a.nonbogus_features))
                elif lndim == 2 and labels1.shape[1] == 2:  # pair of labels
                    # we should have highest (in abs) coefficients in
                    # those two labels
                    maxsensi2 = np.argsort(np.abs(sens1))[0][-2:]
                    ilabel2 = [
                        np.where(ds.fa.targets == l)[0][0] for l in label
                    ]
                    self.failUnlessEqual(
                        set(maxsensi2), set(ilabel2),
                        "Maximal sensitivity for %s was found in %s whenever"
                        " original features were %s for nonbogus features %s" %
                        (labels1, maxsensi2, ilabel2, ds.a.nonbogus_features))
                    """
                    # Now test for the sign of each one in pair ;) in
                    # all binary problems L1 (-1) -> L2(+1), then
                    # weights for L2 should be positive.  to test for
                    # L1 -- invert the sign
                    # We already know (if we haven't failed in previous test),
                    # that those 2 were the strongest -- so check only signs
                    """
                    self.failUnless(
                        sens1.samples[0, ilabel2[0]] < 0,
                        "With %i classes in pair %s got feature %i for %r >= 0"
                        % (nlabels, label, ilabel2[0], label[0]))
                    self.failUnless(
                        sens1.samples[0, ilabel2[1]] > 0,
                        "With %i classes in pair %s got feature %i for %r <= 0"
                        % (nlabels, label, ilabel2[1], label[1]))
                else:
                    # yoh could be wrong at this assumption... time will show
                    self.fail("Got unknown number labels per sensitivity: %s."
                              " Should be either a single label or a pair" %
                              labels1)
示例#19
0
    def test_regressions(self, regr):
        """Simple tests on regressions
        """
        ds = datasets['chirp_linear']
        # we want numeric labels to maintain the previous behavior, especially
        # since we deal with regressions here
        ds.sa.targets = AttributeMap().to_numeric(ds.targets)

        cve = CrossValidatedTransferError(
            TransferError(regr),
            splitter=NFoldSplitter(),
            postproc=mean_sample(),
            enable_ca=['training_confusion', 'confusion'])
        # check the default
        self.failUnless(isinstance(cve.transerror.errorfx, CorrErrorFx))
        corr = np.asscalar(cve(ds).samples)

        # Our CorrErrorFx should never return NaN
        self.failUnless(not np.isnan(corr))
        self.failUnless(corr == cve.ca.confusion.stats['CCe'])

        splitregr = SplitClassifier(
            regr,
            splitter=OddEvenSplitter(),
            enable_ca=['training_confusion', 'confusion'])
        splitregr.train(ds)
        split_corr = splitregr.ca.confusion.stats['CCe']
        split_corr_tr = splitregr.ca.training_confusion.stats['CCe']

        for confusion, error in (
            (cve.ca.confusion, corr),
            (splitregr.ca.confusion, split_corr),
            (splitregr.ca.training_confusion, split_corr_tr),
        ):
            #TODO: test confusion statistics
            # Part of it for now -- CCe
            for conf in confusion.summaries:
                stats = conf.stats
                if cfg.getboolean('tests', 'labile', default='yes'):
                    self.failUnless(stats['CCe'] < 0.5)
                self.failUnlessEqual(stats['CCe'], stats['Summary CCe'])

            s0 = confusion.as_string(short=True)
            s1 = confusion.as_string(short=False)

            for s in [s0, s1]:
                self.failUnless(len(s) > 10,
                                msg="We should get some string representation "
                                "of regression summary. Got %s" % s)
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnless(
                    error < 0.2,
                    msg="Regressions should perform well on a simple "
                    "dataset. Got correlation error of %s " % error)

            # Test access to summary statistics
            # YOH: lets start making testing more reliable.
            #      p-value for such accident to have is verrrry tiny,
            #      so if regression works -- it better has at least 0.5 ;)
            #      otherwise fix it! ;)
            # YOH: not now -- issues with libsvr in SG and linear kernel
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnless(confusion.stats['CCe'] < 0.5)

        # just to check if it works fine
        split_predictions = splitregr.predict(ds.samples)
示例#20
0
    def test_rfe(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())
        pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error,
                                                           'targets'))
        cvmeasure = CrossValidation(clf, NFoldPartitioner(),
                                    errorfx=mean_mismatch_error,
                                    postproc=mean_sample())

        rfesvm_split = SplitClassifier(clf, OddEvenPartitioner())

        # explore few recipes
        for rfe, data in [
            # because the clf is already trained when computing the sensitivity
            # map, prevent retraining for transfer error calculation
            # Use absolute of the svm weights as sensitivity
            (RFE(sens_ana,
                pmeasure,
                Splitter('train'),
                fselector=FixedNElementTailSelector(1),
                train_pmeasure=False),
             self.get_data()),
            # use cross-validation within training to get error for the stopping point
            # but use full training data to derive sensitivity
            (RFE(sens_ana,
                 cvmeasure,
                 Repeater(2),            # give the same full dataset to sens_ana and cvmeasure
                 fselector=FractionTailSelector(
                     0.70,
                     mode='select', tail='upper'),
                train_pmeasure=True),
             normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200,
                                    nonbogus_features=[0, 1], snr=1.5)),
            # use cross-validation (via SplitClassifier) and get mean
            # of normed sensitivities across those splits
            (RFE(rfesvm_split.get_sensitivity_analyzer(
                    postproc=ChainMapper([ FxMapper('features', l2_normed),
                                           FxMapper('samples', np.mean),
                                           FxMapper('samples', np.abs)])),
                 ConfusionBasedError(rfesvm_split, confusion_state='stats'),
                 Repeater(2),             #  we will use the same full cv-training dataset
                 fselector=FractionTailSelector(
                     0.50,
                     mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 train_pmeasure=False,    # we just extract it from existing confusion
                 update_sensitivity=True),
             normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200,
                                    nonbogus_features=[0, 1], snr=1.5))
            ]:
            # prep data
            # data = datasets['uni2medium']
            data_nfeatures = data.nfeatures

            rfe.train(data)
            resds = rfe(data)

            # fail if orig datasets are changed
            self.failUnless(data.nfeatures == data_nfeatures)

            # check that the features set with the least error is selected
            if len(rfe.ca.errors):
                e = np.array(rfe.ca.errors)
                if isinstance(rfe._fselector, FixedNElementTailSelector):
                    self.failUnless(resds.nfeatures == data_nfeatures - e.argmin())
                else:
                    # in this case we can even check if we had actual
                    # going down/up trend... although -- why up???
                    imin = np.argmin(e)
                    self.failUnless( 1 < imin < len(e) - 1 )
            else:
                self.failUnless(resds.nfeatures == data_nfeatures)

            # silly check if nfeatures is in decreasing order
            nfeatures = np.array(rfe.ca.nfeatures).copy()
            nfeatures.sort()
            self.failUnless( (nfeatures[::-1] == rfe.ca.nfeatures).all() )

            # check if history has elements for every step
            self.failUnless(set(rfe.ca.history)
                            == set(range(len(np.array(rfe.ca.errors)))))

            # Last (the largest number) can be present multiple times even
            # if we remove 1 feature at a time -- just need to stop well
            # in advance when we have more than 1 feature left ;)
            self.failUnless(rfe.ca.nfeatures[-1]
                            == len(np.where(rfe.ca.history
                                           ==max(rfe.ca.history))[0]))
示例#21
0
               linearSVMC.get_sensitivity_analyzer(postproc=maxofabs_sample()),
               FixedNElementTailSelector(50, mode='select', tail='upper')),
            descr="LinSVM on 50(SVM)")

    ### Imports which are specific to RFEs
    # from mvpa.datasets.splitters import OddEvenSplitter
    # from mvpa.clfs.transerror import TransferError
    # from mvpa.featsel.rfe import RFE
    # from mvpa.featsel.helpers import FixedErrorThresholdStopCrit
    # from mvpa.clfs.transerror import ConfusionBasedError

    # SVM with unbiased RFE -- transfer-error to another splits, or in
    # other terms leave-1-out error on the same dataset
    # Has to be bound outside of the RFE definition since both analyzer and
    # error should use the same instance.
    rfesvm_split = SplitClassifier(linearSVMC)  #clfswh['LinearSVMC'][0])

    # "Almost" classical RFE. If this works it would differ only that
    # our transfer_error is based on internal splitting and classifier used
    # within RFE is a split classifier and its sensitivities per split will get
    # averaged
    #

    #clfswh += \
    #  FeatureSelectionClassifier(
    #    clf = LinearCSVMC(), #clfswh['LinearSVMC'][0],         # we train LinearSVM
    #    feature_selection = RFE(             # on features selected via RFE
    #        # based on sensitivity of a clf which does splitting internally
    #        sensitivity_analyzer=rfesvm_split.get_sensitivity_analyzer(),
    #        transfer_error=ConfusionBasedError(
    #           rfesvm_split,