示例#1
0
    def test_sensitivity_based_feature_selection(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())

        # of features to remove
        Nremove = 2

        # because the clf is already trained when computing the sensitivity
        # map, prevent retraining for transfer error calculation
        # Use absolute of the svm weights as sensitivity
        fe = SensitivityBasedFeatureSelection(
            sens_ana,
            feature_selector=FixedNElementTailSelector(2),
            enable_ca=["sensitivity", "selected_ids"])

        data = self.get_data()

        data_nfeatures = data.nfeatures

        fe.train(data)
        resds = fe(data)

        # fail if orig datasets are changed
        self.assertTrue(data.nfeatures == data_nfeatures)

        # silly check if nfeatures got a single one removed
        self.assertEqual(data.nfeatures,
                         resds.nfeatures + Nremove,
                         msg="We had to remove just a single feature")

        self.assertEqual(
            fe.ca.sensitivity.nfeatures,
            data_nfeatures,
            msg="Sensitivity have to have # of features equal to original")
示例#2
0
 def __repr__(self, prefixes=[]):
     return super(SplitRFE, self).__repr__(
         prefixes=prefixes
         + _repr_attrs(self, ['lrn', 'partitioner'])
         + _repr_attrs(self, ['errorfx'], default=mean_mismatch_error)
         + _repr_attrs(self, ['analyzer_postproc'], default=maxofabs_sample())
         )
示例#3
0
    def test_sensitivity_based_feature_selection(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())

        # of features to remove
        Nremove = 2

        # because the clf is already trained when computing the sensitivity
        # map, prevent retraining for transfer error calculation
        # Use absolute of the svm weights as sensitivity
        fe = SensitivityBasedFeatureSelection(sens_ana,
                feature_selector=FixedNElementTailSelector(2),
                enable_ca=["sensitivity", "selected_ids"])

        data = self.get_data()

        data_nfeatures = data.nfeatures

        fe.train(data)
        resds = fe(data)

        # fail if orig datasets are changed
        self.assertTrue(data.nfeatures == data_nfeatures)

        # silly check if nfeatures got a single one removed
        self.assertEqual(data.nfeatures, resds.nfeatures+Nremove,
            msg="We had to remove just a single feature")

        self.assertEqual(fe.ca.sensitivity.nfeatures, data_nfeatures,
            msg="Sensitivity have to have # of features equal to original")
示例#4
0
文件: rfe.py 项目: velvetcheng/nidata
    def __init__(
            self,
            lrn,
            partitioner,
            fselector,
            errorfx=mean_mismatch_error,
            analyzer_postproc=maxofabs_sample(),
            # callback?
            **kwargs):
        """
        Parameters
        ----------
        lrn : Learner
          Learner with a sensitivity analyzer which will be used both
          for the sensitivity analysis and transfer error estimation
        partitioner : Partitioner
          Used to generate cross-validation partitions for cross-validation
          to deduce optimal number of features to maintain
        fselector : Functor
          Given a sensitivity map it has to return the ids of those
          features that should be kept.
        errorfx : func, optional
          Functor to use for estimation of cross-validation error
        analyzer_postproc : func, optional
          Function to provide to the sensitivity analyzer as postproc
        """
        # Initialize itself preparing for the 2nd invocation
        # with determined number of nfeatures_min
        fmeasure = lrn.get_sensitivity_analyzer(postproc=analyzer_postproc)

        RFE.__init__(self,
                     fmeasure,
                     None,
                     Repeater(2),
                     fselector=fselector,
                     bestdetector=None,
                     train_pmeasure=False,
                     stopping_criterion=None,
                     **kwargs)
        self._lrn = lrn  # should not be modified, thus _
        self.partitioner = partitioner
        self.errorfx = errorfx
        self.analyzer_postproc = analyzer_postproc
示例#5
0
文件: rfe.py 项目: Arthurkorn/PyMVPA
    def __init__(self, lrn, partitioner,
                 fselector,
                 errorfx=mean_mismatch_error,
                 analyzer_postproc=maxofabs_sample(),
                 # callback?
                 **kwargs):
        """
        Parameters
        ----------
        lrn : Learner
          Learner with a sensitivity analyzer which will be used both
          for the sensitivity analysis and transfer error estimation
        partitioner : Partitioner
          Used to generate cross-validation partitions for cross-validation
          to deduce optimal number of features to maintain
        fselector : Functor
          Given a sensitivity map it has to return the ids of those
          features that should be kept.
        errorfx : func, optional
          Functor to use for estimation of cross-validation error
        analyzer_postproc : func, optional
          Function to provide to the sensitivity analyzer as postproc
        """
        # Initialize itself preparing for the 2nd invocation
        # with determined number of nfeatures_min
        fmeasure = lrn.get_sensitivity_analyzer(postproc=analyzer_postproc)

        RFE.__init__(self,
                     fmeasure,
                     None,
                     Repeater(2),
                     fselector=fselector,
                     bestdetector=None,
                     train_pmeasure=False,
                     stopping_criterion=None,
                     **kwargs)
        self._lrn = lrn                   # should not be modified, thus _
        self.partitioner = partitioner
        self.errorfx = errorfx
        self.analyzer_postproc = analyzer_postproc
示例#6
0
文件: rfe.py 项目: Anhmike/PyMVPA
    def __init__(self, lrn, partitioner,
                 fselector,
                 errorfx=mean_mismatch_error,
                 fmeasure_postproc=None,
                 fmeasure=None,
                 nproc=1,
                 # callback?
                 **kwargs):
        """
        Parameters
        ----------
        lrn : Learner
          Learner with a sensitivity analyzer which will be used both
          for the sensitivity analysis and transfer error estimation
        partitioner : Partitioner
          Used to generate cross-validation partitions for cross-validation
          to deduce optimal number of features to maintain
        fselector : Functor
          Given a sensitivity map it has to return the ids of those
          features that should be kept.
        errorfx : func, optional
          Functor to use for estimation of cross-validation error
        fmeasure_postproc : func, optional
          Function to provide to the sensitivity analyzer as postproc.  If no
          fmeasure is provided and classifier sensitivity is used, then
          maxofabs_sample() would be used for this postproc, unless other
          value is provided
        fmeasure : Function, optional
          Featurewise measure.  If None was provided, lrn's sensitivity
          analyzer will be used.
        """
        # Initialize itself preparing for the 2nd invocation
        # with determined number of nfeatures_min
        # TODO:  move this into _train since better not to assign anything here
        # to avoid possible problems with copies needing to deal with the same
        # lrn... but then we might like again to reconsider delegation instead
        # of subclassing here....
        if fmeasure is None:
            if __debug__:
                debug('RFE', 'fmeasure was not provided, will be using the '
                             'sensitivity analyzer for %s' % lrn)
            fmeasure = lrn.get_sensitivity_analyzer(
                postproc=fmeasure_postproc if fmeasure_postproc is not None
                                           else maxofabs_sample())
            train_pmeasure = False
        else:
            assert fmeasure_postproc is None, "There should be no explicit " \
                    "fmeasure_postproc when fmeasure is specified"
            # if user provided explicit value -- use it! otherwise, we do want
            # to train an arbitrary fmeasure
            train_pmeasure = kwargs.pop('train_pmeasure', True)

        RFE.__init__(self,
                     fmeasure,
                     None,
                     Repeater(2),
                     fselector=fselector,
                     bestdetector=None,
                     train_pmeasure=train_pmeasure,
                     stopping_criterion=None,
                     **kwargs)
        self._lrn = lrn                   # should not be modified, thus _
        self.partitioner = partitioner
        self.errorfx = errorfx
        self.fmeasure_postproc = fmeasure_postproc
        self.nproc = nproc
示例#7
0
    def test_analyzer_with_split_classifier(self, clfds):
        """Test analyzers in split classifier
        """
        clf, ds = clfds  # unroll the tuple
        # We need to skip some LARSes here
        _sclf = str(clf)
        if 'LARS(' in _sclf and "type='stepwise'" in _sclf:
            # ADD KnownToFail thingie from NiPy
            return

        # To don't waste too much time testing lets limit to 3 splits
        nsplits = 3
        partitioner = NFoldPartitioner(count=nsplits)
        mclf = SplitClassifier(clf=clf,
                               partitioner=partitioner,
                               enable_ca=['training_stats', 'stats'])
        sana = mclf.get_sensitivity_analyzer(  # postproc=absolute_features(),
            pass_attr=['fa.nonbogus_targets'],
            enable_ca=["sensitivities"])

        ulabels = ds.uniquetargets
        nlabels = len(ulabels)
        # Can't rely on splitcfg since count-limit is done in __call__
        assert (nsplits == len(list(partitioner.generate(ds))))
        sens = sana(ds)
        assert ('nonbogus_targets' in sens.fa)  # were they passsed?
        # TODO: those few do not expose biases
        if not len(set(clf.__tags__).intersection(('lars', 'glmnet', 'gpr'))):
            assert ('biases' in sens.sa)
            # print sens.sa.biases
        # It should return either ...
        #  nlabels * nsplits
        req_nsamples = [nlabels * nsplits]
        if nlabels == 2:
            # A single sensitivity in case of binary
            req_nsamples += [nsplits]
        else:
            # and for pairs in case of multiclass
            req_nsamples += [(nlabels * (nlabels - 1) / 2) * nsplits]
            # and for 1-vs-1 embedded within Multiclass operating on
            # pairs (e.g. SMLR)
            req_nsamples += [req_nsamples[-1] * 2]

            # Also for regression_based -- they can do multiclass
            # but only 1 sensitivity is provided
            if 'regression_based' in clf.__tags__:
                req_nsamples += [nsplits]

        # # of features should correspond
        self.assertEqual(sens.shape[1], ds.nfeatures)
        # # of samples/sensitivities should also be reasonable
        self.assertTrue(sens.shape[0] in req_nsamples)

        # Check if labels are present
        self.assertTrue('splits' in sens.sa)
        self.assertTrue('targets' in sens.sa)
        # should be 1D -- otherwise dtype object
        self.assertTrue(sens.sa.targets.ndim == 1)

        sens_ulabels = sens.sa['targets'].unique
        # Some labels might be pairs(tuples) so ndarray would be of
        # dtype object and we would need to get them all
        if sens_ulabels.dtype is np.dtype('object'):
            sens_ulabels = np.unique(
                reduce(lambda x, y: x + y, [list(x) for x in sens_ulabels]))

        assert_array_equal(sens_ulabels, ds.sa['targets'].unique)

        errors = [x.percent_correct for x in sana.clf.ca.stats.matrices]

        # lets go through all sensitivities and see if we selected the right
        # features
        #if 'meta' in clf.__tags__ and len(sens.samples[0].nonzero()[0])<2:
        if '5%' in clf.descr \
               or (nlabels > 2 and 'regression_based' in clf.__tags__):
            # Some meta classifiers (5% of ANOVA) are too harsh ;-)
            # if we get less than 2 features with on-zero sensitivities we
            # cannot really test
            # Also -- regression based classifiers performance for multiclass
            # is expected to suck in general
            return

        if cfg.getboolean('tests', 'labile', default='yes'):
            for conf_matrix in [sana.clf.ca.training_stats] \
                              + sana.clf.ca.stats.matrices:
                self.assertTrue(
                    conf_matrix.percent_correct>=70,
                    msg="We must have trained on each one more or " \
                    "less correctly. Got %f%% correct on %d labels" %
                    (conf_matrix.percent_correct,
                     nlabels))

        # Since  now we have per split and possibly per label -- lets just find
        # mean per each feature per label across splits
        sensm = FxMapper('samples', lambda x: np.sum(x),
                         uattrs=['targets']).forward(sens)
        sensgm = maxofabs_sample().forward(sensm)  # global max of abs of means

        assert_equal(sensgm.shape[0], 1)
        assert_equal(sensgm.shape[1], ds.nfeatures)

        selected = FixedNElementTailSelector(len(ds.a.bogus_features))(
            sensgm.samples[0])

        if cfg.getboolean('tests', 'labile', default='yes'):

            self.assertEqual(
                set(selected),
                set(ds.a.nonbogus_features),
                msg="At the end we should have selected the right features. "
                "Chose %s whenever nonbogus are %s" %
                (selected, ds.a.nonbogus_features))

            # Now test each one per label
            # TODO: collect all failures and spit them out at once --
            #       that would make it easy to see if the sensitivity
            #       just has incorrect order of labels assigned
            for sens1 in sensm:
                labels1 = sens1.targets  # labels (1) for this sensitivity
                lndim = labels1.ndim
                label = labels1[0]  # current label

                # XXX whole lndim comparison should be gone after
                #     things get fixed and we arrive here with a tuple!
                if lndim == 1:  # just a single label
                    self.assertTrue(label in ulabels)

                    ilabel_all = np.where(ds.fa.nonbogus_targets == label)[0]
                    # should have just 1 feature for the label
                    self.assertEqual(len(ilabel_all), 1)
                    ilabel = ilabel_all[0]

                    maxsensi = np.argmax(sens1)  # index of max sensitivity
                    self.assertEqual(
                        maxsensi, ilabel,
                        "Maximal sensitivity for %s was found in %i whenever"
                        " original feature was %i for nonbogus features %s" %
                        (labels1, maxsensi, ilabel, ds.a.nonbogus_features))
                elif lndim == 2 and labels1.shape[1] == 2:  # pair of labels
                    # we should have highest (in abs) coefficients in
                    # those two labels
                    maxsensi2 = np.argsort(np.abs(sens1))[0][-2:]
                    ilabel2 = [
                        np.where(ds.fa.nonbogus_targets == l)[0][0]
                        for l in label
                    ]
                    self.assertEqual(
                        set(maxsensi2), set(ilabel2),
                        "Maximal sensitivity for %s was found in %s whenever"
                        " original features were %s for nonbogus features %s" %
                        (labels1, maxsensi2, ilabel2, ds.a.nonbogus_features))
                    """
                    # Now test for the sign of each one in pair ;) in
                    # all binary problems L1 (-1) -> L2(+1), then
                    # weights for L2 should be positive.  to test for
                    # L1 -- invert the sign
                    # We already know (if we haven't failed in previous test),
                    # that those 2 were the strongest -- so check only signs
                    """
                    self.assertTrue(
                        sens1.samples[0, ilabel2[0]] < 0,
                        "With %i classes in pair %s got feature %i for %r >= 0"
                        % (nlabels, label, ilabel2[0], label[0]))
                    self.assertTrue(
                        sens1.samples[0, ilabel2[1]] > 0,
                        "With %i classes in pair %s got feature %i for %r <= 0"
                        % (nlabels, label, ilabel2[1], label[1]))
                else:
                    # yoh could be wrong at this assumption... time will show
                    self.fail("Got unknown number labels per sensitivity: %s."
                              " Should be either a single label or a pair" %
                              labels1)
示例#8
0
def test_rfe_sensmap():
    # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html
    # just a smoke test. fails with
    from mvpa2.clfs.svm import LinearCSVMC
    from mvpa2.clfs.meta import FeatureSelectionClassifier
    from mvpa2.measures.base import CrossValidation, RepeatedMeasure
    from mvpa2.generators.splitters import Splitter
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.misc.errorfx import mean_mismatch_error
    from mvpa2.mappers.fx import mean_sample
    from mvpa2.mappers.fx import maxofabs_sample
    from mvpa2.generators.base import Repeater
    from mvpa2.featsel.rfe import RFE
    from mvpa2.featsel.helpers import FractionTailSelector, BestDetector
    from mvpa2.featsel.helpers import NBackHistoryStopCrit
    from mvpa2.datasets import vstack

    from mvpa2.misc.data_generators import normal_feature_dataset

    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    fds = normal_feature_dataset(nlabels=3,
                                 snr=1, # 100,   # pure signal! ;)
                                 perlabel=9,
                                 nfeatures=6,
                                 nonbogus_features=range(3),
                                 nchunks=3)
    clfsvm = LinearCSVMC()

    rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()),
                 CrossValidation(
                     clfsvm,
                     NFoldPartitioner(),
                     errorfx=mean_mismatch_error, postproc=mean_sample()),
                 Repeater(2),
                 fselector=FractionTailSelector(0.70, mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 update_sensitivity=True)

    fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm)

    sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample())


    # manually repeating/splitting so we do both RFE sensitivity and classification
    senses, errors = [], []
    for i, pset in enumerate(NFoldPartitioner().generate(fds)):
        # split partitioned dataset
        split = [d for d in Splitter('partitions').generate(pset)]
        senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error
        errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets))

    senses = vstack(senses)
    errors = vstack(errors)

    # Let's compare against rerunning the beast simply for classification with CV
    errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds)
    # and they should match
    assert_array_equal(errors, errors_cv)

    # buggy!
    cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner())
    senses_rm = cv_sensana_svm(fds)

    #print senses.samples, senses_rm.samples
    #print errors, errors_cv.samples
    assert_raises(AssertionError,
                  assert_array_almost_equal,
                  senses.samples, senses_rm.samples)
    raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
示例#9
0
    def test_rfe(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())
        pmeasure = ProxyMeasure(clf,
                                postproc=BinaryFxNode(mean_mismatch_error,
                                                      'targets'))
        cvmeasure = CrossValidation(clf,
                                    NFoldPartitioner(),
                                    errorfx=mean_mismatch_error,
                                    postproc=mean_sample())

        rfesvm_split = SplitClassifier(clf, OddEvenPartitioner())

        # explore few recipes
        for rfe, data in [
                # because the clf is already trained when computing the sensitivity
                # map, prevent retraining for transfer error calculation
                # Use absolute of the svm weights as sensitivity
            (RFE(sens_ana,
                 pmeasure,
                 Splitter('train'),
                 fselector=FixedNElementTailSelector(1),
                 train_pmeasure=False), self.get_data()),
                # use cross-validation within training to get error for the stopping point
                # but use full training data to derive sensitivity
            (
                RFE(
                    sens_ana,
                    cvmeasure,
                    Repeater(
                        2
                    ),  # give the same full dataset to sens_ana and cvmeasure
                    fselector=FractionTailSelector(0.70,
                                                   mode='select',
                                                   tail='upper'),
                    train_pmeasure=True),
                normal_feature_dataset(perlabel=20,
                                       nchunks=5,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5)),
                # use cross-validation (via SplitClassifier) and get mean
                # of normed sensitivities across those splits
            (
                RFE(
                    rfesvm_split.get_sensitivity_analyzer(
                        postproc=ChainMapper([
                            FxMapper('features', l2_normed),
                            FxMapper('samples', np.mean),
                            FxMapper('samples', np.abs)
                        ])),
                    ConfusionBasedError(rfesvm_split, confusion_state='stats'),
                    Repeater(
                        2),  #  we will use the same full cv-training dataset
                    fselector=FractionTailSelector(0.50,
                                                   mode='select',
                                                   tail='upper'),
                    stopping_criterion=NBackHistoryStopCrit(
                        BestDetector(), 10),
                    train_pmeasure=
                    False,  # we just extract it from existing confusion
                    update_sensitivity=True),
                normal_feature_dataset(perlabel=28,
                                       nchunks=7,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5))
        ]:
            # prep data
            # data = datasets['uni2medium']
            data_nfeatures = data.nfeatures

            rfe.train(data)
            resds = rfe(data)

            # fail if orig datasets are changed
            self.assertTrue(data.nfeatures == data_nfeatures)

            # check that the features set with the least error is selected
            if len(rfe.ca.errors):
                e = np.array(rfe.ca.errors)
                if isinstance(rfe._fselector, FixedNElementTailSelector):
                    self.assertTrue(resds.nfeatures == data_nfeatures -
                                    e.argmin())
                else:
                    imin = np.argmin(e)
                    if 'does_feature_selection' in clf.__tags__:
                        # if clf is smart it might figure it out right away
                        assert_array_less(imin, len(e))
                    else:
                        # in this case we can even check if we had actual
                        # going down/up trend... although -- why up???
                        self.assertTrue(1 < imin < len(e) - 1)
            else:
                self.assertTrue(resds.nfeatures == data_nfeatures)

            # silly check if nfeatures is in decreasing order
            nfeatures = np.array(rfe.ca.nfeatures).copy()
            nfeatures.sort()
            self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all())

            # check if history has elements for every step
            self.assertTrue(
                set(rfe.ca.history) == set(range(len(np.array(
                    rfe.ca.errors)))))

            # Last (the largest number) can be present multiple times even
            # if we remove 1 feature at a time -- just need to stop well
            # in advance when we have more than 1 feature left ;)
            self.assertTrue(rfe.ca.nfeatures[-1] == len(
                np.where(rfe.ca.history == max(rfe.ca.history))[0]))
示例#10
0
    def __init__(
            self,
            lrn,
            partitioner,
            fselector,
            errorfx=mean_mismatch_error,
            fmeasure_postproc=None,
            fmeasure=None,
            nproc=1,
            # callback?
            **kwargs):
        """
        Parameters
        ----------
        lrn : Learner
          Learner with a sensitivity analyzer which will be used both
          for the sensitivity analysis and transfer error estimation
        partitioner : Partitioner
          Used to generate cross-validation partitions for cross-validation
          to deduce optimal number of features to maintain
        fselector : Functor
          Given a sensitivity map it has to return the ids of those
          features that should be kept.
        errorfx : func, optional
          Functor to use for estimation of cross-validation error
        fmeasure_postproc : func, optional
          Function to provide to the sensitivity analyzer as postproc.  If no
          fmeasure is provided and classifier sensitivity is used, then
          maxofabs_sample() would be used for this postproc, unless other
          value is provided
        fmeasure : Function, optional
          Featurewise measure.  If None was provided, lrn's sensitivity
          analyzer will be used.
        """
        # Initialize itself preparing for the 2nd invocation
        # with determined number of nfeatures_min
        # TODO:  move this into _train since better not to assign anything here
        # to avoid possible problems with copies needing to deal with the same
        # lrn... but then we might like again to reconsider delegation instead
        # of subclassing here....
        if fmeasure is None:
            if __debug__:
                debug(
                    'RFE', 'fmeasure was not provided, will be using the '
                    'sensitivity analyzer for %s' % lrn)
            fmeasure = lrn.get_sensitivity_analyzer(
                postproc=fmeasure_postproc
                if fmeasure_postproc is not None else maxofabs_sample())
            train_pmeasure = False
        else:
            assert fmeasure_postproc is None, "There should be no explicit " \
                    "fmeasure_postproc when fmeasure is specified"
            # if user provided explicit value -- use it! otherwise, we do want
            # to train an arbitrary fmeasure
            train_pmeasure = kwargs.pop('train_pmeasure', True)

        RFE.__init__(self,
                     fmeasure,
                     None,
                     Repeater(2),
                     fselector=fselector,
                     bestdetector=None,
                     train_pmeasure=train_pmeasure,
                     stopping_criterion=None,
                     **kwargs)
        self._lrn = lrn  # should not be modified, thus _
        self.partitioner = partitioner
        self.errorfx = errorfx
        self.fmeasure_postproc = fmeasure_postproc
        self.nproc = nproc
示例#11
0
    def test_rfe(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())
        pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error,
                                                           'targets'))
        cvmeasure = CrossValidation(clf, NFoldPartitioner(),
                                    errorfx=mean_mismatch_error,
                                    postproc=mean_sample())

        rfesvm_split = SplitClassifier(clf, OddEvenPartitioner())

        # explore few recipes
        for rfe, data in [
            # because the clf is already trained when computing the sensitivity
            # map, prevent retraining for transfer error calculation
            # Use absolute of the svm weights as sensitivity
            (RFE(sens_ana,
                pmeasure,
                Splitter('train'),
                fselector=FixedNElementTailSelector(1),
                train_pmeasure=False),
             self.get_data()),
            # use cross-validation within training to get error for the stopping point
            # but use full training data to derive sensitivity
            (RFE(sens_ana,
                 cvmeasure,
                 Repeater(2),            # give the same full dataset to sens_ana and cvmeasure
                 fselector=FractionTailSelector(
                     0.70,
                     mode='select', tail='upper'),
                train_pmeasure=True),
             normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200,
                                    nonbogus_features=[0, 1], snr=1.5)),
            # use cross-validation (via SplitClassifier) and get mean
            # of normed sensitivities across those splits
            (RFE(rfesvm_split.get_sensitivity_analyzer(
                    postproc=ChainMapper([ FxMapper('features', l2_normed),
                                           FxMapper('samples', np.mean),
                                           FxMapper('samples', np.abs)])),
                 ConfusionBasedError(rfesvm_split, confusion_state='stats'),
                 Repeater(2),             #  we will use the same full cv-training dataset
                 fselector=FractionTailSelector(
                     0.50,
                     mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 train_pmeasure=False,    # we just extract it from existing confusion
                 update_sensitivity=True),
             normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200,
                                    nonbogus_features=[0, 1], snr=1.5))
            ]:
            # prep data
            # data = datasets['uni2medium']
            data_nfeatures = data.nfeatures

            rfe.train(data)
            resds = rfe(data)

            # fail if orig datasets are changed
            self.assertTrue(data.nfeatures == data_nfeatures)

            # check that the features set with the least error is selected
            if len(rfe.ca.errors):
                e = np.array(rfe.ca.errors)
                if isinstance(rfe._fselector, FixedNElementTailSelector):
                    self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin())
                else:
                    imin = np.argmin(e)
                    if 'does_feature_selection' in clf.__tags__:
                        # if clf is smart it might figure it out right away
                        assert_array_less( imin, len(e) )
                    else:
                        # in this case we can even check if we had actual
                        # going down/up trend... although -- why up???
                        self.assertTrue( 1 < imin < len(e) - 1 )
            else:
                self.assertTrue(resds.nfeatures == data_nfeatures)

            # silly check if nfeatures is in decreasing order
            nfeatures = np.array(rfe.ca.nfeatures).copy()
            nfeatures.sort()
            self.assertTrue( (nfeatures[::-1] == rfe.ca.nfeatures).all() )

            # check if history has elements for every step
            self.assertTrue(set(rfe.ca.history)
                            == set(range(len(np.array(rfe.ca.errors)))))

            # Last (the largest number) can be present multiple times even
            # if we remove 1 feature at a time -- just need to stop well
            # in advance when we have more than 1 feature left ;)
            self.assertTrue(rfe.ca.nfeatures[-1]
                            == len(np.where(rfe.ca.history
                                           ==max(rfe.ca.history))[0]))
示例#12
0
    def test_analyzer_with_split_classifier(self, clfds):
        """Test analyzers in split classifier
        """
        clf, ds = clfds             # unroll the tuple
        # We need to skip some LARSes here
        _sclf = str(clf)
        if 'LARS(' in _sclf and "type='stepwise'" in _sclf:
            # ADD KnownToFail thingie from NiPy
            return

        # To don't waste too much time testing lets limit to 3 splits
        nsplits = 3
        partitioner = NFoldPartitioner(count=nsplits)
        mclf = SplitClassifier(clf=clf,
                               partitioner=partitioner,
                               enable_ca=['training_stats',
                                              'stats'])
        sana = mclf.get_sensitivity_analyzer(# postproc=absolute_features(),
                                           pass_attr=['fa.nonbogus_targets'],
                                           enable_ca=["sensitivities"])

        ulabels = ds.uniquetargets
        nlabels = len(ulabels)
        # Can't rely on splitcfg since count-limit is done in __call__
        assert(nsplits == len(list(partitioner.generate(ds))))
        sens = sana(ds)
        assert('nonbogus_targets' in sens.fa) # were they passsed?
        # TODO: those few do not expose biases
        if not len(set(clf.__tags__).intersection(('lars', 'glmnet', 'gpr'))):
            assert('biases' in sens.sa)
            # print sens.sa.biases
        # It should return either ...
        #  nlabels * nsplits
        req_nsamples = [ nlabels * nsplits ]
        if nlabels == 2:
            # A single sensitivity in case of binary
            req_nsamples += [ nsplits ]
        else:
            # and for pairs in case of multiclass
            req_nsamples += [ (nlabels * (nlabels - 1) / 2) * nsplits ]
            # and for 1-vs-1 embedded within Multiclass operating on
            # pairs (e.g. SMLR)
            req_nsamples += [req_nsamples[-1] * 2]

            # Also for regression_based -- they can do multiclass
            # but only 1 sensitivity is provided
            if 'regression_based' in clf.__tags__:
                req_nsamples += [ nsplits ]

        # # of features should correspond
        self.assertEqual(sens.shape[1], ds.nfeatures)
        # # of samples/sensitivities should also be reasonable
        self.assertTrue(sens.shape[0] in req_nsamples)

        # Check if labels are present
        self.assertTrue('splits' in sens.sa)
        self.assertTrue('targets' in sens.sa)
        # should be 1D -- otherwise dtype object
        self.assertTrue(sens.sa.targets.ndim == 1)

        sens_ulabels = sens.sa['targets'].unique
        # Some labels might be pairs(tuples) so ndarray would be of
        # dtype object and we would need to get them all
        if sens_ulabels.dtype is np.dtype('object'):
            sens_ulabels = np.unique(
                reduce(lambda x, y: x + y, [list(x) for x in sens_ulabels]))

        assert_array_equal(sens_ulabels, ds.sa['targets'].unique)

        errors = [x.percent_correct
                    for x in sana.clf.ca.stats.matrices]

        # lets go through all sensitivities and see if we selected the right
        # features
        #if 'meta' in clf.__tags__ and len(sens.samples[0].nonzero()[0])<2:
        if '5%' in clf.descr \
               or (nlabels > 2 and 'regression_based' in clf.__tags__):
            # Some meta classifiers (5% of ANOVA) are too harsh ;-)
            # if we get less than 2 features with on-zero sensitivities we
            # cannot really test
            # Also -- regression based classifiers performance for multiclass
            # is expected to suck in general
            return

        if cfg.getboolean('tests', 'labile', default='yes'):
            for conf_matrix in [sana.clf.ca.training_stats] \
                              + sana.clf.ca.stats.matrices:
                self.assertTrue(
                    conf_matrix.percent_correct >= 70,
                    msg="We must have trained on each one more or " \
                    "less correctly. Got %f%% correct on %d labels" %
                    (conf_matrix.percent_correct,
                     nlabels))


        # Since  now we have per split and possibly per label -- lets just find
        # mean per each feature per label across splits
        sensm = FxMapper('samples', lambda x: np.sum(x),
                         uattrs=['targets']).forward(sens)
        sensgm = maxofabs_sample().forward(sensm)    # global max of abs of means

        assert_equal(sensgm.shape[0], 1)
        assert_equal(sensgm.shape[1], ds.nfeatures)

        selected = FixedNElementTailSelector(
            len(ds.a.bogus_features))(sensgm.samples[0])

        if cfg.getboolean('tests', 'labile', default='yes'):

            self.assertEqual(
                set(selected), set(ds.a.nonbogus_features),
                msg="At the end we should have selected the right features. "
                "Chose %s whenever nonbogus are %s"
                % (selected, ds.a.nonbogus_features))

            # Now test each one per label
            # TODO: collect all failures and spit them out at once --
            #       that would make it easy to see if the sensitivity
            #       just has incorrect order of labels assigned
            for sens1 in sensm:
                labels1 = sens1.targets  # labels (1) for this sensitivity
                lndim = labels1.ndim
                label = labels1[0]      # current label

                # XXX whole lndim comparison should be gone after
                #     things get fixed and we arrive here with a tuple!
                if lndim == 1: # just a single label
                    self.assertTrue(label in ulabels)

                    ilabel_all = np.where(ds.fa.nonbogus_targets == label)[0]
                    # should have just 1 feature for the label
                    self.assertEqual(len(ilabel_all), 1)
                    ilabel = ilabel_all[0]

                    maxsensi = np.argmax(sens1) # index of max sensitivity
                    self.assertEqual(maxsensi, ilabel,
                        "Maximal sensitivity for %s was found in %i whenever"
                        " original feature was %i for nonbogus features %s"
                        % (labels1, maxsensi, ilabel, ds.a.nonbogus_features))
                elif lndim == 2 and labels1.shape[1] == 2: # pair of labels
                    # we should have highest (in abs) coefficients in
                    # those two labels
                    maxsensi2 = np.argsort(np.abs(sens1))[0][-2:]
                    ilabel2 = [np.where(ds.fa.nonbogus_targets == l)[0][0]
                                    for l in label]
                    self.assertEqual(
                        set(maxsensi2), set(ilabel2),
                        "Maximal sensitivity for %s was found in %s whenever"
                        " original features were %s for nonbogus features %s"
                        % (labels1, maxsensi2, ilabel2, ds.a.nonbogus_features))
                    """
                    # Now test for the sign of each one in pair ;) in
                    # all binary problems L1 (-1) -> L2(+1), then
                    # weights for L2 should be positive.  to test for
                    # L1 -- invert the sign
                    # We already know (if we haven't failed in previous test),
                    # that those 2 were the strongest -- so check only signs
                    """
                    self.assertTrue(
                        sens1.samples[0, ilabel2[0]] < 0,
                        "With %i classes in pair %s got feature %i for %r >= 0"
                        % (nlabels, label, ilabel2[0], label[0]))
                    self.assertTrue(sens1.samples[0, ilabel2[1]] > 0,
                        "With %i classes in pair %s got feature %i for %r <= 0"
                        % (nlabels, label, ilabel2[1], label[1]))
                else:
                    # yoh could be wrong at this assumption... time will show
                    self.fail("Got unknown number labels per sensitivity: %s."
                              " Should be either a single label or a pair"
                              % labels1)
示例#13
0
文件: rfe.py 项目: mfalkiewicz/PyMVPA
 def __repr__(self, prefixes=[]):
     return super(SplitRFE, self).__repr__(
         prefixes=prefixes + _repr_attrs(self, ['lrn', 'partitioner']) +
         _repr_attrs(self, ['errorfx'], default=mean_mismatch_error) +
         _repr_attrs(self, ['analyzer_postproc'], default=maxofabs_sample())
     )
示例#14
0
                                           descr='skl.LassoLarsIC()')
        regrswh += [_lasso_lars_ic]
        clfswh += [
            RegressionAsClassifier(_lasso_lars_ic, descr='skl.LassoLarsIC_C()')
        ]

# kNN
clfswh += kNN(k=5, descr="kNN(k=5)")
clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           SMLRWeights(SMLR(lm=1.0, implementation="C"),
                       postproc=maxofabs_sample()),
           RangeElementSelector(mode='select')),
        descr="kNN on SMLR(lm=1) non-0")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           OneWayAnova(),
           FractionTailSelector(0.05, mode='select', tail='upper')),
        descr="kNN on 5%(ANOVA)")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
示例#15
0
                                           tags=_lars_tags,
                                           descr='skl.LassoLarsIC()')
        regrswh += [_lasso_lars_ic]
        clfswh += [RegressionAsClassifier(_lasso_lars_ic,
                                          descr='skl.LassoLarsIC_C()')]

# kNN
clfswh += kNN(k=5, descr="kNN(k=5)")
clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           SMLRWeights(SMLR(lm=1.0, implementation="C"),
                       postproc=maxofabs_sample()),
           RangeElementSelector(mode='select')),
        descr="kNN on SMLR(lm=1) non-0")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           OneWayAnova(),
           FractionTailSelector(0.05, mode='select', tail='upper')),
        descr="kNN on 5%(ANOVA)")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
示例#16
0
        _lasso_lars = SKLLearnerAdapter(sklLassoLARS(), tags=_lars_tags, descr="skl.LassoLARS()")

        regrswh += [_lars, _lasso_lars]
        clfswh += [
            RegressionAsClassifier(_lars, descr="skl.LARS_C()"),
            RegressionAsClassifier(_lasso_lars, descr="skl.LassoLARS_C()"),
        ]

# kNN
clfswh += kNN(k=5, descr="kNN(k=5)")
clfswh += kNN(k=5, voting="majority", descr="kNN(k=5, voting='majority')")

clfswh += FeatureSelectionClassifier(
    kNN(),
    SensitivityBasedFeatureSelection(
        SMLRWeights(SMLR(lm=1.0, implementation="C"), postproc=maxofabs_sample()), RangeElementSelector(mode="select")
    ),
    descr="kNN on SMLR(lm=1) non-0",
)

clfswh += FeatureSelectionClassifier(
    kNN(),
    SensitivityBasedFeatureSelection(OneWayAnova(), FractionTailSelector(0.05, mode="select", tail="upper")),
    descr="kNN on 5%(ANOVA)",
)

clfswh += FeatureSelectionClassifier(
    kNN(),
    SensitivityBasedFeatureSelection(OneWayAnova(), FixedNElementTailSelector(50, mode="select", tail="upper")),
    descr="kNN on 50(ANOVA)",
)