示例#1
0
    def test_feature_selection_pipeline(self):
        sens_ana = SillySensitivityAnalyzer()

        data = self.get_data()
        data_nfeatures = data.nfeatures

        # test silly one first ;-)
        self.assertEqual(
            sens_ana(data).samples[0, 0], -int(data_nfeatures / 2))

        # OLD: first remove 25% == 6, and then 4, total removing 10
        # NOW: test should be independent of the numerical number of features
        feature_selections = [
            SensitivityBasedFeatureSelection(sens_ana,
                                             FractionTailSelector(0.25)),
            SensitivityBasedFeatureSelection(sens_ana,
                                             FixedNElementTailSelector(4))
        ]

        # create a FeatureSelection pipeline
        feat_sel_pipeline = ChainMapper(feature_selections)

        feat_sel_pipeline.train(data)
        resds = feat_sel_pipeline(data)

        self.assertEqual(len(feat_sel_pipeline),
                         len(feature_selections),
                         msg="Test the property feature_selections")

        desired_nfeatures = int(np.ceil(data_nfeatures * 0.75))
        self.assertEqual([fe._oshape[0] for fe in feat_sel_pipeline],
                         [desired_nfeatures, desired_nfeatures - 4])
示例#2
0
    def __init__(self,
                 sensitivity_analyzer,
                 feature_selector=FractionTailSelector(0.05),
                 train_analyzer=True,
                 **kwargs
                 ):
        """Initialize feature selection

        Parameters
        ----------
        sensitivity_analyzer : FeaturewiseMeasure
          sensitivity analyzer to come up with sensitivity
        feature_selector : Functor
          Given a sensitivity map it has to return the ids of those
          features that should be kept.
        train_analyzer : bool
          Flag whether to train the sensitivity analyzer on the input dataset
          during train(). If False, the employed sensitivity measure has to be
          already trained before.
        """

        # base init first
        FeatureSelection.__init__(self, **kwargs)

        self.__sensitivity_analyzer = sensitivity_analyzer
        """Sensitivity analyzer to use once"""

        self.__feature_selector = feature_selector
        """Functor which takes care about removing some features."""

        self.__train_analyzer = train_analyzer
示例#3
0
    def __test_matthias_question(self):
        rfe_clf = LinearCSVMC(C=1)

        rfesvm_split = SplitClassifier(rfe_clf)
        clf = \
            FeatureSelectionClassifier(
            clf = LinearCSVMC(C=1),
            feature_selection = RFE(
                sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer(
                    combiner=first_axis_mean,
                    transformer=np.abs),
                transfer_error=ConfusionBasedError(
                    rfesvm_split,
                    confusion_state="confusion"),
                stopping_criterion=FixedErrorThresholdStopCrit(0.20),
                feature_selector=FractionTailSelector(
                    0.2, mode='discard', tail='lower'),
                update_sensitivity=True))

        no_permutations = 1000
        permutator = AttributePermutator('targets', count=no_permutations)
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             null_dist=MCNullDist(permutator, tail='left'),
                             enable_ca=['stats'])
        error = cv(datasets['uni2small'])
        self.assertTrue(error < 0.4)
        self.assertTrue(cv.ca.null_prob < 0.05)
示例#4
0
    def test_split_samples_probability_mapper(self):
        skip_if_no_external('scipy')
        nf = 10
        ns = 100
        nsubj = 5
        nchunks = 5
        data = np.random.normal(size=(ns, nf))
        ds = AttrDataset(data,
                         sa=dict(sidx=np.arange(ns),
                                 targets=np.arange(ns) % nchunks,
                                 chunks=np.floor(np.arange(ns) * nchunks / ns),
                                 subjects=np.arange(ns) /
                                 (ns / nsubj / nchunks) % nsubj),
                         fa=dict(fidx=np.arange(nf)))
        analyzer = OneWayAnova()
        element_selector = FractionTailSelector(.4,
                                                mode='select',
                                                tail='upper')
        common = True
        m = SplitSamplesProbabilityMapper(analyzer,
                                          'subjects',
                                          probability_label='fprob',
                                          select_common_features=common,
                                          selector=element_selector)

        m.train(ds)
        y = m(ds)
        z = m(ds.samples)

        assert_array_equal(z, y.samples)
        assert_equal(y.shape, (100, 4))
示例#5
0
    def __init__(self,
                 sensitivity_analyzer,
                 split_by_labels,
                 select_common_features=True,
                 probability_label=None,
                 probability_combiner=None,
                 selector=FractionTailSelector(0.05),
                 **kwargs):
        '''
        Parameters
        ----------
        sensitivity_analyzer: FeaturewiseMeasure
            Sensitivity analyzer to come up with sensitivity.
        split_by_labels: str or list of str
            Sample labels on which input datasets are split before
            data is selected.
        select_common_features: bool
            True means that the same features are selected after the split.
        probablity_label: None or str
            If None, then the output dataset ds from the
            sensitivity_analyzer is taken to select the samples.
            If not None it takes ds.sa['probablity_label'].
            For example if sensitivity_analyzer=OneWayAnova then
            probablity_label='fprob' is a sensible value.
        probability_combiner: function
            If select_common_features is True, then this function is
            applied to the feature scores across splits. If None,
            it uses lambda x:np.sum(-np.log(x)) which is sensible if
            the scores are probability values
        selector: Selector
            function that returns the indices to keep.
        '''

        SliceMapper.__init__(self, None, **kwargs)

        if probability_combiner is None:

            def f(x):
                y = -np.log(x.ravel())

                # address potential NaNs
                # set to max value in y
                m = np.isnan(y)
                if np.all(m):
                    return 0  # p=1

                y[m] = np.max(y[np.logical_not(m)])
                return np.sum(y)

            probability_combiner = f  # avoid lambda as h5py doesn't like it

        self._sensitivity_analyzer = sensitivity_analyzer
        self._split_by_labels = split_by_labels
        self._select_common_features = select_common_features
        self._probability_label = probability_label
        self._probability_combiner = probability_combiner
        self._selector = selector
示例#6
0
    def test_james_problem_multiclass(self):
        percent = 80
        dataset = datasets['uni4large']
        #dataset = dataset[:, dataset.a.nonbogus_features]

        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(
            postproc=ChainMapper([
                #FxMapper('features', l2_normed),
                #FxMapper('samples', np.mean),
                #FxMapper('samples', np.abs)
                FxMapper('features', lambda x: np.argsort(np.abs(x))),
                #maxofabs_sample()
                mean_sample()
                ])),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)

        # update sensitivity at each step (since we're not using the
        # same CLF as sensitivity analyzer)

        class StoreResults(object):
            def __init__(self):
                self.storage = []

            def __call__(self, data, node, result):
                self.storage.append((node.measure.mapper.ca.history,
                                     node.measure.mapper.ca.errors)),

        cv_storage = StoreResults()
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             callback=cv_storage,
                             enable_ca=['stats'])
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception, e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e, ))
示例#7
0
    def test_SplitRFE(self):
        # just a smoke test ATM
        from mvpa2.clfs.svm import LinearCSVMC
        from mvpa2.clfs.meta import MappedClassifier
        from mvpa2.misc.data_generators import normal_feature_dataset
        #import mvpa2.featsel.rfe
        #reload(mvpa2.featsel.rfe)
        from mvpa2.featsel.rfe import RFE, SplitRFE
        from mvpa2.generators.partition import NFoldPartitioner
        from mvpa2.featsel.helpers import FractionTailSelector
        from mvpa2.testing import ok_, assert_equal

        clf = LinearCSVMC(C=1)
        dataset = normal_feature_dataset(perlabel=20,
                                         nlabels=2,
                                         nfeatures=30,
                                         snr=1.,
                                         nonbogus_features=[1, 5])
        # flip one of the meaningful features around to see
        # if we are still getting proper selection
        dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1
        # 4 partitions should be enough for testing
        partitioner = NFoldPartitioner(count=4)

        rfeclf = MappedClassifier(
            clf,
            SplitRFE(clf,
                     partitioner,
                     fselector=FractionTailSelector(0.2,
                                                    mode='discard',
                                                    tail='lower')))
        r0 = repr(rfeclf)

        ok_(rfeclf.mapper.nfeatures_min == 0)
        rfeclf.train(dataset)
        ok_(rfeclf.mapper.nfeatures_min > 0)
        predictions = rfeclf(dataset).samples

        # at least 1 of the nonbogus-features should be chosen
        ok_(
            len(
                set(dataset.a.nonbogus_features).intersection(
                    rfeclf.mapper.slicearg)) > 0)
        # check repr to have all needed pieces
        r = repr(rfeclf)
        s = str(rfeclf)
        ok_(('partitioner=NFoldP' in r) or
            ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r))
        ok_('lrn=' in r)
        ok_(not 'slicearg=' in r)
        assert_equal(r, r0)
示例#8
0
    def test_james_problem(self):
        percent = 80
        dataset = datasets['uni2small']
        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)

        # update sensitivity at each step (since we're not using the
        # same CLF as sensitivity analyzer)

        class StoreResults(object):
            def __init__(self):
                self.storage = []

            def __call__(self, data, node, result):
                self.storage.append((node.measure.mapper.ca.history,
                                     node.measure.mapper.ca.errors)),

        cv_storage = StoreResults()
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             callback=cv_storage,
                             enable_ca=['confusion'])  # TODO -- it is stats
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception as e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e, ))

        assert (len(cv_storage.storage) == len(dataset.sa['chunks'].unique))
        assert (len(cv_storage.storage[0]) == 2)
        assert (len(cv_storage.storage[0][0]) == dataset.nfeatures)

        self.assertTrue(error < 0.2)
示例#9
0
    def test_mapped_classifier_sensitivity_analyzer(self, clf):
        """Test sensitivity of the mapped classifier
        """
        # Assuming many defaults it is as simple as
        mclf = FeatureSelectionClassifier(
            clf,
            SensitivityBasedFeatureSelection(
                OneWayAnova(),
                FractionTailSelector(0.5, mode='select', tail='upper')),
            enable_ca=['training_stats'])

        sana = mclf.get_sensitivity_analyzer(postproc=sumofabs_sample(),
                                             enable_ca=["sensitivities"])
        # and lets look at all sensitivities
        dataset = datasets['uni2small']
        # and we get sensitivity analyzer which works on splits
        sens = sana(dataset)
        self.assertEqual(sens.shape, (1, dataset.nfeatures))
示例#10
0
    def __init__(self,
                 fmeasure,
                 pmeasure,
                 splitter,
                 fselector=FractionTailSelector(0.05),
                 update_sensitivity=True,
                 nfeatures_min=0,
                 **kwargs):
        # XXX Allow for multiple stopping criterions, e.g. error not decreasing
        # anymore OR number of features less than threshold
        """Initialize recursive feature elimination

        Parameters
        ----------
        fmeasure : FeaturewiseMeasure
        pmeasure : Measure
          used to compute the transfer error of a classifier based on a
          certain feature set on the test dataset.
          NOTE: If sensitivity analyzer is based on the same
          classifier as transfer_error is using, make sure you
          initialize transfer_error with train=False, otherwise
          it would train classifier twice without any necessity.
        splitter: Splitter
          This splitter instance has to generate at least two dataset splits
          when called with the input dataset. The first split serves as the
          training dataset and the second as the evaluation dataset.
        fselector : Functor
          Given a sensitivity map it has to return the ids of those
          features that should be kept.
        update_sensitivity : bool
          If False the sensitivity map is only computed once and reused
          for each iteration. Otherwise the sensitivities are
          recomputed at each selection step.
        nfeatures_min : int
          Number of features for RFE to stop if reached.
        """
        # bases init first
        IterativeFeatureSelection.__init__(self, fmeasure, pmeasure, splitter,
                                           fselector, **kwargs)

        self.__update_sensitivity = update_sensitivity
        """Flag whether sensitivity map is recomputed for each step."""

        self._nfeatures_min = nfeatures_min
示例#11
0
    def test_union_feature_selection(self):
        # two methods: 5% highes F-scores, non-zero SMLR weights
        fss = [SensitivityBasedFeatureSelection(
                    OneWayAnova(),
                    FractionTailSelector(0.05, mode='select', tail='upper')),
               SensitivityBasedFeatureSelection(
                    SMLRWeights(SMLR(lm=1, implementation="C"),
                                postproc=sumofabs_sample()),
                    RangeElementSelector(mode='select'))]

        fs = CombinedFeatureSelection(fss, method='union')

        od_union = fs(self.dataset)

        self.assertTrue(fs.method == 'union')
        # check output dataset
        self.assertTrue(od_union.nfeatures <= self.dataset.nfeatures)
        # again for intersection
        fs = CombinedFeatureSelection(fss, method='intersection')
        od_intersect = fs(self.dataset)
        assert_true(od_intersect.nfeatures < od_union.nfeatures)
示例#12
0
    def __test_fspipeline_with_split_classifier(self, basic_clf):
        #basic_clf = LinearNuSVMC()
        multi_clf = MulticlassClassifier(clf=basic_clf)
        #svm_weigths = LinearSVMWeights(svm)

        # Proper RFE: aggregate sensitivities across multiple splits,
        # but also due to multi class those need to be aggregated
        # somehow. Transfer error here should be 'leave-1-out' error
        # of split classifier itself
        sclf = SplitClassifier(clf=basic_clf)
        rfe = RFE(sensitivity_analyzer=sclf.get_sensitivity_analyzer(
            enable_ca=["sensitivities"]),
                  transfer_error=trans_error,
                  feature_selector=FeatureSelectionPipeline([
                      FractionTailSelector(0.5),
                      FixedNElementTailSelector(1)
                  ]),
                  train_pmeasure=True)

        # and we get sensitivity analyzer which works on splits and uses
        # sensitivity
        selected_features = rfe(self.dataset)
示例#13
0
    def test_rfe(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())
        pmeasure = ProxyMeasure(clf,
                                postproc=BinaryFxNode(mean_mismatch_error,
                                                      'targets'))
        cvmeasure = CrossValidation(clf,
                                    NFoldPartitioner(),
                                    errorfx=mean_mismatch_error,
                                    postproc=mean_sample())

        rfesvm_split = SplitClassifier(clf, OddEvenPartitioner())

        # explore few recipes
        for rfe, data in [
                # because the clf is already trained when computing the sensitivity
                # map, prevent retraining for transfer error calculation
                # Use absolute of the svm weights as sensitivity
            (RFE(sens_ana,
                 pmeasure,
                 Splitter('train'),
                 fselector=FixedNElementTailSelector(1),
                 train_pmeasure=False), self.get_data()),
                # use cross-validation within training to get error for the stopping point
                # but use full training data to derive sensitivity
            (
                RFE(
                    sens_ana,
                    cvmeasure,
                    Repeater(
                        2
                    ),  # give the same full dataset to sens_ana and cvmeasure
                    fselector=FractionTailSelector(0.70,
                                                   mode='select',
                                                   tail='upper'),
                    train_pmeasure=True),
                normal_feature_dataset(perlabel=20,
                                       nchunks=5,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5)),
                # use cross-validation (via SplitClassifier) and get mean
                # of normed sensitivities across those splits
            (
                RFE(
                    rfesvm_split.get_sensitivity_analyzer(
                        postproc=ChainMapper([
                            FxMapper('features', l2_normed),
                            FxMapper('samples', np.mean),
                            FxMapper('samples', np.abs)
                        ])),
                    ConfusionBasedError(rfesvm_split, confusion_state='stats'),
                    Repeater(
                        2),  #  we will use the same full cv-training dataset
                    fselector=FractionTailSelector(0.50,
                                                   mode='select',
                                                   tail='upper'),
                    stopping_criterion=NBackHistoryStopCrit(
                        BestDetector(), 10),
                    train_pmeasure=
                    False,  # we just extract it from existing confusion
                    update_sensitivity=True),
                normal_feature_dataset(perlabel=28,
                                       nchunks=7,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5))
        ]:
            # prep data
            # data = datasets['uni2medium']
            data_nfeatures = data.nfeatures

            rfe.train(data)
            resds = rfe(data)

            # fail if orig datasets are changed
            self.assertTrue(data.nfeatures == data_nfeatures)

            # check that the features set with the least error is selected
            if len(rfe.ca.errors):
                e = np.array(rfe.ca.errors)
                if isinstance(rfe._fselector, FixedNElementTailSelector):
                    self.assertTrue(resds.nfeatures == data_nfeatures -
                                    e.argmin())
                else:
                    imin = np.argmin(e)
                    if 'does_feature_selection' in clf.__tags__:
                        # if clf is smart it might figure it out right away
                        assert_array_less(imin, len(e))
                    else:
                        # in this case we can even check if we had actual
                        # going down/up trend... although -- why up???
                        self.assertTrue(1 < imin < len(e) - 1)
            else:
                self.assertTrue(resds.nfeatures == data_nfeatures)

            # silly check if nfeatures is in decreasing order
            nfeatures = np.array(rfe.ca.nfeatures).copy()
            nfeatures.sort()
            self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all())

            # check if history has elements for every step
            self.assertTrue(
                set(rfe.ca.history) == set(range(len(np.array(
                    rfe.ca.errors)))))

            # Last (the largest number) can be present multiple times even
            # if we remove 1 feature at a time -- just need to stop well
            # in advance when we have more than 1 feature left ;)
            self.assertTrue(rfe.ca.nfeatures[-1] == len(
                np.where(rfe.ca.history == max(rfe.ca.history))[0]))
示例#14
0
    def test_feature_selector(self):
        """Test feature selector"""
        # remove 10% weekest
        selector = FractionTailSelector(0.1)
        data = np.array([3.5, 10, 7, 5, -0.4, 0, 0, 2, 10, 9])
        # == rank [4, 5, 6, 7, 0, 3, 2, 9, 1, 8]
        target10 = np.array([0, 1, 2, 3, 5, 6, 7, 8, 9])
        target30 = np.array([0, 1, 2, 3, 7, 8, 9])

        self.assertRaises(UnknownStateError, selector.ca.__getattribute__,
                          'ndiscarded')
        self.assertTrue((selector(data) == target10).all())
        selector.felements = 0.30  # discard 30%
        self.assertTrue(selector.felements == 0.3)
        self.assertTrue((selector(data) == target30).all())
        self.assertTrue(selector.ca.ndiscarded == 3)  # se 3 were discarded

        selector = FixedNElementTailSelector(1)
        #                   0   1   2  3   4    5  6  7  8   9
        data = np.array([3.5, 10, 7, 5, -0.4, 0, 0, 2, 10, 9])
        self.assertTrue((selector(data) == target10).all())

        selector.nelements = 3
        self.assertTrue(selector.nelements == 3)
        self.assertTrue((selector(data) == target30).all())
        self.assertTrue(selector.ca.ndiscarded == 3)

        # test range selector
        # simple range 'above'
        self.assertTrue((RangeElementSelector(lower=0)(data) == \
                         np.array([0,1,2,3,7,8,9])).all())

        self.assertTrue((RangeElementSelector(lower=0,
                                              inclusive=True)(data) == \
                         np.array([0,1,2,3,5,6,7,8,9])).all())

        self.assertTrue((RangeElementSelector(lower=0, mode='discard',
                                              inclusive=True)(data) == \
                         np.array([4])).all())

        # simple range 'below'
        self.assertTrue((RangeElementSelector(upper=2)(data) == \
                         np.array([4,5,6])).all())

        self.assertTrue((RangeElementSelector(upper=2,
                                              inclusive=True)(data) == \
                         np.array([4,5,6,7])).all())

        self.assertTrue((RangeElementSelector(upper=2, mode='discard',
                                              inclusive=True)(data) == \
                         np.array([0,1,2,3,8,9])).all())

        # ranges
        self.assertTrue((RangeElementSelector(lower=2, upper=9)(data) == \
                         np.array([0,2,3])).all())

        self.assertTrue((RangeElementSelector(lower=2, upper=9,
                                              inclusive=True)(data) == \
                         np.array([0,2,3,7,9])).all())

        self.assertTrue((RangeElementSelector(
            upper=2, lower=9, mode='discard',
            inclusive=True)(data) == RangeElementSelector(
                lower=2, upper=9, inclusive=False)(data)).all())

        # non-0 elements -- should be equivalent to np.nonzero()[0]
        self.assertTrue((RangeElementSelector()(data) == \
                         np.nonzero(data)[0]).all())
示例#15
0
clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           SMLRWeights(SMLR(lm=1.0, implementation="C"),
                       postproc=maxofabs_sample()),
           RangeElementSelector(mode='select')),
        descr="kNN on SMLR(lm=1) non-0")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           OneWayAnova(),
           FractionTailSelector(0.05, mode='select', tail='upper')),
        descr="kNN on 5%(ANOVA)")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           OneWayAnova(),
           FixedNElementTailSelector(50, mode='select', tail='upper')),
        descr="kNN on 50(ANOVA)")

# GNB
clfswh += GNB(descr="GNB()")
clfswh += GNB(common_variance=True, descr="GNB(common_variance=True)")
clfswh += GNB(prior='uniform', descr="GNB(prior='uniform')")
clfswh += \
示例#16
0
    def test_SplitRFE(self, fmeasure):
        # just a smoke test ATM
        from mvpa2.clfs.svm import LinearCSVMC
        from mvpa2.clfs.meta import MappedClassifier
        from mvpa2.misc.data_generators import normal_feature_dataset
        #import mvpa2.featsel.rfe
        #reload(mvpa2.featsel.rfe)
        from mvpa2.featsel.rfe import RFE, SplitRFE
        from mvpa2.generators.partition import NFoldPartitioner
        from mvpa2.featsel.helpers import FractionTailSelector
        from mvpa2.testing import ok_, assert_equal

        clf = LinearCSVMC(C=1)
        dataset = normal_feature_dataset(perlabel=20,
                                         nlabels=2,
                                         nfeatures=11,
                                         snr=1.,
                                         nonbogus_features=[1, 5])
        # flip one of the meaningful features around to see
        # if we are still getting proper selection
        dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1
        # 3 partitions should be enough for testing
        partitioner = NFoldPartitioner(count=3)

        rfeclf = MappedClassifier(
            clf,
            SplitRFE(
                clf,
                partitioner,
                fselector=FractionTailSelector(0.5,
                                               mode='discard',
                                               tail='lower'),
                fmeasure=fmeasure,
                # need to update only when using clf's sens anal
                update_sensitivity=fmeasure is None))
        r0 = repr(rfeclf)

        ok_(rfeclf.mapper.nfeatures_min == 0)
        rfeclf.train(dataset)
        ok_(rfeclf.mapper.nfeatures_min > 0)
        predictions = rfeclf(dataset).samples

        # at least 1 of the nonbogus-features should be chosen
        ok_(
            len(
                set(dataset.a.nonbogus_features).intersection(
                    rfeclf.mapper.slicearg)) > 0)

        # check repr to have all needed pieces
        r = repr(rfeclf)
        s = str(rfeclf)
        ok_(('partitioner=NFoldP' in r) or
            ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r))
        ok_('lrn=' in r)
        ok_(not 'slicearg=' in r)
        assert_equal(r, r0)

        if externals.exists('joblib'):
            rfeclf.mapper.nproc = -1
            # compare results against the one ran in parallel
            _slicearg = rfeclf.mapper.slicearg
            _predictions = predictions
            rfeclf.train(dataset)
            predictions = rfeclf(dataset).samples
            assert_array_equal(predictions, _predictions)
            assert_array_equal(_slicearg, rfeclf.mapper.slicearg)

        # Test that we can collect stats from cas within cross-validation
        sensitivities = []
        nested_errors = []
        nested_nfeatures = []

        def store_me(data, node, result):
            sens = node.measure.get_sensitivity_analyzer(
                force_train=False)(data)
            sensitivities.append(sens)
            nested_errors.append(node.measure.mapper.ca.nested_errors)
            nested_nfeatures.append(node.measure.mapper.ca.nested_nfeatures)

        cv = CrossValidation(rfeclf,
                             NFoldPartitioner(count=1),
                             callback=store_me,
                             enable_ca=['stats'])
        _ = cv(dataset)
        # just to make sure we collected them
        assert_equal(len(sensitivities), 1)
        assert_equal(len(nested_errors), 1)
        assert_equal(len(nested_nfeatures), 1)
示例#17
0
def test_rfe_sensmap():
    # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html
    # just a smoke test. fails with
    from mvpa2.clfs.svm import LinearCSVMC
    from mvpa2.clfs.meta import FeatureSelectionClassifier
    from mvpa2.measures.base import CrossValidation, RepeatedMeasure
    from mvpa2.generators.splitters import Splitter
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.misc.errorfx import mean_mismatch_error
    from mvpa2.mappers.fx import mean_sample
    from mvpa2.mappers.fx import maxofabs_sample
    from mvpa2.generators.base import Repeater
    from mvpa2.featsel.rfe import RFE
    from mvpa2.featsel.helpers import FractionTailSelector, BestDetector
    from mvpa2.featsel.helpers import NBackHistoryStopCrit
    from mvpa2.datasets import vstack

    from mvpa2.misc.data_generators import normal_feature_dataset

    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    fds = normal_feature_dataset(nlabels=3,
                                 snr=1, # 100,   # pure signal! ;)
                                 perlabel=9,
                                 nfeatures=6,
                                 nonbogus_features=range(3),
                                 nchunks=3)
    clfsvm = LinearCSVMC()

    rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()),
                 CrossValidation(
                     clfsvm,
                     NFoldPartitioner(),
                     errorfx=mean_mismatch_error, postproc=mean_sample()),
                 Repeater(2),
                 fselector=FractionTailSelector(0.70, mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 update_sensitivity=True)

    fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm)

    sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample())


    # manually repeating/splitting so we do both RFE sensitivity and classification
    senses, errors = [], []
    for i, pset in enumerate(NFoldPartitioner().generate(fds)):
        # split partitioned dataset
        split = [d for d in Splitter('partitions').generate(pset)]
        senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error
        errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets))

    senses = vstack(senses)
    errors = vstack(errors)

    # Let's compare against rerunning the beast simply for classification with CV
    errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds)
    # and they should match
    assert_array_equal(errors, errors_cv)

    # buggy!
    cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner())
    senses_rm = cv_sensana_svm(fds)

    #print senses.samples, senses_rm.samples
    #print errors, errors_cv.samples
    assert_raises(AssertionError,
                  assert_array_almost_equal,
                  senses.samples, senses_rm.samples)
    raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
示例#18
0
    def test_feature_selector(self):
        """Test feature selector"""
        # remove 10% weekest
        selector = FractionTailSelector(0.1)
        data = np.array([3.5, 10, 7, 5, -0.4, 0, 0, 2, 10, 9])
        # == rank [4, 5, 6, 7, 0, 3, 2, 9, 1, 8]
        target10 = np.array([0, 1, 2, 3, 5, 6, 7, 8, 9])
        target30 = np.array([0, 1, 2, 3, 7, 8, 9])

        self.assertRaises(UnknownStateError,
                              selector.ca.__getattribute__, 'ndiscarded')
        self.assertTrue((selector(data) == target10).all())
        selector.felements = 0.30      # discard 30%
        self.assertTrue(selector.felements == 0.3)
        self.assertTrue((selector(data) == target30).all())
        self.assertTrue(selector.ca.ndiscarded == 3) # se 3 were discarded

        selector = FixedNElementTailSelector(1)
        #                   0   1   2  3   4    5  6  7  8   9
        data = np.array([3.5, 10, 7, 5, -0.4, 0, 0, 2, 10, 9])
        self.assertTrue((selector(data) == target10).all())

        selector.nelements = 3
        self.assertTrue(selector.nelements == 3)
        self.assertTrue((selector(data) == target30).all())
        self.assertTrue(selector.ca.ndiscarded == 3)

        # test range selector
        # simple range 'above'
        self.assertTrue((RangeElementSelector(lower=0)(data) == \
                         np.array([0,1,2,3,7,8,9])).all())

        self.assertTrue((RangeElementSelector(lower=0,
                                              inclusive=True)(data) == \
                         np.array([0,1,2,3,5,6,7,8,9])).all())

        self.assertTrue((RangeElementSelector(lower=0, mode='discard',
                                              inclusive=True)(data) == \
                         np.array([4])).all())

        # simple range 'below'
        self.assertTrue((RangeElementSelector(upper=2)(data) == \
                         np.array([4,5,6])).all())

        self.assertTrue((RangeElementSelector(upper=2,
                                              inclusive=True)(data) == \
                         np.array([4,5,6,7])).all())

        self.assertTrue((RangeElementSelector(upper=2, mode='discard',
                                              inclusive=True)(data) == \
                         np.array([0,1,2,3,8,9])).all())


        # ranges
        self.assertTrue((RangeElementSelector(lower=2, upper=9)(data) == \
                         np.array([0,2,3])).all())

        self.assertTrue((RangeElementSelector(lower=2, upper=9,
                                              inclusive=True)(data) == \
                         np.array([0,2,3,7,9])).all())

        self.assertTrue((RangeElementSelector(upper=2, lower=9, mode='discard',
                                              inclusive=True)(data) ==
                         RangeElementSelector(lower=2, upper=9,
                                              inclusive=False)(data)).all())

        # non-0 elements -- should be equivalent to np.nonzero()[0]
        self.assertTrue((RangeElementSelector()(data) == \
                         np.nonzero(data)[0]).all())
示例#19
0
 def custom_tail_selector(seq):
     seq1 = FractionTailSelector(0.01, mode='discard',
                                 tail='upper')(seq)
     seq2 = FractionTailSelector(0.05, mode='select', tail='upper')(seq)
     return list(set(seq1).intersection(seq2))
示例#20
0
clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           SMLRWeights(SMLR(lm=1.0, implementation="C"),
                       postproc=maxofabs_sample()),
           RangeElementSelector(mode='select')),
        descr="kNN on SMLR(lm=1) non-0")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           OneWayAnova(),
           FractionTailSelector(0.05, mode='select', tail='upper')),
        descr="kNN on 5%(ANOVA)")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           OneWayAnova(),
           FixedNElementTailSelector(50, mode='select', tail='upper')),
        descr="kNN on 50(ANOVA)")

# GNB
clfswh += GNB(descr="GNB()")
clfswh += GNB(common_variance=True, descr="GNB(common_variance=True)")
clfswh += GNB(prior='uniform', descr="GNB(prior='uniform')")
clfswh += \