예제 #1
0
    def test_sensitivity_based_feature_selection(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())

        # of features to remove
        Nremove = 2

        # because the clf is already trained when computing the sensitivity
        # map, prevent retraining for transfer error calculation
        # Use absolute of the svm weights as sensitivity
        fe = SensitivityBasedFeatureSelection(
            sens_ana,
            feature_selector=FixedNElementTailSelector(2),
            enable_ca=["sensitivity", "selected_ids"])

        data = self.get_data()

        data_nfeatures = data.nfeatures

        fe.train(data)
        resds = fe(data)

        # fail if orig datasets are changed
        self.assertTrue(data.nfeatures == data_nfeatures)

        # silly check if nfeatures got a single one removed
        self.assertEqual(data.nfeatures,
                         resds.nfeatures + Nremove,
                         msg="We had to remove just a single feature")

        self.assertEqual(
            fe.ca.sensitivity.nfeatures,
            data_nfeatures,
            msg="Sensitivity have to have # of features equal to original")
예제 #2
0
    def test_feature_selection_pipeline(self):
        sens_ana = SillySensitivityAnalyzer()

        data = self.get_data()
        data_nfeatures = data.nfeatures

        # test silly one first ;-)
        self.assertEqual(
            sens_ana(data).samples[0, 0], -int(data_nfeatures / 2))

        # OLD: first remove 25% == 6, and then 4, total removing 10
        # NOW: test should be independent of the numerical number of features
        feature_selections = [
            SensitivityBasedFeatureSelection(sens_ana,
                                             FractionTailSelector(0.25)),
            SensitivityBasedFeatureSelection(sens_ana,
                                             FixedNElementTailSelector(4))
        ]

        # create a FeatureSelection pipeline
        feat_sel_pipeline = ChainMapper(feature_selections)

        feat_sel_pipeline.train(data)
        resds = feat_sel_pipeline(data)

        self.assertEqual(len(feat_sel_pipeline),
                         len(feature_selections),
                         msg="Test the property feature_selections")

        desired_nfeatures = int(np.ceil(data_nfeatures * 0.75))
        self.assertEqual([fe._oshape[0] for fe in feat_sel_pipeline],
                         [desired_nfeatures, desired_nfeatures - 4])
예제 #3
0
    def test_sensitivity_based_feature_selection(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())

        # of features to remove
        Nremove = 2

        # because the clf is already trained when computing the sensitivity
        # map, prevent retraining for transfer error calculation
        # Use absolute of the svm weights as sensitivity
        fe = SensitivityBasedFeatureSelection(sens_ana,
                feature_selector=FixedNElementTailSelector(2),
                enable_ca=["sensitivity", "selected_ids"])

        data = self.get_data()

        data_nfeatures = data.nfeatures

        fe.train(data)
        resds = fe(data)

        # fail if orig datasets are changed
        self.assertTrue(data.nfeatures == data_nfeatures)

        # silly check if nfeatures got a single one removed
        self.assertEqual(data.nfeatures, resds.nfeatures+Nremove,
            msg="We had to remove just a single feature")

        self.assertEqual(fe.ca.sensitivity.nfeatures, data_nfeatures,
            msg="Sensitivity have to have # of features equal to original")
예제 #4
0
    def test_feature_selection_classifier(self):
        from mvpa2.featsel.base import \
             SensitivityBasedFeatureSelection
        from mvpa2.featsel.helpers import \
             FixedNElementTailSelector

        # should give lowest weight to the feature with lowest index
        sens_ana = SillySensitivityAnalyzer()
        # should give lowest weight to the feature with highest index
        sens_ana_rev = SillySensitivityAnalyzer(mult=-1)

        # corresponding feature selections
        feat_sel = SensitivityBasedFeatureSelection(
            sens_ana, FixedNElementTailSelector(1, mode='discard'))

        feat_sel_rev = SensitivityBasedFeatureSelection(
            sens_ana_rev, FixedNElementTailSelector(1))

        samples = np.array([[0, 0, -1], [1, 0, 1], [-1, -1, 1], [-1, 0, 1],
                            [1, -1, 1]])

        testdata3 = dataset_wizard(samples=samples, targets=1)
        # dummy train data so proper mapper gets created
        traindata = dataset_wizard(samples=np.array([[0, 0, -1], [1, 0, 1]]),
                                   targets=[1, 2])

        # targets
        res110 = [1, 1, 1, -1, -1]
        res011 = [-1, 1, -1, 1, -1]

        # first classifier -- 0th feature should be discarded
        clf011 = FeatureSelectionClassifier(self.clf_sign,
                                            feat_sel,
                                            enable_ca=['feature_ids'])

        self.clf_sign.ca.change_temporarily(enable_ca=['estimates'])
        clf011.train(traindata)

        self.assertEqual(clf011.predict(testdata3.samples), res011)
        # just silly test if we get values assigned in the 'ProxyClassifier'
        self.assertTrue(len(clf011.ca.estimates) == len(res110),
                        msg="We need to pass values into ProxyClassifier")
        self.clf_sign.ca.reset_changed_temporarily()

        self.assertEqual(clf011.mapper._oshape, (2, ))
        "Feature selection classifier had to be trained on 2 features"

        # first classifier -- last feature should be discarded
        clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev)
        clf011.train(traindata)
        self.assertEqual(clf011.predict(testdata3.samples), res110)
예제 #5
0
def test_remove_invariant_as_a_mapper():
    from mvpa2.featsel.helpers import RangeElementSelector
    from mvpa2.featsel.base import StaticFeatureSelection, SensitivityBasedFeatureSelection
    from mvpa2.testing.datasets import datasets
    from mvpa2.datasets.miscfx import remove_invariant_features

    mapper = SensitivityBasedFeatureSelection(
              lambda x: np.std(x, axis=0),
              RangeElementSelector(lower=0, inclusive=False),
              train_analyzer=False,
              auto_train=True)

    ds = datasets['uni2large'].copy()

    ds.a['mapper'] = StaticFeatureSelection(np.arange(ds.nfeatures))
    ds.fa['index'] = np.arange(ds.nfeatures)
    ds.samples[:, [1, 8]] = 10

    ds_out = mapper(ds)

    # Validate that we are getting the same results as remove_invariant_features
    ds_rifs = remove_invariant_features(ds)
    assert_array_equal(ds_out.samples, ds_rifs.samples)
    assert_array_equal(ds_out.fa.index, ds_rifs.fa.index)

    assert_equal(ds_out.fa.index[1], 2)
    assert_equal(ds_out.fa.index[8], 10)
예제 #6
0
    def test_feature_selection_classifier_with_regression(self):
        from mvpa2.featsel.base import \
             SensitivityBasedFeatureSelection
        from mvpa2.featsel.helpers import \
             FixedNElementTailSelector
        if sample_clf_reg is None:
            # none regression was found, so nothing to test
            return
        # should give lowest weight to the feature with lowest index
        sens_ana = SillySensitivityAnalyzer()

        # corresponding feature selections
        feat_sel = SensitivityBasedFeatureSelection(
            sens_ana, FixedNElementTailSelector(1, mode='discard'))

        # now test with regression-based classifier. The problem is
        # that it is determining predictions twice from values and
        # then setting the values from the results, which the second
        # time is set to predictions.  The final outcome is that the
        # values are actually predictions...
        dat = dataset_wizard(samples=np.random.randn(4, 10),
                             targets=[-1, -1, 1, 1])
        clf_reg = FeatureSelectionClassifier(sample_clf_reg, feat_sel)
        clf_reg.train(dat)
        _ = clf_reg.predict(dat.samples)
        self.failIf(
            (np.array(clf_reg.ca.estimates) -
             clf_reg.ca.predictions).sum() == 0,
            msg="Values were set to the predictions in %s." % sample_clf_reg)
예제 #7
0
    def test_custom_combined_selectors(self):
        """Test combination of the selectors in a single function
        """
        def custom_tail_selector(seq):
            seq1 = FractionTailSelector(0.01, mode='discard',
                                        tail='upper')(seq)
            seq2 = FractionTailSelector(0.05, mode='select', tail='upper')(seq)
            return list(set(seq1).intersection(seq2))

        seq = np.arange(100)
        seq_ = custom_tail_selector(seq)

        assert_array_equal(sorted(seq_), [95, 96, 97, 98])
        # verify that this function could be used in place of the selector
        fs = SensitivityBasedFeatureSelection(OneWayAnova(),
                                              custom_tail_selector)
        ds = datasets['3dsmall']
        fs.train(ds)  # XXX: why needs to be trained here explicitly?
        ds_ = fs(ds)
        assert_equal(ds_.nfeatures, int(ds.nfeatures * 0.04))
예제 #8
0
    def test_custom_combined_selectors(self):
        """Test combination of the selectors in a single function
        """

        def custom_tail_selector(seq):
            seq1 = FractionTailSelector(0.01, mode='discard', tail='upper')(seq)
            seq2 = FractionTailSelector(0.05, mode='select', tail='upper')(seq)
            return list(set(seq1).intersection(seq2))

        seq = np.arange(100)
        seq_ = custom_tail_selector(seq)

        assert_array_equal(sorted(seq_), [95, 96, 97, 98])
        # verify that this function could be used in place of the selector
        fs = SensitivityBasedFeatureSelection(
                    OneWayAnova(),
                    custom_tail_selector)
        ds = datasets['3dsmall']
        fs.train(ds)          # XXX: why needs to be trained here explicitly?
        ds_ = fs(ds)
        assert_equal(ds_.nfeatures, int(ds.nfeatures * 0.04))
예제 #9
0
    def test_union_feature_selection(self):
        # two methods: 5% highes F-scores, non-zero SMLR weights
        fss = [SensitivityBasedFeatureSelection(
                    OneWayAnova(),
                    FractionTailSelector(0.05, mode='select', tail='upper')),
               SensitivityBasedFeatureSelection(
                    SMLRWeights(SMLR(lm=1, implementation="C"),
                                postproc=sumofabs_sample()),
                    RangeElementSelector(mode='select'))]

        fs = CombinedFeatureSelection(fss, method='union')

        od_union = fs(self.dataset)

        self.assertTrue(fs.method == 'union')
        # check output dataset
        self.assertTrue(od_union.nfeatures <= self.dataset.nfeatures)
        # again for intersection
        fs = CombinedFeatureSelection(fss, method='intersection')
        od_intersect = fs(self.dataset)
        assert_true(od_intersect.nfeatures < od_union.nfeatures)
예제 #10
0
    def test_mapped_classifier_sensitivity_analyzer(self, clf):
        """Test sensitivity of the mapped classifier
        """
        # Assuming many defaults it is as simple as
        mclf = FeatureSelectionClassifier(
            clf,
            SensitivityBasedFeatureSelection(
                OneWayAnova(),
                FractionTailSelector(0.5, mode='select', tail='upper')),
            enable_ca=['training_stats'])

        sana = mclf.get_sensitivity_analyzer(postproc=sumofabs_sample(),
                                             enable_ca=["sensitivities"])
        # and lets look at all sensitivities
        dataset = datasets['uni2small']
        # and we get sensitivity analyzer which works on splits
        sens = sana(dataset)
        self.assertEqual(sens.shape, (1, dataset.nfeatures))
예제 #11
0
def setup_classifier(**kwargs):
    '''
    Thinked!
    '''
    for arg in kwargs:
        if arg == 'clf_type':
            clf_type = kwargs[arg]
        if arg == 'fsel':
            f_sel = kwargs[arg]
        if arg == 'cv_type':
            cv_approach = kwargs[arg]
        if arg == 'cv_folds':
            if np.int(kwargs[arg]) == 0:
                cv_type = np.float(kwargs[arg])
            else:
                cv_type = np.int(kwargs[arg])
        if arg == 'permutations':
            permutations = np.int(kwargs[arg])
        if arg == 'cv_attribute':
            attribute = kwargs[arg]

    cv_n = cv_type

    ################# Classifier #######################
    if clf_type == 'SVM':
        clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities'])
    elif clf_type == 'GNB':
        clf = GNB()
    elif clf_type == 'LDA':
        clf = LDA()
    elif clf_type == 'QDA':
        clf = QDA()
    elif clf_type == 'SMLR':
        clf = SMLR()
    elif clf_type == 'RbfSVM':
        sk_clf = SVC(gamma=0.1, C=1)
        clf = SKLLearnerAdapter(sk_clf, enable_ca=['probabilities'])
    elif clf_type == 'GP':
        clf = GPR()
    else:
        clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities'])

    ############## Feature Selection #########################
    if f_sel == 'True':
        logger.info('Feature Selection selected.')
        fsel = SensitivityBasedFeatureSelection(
            OneWayAnova(),
            FractionTailSelector(0.05, mode='select', tail='upper'))
        fclf = FeatureSelectionClassifier(clf, fsel)

    elif f_sel == 'Fixed':
        logger.info('Fixed Feature Selection selected.')
        fsel = SensitivityBasedFeatureSelection(
            OneWayAnova(),
            FixedNElementTailSelector(100, mode='select', tail='upper'))
        fclf = FeatureSelectionClassifier(clf, fsel)

    elif f_sel == 'PCA':
        from mvpa2.mappers.skl_adaptor import SKLTransformer
        from sklearn.decomposition import PCA
        logger.info('Fixed Feature Selection selected.')
        fsel = SKLTransformer(PCA(n_components=45))

        fclf = FeatureSelectionClassifier(clf, fsel)
    else:

        fclf = clf

    ######################### Permutations #############################

    if permutations != 0:
        if __debug__:
            debug.active += ["STATMC"]
        repeater = Repeater(count=permutations)
        permutator = AttributePermutator('targets',
                                         limit={'partitions': 1},
                                         count=1)
        partitioner = NFoldPartitioner(cvtype=cv_n, attr=attribute)
        null_cv = CrossValidation(clf,
                                  ChainNode([partitioner, permutator],
                                            space=partitioner.get_space()),
                                  errorfx=mean_mismatch_error)

        distr_est = MCNullDist(repeater,
                               tail='left',
                               measure=null_cv,
                               enable_ca=['dist_samples'])
        #postproc = mean_sample()
    else:
        distr_est = None
        #postproc = None

    ########################################################
    if cv_approach == 'n_fold':
        if cv_type != 0:
            splitter_used = NFoldPartitioner(cvtype=cv_type, attr=attribute)
        else:
            splitter_used = NFoldPartitioner(cvtype=1, attr=attribute)
    else:
        splitter_used = HalfPartitioner(attr=attribute)

    chain_splitter = ChainNode([
        splitter_used,
        Balancer(
            attr='targets', count=1, limit='partitions', apply_selection=True)
    ],
                               space='partitions')

    #############################################################
    if distr_est == None:
        cvte = CrossValidation(fclf,
                               chain_splitter,
                               enable_ca=['stats', 'repetition_results'])
    else:
        cvte = CrossValidation(fclf,
                               chain_splitter,
                               errorfx=mean_mismatch_error,
                               null_dist=distr_est,
                               enable_ca=['stats', 'repetition_results'])

    logger.info('Classifier set...')

    return [fclf, cvte]
예제 #12
0
                                           tags=_lars_tags,
                                           descr='skl.LassoLarsIC()')
        regrswh += [_lasso_lars_ic]
        clfswh += [
            RegressionAsClassifier(_lasso_lars_ic, descr='skl.LassoLarsIC_C()')
        ]

# kNN
clfswh += kNN(k=5, descr="kNN(k=5)")
clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           SMLRWeights(SMLR(lm=1.0, implementation="C"),
                       postproc=maxofabs_sample()),
           RangeElementSelector(mode='select')),
        descr="kNN on SMLR(lm=1) non-0")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           OneWayAnova(),
           FractionTailSelector(0.05, mode='select', tail='upper')),
        descr="kNN on 5%(ANOVA)")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
예제 #13
0
    
    data = np.concatenate(data)
    labels = np.concatenate(labels)
    
    return data, labels.astype(np.int)

rois = ['aSTG', 'HG', 'pSTG']

for sub_id in range(1, 21):
    data = []
    for roi in rois:
        data_path = os.path.join(data_dir, roi)
        tmp_data, label = load_data(data_path, sub_id)
        data.append(tmp_data)
    data = np.concatenate(data, axis=1)
    data = np.concatenate([data[i,:,:].T for i in range(len(data))])

    ds = Dataset(data)
    ds.sa['time_coords'] = np.linspace(0, len(ds)-1, len(ds))
    events = [{'onset': i*5, 'duration': 5, 'targets':label[i], 'chunks':i+1} for i in range(int(len(ds)/5))]

    hrf_estimates = fit_event_hrf_model(ds, events, time_attr='time_coords', condition_attr=('targets', 'chunks'), 
                                    design_kwargs=dict(drift_model='blank'), glmfit_kwargs=dict(model='ols'),
                                    return_model=True)

    fsel = SensitivityBasedFeatureSelection(OneWayAnova(), FixedNElementTailSelector(5000, mode='select', tail='upper'))

    fsel.train(hrf_estimates)
    ds_p = fsel(hrf_estimates)

    np.save('feat_sub{:03d}'.format(sub_id), ds_p.samples)