def test_feature_selection_classifier_with_regression(self): from mvpa2.featsel.base import \ SensitivityBasedFeatureSelection from mvpa2.featsel.helpers import \ FixedNElementTailSelector if sample_clf_reg is None: # none regression was found, so nothing to test return # should give lowest weight to the feature with lowest index sens_ana = SillySensitivityAnalyzer() # corresponding feature selections feat_sel = SensitivityBasedFeatureSelection(sens_ana, FixedNElementTailSelector(1, mode='discard')) # now test with regression-based classifier. The problem is # that it is determining predictions twice from values and # then setting the values from the results, which the second # time is set to predictions. The final outcome is that the # values are actually predictions... dat = dataset_wizard(samples=np.random.randn(4, 10), targets=[-1, -1, 1, 1]) clf_reg = FeatureSelectionClassifier(sample_clf_reg, feat_sel) clf_reg.train(dat) _ = clf_reg.predict(dat.samples) self.failIf((np.array(clf_reg.ca.estimates) - clf_reg.ca.predictions).sum()==0, msg="Values were set to the predictions in %s." % sample_clf_reg)
def test_feature_selection_classifier_with_regression(self): from mvpa2.featsel.base import \ SensitivityBasedFeatureSelection from mvpa2.featsel.helpers import \ FixedNElementTailSelector if sample_clf_reg is None: # none regression was found, so nothing to test return # should give lowest weight to the feature with lowest index sens_ana = SillySensitivityAnalyzer() # corresponding feature selections feat_sel = SensitivityBasedFeatureSelection( sens_ana, FixedNElementTailSelector(1, mode='discard')) # now test with regression-based classifier. The problem is # that it is determining predictions twice from values and # then setting the values from the results, which the second # time is set to predictions. The final outcome is that the # values are actually predictions... dat = dataset_wizard(samples=np.random.randn(4, 10), targets=[-1, -1, 1, 1]) clf_reg = FeatureSelectionClassifier(sample_clf_reg, feat_sel) clf_reg.train(dat) _ = clf_reg.predict(dat.samples) self.failIf( (np.array(clf_reg.ca.estimates) - clf_reg.ca.predictions).sum() == 0, msg="Values were set to the predictions in %s." % sample_clf_reg)
def test_feature_selection_classifier(self): from mvpa2.featsel.base import \ SensitivityBasedFeatureSelection from mvpa2.featsel.helpers import \ FixedNElementTailSelector # should give lowest weight to the feature with lowest index sens_ana = SillySensitivityAnalyzer() # should give lowest weight to the feature with highest index sens_ana_rev = SillySensitivityAnalyzer(mult=-1) # corresponding feature selections feat_sel = SensitivityBasedFeatureSelection( sens_ana, FixedNElementTailSelector(1, mode='discard')) feat_sel_rev = SensitivityBasedFeatureSelection( sens_ana_rev, FixedNElementTailSelector(1)) samples = np.array([[0, 0, -1], [1, 0, 1], [-1, -1, 1], [-1, 0, 1], [1, -1, 1]]) testdata3 = dataset_wizard(samples=samples, targets=1) # dummy train data so proper mapper gets created traindata = dataset_wizard(samples=np.array([[0, 0, -1], [1, 0, 1]]), targets=[1, 2]) # targets res110 = [1, 1, 1, -1, -1] res011 = [-1, 1, -1, 1, -1] # first classifier -- 0th feature should be discarded clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel, enable_ca=['feature_ids']) self.clf_sign.ca.change_temporarily(enable_ca=['estimates']) clf011.train(traindata) self.assertEqual(clf011.predict(testdata3.samples), res011) # just silly test if we get values assigned in the 'ProxyClassifier' self.assertTrue(len(clf011.ca.estimates) == len(res110), msg="We need to pass values into ProxyClassifier") self.clf_sign.ca.reset_changed_temporarily() self.assertEqual(clf011.mapper._oshape, (2, )) "Feature selection classifier had to be trained on 2 features" # first classifier -- last feature should be discarded clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev) clf011.train(traindata) self.assertEqual(clf011.predict(testdata3.samples), res110)
def __test_matthias_question(self): rfe_clf = LinearCSVMC(C=1) rfesvm_split = SplitClassifier(rfe_clf) clf = \ FeatureSelectionClassifier( clf = LinearCSVMC(C=1), feature_selection = RFE( sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer( combiner=first_axis_mean, transformer=np.abs), transfer_error=ConfusionBasedError( rfesvm_split, confusion_state="confusion"), stopping_criterion=FixedErrorThresholdStopCrit(0.20), feature_selector=FractionTailSelector( 0.2, mode='discard', tail='lower'), update_sensitivity=True)) no_permutations = 1000 permutator = AttributePermutator('targets', count=no_permutations) cv = CrossValidation(clf, NFoldPartitioner(), null_dist=MCNullDist(permutator, tail='left'), enable_ca=['stats']) error = cv(datasets['uni2small']) self.assertTrue(error < 0.4) self.assertTrue(cv.ca.null_prob < 0.05)
def test_mapped_classifier_sensitivity_analyzer(self, clf): """Test sensitivity of the mapped classifier """ # Assuming many defaults it is as simple as mclf = FeatureSelectionClassifier( clf, SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.5, mode='select', tail='upper')), enable_ca=['training_stats']) sana = mclf.get_sensitivity_analyzer(postproc=sumofabs_sample(), enable_ca=["sensitivities"]) # and lets look at all sensitivities dataset = datasets['uni2small'] # and we get sensitivity analyzer which works on splits sens = sana(dataset) self.assertEqual(sens.shape, (1, dataset.nfeatures))
def test_mapped_classifier_sensitivity_analyzer(self, clf): """Test sensitivity of the mapped classifier """ # Assuming many defaults it is as simple as mclf = FeatureSelectionClassifier( clf, SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.5, mode='select', tail='upper')), enable_ca=['training_stats']) sana = mclf.get_sensitivity_analyzer(postproc=sumofabs_sample(), enable_ca=["sensitivities"]) # and lets look at all sensitivities dataset = datasets['uni2small'] # and we get sensitivity analyzer which works on splits sens = sana(dataset) self.assertEqual(sens.shape, (1, dataset.nfeatures))
def test_feature_selection_classifier(self): from mvpa2.featsel.base import \ SensitivityBasedFeatureSelection from mvpa2.featsel.helpers import \ FixedNElementTailSelector # should give lowest weight to the feature with lowest index sens_ana = SillySensitivityAnalyzer() # should give lowest weight to the feature with highest index sens_ana_rev = SillySensitivityAnalyzer(mult=-1) # corresponding feature selections feat_sel = SensitivityBasedFeatureSelection(sens_ana, FixedNElementTailSelector(1, mode='discard')) feat_sel_rev = SensitivityBasedFeatureSelection(sens_ana_rev, FixedNElementTailSelector(1)) samples = np.array([ [0, 0, -1], [1, 0, 1], [-1, -1, 1], [-1, 0, 1], [1, -1, 1] ]) testdata3 = dataset_wizard(samples=samples, targets=1) # dummy train data so proper mapper gets created traindata = dataset_wizard(samples=np.array([ [0, 0, -1], [1, 0, 1] ]), targets=[1, 2]) # targets res110 = [1, 1, 1, -1, -1] res011 = [-1, 1, -1, 1, -1] # first classifier -- 0th feature should be discarded clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel, enable_ca=['feature_ids']) self.clf_sign.ca.change_temporarily(enable_ca=['estimates']) clf011.train(traindata) self.assertEqual(clf011.predict(testdata3.samples), res011) # just silly test if we get values assigned in the 'ProxyClassifier' self.assertTrue(len(clf011.ca.estimates) == len(res110), msg="We need to pass values into ProxyClassifier") self.clf_sign.ca.reset_changed_temporarily() self.assertEqual(clf011.mapper._oshape, (2,)) "Feature selection classifier had to be trained on 2 features" # first classifier -- last feature should be discarded clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev) clf011.train(traindata) self.assertEqual(clf011.predict(testdata3.samples), res110)
def test_james_problem_multiclass(self): percent = 80 dataset = datasets['uni4large'] #dataset = dataset[:, dataset.a.nonbogus_features] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ #FxMapper('features', l2_normed), #FxMapper('samples', np.mean), #FxMapper('samples', np.abs) FxMapper('features', lambda x: np.argsort(np.abs(x))), #maxofabs_sample() mean_sample() ])), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['stats']) #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception, e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, ))
def test_james_problem(self): percent = 80 dataset = datasets['uni2small'] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer(), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['confusion']) # TODO -- it is stats #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception as e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, )) assert (len(cv_storage.storage) == len(dataset.sa['chunks'].unique)) assert (len(cv_storage.storage[0]) == 2) assert (len(cv_storage.storage[0][0]) == dataset.nfeatures) self.assertTrue(error < 0.2)
def test_rfe_sensmap(): # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html # just a smoke test. fails with from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import FeatureSelectionClassifier from mvpa2.measures.base import CrossValidation, RepeatedMeasure from mvpa2.generators.splitters import Splitter from mvpa2.generators.partition import NFoldPartitioner from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.mappers.fx import mean_sample from mvpa2.mappers.fx import maxofabs_sample from mvpa2.generators.base import Repeater from mvpa2.featsel.rfe import RFE from mvpa2.featsel.helpers import FractionTailSelector, BestDetector from mvpa2.featsel.helpers import NBackHistoryStopCrit from mvpa2.datasets import vstack from mvpa2.misc.data_generators import normal_feature_dataset # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent fds = normal_feature_dataset(nlabels=3, snr=1, # 100, # pure signal! ;) perlabel=9, nfeatures=6, nonbogus_features=range(3), nchunks=3) clfsvm = LinearCSVMC() rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()), CrossValidation( clfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()), Repeater(2), fselector=FractionTailSelector(0.70, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), update_sensitivity=True) fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm) sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()) # manually repeating/splitting so we do both RFE sensitivity and classification senses, errors = [], [] for i, pset in enumerate(NFoldPartitioner().generate(fds)): # split partitioned dataset split = [d for d in Splitter('partitions').generate(pset)] senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets)) senses = vstack(senses) errors = vstack(errors) # Let's compare against rerunning the beast simply for classification with CV errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds) # and they should match assert_array_equal(errors, errors_cv) # buggy! cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner()) senses_rm = cv_sensana_svm(fds) #print senses.samples, senses_rm.samples #print errors, errors_cv.samples assert_raises(AssertionError, assert_array_almost_equal, senses.samples, senses_rm.samples) raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
from mvpa2.clfs import svm clf = svm.LinearCSVMC() preproc = 'None' # feature selection # preproc = 'fsel-'+str(nVox) # fsel=LDA() from mvpa2.clfs.warehouse import OneWayAnova, LDA fsel = OneWayAnova() import mvpa2.featsel as fs fselector = fs.helpers.FixedNElementTailSelector(nVox, tail='upper', mode='select', sort=False) # fselector = fs.helpers.FractionTailSelector(0.05, mode='select', tail='upper') sbfs = fs.base.SensitivityBasedFeatureSelection(fsel, fselector, enable_ca=['sensitivities']) from mvpa2.clfs.meta import FeatureSelectionClassifier, MappedClassifier fclf = FeatureSelectionClassifier(clf, sbfs) from mvpa2.measures.base import CrossValidation from mvpa2.misc import errorfx from mvpa2.generators.partition import NFoldPartitioner cv = CrossValidation(fclf, NFoldPartitioner(attr='chunks'), errorfx=errorfx.mean_match_accuracy) import numpy as np from mvpa2.misc.io.base import SampleAttributes cv_attr = SampleAttributes(os.path.join(paths[3], (con + "_attribute_labels.txt"))) from mvpa2.measures import rsa dsm = rsa.PDist(square=True)
def setup_classifier(**kwargs): ''' Thinked! ''' for arg in kwargs: if arg == 'clf_type': clf_type = kwargs[arg] if arg == 'fsel': f_sel = kwargs[arg] if arg == 'cv_type': cv_approach = kwargs[arg] if arg == 'cv_folds': if np.int(kwargs[arg]) == 0: cv_type = np.float(kwargs[arg]) else: cv_type = np.int(kwargs[arg]) if arg == 'permutations': permutations = np.int(kwargs[arg]) if arg == 'cv_attribute': attribute = kwargs[arg] cv_n = cv_type ################# Classifier ####################### if clf_type == 'SVM': clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) elif clf_type == 'GNB': clf = GNB() elif clf_type == 'LDA': clf = LDA() elif clf_type == 'QDA': clf = QDA() elif clf_type == 'SMLR': clf = SMLR() elif clf_type == 'RbfSVM': sk_clf = SVC(gamma=0.1, C=1) clf = SKLLearnerAdapter(sk_clf, enable_ca=['probabilities']) elif clf_type == 'GP': clf = GPR() else: clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) ############## Feature Selection ######################### if f_sel == 'True': logger.info('Feature Selection selected.') fsel = SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')) fclf = FeatureSelectionClassifier(clf, fsel) elif f_sel == 'Fixed': logger.info('Fixed Feature Selection selected.') fsel = SensitivityBasedFeatureSelection( OneWayAnova(), FixedNElementTailSelector(100, mode='select', tail='upper')) fclf = FeatureSelectionClassifier(clf, fsel) elif f_sel == 'PCA': from mvpa2.mappers.skl_adaptor import SKLTransformer from sklearn.decomposition import PCA logger.info('Fixed Feature Selection selected.') fsel = SKLTransformer(PCA(n_components=45)) fclf = FeatureSelectionClassifier(clf, fsel) else: fclf = clf ######################### Permutations ############################# if permutations != 0: if __debug__: debug.active += ["STATMC"] repeater = Repeater(count=permutations) permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) partitioner = NFoldPartitioner(cvtype=cv_n, attr=attribute) null_cv = CrossValidation(clf, ChainNode([partitioner, permutator], space=partitioner.get_space()), errorfx=mean_mismatch_error) distr_est = MCNullDist(repeater, tail='left', measure=null_cv, enable_ca=['dist_samples']) #postproc = mean_sample() else: distr_est = None #postproc = None ######################################################## if cv_approach == 'n_fold': if cv_type != 0: splitter_used = NFoldPartitioner(cvtype=cv_type, attr=attribute) else: splitter_used = NFoldPartitioner(cvtype=1, attr=attribute) else: splitter_used = HalfPartitioner(attr=attribute) chain_splitter = ChainNode([ splitter_used, Balancer( attr='targets', count=1, limit='partitions', apply_selection=True) ], space='partitions') ############################################################# if distr_est == None: cvte = CrossValidation(fclf, chain_splitter, enable_ca=['stats', 'repetition_results']) else: cvte = CrossValidation(fclf, chain_splitter, errorfx=mean_mismatch_error, null_dist=distr_est, enable_ca=['stats', 'repetition_results']) logger.info('Classifier set...') return [fclf, cvte]
tags=_lars_tags, descr='skl.LassoLarsIC()') regrswh += [_lasso_lars_ic] clfswh += [ RegressionAsClassifier(_lasso_lars_ic, descr='skl.LassoLarsIC_C()') ] # kNN clfswh += kNN(k=5, descr="kNN(k=5)") clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( SMLRWeights(SMLR(lm=1.0, implementation="C"), postproc=maxofabs_sample()), RangeElementSelector(mode='select')), descr="kNN on SMLR(lm=1) non-0") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')), descr="kNN on 5%(ANOVA)") clfswh += \ FeatureSelectionClassifier( kNN(),
# clf.fit(desX.matrix, thisDS.samples[:,1]) # clf.predict(desX.matrix) # make class that takes skl regression models as input and wraps into mapper cname = 'LinearSVM' clf = svm.LinearCSVMC() preproc = 'None' # feature selection # preproc = 'fsel-'+str(nVox) # fsel=LDA() fsel = OneWayAnova() fselector = fs.helpers.FixedNElementTailSelector(nVox, tail='upper', mode='select', sort=False) # fselector = fs.helpers.FractionTailSelector(0.05, mode='select', tail='upper') sbfs = fs.base.SensitivityBasedFeatureSelection(fsel, fselector, enable_ca=['sensitivities']) fclf = FeatureSelectionClassifier(clf, sbfs) # SVD # preproc = 'SVD-' + str(nComp) # svdmapper = SVDMapper() # get_SVD_sliced = lambda x: ChainMapper([svdmapper, StaticFeatureSelection(x)]) # fclf = MappedClassifier(clf, get_SVD_sliced(slice(0, nComp))) ################################################ cv = CrossValidation(fclf, NFoldPartitioner(attr='chunks'), errorfx=errorfx.mean_match_accuracy) from mvpa2.measures import rsa dsm = rsa.PDist(square=True) lresults = []
tags=_lars_tags, descr='skl.LassoLarsIC()') regrswh += [_lasso_lars_ic] clfswh += [ RegressionAsClassifier(_lasso_lars_ic, descr='skl.LassoLarsIC_C()') ] # kNN clfswh += kNN(k=5, descr="kNN(k=5)") clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( SMLRWeights(SMLR(lm=1.0, implementation="C"), postproc=maxofabs_sample()), RangeElementSelector(mode='select')), descr="kNN on SMLR(lm=1) non-0") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')), descr="kNN on 5%(ANOVA)") clfswh += \ FeatureSelectionClassifier( kNN(),