def test_split_classifier_extended(self, clf_): clf2 = clf_.clone() ds = datasets['uni2%s' % self._get_clf_ds(clf2)] clf = SplitClassifier(clf=clf_, #SameSignClassifier(), enable_ca=['stats', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.stats.error cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds).samples.squeeze() self.failUnless(abs(error-cverror)<0.01, msg="We should get the same error using split classifier as" " using CrossValidation. Got %s and %s" % (error, cverror)) if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(error < 0.25, msg="clf should generalize more or less fine. " "Got error %s" % error) self.failUnlessEqual(len(clf.ca.stats.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.failUnlessEqual(len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs")
def test_split_classifier_extended(self, clf_): clf2 = clf_.clone() ds = datasets['uni2medium'] #self.data_bin_1 clf = SplitClassifier( clf=clf_, #SameSignClassifier(), splitter=NFoldSplitter(1), enable_ca=['confusion', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.confusion.error cv = CrossValidatedTransferError( TransferError(clf2), NFoldSplitter(), postproc=mean_sample(), enable_ca=['confusion', 'training_confusion']) cverror = cv(ds).samples.squeeze() self.failUnless( abs(error - cverror) < 0.01, msg="We should get the same error using split classifier as" " using CrossValidatedTransferError. Got %s and %s" % (error, cverror)) if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(error < 0.25, msg="clf should generalize more or less fine. " "Got error %s" % error) self.failUnlessEqual(len(clf.ca.confusion.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.failUnlessEqual( len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs")
def test_james_problem(self): percent = 80 dataset = datasets['uni2small'] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer(), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['confusion']) #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception, e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e,))
def test_regression_as_classifier(self, regr): """Basic tests of metaclass for using regressions as classifiers """ for dsname in 'uni2small', 'uni4small': ds = datasets[dsname] clf = RegressionAsClassifier(regr, enable_ca=['distances']) cv = CrossValidatedTransferError( TransferError(clf), OddEvenSplitter(), postproc=mean_sample(), enable_ca=['confusion', 'training_confusion']) error = cv(ds).samples.squeeze() nlabels = len(ds.uniquetargets) if nlabels == 2 \ and cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(error < 0.3) # Check if does not puke on repr and str self.failUnless(str(clf) != "") self.failUnless(repr(clf) != "") self.failUnlessEqual(clf.ca.distances.shape, (ds.nsamples / 2, nlabels))
def test_regression_as_classifier(self, regr): """Basic tests of metaclass for using regressions as classifiers """ for dsname in 'uni2small', 'uni4small': ds = datasets[dsname] clf = RegressionAsClassifier(regr, enable_ca=['distances']) cv = CrossValidation(clf, OddEvenPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) error = cv(ds).samples.squeeze() nlabels = len(ds.uniquetargets) if nlabels == 2 \ and cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(error < 0.3, msg="Got error %.2f on %s dataset" % (error, dsname)) # Check if does not puke on repr and str self.failUnless(str(clf) != "") self.failUnless(repr(clf) != "") self.failUnlessEqual(clf.ca.distances.shape, (ds.nsamples / 2, nlabels))
def test_james_problem(self): percent = 80 dataset = datasets["uni2small"] rfesvm_split = LinearCSVMC() fs = RFE( sensitivity_analyzer=rfesvm_split.get_sensitivity_analyzer(), transfer_error=TransferError(rfesvm_split), feature_selector=FractionTailSelector(percent / 100.0, mode="select", tail="upper"), update_sensitivity=True, ) clf = FeatureSelectionClassifier( clf=LinearCSVMC(), # on features selected via RFE feature_selection=fs, ) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) clf.ca.enable("feature_ids") cv = CrossValidatedTransferError( TransferError(clf), NFoldSplitter(cvtype=1), postproc=mean_sample(), enable_ca=["confusion"], expose_testdataset=True, ) # cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception, e: self.fail( "CrossValidation cannot handle classifier with RFE " "feature selection. Got exception: %s" % (e,) )
def test_james_problem(self): percent = 80 dataset = datasets['uni2small'] rfesvm_split = LinearCSVMC() fs = \ RFE(sensitivity_analyzer=rfesvm_split.get_sensitivity_analyzer(), transfer_error=TransferError(rfesvm_split), feature_selector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( clf = LinearCSVMC(), # on features selected via RFE feature_selection = fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) clf.ca.enable('feature_ids') cv = CrossValidatedTransferError( TransferError(clf), NFoldSplitter(cvtype=1), postproc=mean_sample(), enable_ca=['confusion'], expose_testdataset=True) #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception, e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e,))
def test_split_classifier(self): ds = self.data_bin_1 clf = SplitClassifier( clf=SameSignClassifier(), splitter=NFoldSplitter(1), enable_ca=['confusion', 'training_confusion', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.confusion.error tr_error = clf.ca.training_confusion.error clf2 = clf.clone() cv = CrossValidatedTransferError( TransferError(clf2), NFoldSplitter(), postproc=mean_sample(), enable_ca=['confusion', 'training_confusion']) cverror = cv(ds).samples.squeeze() tr_cverror = cv.ca.training_confusion.error self.failUnlessEqual( error, cverror, msg="We should get the same error using split classifier as" " using CrossValidatedTransferError. Got %s and %s" % (error, cverror)) self.failUnlessEqual( tr_error, tr_cverror, msg="We should get the same training error using split classifier as" " using CrossValidatedTransferError. Got %s and %s" % (tr_error, tr_cverror)) self.failUnlessEqual(clf.ca.confusion.percent_correct, 100, msg="Dummy clf should train perfectly") self.failUnlessEqual(len(clf.ca.confusion.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.failUnlessEqual( len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs") self.failUnlessEqual(clf.predict(ds.samples), list(ds.targets), msg="Should classify correctly") # feature_ids must be list of lists, and since it is not # feature-selecting classifier used - we expect all features # to be utilized # NOT ANYMORE -- for BoostedClassifier we have now union of all # used features across slave classifiers. That makes # semantics clear. If you need to get deeper -- use upcoming # harvesting facility ;-) # self.failUnlessEqual(len(clf.feature_ids), len(ds.uniquechunks)) # self.failUnless(np.array([len(ids)==ds.nfeatures # for ids in clf.feature_ids]).all()) # Just check if we get it at all ;-) summary = clf.summary()
def test_tree_classifier(self): """Basic tests for TreeClassifier """ ds = datasets['uni4small'] clfs = clfswh['binary'] # pool of classifiers # Lets permute so each time we try some different combination # of the classifiers clfs = [clfs[i] for i in np.random.permutation(len(clfs))] # Test conflicting definition tclf = TreeClassifier(clfs[0], { 'L0+2': (('L0', 'L2'), clfs[1]), 'L2+3': (('L2', 'L3'), clfs[2]) }) self.failUnlessRaises(ValueError, tclf.train, ds) """Should raise exception since label 2 is in both""" # Test insufficient definition tclf = TreeClassifier(clfs[0], { 'L0+5': (('L0', 'L5'), clfs[1]), 'L2+3': (('L2', 'L3'), clfs[2]) }) self.failUnlessRaises(ValueError, tclf.train, ds) """Should raise exception since no group for L1""" # proper definition now tclf = TreeClassifier(clfs[0], { 'L0+1': (('L0', 'L1'), clfs[1]), 'L2+3': (('L2', 'L3'), clfs[2]) }) # Lets test train/test cycle using CVTE cv = CrossValidatedTransferError( TransferError(tclf), OddEvenSplitter(), postproc=mean_sample(), enable_ca=['confusion', 'training_confusion']) cverror = cv(ds).samples.squeeze() try: rtclf = repr(tclf) except: self.fail(msg="Could not obtain repr for TreeClassifier") # Test accessibility of .clfs self.failUnless(tclf.clfs['L0+1'] is clfs[1]) self.failUnless(tclf.clfs['L2+3'] is clfs[2]) cvtrc = cv.ca.training_confusion cvtc = cv.ca.confusion if cfg.getboolean('tests', 'labile', default='yes'): # just a dummy check to make sure everything is working self.failUnless(cvtrc != cvtc) self.failUnless(cverror < 0.3) # TODO: whenever implemented tclf = TreeClassifier(clfs[0], { 'L0': (('L0', ), clfs[1]), 'L1+2+3': (('L1', 'L2', 'L3'), clfs[2]) })
def test_tree_classifier(self): """Basic tests for TreeClassifier """ ds = datasets['uni4small'] clfs = clfswh['binary'] # pool of classifiers # Lets permute so each time we try some different combination # of the classifiers clfs = [clfs[i] for i in np.random.permutation(len(clfs))] # Test conflicting definition tclf = TreeClassifier(clfs[0], { 'L0+2' : (('L0', 'L2'), clfs[1]), 'L2+3' : (('L2', 'L3'), clfs[2])}) self.failUnlessRaises(ValueError, tclf.train, ds) """Should raise exception since label 2 is in both""" # Test insufficient definition tclf = TreeClassifier(clfs[0], { 'L0+5' : (('L0', 'L5'), clfs[1]), 'L2+3' : (('L2', 'L3'), clfs[2])}) self.failUnlessRaises(ValueError, tclf.train, ds) """Should raise exception since no group for L1""" # proper definition now tclf = TreeClassifier(clfs[0], { 'L0+1' : (('L0', 'L1'), clfs[1]), 'L2+3' : (('L2', 'L3'), clfs[2])}) # Lets test train/test cycle using CVTE cv = CrossValidatedTransferError( TransferError(tclf), OddEvenSplitter(), postproc=mean_sample(), enable_ca=['confusion', 'training_confusion']) cverror = cv(ds).samples.squeeze() try: rtclf = repr(tclf) except: self.fail(msg="Could not obtain repr for TreeClassifier") # Test accessibility of .clfs self.failUnless(tclf.clfs['L0+1'] is clfs[1]) self.failUnless(tclf.clfs['L2+3'] is clfs[2]) cvtrc = cv.ca.training_confusion cvtc = cv.ca.confusion if cfg.getboolean('tests', 'labile', default='yes'): # just a dummy check to make sure everything is working self.failUnless(cvtrc != cvtc) self.failUnless(cverror < 0.3) # TODO: whenever implemented tclf = TreeClassifier(clfs[0], { 'L0' : (('L0',), clfs[1]), 'L1+2+3' : (('L1', 'L2', 'L3'), clfs[2])})
def test_null_dist_prob(self, l_clf): train = datasets['uni2medium'] num_perm = 10 permutator = AttributePermutator('targets', count=num_perm) # define class to estimate NULL distribution of errors # use left tail of the distribution since we use MeanMatchFx as error # function and lower is better terr = TransferMeasure( l_clf, Repeater(count=2), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), null_dist=MCNullDist(permutator, tail='left')) # check reasonable error range err = terr(train) self.failUnless(np.mean(err) < 0.4) # Lets do the same for CVTE cvte = CrossValidation(l_clf, OddEvenPartitioner(), null_dist=MCNullDist(permutator, tail='left', enable_ca=['dist_samples']), postproc=mean_sample()) cv_err = cvte(train) # check that the result is highly significant since we know that the # data has signal null_prob = np.asscalar(terr.ca.null_prob) if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(null_prob <= 0.1, msg="Failed to check that the result is highly significant " "(got %f) since we know that the data has signal" % null_prob) self.failUnless(np.asscalar(cvte.ca.null_prob) <= 0.1, msg="Failed to check that the result is highly significant " "(got p(cvte)=%f) since we know that the data has signal" % np.asscalar(cvte.ca.null_prob)) # we should be able to access the actual samples of the distribution # yoh: why it is 3D really? # mih: because these are the distribution samples for the ONE error # collapsed into ONE value across all folds. It will also be # 3d if the return value of the measure isn't a scalar and it is # not collapsed across folds. it simply corresponds to the shape # of the output dataset of the respective measure (+1 axis) # Some permutations could have been skipped since classifier failed # to train due to degenerate situation etc, thus accounting for them self.failUnlessEqual(cvte.null_dist.ca.dist_samples.shape[2], num_perm - cvte.null_dist.ca.skipped)
def test_split_classifier(self): ds = self.data_bin_1 clf = SplitClassifier(clf=SameSignClassifier(), splitter=NFoldSplitter(1), enable_ca=['confusion', 'training_confusion', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.confusion.error tr_error = clf.ca.training_confusion.error clf2 = clf.clone() cv = CrossValidatedTransferError( TransferError(clf2), NFoldSplitter(), postproc=mean_sample(), enable_ca=['confusion', 'training_confusion']) cverror = cv(ds).samples.squeeze() tr_cverror = cv.ca.training_confusion.error self.failUnlessEqual(error, cverror, msg="We should get the same error using split classifier as" " using CrossValidatedTransferError. Got %s and %s" % (error, cverror)) self.failUnlessEqual(tr_error, tr_cverror, msg="We should get the same training error using split classifier as" " using CrossValidatedTransferError. Got %s and %s" % (tr_error, tr_cverror)) self.failUnlessEqual(clf.ca.confusion.percent_correct, 100, msg="Dummy clf should train perfectly") self.failUnlessEqual(len(clf.ca.confusion.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.failUnlessEqual(len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs") self.failUnlessEqual(clf.predict(ds.samples), list(ds.targets), msg="Should classify correctly") # feature_ids must be list of lists, and since it is not # feature-selecting classifier used - we expect all features # to be utilized # NOT ANYMORE -- for BoostedClassifier we have now union of all # used features across slave classifiers. That makes # semantics clear. If you need to get deeper -- use upcoming # harvesting facility ;-) # self.failUnlessEqual(len(clf.feature_ids), len(ds.uniquechunks)) # self.failUnless(np.array([len(ids)==ds.nfeatures # for ids in clf.feature_ids]).all()) # Just check if we get it at all ;-) summary = clf.summary()
def test_function_ptrs(): ds = load_example_fmri_dataset() # add a mapper with a function ptr inside ds = ds.get_mapped(mean_sample()) f = tempfile.NamedTemporaryFile() h5save(f.name, ds) ds_loaded = h5load(f.name) fresh = load_example_fmri_dataset().O # check that the reconstruction function pointer in the FxMapper points # to the right one assert_array_equal(ds_loaded.a.mapper.forward(fresh), ds.samples)
def test_classifier_generalization(self, clf): """Simple test if classifiers can generalize ok on simple data """ te = CrossValidatedTransferError(TransferError(clf), NFoldSplitter(), postproc=mean_sample()) nclasses = 2 * (1 + int('multiclass' in clf.__tags__)) ds = datasets['uni%dmedium' % nclasses] try: cve = te(ds).samples.squeeze() except Exception, e: self.fail("Failed with %s" % e)
def test_function_ptrs(): if not externals.exists('nifti') and not externals.exists('nibabel'): raise SkipTest ds = load_example_fmri_dataset() # add a mapper with a function ptr inside ds = ds.get_mapped(mean_sample()) f = tempfile.NamedTemporaryFile() h5save(f.name, ds) ds_loaded = h5load(f.name) fresh = load_example_fmri_dataset().O # check that the reconstruction function pointer in the FxMapper points # to the right one assert_array_equal(ds_loaded.a.mapper.forward(fresh), ds.samples)
def test_classifier_generalization(self, clf): """Simple test if classifiers can generalize ok on simple data """ te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample()) # check the default #self.failUnless(te.transerror.errorfx is mean_mismatch_error) nclasses = 2 * (1 + int('multiclass' in clf.__tags__)) ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))] try: cve = te(ds).samples.squeeze() except Exception, e: self.fail("Failed with %s" % e)
def test_classifier_generalization(self, clf): """Simple test if classifiers can generalize ok on simple data """ te = CrossValidatedTransferError(TransferError(clf), NFoldSplitter(), postproc=mean_sample()) # check the default self.failUnless(isinstance(te.transerror.errorfx, MeanMismatchErrorFx)) nclasses = 2 * (1 + int('multiclass' in clf.__tags__)) ds = datasets['uni%dmedium' % nclasses] try: cve = te(ds).samples.squeeze() except Exception, e: self.fail("Failed with %s" % e)
def test_null_dist_prob(self, l_clf): train = datasets['uni2medium'] num_perm = 10 permutator = AttributePermutator('targets', count=num_perm) # define class to estimate NULL distribution of errors # use left tail of the distribution since we use MeanMatchFx as error # function and lower is better terr = TransferMeasure( l_clf, Splitter(None, count=2), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), null_dist=MCNullDist(permutator, tail='left')) # check reasonable error range err = terr(train) self.failUnless(np.mean(err) < 0.4) # Lets do the same for CVTE cvte = CrossValidation(l_clf, OddEvenPartitioner(), null_dist=MCNullDist(permutator, tail='left', enable_ca=['dist_samples']), postproc=mean_sample()) cv_err = cvte(train) # check that the result is highly significant since we know that the # data has signal null_prob = terr.ca.null_prob if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(null_prob <= 0.1, msg="Failed to check that the result is highly significant " "(got %f) since we know that the data has signal" % null_prob) self.failUnless(cvte.ca.null_prob <= 0.1, msg="Failed to check that the result is highly significant " "(got p(cvte)=%f) since we know that the data has signal" % cvte.ca.null_prob) # and we should be able to access the actual samples of the distribution self.failUnlessEqual(len(cvte.null_dist.ca.dist_samples), num_perm)
def _run_core(self,): """ Core routine for detecting outliers Parameters ---------- imgfile : motionfile : """ attr = SampleAttributes(self.inputs.attributes_file) dataset = fmri_dataset( samples=self.inputs.samples_file, labels=attr.labels, chunks=attr.chunks, mask=self.inputs.mask_file) if 'rest' in dataset.uniquelabels: dataset = dataset[dataset.sa.labels != 'rest'] # zscore dataset relative to baseline ('rest') mean zscore(dataset, chunks_attr=True, dtype='float32') # choose classifier clf = LinearCSVMC() # setup measure to be computed by Searchlight # cross-validated mean transfer using an N-fold dataset splitter cv = CrossValidatedTransferError(TransferError(clf), NFoldSplitter()) sl = sphere_searchlight(cv, radius=self.inputs.radius, space='voxel_indices', nproc=2, mapper=mean_sample()) ds = dataset.copy(deep=False, sa=['labels', 'chunks'], fa=['voxel_indices'], a=[]) sl_map = sl(ds) # map sensitivity map into original dataspace orig_sl_map = dataset.map2nifti(sl_map) orig_sl_map.save(self._get_output_filename())
def test_ifs(self, svm): # measure for feature selection criterion and performance assesment # use the SAME clf! errorfx = mean_mismatch_error fmeasure = CrossValidation(svm, NFoldPartitioner(), postproc=mean_sample()) pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets')) ifs = IFS(fmeasure, pmeasure, Splitter('purpose', attr_values=['train', 'test']), fselector=\ # go for lower tail selection as data_measure will return # errors -> low is good FixedNElementTailSelector(1, tail='lower', mode='select'), ) wdata = self.get_data() wdata.sa['purpose'] = np.repeat('train', len(wdata)) tdata = self.get_data() tdata.sa['purpose'] = np.repeat('test', len(tdata)) ds = vstack((wdata, tdata)) orig_nfeatures = ds.nfeatures ifs.train(ds) resds = ifs(ds) # fail if orig datasets are changed self.failUnless(ds.nfeatures == orig_nfeatures) # check that the features set with the least error is selected self.failUnless(len(ifs.ca.errors)) e = np.array(ifs.ca.errors) self.failUnless(resds.nfeatures == e.argmin() + 1) # repeat with dataset where selection order is known wsignal = datasets['dumb2'].copy() wsignal.sa['purpose'] = np.repeat('train', len(wsignal)) tsignal = datasets['dumb2'].copy() tsignal.sa['purpose'] = np.repeat('test', len(tsignal)) signal = vstack((wsignal, tsignal)) ifs.train(signal) resds = ifs(signal) self.failUnless((resds.samples[:,0] == signal.samples[:,0]).all())
def test_null_dist_prob(self, l_clf): train = datasets['uni2medium'] num_perm = 10 # define class to estimate NULL distribution of errors # use left tail of the distribution since we use MeanMatchFx as error # function and lower is better terr = TransferError(clf=l_clf, null_dist=MCNullDist(permutations=num_perm, tail='left')) # check reasonable error range err = terr(train, train) self.failUnless(err < 0.4) # Lets do the same for CVTE cvte = CrossValidatedTransferError(TransferError(clf=l_clf), OddEvenSplitter(), null_dist=MCNullDist( permutations=num_perm, tail='left', enable_ca=['dist_samples']), postproc=mean_sample()) cv_err = cvte(train) # check that the result is highly significant since we know that the # data has signal null_prob = terr.ca.null_prob if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless( null_prob <= 0.1, msg="Failed to check that the result is highly significant " "(got %f) since we know that the data has signal" % null_prob) self.failUnless( cvte.ca.null_prob <= 0.1, msg="Failed to check that the result is highly significant " "(got p(cvte)=%f) since we know that the data has signal" % cvte.ca.null_prob) # and we should be able to access the actual samples of the distribution self.failUnlessEqual(len(cvte.null_dist.ca.dist_samples), num_perm)
def test_ifs(self, svm): # data measure and transfer error quantifier use the SAME clf! trans_error = TransferError(svm) data_measure = CrossValidatedTransferError(trans_error, NFoldSplitter(1), postproc=mean_sample()) ifs = IFS(data_measure, trans_error, feature_selector=\ # go for lower tail selection as data_measure will return # errors -> low is good FixedNElementTailSelector(1, tail='lower', mode='select'), ) wdata = self.get_data() wdata_nfeatures = wdata.nfeatures tdata = self.get_data() tdata_nfeatures = tdata.nfeatures sdata, stdata = ifs(wdata, tdata) # fail if orig datasets are changed self.failUnless(wdata.nfeatures == wdata_nfeatures) self.failUnless(tdata.nfeatures == tdata_nfeatures) # check that the features set with the least error is selected self.failUnless(len(ifs.ca.errors)) e = np.array(ifs.ca.errors) self.failUnless(sdata.nfeatures == e.argmin() + 1) # repeat with dataset where selection order is known signal = datasets['dumb2'] sdata, stdata = ifs(signal, signal) self.failUnless((sdata.samples[:, 0] == signal.samples[:, 0]).all())
def test_ifs(self, svm): # data measure and transfer error quantifier use the SAME clf! trans_error = TransferError(svm) data_measure = CrossValidatedTransferError(trans_error, NFoldSplitter(1), postproc=mean_sample()) ifs = IFS(data_measure, trans_error, feature_selector=\ # go for lower tail selection as data_measure will return # errors -> low is good FixedNElementTailSelector(1, tail='lower', mode='select'), ) wdata = self.get_data() wdata_nfeatures = wdata.nfeatures tdata = self.get_data() tdata_nfeatures = tdata.nfeatures sdata, stdata = ifs(wdata, tdata) # fail if orig datasets are changed self.failUnless(wdata.nfeatures == wdata_nfeatures) self.failUnless(tdata.nfeatures == tdata_nfeatures) # check that the features set with the least error is selected self.failUnless(len(ifs.ca.errors)) e = np.array(ifs.ca.errors) self.failUnless(sdata.nfeatures == e.argmin() + 1) # repeat with dataset where selection order is known signal = datasets['dumb2'] sdata, stdata = ifs(signal, signal) self.failUnless((sdata.samples[:,0] == signal.samples[:,0]).all())
def test_regressions(self, regr): """Simple tests on regressions """ ds = datasets['chirp_linear'] # we want numeric labels to maintain the previous behavior, especially # since we deal with regressions here ds.sa.targets = AttributeMap().to_numeric(ds.targets) cve = CrossValidatedTransferError( TransferError(regr), splitter=NFoldSplitter(), postproc=mean_sample(), enable_ca=['training_confusion', 'confusion']) # check the default self.failUnless(isinstance(cve.transerror.errorfx, CorrErrorFx)) corr = np.asscalar(cve(ds).samples) # Our CorrErrorFx should never return NaN self.failUnless(not np.isnan(corr)) self.failUnless(corr == cve.ca.confusion.stats['CCe']) splitregr = SplitClassifier( regr, splitter=OddEvenSplitter(), enable_ca=['training_confusion', 'confusion']) splitregr.train(ds) split_corr = splitregr.ca.confusion.stats['CCe'] split_corr_tr = splitregr.ca.training_confusion.stats['CCe'] for confusion, error in ( (cve.ca.confusion, corr), (splitregr.ca.confusion, split_corr), (splitregr.ca.training_confusion, split_corr_tr), ): #TODO: test confusion statistics # Part of it for now -- CCe for conf in confusion.summaries: stats = conf.stats if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(stats['CCe'] < 0.5) self.failUnlessEqual(stats['CCe'], stats['Summary CCe']) s0 = confusion.as_string(short=True) s1 = confusion.as_string(short=False) for s in [s0, s1]: self.failUnless(len(s) > 10, msg="We should get some string representation " "of regression summary. Got %s" % s) if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(error < 0.2, msg="Regressions should perform well on a simple " "dataset. Got correlation error of %s " % error) # Test access to summary statistics # YOH: lets start making testing more reliable. # p-value for such accident to have is verrrry tiny, # so if regression works -- it better has at least 0.5 ;) # otherwise fix it! ;) # YOH: not now -- issues with libsvr in SG and linear kernel if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(confusion.stats['CCe'] < 0.5) # just to check if it works fine split_predictions = splitregr.predict(ds.samples)
from mvpa.algorithms.cvtranserror import CrossValidatedTransferError from mvpa.measures.searchlight import sphere_searchlight from mvpa.testing.datasets import datasets from mvpa.mappers.fx import mean_sample """For the sake of simplicity, let's use a small artificial dataset.""" # Lets just use our tiny 4D dataset from testing battery dataset = datasets['3dlarge'] """Now it only takes three lines for a searchlight analysis.""" # setup measure to be computed in each sphere (cross-validated # generalization error on odd/even splits) cv = CrossValidatedTransferError(TransferError(LinearCSVMC()), OddEvenSplitter()) # setup searchlight with 2 voxels radius and measure configured above sl = sphere_searchlight(cv, radius=2, space='myspace', postproc=mean_sample()) # run searchlight on dataset sl_map = sl(dataset) print 'Best performing sphere error:', np.min(sl_map.samples) """ If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting searchlight map (`sl_map`) can be mapped back into the original dataspace and viewed as a brain overlay. :ref:`Another example <example_searchlight>` shows a typical application of this algorithm. .. Mention the fact that it also is a special `SensitivityAnalyzer` """
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity (RFE(sens_ana, cvmeasure, Repeater(2), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector( 0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits (RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.mean), FxMapper('samples', np.abs)])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater(2), # we will use the same full cv-training dataset fselector=FractionTailSelector( 0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), train_pmeasure=False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.failUnless(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.failUnless(resds.nfeatures == data_nfeatures - e.argmin()) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? imin = np.argmin(e) self.failUnless( 1 < imin < len(e) - 1 ) else: self.failUnless(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.failUnless( (nfeatures[::-1] == rfe.ca.nfeatures).all() ) # check if history has elements for every step self.failUnless(set(rfe.ca.history) == set(range(len(np.array(rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.failUnless(rfe.ca.nfeatures[-1] == len(np.where(rfe.ca.history ==max(rfe.ca.history))[0]))
def test_regressions(self, regr): """Simple tests on regressions """ ds = datasets['chirp_linear'] # we want numeric labels to maintain the previous behavior, especially # since we deal with regressions here ds.sa.targets = AttributeMap().to_numeric(ds.targets) cve = CrossValidatedTransferError( TransferError(regr), splitter=NFoldSplitter(), postproc=mean_sample(), enable_ca=['training_confusion', 'confusion']) # check the default self.failUnless(isinstance(cve.transerror.errorfx, CorrErrorFx)) corr = np.asscalar(cve(ds).samples) # Our CorrErrorFx should never return NaN self.failUnless(not np.isnan(corr)) self.failUnless(corr == cve.ca.confusion.stats['CCe']) splitregr = SplitClassifier( regr, splitter=OddEvenSplitter(), enable_ca=['training_confusion', 'confusion']) splitregr.train(ds) split_corr = splitregr.ca.confusion.stats['CCe'] split_corr_tr = splitregr.ca.training_confusion.stats['CCe'] for confusion, error in ( (cve.ca.confusion, corr), (splitregr.ca.confusion, split_corr), (splitregr.ca.training_confusion, split_corr_tr), ): #TODO: test confusion statistics # Part of it for now -- CCe for conf in confusion.summaries: stats = conf.stats if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(stats['CCe'] < 0.5) self.failUnlessEqual(stats['CCe'], stats['Summary CCe']) s0 = confusion.as_string(short=True) s1 = confusion.as_string(short=False) for s in [s0, s1]: self.failUnless(len(s) > 10, msg="We should get some string representation " "of regression summary. Got %s" % s) if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless( error < 0.2, msg="Regressions should perform well on a simple " "dataset. Got correlation error of %s " % error) # Test access to summary statistics # YOH: lets start making testing more reliable. # p-value for such accident to have is verrrry tiny, # so if regression works -- it better has at least 0.5 ;) # otherwise fix it! ;) # YOH: not now -- issues with libsvr in SG and linear kernel if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(confusion.stats['CCe'] < 0.5) # just to check if it works fine split_predictions = splitregr.predict(ds.samples)
def test_tree_classifier(self): """Basic tests for TreeClassifier """ ds = datasets['uni4medium'] # make it simple of the beast -- take only informative ones # because classifiers for the tree are selected randomly, so # performance varies a lot and we just need to check on # correct operation ds = ds[:, ds.fa.nonbogus_targets != [None]] clfs = clfswh['binary'] # pool of classifiers # Lets permute so each time we try some different combination # of the classifiers but exclude those operating on %s of # features since we might not have enough for that clfs = [clfs[i] for i in np.random.permutation(len(clfs)) if not '%' in str(clfs[i])] # Test conflicting definition tclf = TreeClassifier(clfs[0], { 'L0+2' : (('L0', 'L2'), clfs[1]), 'L2+3' : (('L2', 'L3'), clfs[2])}) self.failUnlessRaises(ValueError, tclf.train, ds) """Should raise exception since label 2 is in both""" # Test insufficient definition tclf = TreeClassifier(clfs[0], { 'L0+5' : (('L0', 'L5'), clfs[1]), 'L2+3' : (('L2', 'L3'), clfs[2])}) self.failUnlessRaises(ValueError, tclf.train, ds) """Should raise exception since no group for L1""" # proper definition now tclf = TreeClassifier(clfs[0], { 'L0+1' : (('L0', 'L1'), clfs[1]), 'L2+3' : (('L2', 'L3'), clfs[2])}) # Lets test train/test cycle using CVTE cv = CrossValidation(tclf, OddEvenPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds).samples.squeeze() try: rtclf = repr(tclf) except: self.fail(msg="Could not obtain repr for TreeClassifier") # Test accessibility of .clfs self.failUnless(tclf.clfs['L0+1'] is clfs[1]) self.failUnless(tclf.clfs['L2+3'] is clfs[2]) cvtrc = cv.ca.training_stats cvtc = cv.ca.stats if cfg.getboolean('tests', 'labile', default='yes'): # just a dummy check to make sure everything is working self.failUnless(cvtrc != cvtc) self.failUnless(cverror < 0.3, msg="Got too high error = %s using %s" % (cverror, tclf)) # Test trailing nodes with no classifier # NB: It is necessary that the same classifier was not used at # different nodes, since it would be re-trained for a new set # of targets, thus leading to incorrect behavior/high error. # That is why we use separate pool of classifiers here clfs_mc = clfswh['multiclass'] # pool of classifiers clfs_mc = [clfs_mc[i] for i in np.random.permutation(len(clfs_mc)) if not '%' in str(clfs_mc[i])] tclf = TreeClassifier(clfs_mc[0], { 'L0' : (('L0',), None), 'L1+2+3' : (('L1', 'L2', 'L3'), clfs_mc[1])}) cv = CrossValidation(tclf, OddEvenPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = np.asscalar(cv(ds)) if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(cverror < 0.3, msg="Got too high error = %s using %s" % (cverror, tclf))
from mvpa.mappers.fx import mean_sample """For the sake of simplicity, let's use a small artificial dataset.""" # Lets just use our tiny 4D dataset from testing battery dataset = datasets['3dlarge'] """Now it only takes three lines for a searchlight analysis.""" # setup measure to be computed in each sphere (cross-validated # generalization error on odd/even splits) cv = CrossValidation(LinearCSVMC(), OddEvenPartitioner()) # setup searchlight with 2 voxels radius and measure configured above sl = sphere_searchlight(cv, radius=2, space='myspace', postproc=mean_sample()) # run searchlight on dataset sl_map = sl(dataset) print 'Best performing sphere error:', np.min(sl_map.samples) """ If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting searchlight map (`sl_map`) can be mapped back into the original dataspace and viewed as a brain overlay. :ref:`Another example <example_searchlight>` shows a typical application of this algorithm. .. Mention the fact that it also is a special `SensitivityAnalyzer` """
"""For the sake of simplicity, let's use a small artificial dataset.""" # Lets just use our tiny 4D dataset from testing battery dataset = datasets['3dlarge'] """Now it only takes three lines for a searchlight analysis.""" # setup measure to be computed in each sphere (cross-validated # generalization error on odd/even splits) cv = CrossValidatedTransferError( TransferError(LinearCSVMC()), OddEvenSplitter()) # setup searchlight with 2 voxels radius and measure configured above sl = sphere_searchlight(cv, radius=2, space='myspace', postproc=mean_sample()) # run searchlight on dataset sl_map = sl(dataset) print 'Best performing sphere error:', np.min(sl_map.samples) """ If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting searchlight map (`sl_map`) can be mapped back into the original dataspace and viewed as a brain overlay. :ref:`Another example <example_searchlight>` shows a typical application of this algorithm. .. Mention the fact that it also is a special `SensitivityAnalyzer` """