def test_BinaryFxFeatureMeasure_mi(fx, ds_mi): ds, (mi_min, mi_max) = ds_mi res = fx(ds) assert_equal(res.shape, (1, ds.nfeatures)) mi = res.samples[0] assert_array_less(mi_min, mi) assert_array_less(mi, mi_max) fx_str = str(fx) if not (__debug__ and 'ID_IN_REPR' in debug.active): assert_equal(fx_str, "<BinaryFxFeaturewiseMeasure: %s>" % fx.fx.__name__)
def test_BinaryFxFeatureMeasure(ds): if not isinstance(ds.samples, np.ndarray): return # some simple function f = lambda x, y: np.sum((x.T*y).T, axis=0) fx = BinaryFxFeaturewiseMeasure(f, uni=False, numeric=True) fx_uni = BinaryFxFeaturewiseMeasure(f, uni=True, numeric=True) out = fx(ds) out_uni = fx_uni(ds) assert(len(out) == 1) assert_array_almost_equal(out.samples, out_uni) assert_equal(out.fa, out_uni.fa) ok_(str(fx).startswith("<BinaryFxFeaturewiseMeasure: lambda x, y:"))
def test_SplitRFE(self): # just a smoke test ATM from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import MappedClassifier from mvpa2.misc.data_generators import normal_feature_dataset #import mvpa2.featsel.rfe #reload(mvpa2.featsel.rfe) from mvpa2.featsel.rfe import RFE, SplitRFE from mvpa2.generators.partition import NFoldPartitioner from mvpa2.featsel.helpers import FractionTailSelector from mvpa2.testing import ok_, assert_equal clf = LinearCSVMC(C=1) dataset = normal_feature_dataset(perlabel=20, nlabels=2, nfeatures=30, snr=1., nonbogus_features=[1, 5]) # flip one of the meaningful features around to see # if we are still getting proper selection dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1 # 4 partitions should be enough for testing partitioner = NFoldPartitioner(count=4) rfeclf = MappedClassifier( clf, SplitRFE(clf, partitioner, fselector=FractionTailSelector(0.2, mode='discard', tail='lower'))) r0 = repr(rfeclf) ok_(rfeclf.mapper.nfeatures_min == 0) rfeclf.train(dataset) ok_(rfeclf.mapper.nfeatures_min > 0) predictions = rfeclf(dataset).samples # at least 1 of the nonbogus-features should be chosen ok_( len( set(dataset.a.nonbogus_features).intersection( rfeclf.mapper.slicearg)) > 0) # check repr to have all needed pieces r = repr(rfeclf) s = str(rfeclf) ok_(('partitioner=NFoldP' in r) or ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r)) ok_('lrn=' in r) ok_(not 'slicearg=' in r) assert_equal(r, r0)
def test_SplitRFE(self): # just a smoke test ATM from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import MappedClassifier from mvpa2.misc.data_generators import normal_feature_dataset #import mvpa2.featsel.rfe #reload(mvpa2.featsel.rfe) from mvpa2.featsel.rfe import RFE, SplitRFE from mvpa2.generators.partition import NFoldPartitioner from mvpa2.featsel.helpers import FractionTailSelector from mvpa2.testing import ok_, assert_equal clf = LinearCSVMC(C=1) dataset = normal_feature_dataset(perlabel=20, nlabels=2, nfeatures=30, snr=1., nonbogus_features=[1,5]) # flip one of the meaningful features around to see # if we are still getting proper selection dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1 # 4 partitions should be enough for testing partitioner = NFoldPartitioner(count=4) rfeclf = MappedClassifier( clf, SplitRFE(clf, partitioner, fselector=FractionTailSelector( 0.2, mode='discard', tail='lower'))) r0 = repr(rfeclf) ok_(rfeclf.mapper.nfeatures_min == 0) rfeclf.train(dataset) ok_(rfeclf.mapper.nfeatures_min > 0) predictions = rfeclf(dataset).samples # at least 1 of the nonbogus-features should be chosen ok_(len(set(dataset.a.nonbogus_features).intersection( rfeclf.mapper.slicearg)) > 0) # check repr to have all needed pieces r = repr(rfeclf) s = str(rfeclf) ok_(('partitioner=NFoldP' in r) or ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r)) ok_('lrn=' in r) ok_(not 'slicearg=' in r) assert_equal(r, r0)
def test_cluster_count(): if externals.versions['scipy'] < '0.10': raise SkipTest # we get a ZERO cluster count of one if there are no clusters at all # this is needed to keept track of the number of bootstrap samples that yield # no cluster at all (high treshold) in order to compute p-values when there is no # actual cluster size histogram assert_equal(gct._get_map_cluster_sizes([0, 0, 0, 0]), [0]) # if there is at least one cluster: no ZERO count assert_equal(gct._get_map_cluster_sizes([0, 0, 1, 0]), [1]) for i in range(2): # rerun tests for bool type of test_M test_M = np.array([[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0], [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1], [0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0], [0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0], [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0], [1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0]]) expected_result = [5, 4, 3, 3, 2, 0, 2] # 5 clusters of size 1, # 4 clusters of size 2 ... test_ds = Dataset([test_M]) if i == 1: test_M = test_M.astype(bool) test_M_3d = np.hstack((test_M.flatten(), test_M.flatten())).reshape(2, 9, 16) test_ds_3d = Dataset([test_M_3d]) # expected_result^2 expected_result_3d = np.array([0, 5, 0, 4, 0, 3, 0, 3, 0, 2, 0, 0, 0, 2]) size = 10000 # how many times bigger than test_M_3d test_M_3d_big = np.hstack((test_M_3d.flatten(), np.zeros(144))) test_M_3d_big = np.hstack((test_M_3d_big for i in range(size)) ).reshape(3 * size, 9, 16) test_ds_3d_big = Dataset([test_M_3d_big]) expected_result_3d_big = expected_result_3d * size # check basic cluster size determination for plain arrays and datasets # with a single sample for t, e in ((test_M, expected_result), (test_ds, expected_result), (test_M_3d, expected_result_3d), (test_ds_3d, expected_result_3d), (test_M_3d_big, expected_result_3d_big), (test_ds_3d_big, expected_result_3d_big)): assert_array_equal(np.bincount(gct._get_map_cluster_sizes(t))[1:], e) # old M = np.vstack([test_M_3d.flatten()] * 10) # new ds = dataset_wizard([test_M_3d] * 10) assert_array_equal(M, ds) expected_result = Counter(np.hstack([gct._get_map_cluster_sizes(test_M_3d)] * 10)) assert_array_equal(expected_result, gct.get_cluster_sizes(ds)) # test the same with some arbitrary per-feature threshold thr = 4 labels, num = measurements.label(test_M_3d) area = measurements.sum(test_M_3d, labels, index=np.arange(labels.max() + 1)) cluster_sizes_map = area[labels] # .astype(int) thresholded_cluster_sizes_map = cluster_sizes_map > thr # old M = np.vstack([cluster_sizes_map.flatten()] * 10) # new ds = dataset_wizard([cluster_sizes_map] * 10) assert_array_equal(M, ds) expected_result = Counter(np.hstack( [gct._get_map_cluster_sizes(thresholded_cluster_sizes_map)] * 10)) th_map = np.ones(cluster_sizes_map.flatten().shape) * thr # threshold dataset by hand ds.samples = ds.samples > th_map assert_array_equal(expected_result, gct.get_cluster_sizes(ds))
def test_group_clusterthreshold_simple(n_proc): if n_proc > 1 and not externals.exists('joblib'): raise SkipTest feature_thresh_prob = 0.005 nsubj = 10 # make a nice 1D blob and a speck blob = np.array([0, 0, .5, 3, 5, 3, 3, 0, 2, 0]) blob = Dataset([blob]) # and some nice random permutations nperms = 100 * nsubj perm_samples = np.random.randn(nperms, blob.nfeatures) perms = Dataset(perm_samples, sa=dict(chunks=np.repeat(range(nsubj), len(perm_samples) / nsubj)), fa=dict(fid=range(perm_samples.shape[1]))) # the algorithm instance # scale number of bootstraps to match desired probability # plus a safety margin to mimimize bad luck in sampling clthr = gct.GroupClusterThreshold(n_bootstrap=int(3. / feature_thresh_prob), feature_thresh_prob=feature_thresh_prob, fwe_rate=0.01, n_blocks=3, n_proc=n_proc) clthr.train(perms) # get the FE thresholds thr = clthr._thrmap # perms are normally distributed, hence the CDF should be close, std of the distribution # will scale 1/sqrt(nsubj) assert_true(np.abs( feature_thresh_prob - (1 - norm.cdf(thr.mean(), loc=0, scale=1. / np.sqrt(nsubj)))) < 0.01) clstr_sizes = clthr._null_cluster_sizes # getting anything but a lonely one feature cluster is very unlikely assert_true(max([c[0] for c in clstr_sizes.keys()]) <= 1) # threshold orig map res = clthr(blob) # # check output # # samples unchanged assert_array_equal(blob.samples, res.samples) # need to find the big cluster assert_true(len(res.a.clusterstats) > 0) assert_equal(len(res.a.clusterstats), res.fa.clusters_featurewise_thresh.max()) # probs need to decrease with size, clusters are sorted by size (decreasing) assert_true(res.a.clusterstats['prob_raw'][0] <= res.a.clusterstats['prob_raw'][1]) # corrected probs for every uncorrected cluster assert_true('prob_corrected' in res.a.clusterstats.dtype.names) # fwe correction always increases the p-values (if anything) assert_true(np.all(res.a.clusterstats['prob_raw'] <= res.a.clusterstats['prob_corrected'])) # fwe thresholding only ever removes clusters assert_true(np.all(np.abs(res.fa.clusters_featurewise_thresh - res.fa.clusters_fwe_thresh) >= 0)) # FWE should kill the small one assert_greater(res.fa.clusters_featurewise_thresh.max(), res.fa.clusters_fwe_thresh.max()) # check that the cluster results aren't depending in the actual location of # the clusters shifted_blob = Dataset([[.5, 3, 5, 3, 3, 0, 0, 0, 2, 0]]) shifted_res = clthr(shifted_blob) assert_array_equal(res.a.clusterstats, shifted_res.a.clusterstats) # check that it averages multi-sample datasets # also checks that scenarios work where all features are part of one big # cluster multisamp = Dataset(np.arange(30).reshape(3, 10) + 100) avgres = clthr(multisamp) assert_equal(len(avgres), 1) assert_array_equal(avgres.samples[0], np.mean(multisamp.samples, axis=0)) # retrain, this time with data from only a single subject perms = Dataset(perm_samples, sa=dict(chunks=np.repeat(1, len(perm_samples))), fa=dict(fid=range(perms.shape[1]))) clthr.train(perms) # same blob -- 1st this should work without issues sglres = clthr(blob) # NULL estimation does no averaging # -> more noise -> fewer clusters -> higher p assert_greater_equal(len(res.a.clusterstats), len(sglres.a.clusterstats)) assert_greater_equal(np.round(sglres.a.clusterstats[0]['prob_raw'], 4), np.round(res.a.clusterstats[0]['prob_raw'], 4)) # no again for real scientists: no FWE correction superclthr = gct.GroupClusterThreshold( n_bootstrap=int(3. / feature_thresh_prob), feature_thresh_prob=feature_thresh_prob, multicomp_correction=None, n_blocks=3, n_proc=n_proc) superclthr.train(perms) superres = superclthr(blob) assert_true('prob_corrected' in res.a.clusterstats.dtype.names) assert_true('clusters_fwe_thresh' in res.fa) assert_false('prob_corrected' in superres.a.clusterstats.dtype.names) assert_false('clusters_fwe_thresh' in superres.fa) # check validity test assert_raises(ValueError, gct.GroupClusterThreshold, n_bootstrap=10, feature_thresh_prob=.09, n_proc=n_proc) # check mapped datasets blob = np.array([[0, 0, .5, 3, 5, 3, 3, 0, 2, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) blob = dataset_wizard([blob]) # and some nice random permutations nperms = 100 * nsubj perm_samples = np.random.randn(*((nperms,) + blob.shape)) perms = dataset_wizard( perm_samples, chunks=np.repeat(range(nsubj), len(perm_samples) / nsubj)) clthr.train(perms) twodres = clthr(blob) # finds two clusters of the same size assert_array_equal(twodres.a.clusterstats['size'], res.a.clusterstats['size'])
def test_SplitRFE(self, fmeasure): # just a smoke test ATM from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import MappedClassifier from mvpa2.misc.data_generators import normal_feature_dataset #import mvpa2.featsel.rfe #reload(mvpa2.featsel.rfe) from mvpa2.featsel.rfe import RFE, SplitRFE from mvpa2.generators.partition import NFoldPartitioner from mvpa2.featsel.helpers import FractionTailSelector from mvpa2.testing import ok_, assert_equal clf = LinearCSVMC(C=1) dataset = normal_feature_dataset(perlabel=20, nlabels=2, nfeatures=11, snr=1., nonbogus_features=[1, 5]) # flip one of the meaningful features around to see # if we are still getting proper selection dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1 # 3 partitions should be enough for testing partitioner = NFoldPartitioner(count=3) rfeclf = MappedClassifier( clf, SplitRFE(clf, partitioner, fselector=FractionTailSelector( 0.5, mode='discard', tail='lower'), fmeasure=fmeasure, # need to update only when using clf's sens anal update_sensitivity=fmeasure is None)) r0 = repr(rfeclf) ok_(rfeclf.mapper.nfeatures_min == 0) rfeclf.train(dataset) ok_(rfeclf.mapper.nfeatures_min > 0) predictions = rfeclf(dataset).samples # at least 1 of the nonbogus-features should be chosen ok_(len(set(dataset.a.nonbogus_features).intersection( rfeclf.mapper.slicearg)) > 0) # check repr to have all needed pieces r = repr(rfeclf) s = str(rfeclf) ok_(('partitioner=NFoldP' in r) or ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r)) ok_('lrn=' in r) ok_(not 'slicearg=' in r) assert_equal(r, r0) if externals.exists('joblib'): rfeclf.mapper.nproc = -1 # compare results against the one ran in parallel _slicearg = rfeclf.mapper.slicearg _predictions = predictions rfeclf.train(dataset) predictions = rfeclf(dataset).samples assert_array_equal(predictions, _predictions) assert_array_equal(_slicearg, rfeclf.mapper.slicearg) # Test that we can collect stats from cas within cross-validation sensitivities = [] nested_errors = [] nested_nfeatures = [] def store_me(data, node, result): sens = node.measure.get_sensitivity_analyzer(force_train=False)(data) sensitivities.append(sens) nested_errors.append(node.measure.mapper.ca.nested_errors) nested_nfeatures.append(node.measure.mapper.ca.nested_nfeatures) cv = CrossValidation(rfeclf, NFoldPartitioner(count=1), callback=store_me, enable_ca=['stats']) _ = cv(dataset) # just to make sure we collected them assert_equal(len(sensitivities), 1) assert_equal(len(nested_errors), 1) assert_equal(len(nested_nfeatures), 1)
def test_cluster_count(): skip_if_no_external('scipy', min_version='0.10') # we get a ZERO cluster count of one if there are no clusters at all # this is needed to keept track of the number of bootstrap samples that yield # no cluster at all (high treshold) in order to compute p-values when there is no # actual cluster size histogram assert_equal(gct._get_map_cluster_sizes([0, 0, 0, 0]), [0]) # if there is at least one cluster: no ZERO count assert_equal(gct._get_map_cluster_sizes([0, 0, 1, 0]), [1]) for i in range(2): # rerun tests for bool type of test_M test_M = np.array([[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0], [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1], [0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0], [0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0], [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0], [1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0]]) expected_result = [5, 4, 3, 3, 2, 0, 2] # 5 clusters of size 1, # 4 clusters of size 2 ... test_ds = Dataset([test_M]) if i == 1: test_M = test_M.astype(bool) test_M_3d = np.hstack( (test_M.flatten(), test_M.flatten())).reshape(2, 9, 16) test_ds_3d = Dataset([test_M_3d]) # expected_result^2 expected_result_3d = np.array( [0, 5, 0, 4, 0, 3, 0, 3, 0, 2, 0, 0, 0, 2]) size = 10000 # how many times bigger than test_M_3d test_M_3d_big = np.hstack((test_M_3d.flatten(), np.zeros(144))) test_M_3d_big = np.hstack( (test_M_3d_big for i in range(size))).reshape(3 * size, 9, 16) test_ds_3d_big = Dataset([test_M_3d_big]) expected_result_3d_big = expected_result_3d * size # check basic cluster size determination for plain arrays and datasets # with a single sample for t, e in ((test_M, expected_result), (test_ds, expected_result), (test_M_3d, expected_result_3d), (test_ds_3d, expected_result_3d), (test_M_3d_big, expected_result_3d_big), (test_ds_3d_big, expected_result_3d_big)): assert_array_equal( np.bincount(gct._get_map_cluster_sizes(t))[1:], e) # old M = np.vstack([test_M_3d.flatten()] * 10) # new ds = dataset_wizard([test_M_3d] * 10) assert_array_equal(M, ds) expected_result = Counter( np.hstack([gct._get_map_cluster_sizes(test_M_3d)] * 10)) assert_array_equal(expected_result, gct.get_cluster_sizes(ds)) # test the same with some arbitrary per-feature threshold thr = 4 labels, num = measurements.label(test_M_3d) area = measurements.sum(test_M_3d, labels, index=np.arange(labels.max() + 1)) cluster_sizes_map = area[labels] # .astype(int) thresholded_cluster_sizes_map = cluster_sizes_map > thr # old M = np.vstack([cluster_sizes_map.flatten()] * 10) # new ds = dataset_wizard([cluster_sizes_map] * 10) assert_array_equal(M, ds) expected_result = Counter( np.hstack( [gct._get_map_cluster_sizes(thresholded_cluster_sizes_map)] * 10)) th_map = np.ones(cluster_sizes_map.flatten().shape) * thr # threshold dataset by hand ds.samples = ds.samples > th_map assert_array_equal(expected_result, gct.get_cluster_sizes(ds))
def test_group_clusterthreshold_simple(n_proc): if n_proc > 1: skip_if_no_external('joblib') feature_thresh_prob = 0.005 nsubj = 10 # make a nice 1D blob and a speck blob = np.array([0, 0, .5, 3, 5, 3, 3, 0, 2, 0]) blob = Dataset([blob]) # and some nice random permutations nperms = 100 * nsubj perm_samples = np.random.randn(nperms, blob.nfeatures) perms = Dataset(perm_samples, sa=dict(chunks=np.repeat(range(nsubj), len(perm_samples) / nsubj)), fa=dict(fid=range(perm_samples.shape[1]))) # the algorithm instance # scale number of bootstraps to match desired probability # plus a safety margin to mimimize bad luck in sampling clthr = gct.GroupClusterThreshold(n_bootstrap=int(3. / feature_thresh_prob), feature_thresh_prob=feature_thresh_prob, fwe_rate=0.01, n_blocks=3, n_proc=n_proc) clthr.train(perms) # get the FE thresholds thr = clthr._thrmap # perms are normally distributed, hence the CDF should be close, std of the distribution # will scale 1/sqrt(nsubj) assert_true( np.abs(feature_thresh_prob - (1 - norm.cdf(thr.mean(), loc=0, scale=1. / np.sqrt(nsubj)))) < 0.01) clstr_sizes = clthr._null_cluster_sizes # getting anything but a lonely one feature cluster is very unlikely assert_true(max([c[0] for c in clstr_sizes.keys()]) <= 1) # threshold orig map res = clthr(blob) # # check output # # samples unchanged assert_array_equal(blob.samples, res.samples) # need to find the big cluster assert_true(len(res.a.clusterstats) > 0) assert_equal(len(res.a.clusterstats), res.fa.clusters_featurewise_thresh.max()) # probs need to decrease with size, clusters are sorted by size (decreasing) assert_true( res.a.clusterstats['prob_raw'][0] <= res.a.clusterstats['prob_raw'][1]) # corrected probs for every uncorrected cluster assert_true('prob_corrected' in res.a.clusterstats.dtype.names) # fwe correction always increases the p-values (if anything) assert_true( np.all(res.a.clusterstats['prob_raw'] <= res.a.clusterstats['prob_corrected'])) # check expected cluster sizes, ordered large -> small assert_array_equal(res.a.clusterstats['size'], [4, 1]) # check max position assert_array_equal(res.a.clusterlocations['max'], [[4], [8]]) # center of mass: eyeballed assert_array_almost_equal(res.a.clusterlocations['center_of_mass'], [[4.429], [8]], 3) # other simple stats #[0, 0, .5, 3, 5, 3, 3, 0, 2, 0] assert_array_equal(res.a.clusterstats['mean'], [3.5, 2]) assert_array_equal(res.a.clusterstats['min'], [3, 2]) assert_array_equal(res.a.clusterstats['max'], [5, 2]) assert_array_equal(res.a.clusterstats['median'], [3, 2]) assert_array_almost_equal(res.a.clusterstats['std'], [0.866, 0], 3) # fwe thresholding only ever removes clusters assert_true( np.all( np.abs(res.fa.clusters_featurewise_thresh - res.fa.clusters_fwe_thresh) >= 0)) # FWE should kill the small one assert_greater(res.fa.clusters_featurewise_thresh.max(), res.fa.clusters_fwe_thresh.max()) # check that the cluster results aren't depending in the actual location of # the clusters shifted_blob = Dataset([[.5, 3, 5, 3, 3, 0, 0, 0, 2, 0]]) shifted_res = clthr(shifted_blob) assert_array_equal(res.a.clusterstats, shifted_res.a.clusterstats) # check that it averages multi-sample datasets # also checks that scenarios work where all features are part of one big # cluster multisamp = Dataset(np.arange(30).reshape(3, 10) + 100) avgres = clthr(multisamp) assert_equal(len(avgres), 1) assert_array_equal(avgres.samples[0], np.mean(multisamp.samples, axis=0)) # retrain, this time with data from only a single subject perms = Dataset(perm_samples, sa=dict(chunks=np.repeat(1, len(perm_samples))), fa=dict(fid=range(perms.shape[1]))) clthr.train(perms) # same blob -- 1st this should work without issues sglres = clthr(blob) # NULL estimation does no averaging # -> more noise -> fewer clusters -> higher p assert_greater_equal(len(res.a.clusterstats), len(sglres.a.clusterstats)) assert_greater_equal(np.round(sglres.a.clusterstats[0]['prob_raw'], 4), np.round(res.a.clusterstats[0]['prob_raw'], 4)) # no again for real scientists: no FWE correction superclthr = gct.GroupClusterThreshold( n_bootstrap=int(3. / feature_thresh_prob), feature_thresh_prob=feature_thresh_prob, multicomp_correction=None, n_blocks=3, n_proc=n_proc) superclthr.train(perms) superres = superclthr(blob) assert_true('prob_corrected' in res.a.clusterstats.dtype.names) assert_true('clusters_fwe_thresh' in res.fa) assert_false('prob_corrected' in superres.a.clusterstats.dtype.names) assert_false('clusters_fwe_thresh' in superres.fa) # check validity test assert_raises(ValueError, gct.GroupClusterThreshold, n_bootstrap=10, feature_thresh_prob=.09, n_proc=n_proc) # check mapped datasets blob = np.array([[0, 0, .5, 3, 5, 3, 3, 0, 2, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) blob = dataset_wizard([blob]) # and some nice random permutations nperms = 100 * nsubj perm_samples = np.random.randn(*((nperms, ) + blob.shape)) perms = dataset_wizard(perm_samples, chunks=np.repeat(range(nsubj), len(perm_samples) / nsubj)) clthr.train(perms) twodres = clthr(blob) # finds two clusters of the same size assert_array_equal(twodres.a.clusterstats['size'], res.a.clusterstats['size'])
def test_SplitRFE(self, fmeasure): # just a smoke test ATM from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import MappedClassifier from mvpa2.misc.data_generators import normal_feature_dataset #import mvpa2.featsel.rfe #reload(mvpa2.featsel.rfe) from mvpa2.featsel.rfe import RFE, SplitRFE from mvpa2.generators.partition import NFoldPartitioner from mvpa2.featsel.helpers import FractionTailSelector from mvpa2.testing import ok_, assert_equal clf = LinearCSVMC(C=1) dataset = normal_feature_dataset(perlabel=20, nlabels=2, nfeatures=11, snr=1., nonbogus_features=[1, 5]) # flip one of the meaningful features around to see # if we are still getting proper selection dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1 # 3 partitions should be enough for testing partitioner = NFoldPartitioner(count=3) rfeclf = MappedClassifier( clf, SplitRFE( clf, partitioner, fselector=FractionTailSelector(0.5, mode='discard', tail='lower'), fmeasure=fmeasure, # need to update only when using clf's sens anal update_sensitivity=fmeasure is None)) r0 = repr(rfeclf) ok_(rfeclf.mapper.nfeatures_min == 0) rfeclf.train(dataset) ok_(rfeclf.mapper.nfeatures_min > 0) predictions = rfeclf(dataset).samples # at least 1 of the nonbogus-features should be chosen ok_( len( set(dataset.a.nonbogus_features).intersection( rfeclf.mapper.slicearg)) > 0) # check repr to have all needed pieces r = repr(rfeclf) s = str(rfeclf) ok_(('partitioner=NFoldP' in r) or ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r)) ok_('lrn=' in r) ok_(not 'slicearg=' in r) assert_equal(r, r0) if externals.exists('joblib'): rfeclf.mapper.nproc = -1 # compare results against the one ran in parallel _slicearg = rfeclf.mapper.slicearg _predictions = predictions rfeclf.train(dataset) predictions = rfeclf(dataset).samples assert_array_equal(predictions, _predictions) assert_array_equal(_slicearg, rfeclf.mapper.slicearg) # Test that we can collect stats from cas within cross-validation sensitivities = [] nested_errors = [] nested_nfeatures = [] def store_me(data, node, result): sens = node.measure.get_sensitivity_analyzer( force_train=False)(data) sensitivities.append(sens) nested_errors.append(node.measure.mapper.ca.nested_errors) nested_nfeatures.append(node.measure.mapper.ca.nested_nfeatures) cv = CrossValidation(rfeclf, NFoldPartitioner(count=1), callback=store_me, enable_ca=['stats']) _ = cv(dataset) # just to make sure we collected them assert_equal(len(sensitivities), 1) assert_equal(len(nested_errors), 1) assert_equal(len(nested_nfeatures), 1)