def test_simple_cluster_level_thresholding(): nf = 13 nperms = 100 pthr_feature = 0.5 # just for testing pthr_cluster = 0.5 rand_acc = np.random.normal(size=(nperms, nf)) acc = np.random.normal(size=(1, nf)) # Step 1 is to "fit" "Nonparametrics" per each of the features from mvpa2.clfs.stats import Nonparametric dists = [Nonparametric(samples) for samples in rand_acc.T] # we should be able to assert "p" value for each random sample for each feature rand_acc_p = np.array([dist.rcdf(v) for dist, v in zip(dists, rand_acc.T)]).T rand_acc_p_slow = np.array( [[dist.rcdf(v) for dist, v in zip(dists, sample)] for sample in rand_acc]) assert_array_equal(rand_acc_p_slow, rand_acc_p) assert_equal(rand_acc_p.shape, rand_acc.shape) assert (np.all(rand_acc_p <= 1)) assert (np.all(rand_acc_p > 0)) # 2: apply the same to our acc acc_p = np.array([dist.rcdf(v) for dist, v in zip(dists, acc[0])])[None, :] assert (np.all(acc_p <= 1)) assert (np.all(acc_p > 0)) skip_if_no_external('scipy') # Now we need to do our fancy cluster level madness from mvpa2.algorithms.group_clusterthr import \ get_cluster_sizes, _transform_to_pvals, get_cluster_pvals, \ get_thresholding_map, repeat_cluster_vals rand_acc_p_thr = rand_acc_p < pthr_feature acc_p_thr = acc_p < pthr_feature rand_cluster_sizes = get_cluster_sizes(rand_acc_p_thr) acc_cluster_sizes = get_cluster_sizes(acc_p_thr) # This is how we can compute it within present implementation. # It will be a bit different (since it doesn't account for target value if # I got it right), and would work only for accuracies thr_map = get_thresholding_map(rand_acc, pthr_feature) rand_cluster_sizes_ = get_cluster_sizes(rand_acc > thr_map) acc_cluster_sizes_ = get_cluster_sizes(acc > thr_map) assert_equal(rand_cluster_sizes, rand_cluster_sizes_) assert_equal(acc_cluster_sizes, acc_cluster_sizes_) #print rand_cluster_sizes #print acc_cluster_sizes # That is how it is done in group_clusterthr atm # store cluster size histogram for later p-value evaluation # use a sparse matrix for easy consumption (max dim is the number of # features, i.e. biggest possible cluster) from scipy.sparse import dok_matrix scl = dok_matrix((1, nf + 1), dtype=int) for s in rand_cluster_sizes: scl[0, s] = rand_cluster_sizes[s] test_count_sizes = repeat_cluster_vals(acc_cluster_sizes) test_pvals = _transform_to_pvals(test_count_sizes, scl.astype('float')) # needs conversion to array for comparisons test_pvals = np.asanyarray(test_pvals) # critical cluster_level threshold (without FW correction between clusters) # would be clusters_passed_threshold = test_count_sizes[test_pvals <= pthr_cluster] if len(clusters_passed_threshold): thr_cluster_size = min(clusters_passed_threshold) #print("Min cluster size which passed threshold: %d" % thr_cluster_size) else: #print("No clusters passed threshold") pass #print test_count_sizes, test_pvals acc_cluster_ps = get_cluster_pvals(acc_cluster_sizes, rand_cluster_sizes) for test_pval, test_count_size in zip(test_pvals, test_count_sizes): assert_almost_equal(acc_cluster_ps[test_count_size], test_pval)
def test_cluster_count(): skip_if_no_external('scipy', min_version='0.10') # we get a ZERO cluster count of one if there are no clusters at all # this is needed to keept track of the number of bootstrap samples that yield # no cluster at all (high treshold) in order to compute p-values when there is no # actual cluster size histogram assert_equal(gct._get_map_cluster_sizes([0, 0, 0, 0]), [0]) # if there is at least one cluster: no ZERO count assert_equal(gct._get_map_cluster_sizes([0, 0, 1, 0]), [1]) for i in range(2): # rerun tests for bool type of test_M test_M = np.array([[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0], [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1], [0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0], [0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0], [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0], [1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0]]) expected_result = [5, 4, 3, 3, 2, 0, 2] # 5 clusters of size 1, # 4 clusters of size 2 ... test_ds = Dataset([test_M]) if i == 1: test_M = test_M.astype(bool) test_M_3d = np.hstack( (test_M.flatten(), test_M.flatten())).reshape(2, 9, 16) test_ds_3d = Dataset([test_M_3d]) # expected_result^2 expected_result_3d = np.array( [0, 5, 0, 4, 0, 3, 0, 3, 0, 2, 0, 0, 0, 2]) size = 10000 # how many times bigger than test_M_3d test_M_3d_big = np.hstack((test_M_3d.flatten(), np.zeros(144))) test_M_3d_big = np.hstack( (test_M_3d_big for i in range(size))).reshape(3 * size, 9, 16) test_ds_3d_big = Dataset([test_M_3d_big]) expected_result_3d_big = expected_result_3d * size # check basic cluster size determination for plain arrays and datasets # with a single sample for t, e in ((test_M, expected_result), (test_ds, expected_result), (test_M_3d, expected_result_3d), (test_ds_3d, expected_result_3d), (test_M_3d_big, expected_result_3d_big), (test_ds_3d_big, expected_result_3d_big)): assert_array_equal( np.bincount(gct._get_map_cluster_sizes(t))[1:], e) # old M = np.vstack([test_M_3d.flatten()] * 10) # new ds = dataset_wizard([test_M_3d] * 10) assert_array_equal(M, ds) expected_result = Counter( np.hstack([gct._get_map_cluster_sizes(test_M_3d)] * 10)) assert_array_equal(expected_result, gct.get_cluster_sizes(ds)) # test the same with some arbitrary per-feature threshold thr = 4 labels, num = measurements.label(test_M_3d) area = measurements.sum(test_M_3d, labels, index=np.arange(labels.max() + 1)) cluster_sizes_map = area[labels] # .astype(int) thresholded_cluster_sizes_map = cluster_sizes_map > thr # old M = np.vstack([cluster_sizes_map.flatten()] * 10) # new ds = dataset_wizard([cluster_sizes_map] * 10) assert_array_equal(M, ds) expected_result = Counter( np.hstack( [gct._get_map_cluster_sizes(thresholded_cluster_sizes_map)] * 10)) th_map = np.ones(cluster_sizes_map.flatten().shape) * thr # threshold dataset by hand ds.samples = ds.samples > th_map assert_array_equal(expected_result, gct.get_cluster_sizes(ds))
def test_cluster_count(): if externals.versions['scipy'] < '0.10': raise SkipTest # we get a ZERO cluster count of one if there are no clusters at all # this is needed to keept track of the number of bootstrap samples that yield # no cluster at all (high treshold) in order to compute p-values when there is no # actual cluster size histogram assert_equal(gct._get_map_cluster_sizes([0, 0, 0, 0]), [0]) # if there is at least one cluster: no ZERO count assert_equal(gct._get_map_cluster_sizes([0, 0, 1, 0]), [1]) for i in range(2): # rerun tests for bool type of test_M test_M = np.array([[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0], [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1], [0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0], [0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0], [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0], [1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0]]) expected_result = [5, 4, 3, 3, 2, 0, 2] # 5 clusters of size 1, # 4 clusters of size 2 ... test_ds = Dataset([test_M]) if i == 1: test_M = test_M.astype(bool) test_M_3d = np.hstack((test_M.flatten(), test_M.flatten())).reshape(2, 9, 16) test_ds_3d = Dataset([test_M_3d]) # expected_result^2 expected_result_3d = np.array([0, 5, 0, 4, 0, 3, 0, 3, 0, 2, 0, 0, 0, 2]) size = 10000 # how many times bigger than test_M_3d test_M_3d_big = np.hstack((test_M_3d.flatten(), np.zeros(144))) test_M_3d_big = np.hstack((test_M_3d_big for i in range(size)) ).reshape(3 * size, 9, 16) test_ds_3d_big = Dataset([test_M_3d_big]) expected_result_3d_big = expected_result_3d * size # check basic cluster size determination for plain arrays and datasets # with a single sample for t, e in ((test_M, expected_result), (test_ds, expected_result), (test_M_3d, expected_result_3d), (test_ds_3d, expected_result_3d), (test_M_3d_big, expected_result_3d_big), (test_ds_3d_big, expected_result_3d_big)): assert_array_equal(np.bincount(gct._get_map_cluster_sizes(t))[1:], e) # old M = np.vstack([test_M_3d.flatten()] * 10) # new ds = dataset_wizard([test_M_3d] * 10) assert_array_equal(M, ds) expected_result = Counter(np.hstack([gct._get_map_cluster_sizes(test_M_3d)] * 10)) assert_array_equal(expected_result, gct.get_cluster_sizes(ds)) # test the same with some arbitrary per-feature threshold thr = 4 labels, num = measurements.label(test_M_3d) area = measurements.sum(test_M_3d, labels, index=np.arange(labels.max() + 1)) cluster_sizes_map = area[labels] # .astype(int) thresholded_cluster_sizes_map = cluster_sizes_map > thr # old M = np.vstack([cluster_sizes_map.flatten()] * 10) # new ds = dataset_wizard([cluster_sizes_map] * 10) assert_array_equal(M, ds) expected_result = Counter(np.hstack( [gct._get_map_cluster_sizes(thresholded_cluster_sizes_map)] * 10)) th_map = np.ones(cluster_sizes_map.flatten().shape) * thr # threshold dataset by hand ds.samples = ds.samples > th_map assert_array_equal(expected_result, gct.get_cluster_sizes(ds))
def test_simple_cluster_level_thresholding(): nf = 13 nperms = 100 pthr_feature = 0.5 # just for testing pthr_cluster = 0.5 rand_acc = np.random.normal(size=(nperms, nf)) acc = np.random.normal(size=(1, nf)) # Step 1 is to "fit" "Nonparametrics" per each of the features from mvpa2.clfs.stats import Nonparametric dists = [Nonparametric(samples) for samples in rand_acc.T] # we should be able to assert "p" value for each random sample for each feature rand_acc_p = np.array( [dist.rcdf(v) for dist, v in zip(dists, rand_acc.T)] ).T rand_acc_p_slow = np.array([ [dist.rcdf(v) for dist, v in zip(dists, sample)] for sample in rand_acc]) assert_array_equal(rand_acc_p_slow, rand_acc_p) assert_equal(rand_acc_p.shape, rand_acc.shape) assert(np.all(rand_acc_p <= 1)) assert(np.all(rand_acc_p > 0)) # 2: apply the same to our acc acc_p = np.array([dist.rcdf(v) for dist, v in zip(dists, acc[0])])[None, :] assert(np.all(acc_p <= 1)) assert(np.all(acc_p > 0)) skip_if_no_external('scipy') # Now we need to do our fancy cluster level madness from mvpa2.algorithms.group_clusterthr import \ get_cluster_sizes, _transform_to_pvals, get_cluster_pvals, \ get_thresholding_map, repeat_cluster_vals rand_acc_p_thr = rand_acc_p < pthr_feature acc_p_thr = acc_p < pthr_feature rand_cluster_sizes = get_cluster_sizes(rand_acc_p_thr) acc_cluster_sizes = get_cluster_sizes(acc_p_thr) # This is how we can compute it within present implementation. # It will be a bit different (since it doesn't account for target value if # I got it right), and would work only for accuracies thr_map = get_thresholding_map(rand_acc, pthr_feature) rand_cluster_sizes_ = get_cluster_sizes(rand_acc > thr_map) acc_cluster_sizes_ = get_cluster_sizes(acc > thr_map) assert_equal(rand_cluster_sizes, rand_cluster_sizes_) assert_equal(acc_cluster_sizes, acc_cluster_sizes_) #print rand_cluster_sizes #print acc_cluster_sizes # That is how it is done in group_clusterthr atm # store cluster size histogram for later p-value evaluation # use a sparse matrix for easy consumption (max dim is the number of # features, i.e. biggest possible cluster) from scipy.sparse import dok_matrix scl = dok_matrix((1, nf + 1), dtype=int) for s in rand_cluster_sizes: scl[0, s] = rand_cluster_sizes[s] test_count_sizes = repeat_cluster_vals(acc_cluster_sizes) test_pvals = _transform_to_pvals(test_count_sizes, scl.astype('float')) # needs conversion to array for comparisons test_pvals = np.asanyarray(test_pvals) # critical cluster_level threshold (without FW correction between clusters) # would be clusters_passed_threshold = test_count_sizes[test_pvals <= pthr_cluster] if len(clusters_passed_threshold): thr_cluster_size = min(clusters_passed_threshold) #print("Min cluster size which passed threshold: %d" % thr_cluster_size) else: #print("No clusters passed threshold") pass #print test_count_sizes, test_pvals acc_cluster_ps = get_cluster_pvals(acc_cluster_sizes, rand_cluster_sizes) for test_pval, test_count_size in zip(test_pvals, test_count_sizes): assert_almost_equal(acc_cluster_ps[test_count_size], test_pval)