def test_factorialpartitioner(): # Test against sifter and chainmap implemented in test_usecases # -- code below copied from test_usecases -- # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, ) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 # let's make two other datasets to test later # one superordinate category only ds_1super = ds.copy() ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets] # one superordinate category has only one subordinate #ds_unbalanced = ds.copy() #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1') #mask_superord = ds_unbalanced.sa.superord == 'super1' #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord]) #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)] ds_unbalanced = Dataset(range(4), sa={ 'subord': [0, 0, 1, 2], 'superord': [1, 1, 2, 2] }) npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') # now the new implementation factpart = FactorialPartitioner(NFoldPartitioner(attr='subord'), attr='superord') partitions_npart = [p.sa.partitions for p in npart.generate(ds)] partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)] assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart)) # now let's check it behaves correctly if we have only one superord class nfold = NFoldPartitioner(attr='subord') partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)] partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_1super) ] assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart)) # smoke test for unbalanced subord classes warning_msg = 'One or more superordinate attributes do not have the same '\ 'number of subordinate attributes. This could yield to '\ 'unbalanced partitions.' with assert_warnings([(RuntimeWarning, warning_msg)]): partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_unbalanced) ] partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])] superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])] subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])] for out_part, true_part, super_out, sub_out in \ zip(partitions_factpart, partitions_unbalanced, superord_unbalanced, subord_unbalanced): assert_array_equal(out_part, true_part) assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()), super_out) assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out) # now let's test on a dummy dataset ds_dummy = Dataset(range(4), sa={ 'subord': range(4), 'superord': [1, 2] * 2 }) partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_dummy) ] assert_array_equal( partitions_factpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
def partition(ds_=ds, **kwargs): partitioner = FactorialPartitioner( partitioner=NFoldPartitioner(attr='targets'), attr='chunks', **kwargs) return [p.sa.partitions for p in partitioner.generate(ds_)]
def test_factorialpartitioner(): # Test against sifter and chainmap implemented in test_usecases # -- code below copied from test_usecases -- # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, ) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 # let's make two other datasets to test later # one superordinate category only ds_1super = ds.copy() ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets] # one superordinate category has only one subordinate #ds_unbalanced = ds.copy() #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1') #mask_superord = ds_unbalanced.sa.superord == 'super1' #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord]) #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)] ds_unbalanced = Dataset(range(4), sa={ 'subord': [0, 0, 1, 2], 'superord': [1, 1, 2, 2] }) npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') def partition(partitioner, ds_=ds): return [p.sa.partitions for p in partitioner.generate(ds_)] # now the new implementation # common kwargs factkw = dict(partitioner=NFoldPartitioner(attr='subord'), attr='superord') fpart = FactorialPartitioner(**factkw) p_npart = partition(npart) p_fpart = partition(fpart) assert_array_equal(np.sort(p_npart), np.sort(p_fpart)) fpart2 = FactorialPartitioner(count=2, selection_strategy='first', **factkw) p_fpart2 = partition(fpart2) assert_equal(len(p_fpart), 8) assert_equal(len(p_fpart2), 2) assert_array_equal(p_fpart[:2], p_fpart2) # 1 equidistant -- should be the first one fpart1 = FactorialPartitioner(count=1, **factkw) p_fpart1 = partition(fpart1) assert_equal(len(p_fpart1), 1) assert_array_equal(p_fpart[:1], p_fpart1) # 2 equidistant fpart2 = FactorialPartitioner(count=2, **factkw) p_fpart2 = partition(fpart2) assert_equal(len(p_fpart2), 2) assert_array_equal(p_fpart[::4], p_fpart2) # without count -- should be all of them in original order fpartr = FactorialPartitioner(selection_strategy='random', **factkw) assert_array_equal(p_fpart, partition(fpartr)) # but if with a count we should get some selection fpartr2 = FactorialPartitioner(selection_strategy='random', count=2, **factkw) # Let's generate a number of random selections: rand2_partitions = [partition(fpartr2) for i in xrange(10)] for p in rand2_partitions: assert_equal(len(p), 2) # majority of them must be different assert len(set([tuple(map(tuple, x)) for x in rand2_partitions])) >= 5 # now let's check it behaves correctly if we have only one superord class nfold = NFoldPartitioner(attr='subord') p_nfold = partition(nfold, ds_1super) p_fpart = partition(fpart, ds_1super) assert_array_equal(np.sort(p_nfold), np.sort(p_fpart)) # smoke test for unbalanced subord classes warning_msg = 'One or more superordinate attributes do not have the same '\ 'number of subordinate attributes. This could yield to '\ 'unbalanced partitions.' with assert_warnings([(RuntimeWarning, warning_msg)]): p_fpart = partition(fpart, ds_unbalanced) p_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])] superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])] subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])] for out_part, true_part, super_out, sub_out in \ zip(p_fpart, p_unbalanced, superord_unbalanced, subord_unbalanced): assert_array_equal(out_part, true_part) assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()), super_out) assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out) # now let's test on a dummy dataset ds_dummy = Dataset(range(4), sa={ 'subord': range(4), 'superord': [1, 2] * 2 }) p_fpart = partition(fpart, ds_dummy) assert_array_equal( p_fpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
def main(infile, outdir, radius, mask, zscoring, classification, derivs=True, debugging=False, permute=None, decoder='svm', errors=False): # gime more if debugging: debug.active += ["SLC"] print('Loading {0}'.format(infile)) ds = h5load(infile) # check we have derivatives too if derivs and 'derivs' not in ds.fa: raise ValueError( 'Dataset {0} does not contain derivatives'.format(infile)) # let's try familiar vs unfamiliar if classification in [ 'familiar_vs_unfamiliar', 'familiar_vs_unfamiliar-id', 'familiar_vs_unfamiliar-id-chunks', 'identity-all', 'identity-familiar', 'identity-unfamiliar' ]: ds = ds[ds.sa.condition != 'self'] # permute if needed if permute: if classification != 'familiar_vs_unfamiliar-id': ds = shuffle_sa(ds, rand_seed=permute) else: # for familiar_vs_unfamiliar-id we need a fancier perm perm = get_unique_combs(8)[permute - 1] perm = flatten(perm) unique_conds = np.unique(ds.sa.condition) mapperm = dict() for i, p in enumerate(perm): mapperm[unique_conds[i]] = unique_conds[p] for i in range(ds.nsamples): this_cond = ds.sa.condition[i] ds.sa.condition[i] = mapperm[this_cond] print("USING PERMUTATION {0}".format(mapperm)) ds.sa['familiarity'] = [ 'familiar' if 'friend' in a else 'control' for a in ds.sa.condition ] else: raise NotImplementedError('Classification not implemented') # if we are using a dataset with derivatives but we don't want to use them # as features, extract only the non-derivatives features sfx = '' if 'derivs' in ds.fa and not derivs: ds = ds[:, ds.fa.derivs == 0] sfx += '_betaderivs' # set up clf and cv if decoder == 'svm': clf = LinearCSVMC() elif decoder == 'gnb': clf = GNB() else: raise ValueError( 'I have no clue about this classifier {0}'.format(decoder)) if classification == 'familiar_vs_unfamiliar': ds.sa['targets'] = ds.sa['familiarity'] partitioner = NFoldPartitioner() elif classification == 'familiar_vs_unfamiliar-id': ds.sa['targets'] = ds.sa['familiarity'] partitioner = FactorialPartitioner(NFoldPartitioner(attr='condition'), attr='targets') #if permute: # rng = np.random.RandomState(permute) # permutator = AttributePermutator(['familiarity'], # limit=['partitions', 'chunks'], # rng=rng) # partitioner = ChainNode([partitioner, permutator], space='partitions') elif classification == 'familiar_vs_unfamiliar-id-chunks': ds.sa['targets'] = ds.sa['familiarity'] # to do within chunks cross-validation across identities partitioner = ChainNode([ FactorialPartitioner(NFoldPartitioner(attr='condition'), attr='familiarity'), ExcludeTargetsCombinationsPartitioner( k=1, targets_attr='chunks', space='partitions') ], space='partitions') elif classification == 'identity-all': ds.sa['targets'] = ds.sa['condition'] partitioner = NFoldPartitioner() elif classification == 'identity-familiar': ds.sa['targets'] = ds.sa['condition'] ds = ds.select( sadict={'condition': ['friend' + str(i) for i in range(1, 5)]}) assert (ds.nsamples == 44) partitioner = NFoldPartitioner() elif classification == 'identity-unfamiliar': ds.sa['targets'] = ds.sa['condition'] ds = ds.select( sadict={'condition': ['control' + str(i) for i in range(1, 5)]}) assert (ds.nsamples == 44) partitioner = NFoldPartitioner() cv = CrossValidation(clf, partitioner) if mask: mask_ds = fmri_dataset(mask) if derivs: assert (np.all(mask_ds.fa.voxel_indices == ds.fa.voxel_indices[ ds.fa.derivs == 0])) else: assert (np.all(mask_ds.fa.voxel_indices == ds.fa.voxel_indices)) assert (len(mask_ds) == 1) mask_ = mask_ds.samples[0] # extract mask as the first sample #assert(np.all(mask_ == mask_ds.samples.flatten())) if derivs: # need to make the mask bigger mask_ = np.tile(mask_, 2) ds = ds[:, mask_ > 0] if derivs: assert (np.all(ds.fa.voxel_indices[ds.fa.derivs == 0] == ds.fa.voxel_indices[ds.fa.derivs == 1])) #ds = remove_invariant_features(ds) # zscore within each chunk if zscoring: zscore(ds, chunks_attr='chunks', dtype='float32') # copy for efficiency ds_ = ds.copy(deep=False, sa=['targets', 'chunks', 'familiarity', 'condition'], fa=['voxel_indices', 'derivs'], a=['mapper']) print(ds_) if derivs: sl = Searchlight(cv, IndexQueryEngine(voxel_indices=Sphere(radius), derivs=Sphere(2)), postproc=mean_sample(), roi_ids=np.where(ds_.fa.derivs == 0)[0], nproc=8) else: sl = sphere_searchlight( cv, radius=radius, space='voxel_indices', #center_ids=range(0, 1000), postproc=mean_sample(), nproc=8) # run it! -- oh, PyMVPA! sl_map = sl(ds_) # copy mapper sl_map.a = ds.a # remove unnecessary field to make file smaller del sl_map.a['add_regs'] if not errors: sl_map.samples *= -1 sl_map.samples += 1 # reduce size sl_map.samples = sl_map.samples.astype('float32') # save fnout = 'sl' if mask: fnout += 'msk' if zscoring: fnout += 'z' fnout += str(radius) + 'vx' if derivs: fnout += '_featderivs' sfx = '' fnout += sfx fnout += '_' + decoder sl_out = pjoin(outdir, fnout, classification) try: os.makedirs(sl_out) except OSError: pass print('Saving in {0}'.format(sl_out)) fnslmap = 'sl_map' if permute: fnslmap += '_perm{0:03d}'.format(permute) fnslmap += '.hdf5' h5save(pjoin(sl_out, fnslmap), sl_map)
def test_factorialpartitioner(): # Test against sifter and chainmap implemented in test_usecases # -- code below copied from test_usecases -- # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5 # pure signal! ;) ) ds.sa["subord"] = ds.sa.targets.copy() ds.sa["superord"] = ["super%d" % (int(i[1]) % 3,) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 # let's make two other datasets to test later # one superordinate category only ds_1super = ds.copy() ds_1super.sa["superord"] = ["super1" for i in ds_1super.targets] # one superordinate category has only one subordinate # ds_unbalanced = ds.copy() # nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1') # mask_superord = ds_unbalanced.sa.superord == 'super1' # uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord]) # ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)] ds_unbalanced = Dataset(range(4), sa={"subord": [0, 0, 1, 2], "superord": [1, 1, 2, 2]}) npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa["superord"].unique), attr="subord"), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([("partitions", 2), ("superord", {"uvalues": ds.sa["superord"].unique, "balanced": True})]), ], space="partitions", ) # now the new implementation factpart = FactorialPartitioner(NFoldPartitioner(attr="subord"), attr="superord") partitions_npart = [p.sa.partitions for p in npart.generate(ds)] partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)] assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart)) # now let's check it behaves correctly if we have only one superord class nfold = NFoldPartitioner(attr="subord") partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)] partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_1super)] assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart)) # smoke test for unbalanced subord classes warning_msg = ( "One or more superordinate attributes do not have the same " "number of subordinate attributes. This could yield to " "unbalanced partitions." ) with assert_warnings([(RuntimeWarning, warning_msg)]): partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_unbalanced)] partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])] superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])] subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])] for out_part, true_part, super_out, sub_out in zip( partitions_factpart, partitions_unbalanced, superord_unbalanced, subord_unbalanced ): assert_array_equal(out_part, true_part) assert_array_equal( (ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()), super_out, ) assert_array_equal( (ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out ) # now let's test on a dummy dataset ds_dummy = Dataset(range(4), sa={"subord": range(4), "superord": [1, 2] * 2}) partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_dummy)] assert_array_equal(partitions_factpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])