def test_log_exclusions(): ds = give_data() ds.sa['time_coords'] = np.arange(len(ds)) # only mark the selection in an attribute bal = Balancer() balanced = bal(ds) tmpfile = tempfile.mktemp() logex = LogExclusions(tmpfile, append=False) logged = logex(balanced) subds = balanced[~balanced.sa['balanced_set'].value] assert_true(logged is balanced) with open(tmpfile, 'r') as fobj: assert_true(fobj.readline().startswith('# New entry')) excluded = np.genfromtxt(tmpfile, dtype='u1', delimiter=',') assert_array_equal(excluded[:, 0], subds.sa.chunks) assert_array_equal(excluded[:, 1], subds.sa.targets) assert_array_equal(excluded[:, 2], subds.sa.time_coords) os.unlink(tmpfile)
def test_permute_chunks(): def is_sorted(x): return np.array_equal(np.sort(x), x) ds = give_data() # change targets labels # there is no target labels permuting within chunks, # assure = True would be error ds.sa['targets'] = range(len(ds.sa.targets)) permutation = AttributePermutator(attr='targets', chunk_attr='chunks', strategy='chunks', assure=True) pds = permutation(ds) assert_false(is_sorted(pds.sa.targets)) assert_true(np.array_equal(pds.samples, ds.samples)) for chunk_id in np.unique(pds.sa.chunks): chunk_ds = pds[pds.sa.chunks == chunk_id] assert_true(is_sorted(chunk_ds.sa.targets)) permutation = AttributePermutator(attr='targets', strategy='chunks') assert_raises(ValueError, permutation, ds)
def test_permute_chunks(): def is_sorted(x): return np.array_equal(np.sort(x), x) ds = give_data() # change targets labels # there is no target labels permuting within chunks, # assure = True would be error ds.sa['targets'] = list(range(len(ds.sa.targets))) permutation = AttributePermutator(attr='targets', chunk_attr='chunks', strategy='chunks', assure=True) pds = permutation(ds) assert_false(is_sorted(pds.sa.targets)) assert_true(np.array_equal(pds.samples, ds.samples)) for chunk_id in np.unique(pds.sa.chunks): chunk_ds = pds[pds.sa.chunks == chunk_id] assert_true(is_sorted(chunk_ds.sa.targets)) permutation = AttributePermutator(attr='targets', strategy='chunks') assert_raises(ValueError, permutation, ds)
def test_eep_bin(): eb = EEPBin(os.path.join(pymvpa_dataroot, 'eep.bin')) assert_equal(eb.nchannels, 32) assert_equal(eb.nsamples, 2) assert_equal(eb.ntimepoints, 4) assert_true(eb.t0 - eb.dt < 0.00000001) assert_equal(len(eb.channels), 32) assert_equal(eb.data.shape, (2, 32, 4))
def test_cosmo_repr_and_str(): # simple smoke test for __repr__ and __str__ creators = (_create_small_mat_nbrhood_dict, _create_small_mat_dataset_dict) for creator in creators: obj = cosmo.from_any(creator()) for fmt in 'rs': obj_str = (("%%%s" % fmt) % obj) assert_true(obj.__class__.__name__ in obj_str)
def test_balancer(): ds = give_data() # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3, )) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal( get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal( get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round( np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1, 2], 5) ds.fa['chk'] = np.repeat([1, 2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def test_balancer(): ds = give_data() # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3,)) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal(get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal(get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1,2], 5) ds.fa['chk'] = np.repeat([1,2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def test_attrpermute(): ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3,4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_equal(len(pds), nruns) for p in pds: assert_false(np.all(p.sa.ids == ds.sa.ids)) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids))
def test_cosmo_do_not_store_unsupported_datatype(): ds = Dataset(np.zeros((0, 0))) class ArbitraryClass(object): pass ds.a['unused'] = ArbitraryClass() c = cosmo.map2cosmo(ds) assert_false('a' in c.keys()) ds.a['foo'] = np.zeros((1,)) c = cosmo.map2cosmo(ds) assert_true('a' in c.keys())
def test_identity_mapper(s): idm = IdentityMapper() # doesn't matter what you throw at it assert_true(idm.forward(s) is s) assert_true(idm.forward1(s) is s) assert_true(idm.reverse(s) is s) assert_true(idm.reverse1(s) is s)
def test_splitter(): ds = give_data() # split with defaults spl1 = Splitter('chunks') assert_raises(NotImplementedError, spl1, ds) splits = list(spl1.generate(ds)) assert_equal(len(splits), len(ds.sa['chunks'].unique)) for split in splits: # it should have perform basic slicing! assert_true(split.samples.base is ds.samples) assert_equal(len(split.sa['chunks'].unique), 1) assert_true('lastsplit' in split.a) assert_true(splits[-1].a.lastsplit) # now again, more customized spl2 = Splitter('targets', attr_values=[0, 1, 1, 2, 3, 3, 3], count=4, noslicing=True) splits = list(spl2.generate(ds)) assert_equal(len(splits), 4) for split in splits: # it should NOT have perform basic slicing! assert_false(split.samples.base is ds.samples) assert_equal(len(split.sa['targets'].unique), 1) assert_equal(len(split.sa['chunks'].unique), 10) assert_true(splits[-1].a.lastsplit) # two should be identical assert_array_equal(splits[1].samples, splits[2].samples) # now go wild and split by feature attribute ds.fa['roi'] = np.repeat([0, 1], 5) # splitter should auto-detect that this is a feature attribute spl3 = Splitter('roi') splits = list(spl3.generate(ds)) assert_equal(len(splits), 2) for split in splits: assert_true(split.samples.base is ds.samples) assert_equal(len(split.fa['roi'].unique), 1) assert_equal(split.shape, (100, 5)) # and finally test chained splitters cspl = ChainNode([spl2, spl3, spl1]) splits = list(cspl.generate(ds)) # 4 target splits and 2 roi splits each and 10 chunks each assert_equal(len(splits), 80)
def test_splitter(): ds = give_data() # split with defaults spl1 = Splitter('chunks') assert_raises(NotImplementedError, spl1, ds) splits = list(spl1.generate(ds)) assert_equal(len(splits), len(ds.sa['chunks'].unique)) for split in splits: # it should have perform basic slicing! assert_true(split.samples.base is ds.samples) assert_equal(len(split.sa['chunks'].unique), 1) assert_true('lastsplit' in split.a) assert_true(splits[-1].a.lastsplit) # now again, more customized spl2 = Splitter('targets', attr_values = [0,1,1,2,3,3,3], count=4, noslicing=True) splits = list(spl2.generate(ds)) assert_equal(len(splits), 4) for split in splits: # it should NOT have perform basic slicing! assert_false(split.samples.base is ds.samples) assert_equal(len(split.sa['targets'].unique), 1) assert_equal(len(split.sa['chunks'].unique), 10) assert_true(splits[-1].a.lastsplit) # two should be identical assert_array_equal(splits[1].samples, splits[2].samples) # now go wild and split by feature attribute ds.fa['roi'] = np.repeat([0,1], 5) # splitter should auto-detect that this is a feature attribute spl3 = Splitter('roi') splits = list(spl3.generate(ds)) assert_equal(len(splits), 2) for split in splits: assert_true(split.samples.base is ds.samples) assert_equal(len(split.fa['roi'].unique), 1) assert_equal(split.shape, (100, 5)) # and finally test chained splitters cspl = ChainNode([spl2, spl3, spl1]) splits = list(cspl.generate(ds)) # 4 target splits and 2 roi splits each and 10 chunks each assert_equal(len(splits), 80)
def test_glmnet_r(): # not the perfect dataset with which to test, but # it will do for now. #data = datasets['dumb2'] # for some reason the R code fails with the dumb data data = datasets['chirp_linear'] clf = GLMNET_R() clf.train(data) # prediction has to be almost perfect # test with a correlation pre = clf.predict(data.samples) corerr = corr_error(pre, data.targets) if cfg.getboolean('tests', 'labile', default='yes'): assert_true(corerr < .2)
def test_collections(): sa = SampleAttributesCollection() assert_equal(len(sa), 0) assert_raises(ValueError, sa.__setitem__, 'test', 0) l = range(5) sa['test'] = l # auto-wrapped assert_true(isinstance(sa['test'], ArrayCollectable)) assert_equal(len(sa), 1) # names which are already present in dict interface assert_raises(ValueError, sa.__setitem__, 'values', range(5)) sa_c = copy.deepcopy(sa) assert_equal(len(sa), len(sa_c)) assert_array_equal(sa.test, sa_c.test)
def test_identity_mapper(s): idm = IdentityMapper() # doesn't matter what you throw at it assert_true(idm.forward(s) is s) assert_true(idm.forward1(s) is s) assert_true(idm.reverse(s) is s) assert_true(idm.reverse1(s) is s) # even like this it should work, but type conversion # can happen assert_array_equal(_verified_reverse1(idm, s), s) assert_array_equal(idm.reverse1(s), s)
def test_array_collectable(): c = ArrayCollectable() # empty by default assert_equal(c.name, None) assert_equal(c.value, None) # late assignment c.name = 'somename' assert_raises(ValueError, c._set, 12345) assert_equal(c.value, None) c.value = np.arange(5) assert_equal(c.name, 'somename') assert_array_equal(c.value, np.arange(5)) # immediate content data = np.random.random(size=(3,10)) c = ArrayCollectable(data.copy(), 'myname', "This is a test", length=3) assert_equal(c.name, 'myname') assert_array_equal(c.value, data) assert_equal(c.__doc__, "This is a test") assert_equal(str(c), 'myname') # repr from numpy import array e = eval(repr(c)) assert_equal(e.name, 'myname') assert_array_almost_equal(e.value, data) assert_equal(e.__doc__, "This is a test") # cannot assign array of wrong length assert_raises(ValueError, c._set, np.arange(5)) assert_equal(len(c), 3) # shallow copy DOES create a view of value array c.value = np.arange(3) d = copy.copy(c) assert_true(d.value.base is c.value) # names starting with _ are not allowed assert_raises(ValueError, c._set_name, "_underscore")
def test_transpose(): from mvpa2.mappers.shape import TransposeMapper ds = Dataset(np.arange(24).reshape(2, 3, 4), sa={"testsa": np.arange(2)}, fa={"testfa": np.arange(3)}) tp = TransposeMapper() tds = tp(ds) assert_equal(tds.shape, (3, 2, 4)) assert_true("testfa" in tds.sa) assert_true("testsa" in tds.fa) assert_false(tds.fa is tds.sa) # and back ttds = tp(tds) assert_array_equal(ttds.samples, ds.samples) assert_equal(ttds.sa, ds.sa) assert_equal(ttds.fa, ds.fa) # or this way rds = tp.reverse(tds) assert_array_equal(rds.samples, ds.samples) assert_equal(rds.sa, ds.sa) assert_equal(rds.fa, ds.fa) assert_array_equal(rds.samples, ttds.samples) assert_equal(rds.sa, ttds.sa) assert_equal(rds.fa, ttds.fa)
def test_transpose(): from mvpa2.mappers.shape import TransposeMapper ds = Dataset(np.arange(24).reshape(2, 3, 4), sa={'testsa': np.arange(2)}, fa={'testfa': np.arange(3)}) tp = TransposeMapper() tds = tp(ds) assert_equal(tds.shape, (3, 2, 4)) assert_true('testfa' in tds.sa) assert_true('testsa' in tds.fa) assert_false(tds.fa is tds.sa) # and back ttds = tp(tds) assert_array_equal(ttds.samples, ds.samples) assert_equal(ttds.sa, ds.sa) assert_equal(ttds.fa, ds.fa) # or this way rds = tp.reverse(tds) assert_array_equal(rds.samples, ds.samples) assert_equal(rds.sa, ds.sa) assert_equal(rds.fa, ds.fa) assert_array_equal(rds.samples, ttds.samples) assert_equal(rds.sa, ttds.sa) assert_equal(rds.fa, ttds.fa)
def test_attrpermute(): ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # verify that implausible assure=True would not work permutation = AttributePermutator('targets', limit='ids', assure=True) assert_raises(RuntimeError, permutation, ds) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3,4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_equal(len(pds), nruns) for p in pds: assert_false(np.all(p.sa.ids == ds.sa.ids)) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids)) # now chunk-wise uattrs strategy (reassignment) permutation = AttributePermutator('targets', limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets), zip(pds.targets)) # in each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) # we have only 1-to-1 mappings assert_true(len(set(zip(otargets, ptargets))), len(set(otargets))) ds.sa['odds'] = ds.sa.ids % 2 # test combinations permutation = AttributePermutator(['targets', 'odds'], limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets, ds.sa.odds), zip(pds.targets, pds.sa.odds)) # In each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] oodds, podds = ds.sa.odds[chunk_idx], pds.sa.odds[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) assert_equal(set(oodds), set(podds)) # at the end we have the same mapping assert_equal(set(zip(otargets, oodds)), set(zip(ptargets, podds)))
def _assert_array_collectable_less_or_equal(x, y): # test for the keys in x to be a subset of those in y, # and the values corresponding to k in x being equal to those in y assert_true(set(x.keys()).issubset(set(y.keys()))) for k in x.keys(): assert_array_equal(x[k].value, y[k].value)
def test_attrpermute(): # Was about to use borrowkwargs but didn't work out . Test doesn't hurt doc = AttributePermutator.__init__.__doc__ assert_in('limit : ', doc) assert_not_in('collection : ', doc) ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # verify that implausible assure=True would not work permutation = AttributePermutator('targets', limit='ids', assure=True) assert_raises(RuntimeError, permutation, ds) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3,4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 def assert_all_different_permutations(pds): assert_equal(len(pds), nruns) for i, p in enumerate(pds): assert_false(np.all(p.sa.ids == ds.sa.ids)) for p_ in pds[i+1:]: assert_false(np.all(p.sa.ids == p_.sa.ids)) permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_all_different_permutations(pds) # if we provide seeding, and generate, it should also return different datasets permutation = AttributePermutator(['targets', 'ids'], count=nruns, rng=1) pds1 = list(permutation.generate(ds)) assert_all_different_permutations(pds) # but if we regenerate -- should all be the same to before pds2 = list(permutation.generate(ds)) assert_equal(len(pds1), len(pds2)) for p1, p2 in zip(pds1, pds2): assert_datasets_equal(p1, p2) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids)) # now chunk-wise uattrs strategy (reassignment) permutation = AttributePermutator('targets', limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets), zip(pds.targets)) # in each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) # we have only 1-to-1 mappings assert_true(len(set(zip(otargets, ptargets))), len(set(otargets))) ds.sa['odds'] = ds.sa.ids % 2 # test combinations permutation = AttributePermutator(['targets', 'odds'], limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets, ds.sa.odds), zip(pds.targets, pds.sa.odds)) # In each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] oodds, podds = ds.sa.odds[chunk_idx], pds.sa.odds[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) assert_equal(set(oodds), set(podds)) # at the end we have the same mapping assert_equal(set(zip(otargets, oodds)), set(zip(ptargets, podds)))
def test_flatten(): samples_shape = (2, 2, 4) data_shape = (4,) + samples_shape data = np.arange(np.prod(data_shape)).reshape(data_shape).view(myarray) pristinedata = data.copy() target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]] target = np.array(target).view(myarray) index_target = np.array([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3], [1, 0, 0], [1, 0, 1], [1, 0, 2], [1, 0, 3], [1, 1, 0], [1, 1, 1], [1, 1, 2], [1, 1, 3]]) # test only flattening the first two dimensions fm_max = FlattenMapper(maxdims=2) fm_max.train(data) assert_equal(fm_max(data).shape, (4, 4, 4)) # array subclass survives ok_(isinstance(data, myarray)) # actually, there should be no difference between a plain FlattenMapper and # a chain that only has a FlattenMapper as the one element for fm in [FlattenMapper(space='voxel'), ChainMapper([FlattenMapper(space='voxel'), StaticFeatureSelection(slice(None))])]: # not working if untrained assert_raises(RuntimeError, fm.forward1, np.arange(np.sum(samples_shape) + 1)) fm.train(data) ok_(isinstance(fm.forward(data), myarray)) ok_(isinstance(fm.forward1(data[2]), myarray)) assert_array_equal(fm.forward(data), target) assert_array_equal(fm.forward1(data[2]), target[2]) assert_raises(ValueError, fm.forward, np.arange(4)) # all of that leaves that data unmodified assert_array_equal(data, pristinedata) # reverse mapping ok_(isinstance(fm.reverse(target), myarray)) ok_(isinstance(fm.reverse1(target[0]), myarray)) ok_(isinstance(fm.reverse(target[1:2]), myarray)) assert_array_equal(fm.reverse(target), data) assert_array_equal(fm.reverse1(target[0]), data[0]) assert_array_equal(fm.reverse1(target[0]), _verified_reverse1(fm, target[0])) assert_array_equal(fm.reverse(target[1:2]), data[1:2]) assert_raises(ValueError, fm.reverse, np.arange(14)) # check one dimensional data, treated as scalar samples oned = np.arange(5) fm.train(Dataset(oned)) # needs 2D assert_raises(ValueError, fm.forward, oned) # doesn't match mapper, since Dataset turns `oned` into (5,1) assert_raises(ValueError, fm.forward, oned) assert_equal(Dataset(oned).nfeatures, 1) # try dataset mode, with some feature attribute fattr = np.arange(np.prod(samples_shape)).reshape(samples_shape) ds = Dataset(data, fa={'awesome': fattr.copy()}) assert_equal(ds.samples.shape, data_shape) fm.train(ds) dsflat = fm.forward(ds) ok_(isinstance(dsflat, Dataset)) ok_(isinstance(dsflat.samples, myarray)) assert_array_equal(dsflat.samples, target) assert_array_equal(dsflat.fa.awesome, np.arange(np.prod(samples_shape))) assert_true(isinstance(dsflat.fa['awesome'], ArrayCollectable)) # test index creation assert_array_equal(index_target, dsflat.fa.voxel) # and back revds = fm.reverse(dsflat) ok_(isinstance(revds, Dataset)) ok_(isinstance(revds.samples, myarray)) assert_array_equal(revds.samples, data) assert_array_equal(revds.fa.awesome, fattr) assert_true(isinstance(revds.fa['awesome'], ArrayCollectable)) assert_false('voxel' in revds.fa)
def test_simpleboxcar(): data = np.atleast_2d(np.arange(10)).T sp = np.arange(10) # check if stupid thing don't work assert_raises(ValueError, BoxcarMapper, sp, 0) # now do an identity transformation bcm = BoxcarMapper(sp, 1) trans = bcm.forward(data) # ,0 is a feature below, so we get explicit 2D out of 1D assert_array_equal(trans[:, 0], data) # now check for illegal boxes if __debug__: # condition is checked only in __debug__ assert_raises(ValueError, BoxcarMapper(sp, 2).train, data) # now something that should work nbox = 9 boxlength = 2 sp = np.arange(nbox) bcm = BoxcarMapper(sp, boxlength) trans = bcm.forward(data) # check that is properly upcasts the dimensionality assert_equal(trans.shape, (nbox, boxlength) + data.shape[1:]) # check actual values, squeezing the last dim for simplicity assert_array_equal(trans.squeeze(), np.vstack((np.arange(9), np.arange(9) + 1)).T) # now test for proper data shape data = np.ones((10, 3, 4, 2)) sp = [2, 4, 3, 5] trans = BoxcarMapper(sp, 4).forward(data) assert_equal(trans.shape, (4, 4, 3, 4, 2)) # test reverse data = np.arange(240).reshape(10, 3, 4, 2) sp = [2, 4, 3, 5] boxlength = 2 m = BoxcarMapper(sp, boxlength) m.train(data) mp = m.forward(data) assert_equal(mp.shape, (4, 2, 3, 4, 2)) # try full reconstruct mr = m.reverse(mp) # shape has to match assert_equal(mr.shape, (len(sp) * boxlength, ) + data.shape[1:]) # only known samples are part of the results assert_true((mr >= 24).all()) assert_true((mr < 168).all()) # check proper reconstruction of non-conflicting sample assert_array_equal(mr[0].ravel(), np.arange(48, 72)) # check proper reconstruction of samples being part of multiple # mapped samples assert_array_equal(mr[1].ravel(), np.arange(72, 96)) # test reverse of a single sample singlesample = np.arange(48).reshape(2, 3, 4, 2) assert_array_equal(singlesample, m.reverse1(singlesample)) # now in a dataset ds = Dataset([singlesample]) assert_equal(ds.shape, (1, ) + singlesample.shape) # after reverse mapping the 'sample axis' should vanish and the original 3d # shape of the samples should be restored assert_equal(ds.shape[1:], m.reverse(ds).shape) # multiple samples should just be concatenated along the samples axis ds = Dataset([singlesample, singlesample]) assert_equal((np.prod(ds.shape[:2]), ) + singlesample.shape[1:], m.reverse(ds).shape) # should not work for shape mismatch, but it does work and is useful when # reverse mapping sample attributes #assert_raises(ValueError, m.reverse, singlesample[0]) # check broadcasting of 'raw' samples into proper boxcars on forward() bc = m.forward1(np.arange(24).reshape(3, 4, 2)) assert_array_equal(bc, np.array(2 * [np.arange(24).reshape(3, 4, 2)]))
def test_chainmapper(): # the chain needs at lest one mapper assert_raises(ValueError, ChainMapper, []) # a typical first mapper is to flatten cm = ChainMapper([FlattenMapper()]) # few container checks assert_equal(len(cm), 1) assert_true(isinstance(cm[0], FlattenMapper)) # now training # come up with data samples_shape = (2, 2, 4) data_shape = (4,) + samples_shape data = np.arange(np.prod(data_shape)).reshape(data_shape) target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]] target = np.array(target) # if it is not trained it knows nothing cm.train(data) # a new mapper should appear when doing feature selection cm.append(StaticFeatureSelection(list(range(1, 16)))) assert_equal(cm.forward1(data[0]).shape, (15,)) assert_equal(len(cm), 2) # multiple slicing cm.append(StaticFeatureSelection([9, 14])) assert_equal(cm.forward1(data[0]).shape, (2,)) assert_equal(len(cm), 3) # check reproduction if __debug__: # debug mode needs special test as it enhances the repr output # with module info and id() appendix for objects import mvpa2 cm_clone = eval(repr(cm)) assert_equal('#'.join(repr(cm_clone).split('#')[:-1]), '#'.join(repr(cm).split('#')[:-1])) else: cm_clone = eval(repr(cm)) assert_equal(repr(cm_clone), repr(cm)) # what happens if we retrain the whole beast an same data as before cm.train(data) assert_equal(cm.forward1(data[0]).shape, (2,)) assert_equal(len(cm), 3) # let's map something mdata = cm.forward(data) assert_array_equal(mdata, target[:, [10, 15]]) # and back rdata = cm.reverse(mdata) # original shape assert_equal(rdata.shape, data.shape) # content as far it could be restored assert_array_equal(rdata[rdata > 0], data[rdata > 0]) assert_equal(np.sum(rdata > 0), 8) # Lets construct a dataset with mapper assigned and see # if sub-selecting a feature adjusts trailing StaticFeatureSelection # appropriately ds_subsel = Dataset.from_wizard(data, mapper=cm)[:, 1] tail_sfs = ds_subsel.a.mapper[-1] assert_equal(repr(tail_sfs), 'StaticFeatureSelection(slicearg=array([14]))')
def test_attrpermute(): # Was about to use borrowkwargs but didn't work out . Test doesn't hurt doc = AttributePermutator.__init__.__doc__ assert_in('limit : ', doc) assert_not_in('collection : ', doc) ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # verify that implausible assure=True would not work permutation = AttributePermutator('targets', limit='ids', assure=True) assert_raises(RuntimeError, permutation, ds) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3, 4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_equal(len(pds), nruns) for p in pds: assert_false(np.all(p.sa.ids == ds.sa.ids)) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids)) # now chunk-wise uattrs strategy (reassignment) permutation = AttributePermutator('targets', limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets), zip(pds.targets)) # in each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) # we have only 1-to-1 mappings assert_true(len(set(zip(otargets, ptargets))), len(set(otargets))) ds.sa['odds'] = ds.sa.ids % 2 # test combinations permutation = AttributePermutator(['targets', 'odds'], limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets, ds.sa.odds), zip(pds.targets, pds.sa.odds)) # In each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] oodds, podds = ds.sa.odds[chunk_idx], pds.sa.odds[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) assert_equal(set(oodds), set(podds)) # at the end we have the same mapping assert_equal(set(zip(otargets, oodds)), set(zip(ptargets, podds)))
def test_balancer(): ds = give_data() ds.sa['ids'] = np.arange(len(ds)) # some sa to ease tracking of samples # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # if we rerun again, it would be a different selection res2 = bal(ds) assert_true(np.any(res.sa.ids != bal(ds).sa.ids)) # but if we create a balancer providing seed rng int, # should be identical results bal = Balancer(apply_selection=True, count=5, rng=1) assert_false(np.any(bal(ds).sa.ids != bal(ds).sa.ids)) # But results should differ if we use .generate to produce those multiple # balanced datasets b = Balancer(apply_selection=True, count=3, rng=1) balanced = list(b.generate(ds)) assert_false(all(balanced[0].sa.ids == balanced[1].sa.ids)) assert_false(all(balanced[0].sa.ids == balanced[2].sa.ids)) assert_false(all(balanced[1].sa.ids == balanced[2].sa.ids)) # And should be exactly the same for ds_a, ds_b in zip(balanced, b.generate(ds)): assert_datasets_equal(ds_a, ds_b) # Contribution by Chris Markiewicz # And interleaving __call__ and generator fetches gen1 = b.generate(ds) gen2 = b.generate(ds) seq1, seq2, seq3 = [], [], [] for i in xrange(3): seq1.append(gen1.next()) seq2.append(gen2.next()) seq3.append(b(ds)) # Produces expected sequences for i in xrange(3): assert_datasets_equal(balanced[i], seq1[i]) assert_datasets_equal(balanced[i], seq2[i]) # And all __call__s return the same result ds_a = seq3[0] for ds_b in seq3[1:]: assert_array_equal(ds_a.sa.ids, ds_b.sa.ids) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3,)) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal(get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal(get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1, 2], 5) ds.fa['chk'] = np.repeat([1, 2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def test_simpleboxcar(): data = np.atleast_2d(np.arange(10)).T sp = np.arange(10) # check if stupid thing don't work assert_raises(ValueError, BoxcarMapper, sp, 0) # now do an identity transformation bcm = BoxcarMapper(sp, 1) trans = bcm.forward(data) # ,0 is a feature below, so we get explicit 2D out of 1D assert_array_equal(trans[:,0], data) # now check for illegal boxes if __debug__: # condition is checked only in __debug__ assert_raises(ValueError, BoxcarMapper(sp, 2).train, data) # now something that should work nbox = 9 boxlength = 2 sp = np.arange(nbox) bcm = BoxcarMapper(sp, boxlength) trans = bcm.forward(data) # check that is properly upcasts the dimensionality assert_equal(trans.shape, (nbox, boxlength) + data.shape[1:]) # check actual values, squeezing the last dim for simplicity assert_array_equal(trans.squeeze(), np.vstack((np.arange(9), np.arange(9)+1)).T) # now test for proper data shape data = np.ones((10,3,4,2)) sp = [ 2, 4, 3, 5 ] trans = BoxcarMapper(sp, 4).forward(data) assert_equal(trans.shape, (4,4,3,4,2)) # test reverse data = np.arange(240).reshape(10, 3, 4, 2) sp = [ 2, 4, 3, 5 ] boxlength = 2 m = BoxcarMapper(sp, boxlength) m.train(data) mp = m.forward(data) assert_equal(mp.shape, (4, 2, 3, 4, 2)) # try full reconstruct mr = m.reverse(mp) # shape has to match assert_equal(mr.shape, (len(sp) * boxlength,) + data.shape[1:]) # only known samples are part of the results assert_true((mr >= 24).all()) assert_true((mr < 168).all()) # check proper reconstruction of non-conflicting sample assert_array_equal(mr[0].ravel(), np.arange(48, 72)) # check proper reconstruction of samples being part of multiple # mapped samples assert_array_equal(mr[1].ravel(), np.arange(72, 96)) # test reverse of a single sample singlesample = np.arange(48).reshape(2, 3, 4, 2) assert_array_equal(singlesample, m.reverse1(singlesample)) # now in a dataset ds = Dataset([singlesample]) assert_equal(ds.shape, (1,) + singlesample.shape) # after reverse mapping the 'sample axis' should vanish and the original 3d # shape of the samples should be restored assert_equal(ds.shape[1:], m.reverse(ds).shape) # multiple samples should just be concatenated along the samples axis ds = Dataset([singlesample, singlesample]) assert_equal((np.prod(ds.shape[:2]),) + singlesample.shape[1:], m.reverse(ds).shape) # should not work for shape mismatch, but it does work and is useful when # reverse mapping sample attributes #assert_raises(ValueError, m.reverse, singlesample[0]) # check broadcasting of 'raw' samples into proper boxcars on forward() bc = m.forward1(np.arange(24).reshape(3, 4, 2)) assert_array_equal(bc, np.array(2 * [np.arange(24).reshape(3, 4, 2)]))
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] # with numpy 1.7.0b1 "chaining" was deprecated so let's create # check function appropriate for the given numpy version _a = np.arange(5) __a = _a[:4][:3] if __a.base is _a: # 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base is base elif __a.base.base is _a: # prior 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base.base is base else: raise RuntimeError("Uknown handling of .base by numpy") for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples)) assert_true(is_the_same_base(s[1].samples)) spl = Splitter(attr='partitions', noslicing=True) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(is_the_same_base(s[0].samples)) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20,2), sa={'chunks': np.tile([0,1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [ list(spl.generate(p)) for p in oes.generate(step_ds) ] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is self.data.samples) assert_true(s[1].samples.base.base is self.data.samples) spl = Splitter(attr='partitions', noslicing=True) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(s[0].samples.base.base is self.data.samples) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20,2), sa={'chunks': np.tile([0,1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [ list(spl.generate(p)) for p in oes.generate(step_ds) ] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def test_flatten(): samples_shape = (2, 2, 4) data_shape = (4,) + samples_shape data = np.arange(np.prod(data_shape)).reshape(data_shape).view(myarray) pristinedata = data.copy() target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]] target = np.array(target).view(myarray) index_target = np.array([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3], [1, 0, 0], [1, 0, 1], [1, 0, 2], [1, 0, 3], [1, 1, 0], [1, 1, 1], [1, 1, 2], [1, 1, 3]]) # test only flattening the first two dimensions fm_max = FlattenMapper(maxdims=2) fm_max.train(data) assert_equal(fm_max(data).shape, (4, 4, 4)) # array subclass survives ok_(isinstance(data, myarray)) # actually, there should be no difference between a plain FlattenMapper and # a chain that only has a FlattenMapper as the one element for fm in [FlattenMapper(space='voxel'), ChainMapper([FlattenMapper(space='voxel'), StaticFeatureSelection(slice(None))])]: # not working if untrained assert_raises(RuntimeError, fm.forward1, np.arange(np.sum(samples_shape) + 1)) fm.train(data) ok_(isinstance(fm.forward(data), myarray)) ok_(isinstance(fm.forward1(data[2]), myarray)) assert_array_equal(fm.forward(data), target) assert_array_equal(fm.forward1(data[2]), target[2]) assert_raises(ValueError, fm.forward, np.arange(4)) # all of that leaves that data unmodified assert_array_equal(data, pristinedata) # reverse mapping ok_(isinstance(fm.reverse(target), myarray)) ok_(isinstance(fm.reverse1(target[0]), myarray)) ok_(isinstance(fm.reverse(target[1:2]), myarray)) assert_array_equal(fm.reverse(target), data) assert_array_equal(fm.reverse1(target[0]), data[0]) assert_array_equal(fm.reverse(target[1:2]), data[1:2]) assert_raises(ValueError, fm.reverse, np.arange(14)) # check one dimensional data, treated as scalar samples oned = np.arange(5) fm.train(Dataset(oned)) # needs 2D assert_raises(ValueError, fm.forward, oned) # doesn't match mapper, since Dataset turns `oned` into (5,1) assert_raises(ValueError, fm.forward, oned) assert_equal(Dataset(oned).nfeatures, 1) # try dataset mode, with some feature attribute fattr = np.arange(np.prod(samples_shape)).reshape(samples_shape) ds = Dataset(data, fa={'awesome': fattr.copy()}) assert_equal(ds.samples.shape, data_shape) fm.train(ds) dsflat = fm.forward(ds) ok_(isinstance(dsflat, Dataset)) ok_(isinstance(dsflat.samples, myarray)) assert_array_equal(dsflat.samples, target) assert_array_equal(dsflat.fa.awesome, np.arange(np.prod(samples_shape))) assert_true(isinstance(dsflat.fa['awesome'], ArrayCollectable)) # test index creation assert_array_equal(index_target, dsflat.fa.voxel) # and back revds = fm.reverse(dsflat) ok_(isinstance(revds, Dataset)) ok_(isinstance(revds.samples, myarray)) assert_array_equal(revds.samples, data) assert_array_equal(revds.fa.awesome, fattr) assert_true(isinstance(revds.fa['awesome'], ArrayCollectable)) assert_false('voxel' in revds.fa)
def test_chainmapper(): # the chain needs at lest one mapper assert_raises(ValueError, ChainMapper, []) # a typical first mapper is to flatten cm = ChainMapper([FlattenMapper()]) # few container checks assert_equal(len(cm), 1) assert_true(isinstance(cm[0], FlattenMapper)) # now training # come up with data samples_shape = (2, 2, 4) data_shape = (4,) + samples_shape data = np.arange(np.prod(data_shape)).reshape(data_shape) pristinedata = data.copy() target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]] target = np.array(target) # if it is not trained it knows nothing cm.train(data) # a new mapper should appear when doing feature selection cm.append(StaticFeatureSelection(range(1, 16))) assert_equal(cm.forward1(data[0]).shape, (15,)) assert_equal(len(cm), 2) # multiple slicing cm.append(StaticFeatureSelection([9, 14])) assert_equal(cm.forward1(data[0]).shape, (2,)) assert_equal(len(cm), 3) # check reproduction if __debug__: # debug mode needs special test as it enhances the repr output # with module info and id() appendix for objects import mvpa2 cm_clone = eval(repr(cm)) assert_equal('#'.join(repr(cm_clone).split('#')[:-1]), '#'.join(repr(cm).split('#')[:-1])) else: cm_clone = eval(repr(cm)) assert_equal(repr(cm_clone), repr(cm)) # what happens if we retrain the whole beast an same data as before cm.train(data) assert_equal(cm.forward1(data[0]).shape, (2,)) assert_equal(len(cm), 3) # let's map something mdata = cm.forward(data) assert_array_equal(mdata, target[:, [10, 15]]) # and back rdata = cm.reverse(mdata) # original shape assert_equal(rdata.shape, data.shape) # content as far it could be restored assert_array_equal(rdata[rdata > 0], data[rdata > 0]) assert_equal(np.sum(rdata > 0), 8) # Lets construct a dataset with mapper assigned and see # if sub-selecting a feature adjusts trailing StaticFeatureSelection # appropriately ds_subsel = Dataset.from_wizard(data, mapper=cm)[:, 1] tail_sfs = ds_subsel.a.mapper[-1] assert_equal(repr(tail_sfs), 'StaticFeatureSelection(slicearg=array([14]))')
def _assert_subset(x, y): # test that first argument is a subset of the second assert_true(set(x).issubset(set(y)))
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] # with numpy 1.7.0b1 "chaining" was deprecated so let's create # check function appropriate for the given numpy version _a = np.arange(5) __a = _a[:4][:3] if __a.base is _a: # 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base is base elif __a.base.base is _a: # prior 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base.base is base else: raise RuntimeError("Uknown handling of .base by numpy") for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples)) assert_true(is_the_same_base(s[1].samples)) spl = Splitter(attr='partitions', noslicing=True) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [list(spl.generate(p)) for p in nfs.generate(self.data)] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(is_the_same_base(s[0].samples)) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20, 2), sa={'chunks': np.tile([0, 1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [list(spl.generate(p)) for p in oes.generate(step_ds)] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def test_balancer(): ds = give_data() ds.sa['ids'] = np.arange(len(ds)) # some sa to ease tracking of samples # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # if we rerun again, it would be a different selection res2 = bal(ds) assert_true(np.any(res.sa.ids != bal(ds).sa.ids)) # but if we create a balancer providing seed rng int, # should be identical results bal = Balancer(apply_selection=True, count=5, rng=1) assert_false(np.any(bal(ds).sa.ids != bal(ds).sa.ids)) # But results should differ if we use .generate to produce those multiple # balanced datasets b = Balancer(apply_selection=True, count=3, rng=1) balanced = list(b.generate(ds)) assert_false(all(balanced[0].sa.ids == balanced[1].sa.ids)) assert_false(all(balanced[0].sa.ids == balanced[2].sa.ids)) assert_false(all(balanced[1].sa.ids == balanced[2].sa.ids)) # And should be exactly the same for ds_a, ds_b in zip(balanced, b.generate(ds)): assert_datasets_equal(ds_a, ds_b) # Contribution by Chris Markiewicz # And interleaving __call__ and generator fetches gen1 = b.generate(ds) gen2 = b.generate(ds) seq1, seq2, seq3 = [], [], [] for i in xrange(3): seq1.append(gen1.next()) seq2.append(gen2.next()) seq3.append(b(ds)) # Produces expected sequences for i in xrange(3): assert_datasets_equal(balanced[i], seq1[i]) assert_datasets_equal(balanced[i], seq2[i]) # And all __call__s return the same result ds_a = seq3[0] for ds_b in seq3[1:]: assert_array_equal(ds_a.sa.ids, ds_b.sa.ids) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3, )) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal( get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal( get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round( np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1, 2], 5) ds.fa['chk'] = np.repeat([1, 2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)