def test_factorialpartitioner_big(): # just to see that we can cope with relatively large datasets/numbers ds = normal_feature_dataset(nlabels=6, perlabel=66, nfeatures=2, nchunks=11) # and now let's do factorial partitioner def partition(ds_=ds, **kwargs): partitioner = FactorialPartitioner( partitioner=NFoldPartitioner(attr='targets'), attr='chunks', **kwargs) return [p.sa.partitions for p in partitioner.generate(ds_)] # prohibitively large # print len(partition(ds)) t0 = time() assert_equal(len(partition(ds, count=2, selection_strategy='first')), 2) # Those time limits are really a stretch. on a any reasonable box not too busy # should be done in fraction of a second, but allow to catch "naive" # implementation assert(time() - t0 < 3) assert_equal(len(partition(ds, count=2, selection_strategy='random')), 2) assert(time() - t0 < 3)
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr="partitions") splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] # with numpy 1.7.0b1 "chaining" was deprecated so let's create # check function appropriate for the given numpy version _a = np.arange(5) __a = _a[:4][:3] if __a.base is _a: # 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base is base elif __a.base.base is _a: # prior 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base.base is base else: raise RuntimeError("Uknown handling of .base by numpy") for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples)) assert_true(is_the_same_base(s[1].samples)) spl = Splitter(attr="partitions", noslicing=True) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr="partitions") splits = [list(spl.generate(p)) for p in nfs.generate(self.data)] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(is_the_same_base(s[0].samples)) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(is_the_same_base(s[1].samples)) step_ds = Dataset(np.random.randn(20, 2), sa={"chunks": np.tile([0, 1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr="partitions") splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [list(spl.generate(p)) for p in oes.generate(step_ds)] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples, step_ds.samples)) assert_true(is_the_same_base(s[1].samples, step_ds.samples))
def test_product_flatten(): nsamples = 17 product_name_values = [('chan', ['C1', 'C2']), ('freq', np.arange(4, 20, 6)), ('time', np.arange(-200, 800, 200))] shape = (nsamples, ) + tuple(len(v) for _, v in product_name_values) sample_names = ['samp%d' % i for i in xrange(nsamples)] # generate random data in four dimensions data = np.random.normal(size=shape) ds = Dataset(data, sa=dict(sample_names=sample_names)) # apply flattening to ds flattener = ProductFlattenMapper(product_name_values) # test I/O (only if h5py is available) if externals.exists('h5py'): from mvpa2.base.hdf5 import h5save, h5load import tempfile import os fd, testfn = tempfile.mkstemp('mapper.h5py', 'test_product') os.close(fd) h5save(testfn, flattener) flattener = h5load(testfn) os.unlink(testfn) mds = flattener(ds) prod = lambda x: reduce(operator.mul, x) # ensure the size is ok assert_equal(mds.shape, (nsamples, ) + (prod(shape[1:]), )) ndim = len(product_name_values) idxs = [range(len(v)) for _, v in product_name_values] for si in xrange(nsamples): for fi, p in enumerate(itertools.product(*idxs)): data_tup = (si, ) + p x = mds[si, fi] # value should match assert_equal(data[data_tup], x.samples[0, 0]) # indices should match as well all_idxs = tuple(x.fa['chan_freq_time_indices'].value.ravel()) assert_equal(p, all_idxs) # values and indices in each dimension should match for i, (name, value) in enumerate(product_name_values): assert_equal(x.fa[name].value, value[p[i]]) assert_equal(x.fa[name + '_indices'].value, p[i]) product_name_values += [('foo', [1, 2, 3])] flattener = ProductFlattenMapper(product_name_values) assert_raises(ValueError, flattener, ds)
def test_conditional_attr(): import copy import cPickle for node in (TestNodeOnDefault(enable_ca=['test', 'stats']), TestNodeOffDefault(enable_ca=['test', 'stats'])): node.ca.test = range(5) node.ca.stats = ConfusionMatrix(labels=['one', 'two']) node.ca.stats.add(('one', 'two', 'one', 'two'), ('one', 'two', 'two', 'one')) node.ca.stats.compute() dc_node = copy.deepcopy(node) assert_equal(set(node.ca.enabled), set(dc_node.ca.enabled)) assert (node.ca['test'].enabled) assert (node.ca['stats'].enabled) assert_array_equal(node.ca['test'].value, dc_node.ca['test'].value) assert_array_equal(node.ca['stats'].value.matrix, dc_node.ca['stats'].value.matrix) # check whether values survive pickling pickled = cPickle.dumps(node) up_node = cPickle.loads(pickled) assert_array_equal(up_node.ca['test'].value, range(5)) assert_array_equal(up_node.ca['stats'].value.matrix, node.ca['stats'].value.matrix)
def test_repeater(): reps = 4 r = Repeater(reps, space='OMG') dsl = [ds for ds in r.generate(Dataset([0,1]))] assert_equal(len(dsl), reps) for i, ds in enumerate(dsl): assert_equal(ds.a.OMG, i)
def test_repeater(): reps = 4 r = Repeater(reps, space='OMG') dsl = [ds for ds in r.generate(Dataset([0, 1]))] assert_equal(len(dsl), reps) for i, ds in enumerate(dsl): assert_equal(ds.a.OMG, i)
def test_sifter_with_balancing(): # extended previous test which was already # "... somewhat duplicating the doctest" ds = Dataset(samples=np.arange(12).reshape((-1, 2)), sa={'chunks': [ 0 , 1 , 2 , 3 , 4, 5 ], 'targets': ['c', 'c', 'c', 'p', 'p', 'p']}) # Without sifter -- just to assure that we do get all of them # i.e. 6*5*4*3/(4!) = 15 par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')]) assert_equal(len(list(par.generate(ds))), 15) # so we will take 4 chunks out of available 7, but would care only # about those partitions where we have balanced number of 'c' and 'p' # entries assert_raises(ValueError, lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds) par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks'), Sifter([('partitions', 2), ('targets', dict(uvalues=['c', 'p'], balanced=True))]) ]) dss = list(par.generate(ds)) # print [ x[x.sa.partitions==2].sa.targets for x in dss ] assert_equal(len(dss), 9) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_sifter_with_balancing(): # extended previous test which was already # "... somewhat duplicating the doctest" ds = Dataset(samples=np.arange(12).reshape((-1, 2)), sa={ 'chunks': [0, 1, 2, 3, 4, 5], 'targets': ['c', 'c', 'c', 'p', 'p', 'p'] }) # Without sifter -- just to assure that we do get all of them # i.e. 6*5*4*3/(4!) = 15 par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')]) assert_equal(len(list(par.generate(ds))), 15) # so we will take 4 chunks out of available 7, but would care only # about those partitions where we have balanced number of 'c' and 'p' # entries assert_raises( ValueError, lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds) par = ChainNode([ NFoldPartitioner(cvtype=4, attr='chunks'), Sifter([('partitions', 2), ('targets', dict(uvalues=['c', 'p'], balanced=True))]) ]) dss = list(par.generate(ds)) # print [ x[x.sa.partitions==2].sa.targets for x in dss ] assert_equal(len(dss), 9) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_remove_invariant_as_a_mapper(): from mvpa2.featsel.helpers import RangeElementSelector from mvpa2.featsel.base import StaticFeatureSelection, SensitivityBasedFeatureSelection from mvpa2.testing.datasets import datasets from mvpa2.datasets.miscfx import remove_invariant_features mapper = SensitivityBasedFeatureSelection( lambda x: np.std(x, axis=0), RangeElementSelector(lower=0, inclusive=False), train_analyzer=False, auto_train=True) ds = datasets['uni2large'].copy() ds.a['mapper'] = StaticFeatureSelection(np.arange(ds.nfeatures)) ds.fa['index'] = np.arange(ds.nfeatures) ds.samples[:, [1, 8]] = 10 ds_out = mapper(ds) # Validate that we are getting the same results as remove_invariant_features ds_rifs = remove_invariant_features(ds) assert_array_equal(ds_out.samples, ds_rifs.samples) assert_array_equal(ds_out.fa.index, ds_rifs.fa.index) assert_equal(ds_out.fa.index[1], 2) assert_equal(ds_out.fa.index[8], 10)
def assert_coordinates_almost_equal_modulo_rotation(p_xyz, q_xyz, max_difference): assert_equal(p_xyz.shape, q_xyz.shape) n, three = p_xyz.shape assert_equal(three, 3) n_pairs_to_test = 50 get_random_int = lambda: int(random.uniform(0, n)) get_distance = lambda x, y: np.linalg.norm(x - y) # ensure that we test for at least some distances, i.e. # that the presence of nans everywhere would not lead to a 'skipped' # test did_distance_test = False # compute some pairwise distances between nodes, and verity these # are more or lress the same in p_xyz and q_xyz for _ in xrange(n_pairs_to_test): a = get_random_int() b = get_random_int() d_p = get_distance(p_xyz[a], p_xyz[b]) d_q = get_distance(q_xyz[a], q_xyz[b]) if not any(np.isnan([d_p, d_q])): assert (abs(d_p - d_q) < max_difference) did_distance_test = True assert (did_distance_test)
def test_addaxis(): from mvpa2.mappers.shape import AddAxisMapper ds = Dataset(np.arange(24).reshape(2, 3, 4), sa={'testsa': np.arange(2)}, fa={'testfa': np.arange(3)}) ds0 = AddAxisMapper(pos=0)(ds) assert_array_equal(ds0.shape, (1,) + ds.shape) # sas have extra dimension assert_array_equal(ds0.sa.testsa[0], ds.sa.testsa) # fas are duplicated assert_array_equal(ds0.fa.testfa[0], ds0.fa.testfa[1]) ds1 = AddAxisMapper(pos=1)(ds) assert_array_equal(ds1.shape, (2, 1, 3, 4)) # same sample attribute assert_equal(ds1.sa, ds.sa) # fas have extra dimension assert_array_equal(ds1.fa.testfa[0], ds.fa.testfa) ds2 = AddAxisMapper(pos=2)(ds) assert_array_equal(ds2.shape, (2, 3, 1, 4)) # no change to attribute collections assert_equal(ds2.sa, ds.sa) assert_equal(ds2.fa, ds.fa) # append an axis ds3 = AddAxisMapper(pos=3)(ds) assert_array_equal(ds3.shape, ds.shape + (1,)) # reverse indexing ds_1 = AddAxisMapper(pos=-1)(ds) assert_array_equal(ds3.samples, ds_1.samples) assert_equal(ds3.sa, ds_1.sa) assert_equal(ds3.fa, ds_1.fa) # add multiple axes ds4 = AddAxisMapper(pos=4)(ds) assert_array_equal(ds4.shape, ds.shape + (1, 1))
def _predict(self, ds_): # also called for estimating training error assert(ds_ is not ds) # we pass a shallow copy assert(len(ds_) < len(ds)) assert_equal(len(ds_.sa['partitions'].unique), 1) return ['c', 'd']
def test_forward_dense_array_mapper(): mask = np.ones((3, 2), dtype='bool') map_ = mask_mapper(mask) # test shape reports assert_equal(map_.forward1(mask).shape, (6, )) # test 1sample mapping assert_array_equal(map_.forward1(np.arange(6).reshape(3, 2)), [0, 1, 2, 3, 4, 5]) # test 4sample mapping foursample = map_.forward(np.arange(24).reshape(4, 3, 2)) assert_array_equal(foursample, [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11], [12, 13, 14, 15, 16, 17], [18, 19, 20, 21, 22, 23]]) # check incomplete masks mask[1, 1] = 0 map_ = mask_mapper(mask) assert_equal(map_.forward1(mask).shape, (5, )) assert_array_equal(map_.forward1(np.arange(6).reshape(3, 2)), [0, 1, 2, 4, 5]) # check that it doesn't accept wrong dataspace assert_raises(ValueError, map_.forward, np.arange(4).reshape(2, 2)) # check fail if neither mask nor shape assert_raises(ValueError, mask_mapper) # check that a full mask is automatically created when providing shape m = mask_mapper(shape=(2, 3, 4)) mp = m.forward1(np.arange(24).reshape(2, 3, 4)) assert_array_equal(mp, np.arange(24))
def _predict(self, ds_): # also called for estimating training error assert (ds_ is not ds) # we pass a shallow copy assert (len(ds_) < len(ds)) assert_equal(len(ds_.sa['partitions'].unique), 1) return ['c', 'd']
def test_sampleslicemapper(): # this does nothing but Dataset.__getitem__ which is tested elsewhere -- but # at least we run it ds = datasets['uni2small'] ssm = SampleSliceMapper(slice(3, 8, 2)) sds = ssm(ds) assert_equal(len(sds), 3)
def test_repr(): # this time give mask only by its target length sm = StaticFeatureSelection(slice(None), space='myspace') # check reproduction sm_clone = eval(repr(sm)) assert_equal(repr(sm_clone), repr(sm))
def test_corrstability_smoketest(ds): if not 'chunks' in ds.sa: return if len(ds.sa['targets'].unique) > 30: # was regression dataset return # very basic testing since cs = CorrStability() #ds = datasets['uni2small'] out = cs(ds) assert_equal(out.shape, (ds.nfeatures,)) ok_(np.all(out >= -1.001)) # it should be a correlation after all ok_(np.all(out <= 1.001)) # and theoretically those nonbogus features should have higher values if 'nonbogus_targets' in ds.fa: bogus_features = np.array([x==None for x in ds.fa.nonbogus_targets]) assert_array_less(np.mean(out[bogus_features]), np.mean(out[~bogus_features])) # and if we move targets to alternative location ds = ds.copy(deep=True) ds.sa['alt'] = ds.T ds.sa.pop('targets') assert_raises(KeyError, cs, ds) cs = CorrStability('alt') out_ = cs(ds) assert_array_equal(out, out_)
def test_corrstability_smoketest(ds): if not 'chunks' in ds.sa: return if len(ds.sa['targets'].unique) > 30: # was regression dataset return # very basic testing since cs = CorrStability() #ds = datasets['uni2small'] out = cs(ds) assert_equal(out.shape, (ds.nfeatures, )) ok_(np.all(out >= -1.001)) # it should be a correlation after all ok_(np.all(out <= 1.001)) # and theoretically those nonbogus features should have higher values if 'nonbogus_targets' in ds.fa: bogus_features = np.array([x == None for x in ds.fa.nonbogus_targets]) assert_array_less(np.mean(out[bogus_features]), np.mean(out[~bogus_features])) # and if we move targets to alternative location ds = ds.copy(deep=True) ds.sa['alt'] = ds.T ds.sa.pop('targets') assert_raises(KeyError, cs, ds) cs = CorrStability('alt') out_ = cs(ds) assert_array_equal(out, out_)
def test_addaxis(): from mvpa2.mappers.shape import AddAxisMapper ds = Dataset(np.arange(24).reshape(2, 3, 4), sa={'testsa': np.arange(2)}, fa={'testfa': np.arange(3)}) ds0 = AddAxisMapper(pos=0)(ds) assert_array_equal(ds0.shape, (1,) + ds.shape) # sas have extra dimension assert_array_equal(ds0.sa.testsa[0], ds.sa.testsa) # fas are duplicated assert_array_equal(ds0.fa.testfa[0], ds0.fa.testfa[1]) ds1 = AddAxisMapper(pos=1)(ds) assert_array_equal(ds1.shape, (2, 1, 3, 4)) # same sample attribute assert_equal(ds1.sa, ds.sa) # fas have extra dimension assert_array_equal(ds1.fa.testfa[0], ds.fa.testfa) ds2 = AddAxisMapper(pos=2)(ds) assert_array_equal(ds2.shape, (2, 3, 1, 4)) # no change to attribute collections assert_equal(ds2.sa, ds.sa) assert_equal(ds2.fa, ds.fa) # append an axis ds3 = AddAxisMapper(pos=3)(ds) assert_array_equal(ds3.shape, ds.shape + (1,)) # reverse indexing ds_1 = AddAxisMapper(pos= -1)(ds) assert_array_equal(ds3.samples, ds_1.samples) assert_equal(ds3.sa, ds_1.sa) assert_equal(ds3.fa, ds_1.fa) # add multiple axes ds4 = AddAxisMapper(pos=4)(ds) assert_array_equal(ds4.shape, ds.shape + (1, 1))
def test_repr(): # this time give mask only by its target length sm = StaticFeatureSelection(slice(None), space='myspace') # check reproduction sm_clone = eval(repr(sm)) assert_equal(repr(sm_clone), repr(sm))
def assert_coordinates_almost_equal_modulo_rotation(p_xyz, q_xyz, max_difference): assert_equal(p_xyz.shape, q_xyz.shape) n, three = p_xyz.shape assert_equal(three, 3) n_pairs_to_test = 50 get_random_int = lambda: int(random.uniform(0, n)) get_distance = lambda x, y: np.linalg.norm(x - y) # ensure that we test for at least some distances, i.e. # that the presence of nans everywhere would not lead to a 'skipped' # test did_distance_test = False # compute some pairwise distances between nodes, and verity these # are more or lress the same in p_xyz and q_xyz for _ in xrange(n_pairs_to_test): a = get_random_int() b = get_random_int() d_p = get_distance(p_xyz[a], p_xyz[b]) d_q = get_distance(q_xyz[a], q_xyz[b]) if not any(np.isnan([d_p, d_q])): assert (abs(d_p - d_q) < max_difference) did_distance_test = True assert (did_distance_test)
def test_factorialpartitioner_big(): # just to see that we can cope with relatively large datasets/numbers ds = normal_feature_dataset(nlabels=6, perlabel=66, nfeatures=2, nchunks=11) # and now let's do factorial partitioner def partition(ds_=ds, **kwargs): partitioner = FactorialPartitioner( partitioner=NFoldPartitioner(attr='targets'), attr='chunks', **kwargs) return [p.sa.partitions for p in partitioner.generate(ds_)] # prohibitively large # print len(partition(ds)) t0 = time() assert_equal(len(partition(ds, count=2, selection_strategy='first')), 2) # Those time limits are really a stretch. on a any reasonable box not too busy # should be done in fraction of a second, but allow to catch "naive" # implementation assert (time() - t0 < 3) assert_equal(len(partition(ds, count=2, selection_strategy='random')), 2) assert (time() - t0 < 3)
def test_sampleslicemapper(): # this does nothing but Dataset.__getitem__ which is tested elsewhere -- but # at least we run it ds = datasets['uni2small'] ssm = SampleSliceMapper(slice(3, 8, 2)) sds = ssm(ds) assert_equal(len(sds), 3)
def test_product_flatten(): nsamples = 17 product_name_values = [('chan', ['C1', 'C2']), ('freq', np.arange(4, 20, 6)), ('time', np.arange(-200, 800, 200))] shape = (nsamples,) + tuple(len(v) for _, v in product_name_values) sample_names = ['samp%d' % i for i in xrange(nsamples)] # generate random data in four dimensions data = np.random.normal(size=shape) ds = Dataset(data, sa=dict(sample_names=sample_names)) # apply flattening to ds flattener = ProductFlattenMapper(product_name_values) # test I/O (only if h5py is available) if externals.exists('h5py'): from mvpa2.base.hdf5 import h5save, h5load import tempfile import os _, testfn = tempfile.mkstemp('mapper.h5py', 'test_product') h5save(testfn, flattener) flattener = h5load(testfn) os.unlink(testfn) mds = flattener(ds) prod = lambda x:reduce(operator.mul, x) # ensure the size is ok assert_equal(mds.shape, (nsamples,) + (prod(shape[1:]),)) ndim = len(product_name_values) idxs = [range(len(v)) for _, v in product_name_values] for si in xrange(nsamples): for fi, p in enumerate(itertools.product(*idxs)): data_tup = (si,) + p x = mds[si, fi] # value should match assert_equal(data[data_tup], x.samples[0, 0]) # indices should match as well all_idxs = tuple(x.fa['chan_freq_time_indices'].value.ravel()) assert_equal(p, all_idxs) # values and indices in each dimension should match for i, (name, value) in enumerate(product_name_values): assert_equal(x.fa[name].value, value[p[i]]) assert_equal(x.fa[name + '_indices'].value, p[i]) product_name_values += [('foo', [1, 2, 3])] flattener = ProductFlattenMapper(product_name_values) assert_raises(ValueError, flattener, ds)
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] # with numpy 1.7.0b1 "chaining" was deprecated so let's create # check function appropriate for the given numpy version _a = np.arange(5) __a = _a[:4][:3] if __a.base is _a: # 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base is base elif __a.base.base is _a: # prior 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base.base is base else: raise RuntimeError("Uknown handling of .base by numpy") for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples)) assert_true(is_the_same_base(s[1].samples)) spl = Splitter(attr='partitions', noslicing=True) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [list(spl.generate(p)) for p in nfs.generate(self.data)] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(is_the_same_base(s[0].samples)) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20, 2), sa={'chunks': np.tile([0, 1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [list(spl.generate(p)) for p in oes.generate(step_ds)] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def test_strip_boundary(): ds = datasets['hollow'] ds.sa['btest'] = np.repeat([0, 1], 20) sn = StripBoundariesSamples('btest', 1, 2) sds = sn(ds) assert_equal(len(sds), len(ds) - 3) for i in [19, 20, 21]: assert_false(i in sds.samples.sid)
def test_strip_boundary(): ds = datasets['hollow'] ds.sa['btest'] = np.repeat([0, 1], 20) sn = StripBoundariesSamples('btest', 1, 2) sds = sn(ds) assert_equal(len(sds), len(ds) - 3) for i in [19, 20, 21]: assert_false(i in sds.samples.sid)
def test_eep_bin(): eb = EEPBin(os.path.join(pymvpa_dataroot, 'eep.bin')) assert_equal(eb.nchannels, 32) assert_equal(eb.nsamples, 2) assert_equal(eb.ntimepoints, 4) assert_true(eb.t0 - eb.dt < 0.00000001) assert_equal(len(eb.channels), 32) assert_equal(eb.data.shape, (2, 32, 4))
def test_eep_bin(): eb = EEPBin(os.path.join(pymvpa_dataroot, 'eep.bin')) assert_equal(eb.nchannels, 32) assert_equal(eb.nsamples, 2) assert_equal(eb.ntimepoints, 4) assert_true(eb.t0 - eb.dt < 0.00000001) assert_equal(len(eb.channels), 32) assert_equal(eb.data.shape, (2, 32, 4))
def test_nfold_random_counted_selection_partitioner_huge(self): # Just test that it completes in a reasonable time and does # not blow up as if would do if it was not limited by count kwargs = dict(count=10) ds = dataset_wizard(np.arange(1000).reshape((-1, 1)), targets=range(1000), chunks=range(500) * 2) split_partitions_random = [ tuple(x.sa.partitions) for x in NFoldPartitioner(100, selection_strategy="random", **kwargs).generate(ds) ] assert_equal(len(split_partitions_random), 10) # we get just 10
def test_searchlight_errors_per_trial(): # To make sure that searchlight can return error/accuracy per trial from mvpa2.clfs.gnb import GNB from mvpa2.generators.partition import OddEvenPartitioner from mvpa2.measures.base import CrossValidation from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.measures.gnbsearchlight import sphere_gnbsearchlight from mvpa2.testing.datasets import datasets from mvpa2.misc.errorfx import prediction_target_matches dataset = datasets['3dsmall'].copy() # randomly permute samples so we break any random correspondence # to strengthen tests below sample_idx = np.arange(len(dataset)) dataset = dataset[np.random.permutation(sample_idx)] dataset.sa.targets = ['L%d' % l for l in dataset.sa.targets] dataset.fa['voxel_indices'] = dataset.fa.myspace sample_clf = GNB() # fast and deterministic part = OddEvenPartitioner() # only do partial to save time cv = CrossValidation(sample_clf, part, errorfx=None) #prediction_target_matches) # Just to compare error cv_error = CrossValidation(sample_clf, part) # Large searchlight radius so we get entire ROI, 2 centers just to make sure # that all stacking works correctly sl = sphere_searchlight(cv, radius=10, center_ids=[0, 1]) results = sl(dataset) sl_gnb = sphere_gnbsearchlight(sample_clf, part, radius=10, errorfx=None, center_ids=[0, 1]) results_gnbsl = sl_gnb(dataset) # inspect both results # verify that partitioning was done correctly partitions = list(part.generate(dataset)) for res in (results, results_gnbsl): assert('targets' in res.sa.keys()) # should carry targets assert('cvfolds' in res.sa.keys()) # should carry cvfolds for ipart in xrange(len(partitions)): assert_array_equal(dataset[partitions[ipart].sa.partitions == 2].targets, res.sa.targets[res.sa.cvfolds == ipart]) assert_datasets_equal(results, results_gnbsl) # one "accuracy" per each trial assert_equal(results.shape, (len(dataset), 2)) # with accuracies the same in both searchlights since the same # features were to be selected in both cases due too large radii errors_dataset = cv(dataset) assert_array_equal(errors_dataset.samples[:, 0], results.samples[:, 0]) assert_array_equal(errors_dataset.samples[:, 0], results.samples[:, 1]) # and error matching (up to precision) the one if we run with default error function assert_array_almost_equal(np.mean(results.targets[:, None] != results.samples, axis=0)[0], np.mean(cv_error(dataset)))
def test_sphere(): # test sphere initialization s = ne.Sphere(1) center0 = (0, 0, 0) center1 = (1, 1, 1) assert_equal(len(s(center0)), 7) target = array([array([-1, 0, 0]), array([ 0, -1, 0]), array([ 0, 0, -1]), array([0, 0, 0]), array([0, 0, 1]), array([0, 1, 0]), array([1, 0, 0])]) # test of internals -- no recomputation of increments should be done prev_increments = s._increments assert_array_equal(s(center0), target) ok_(prev_increments is s._increments) # query lower dimensionality _ = s((0, 0)) ok_(not prev_increments is s._increments) # test Sphere call target = [array([0, 1, 1]), array([1, 0, 1]), array([1, 1, 0]), array([1, 1, 1]), array([1, 1, 2]), array([1, 2, 1]), array([2, 1, 1])] res = s(center1) assert_array_equal(array(res), target) # They all should be tuples ok_(np.all([isinstance(x, tuple) for x in res])) # test for larger diameter s = ne.Sphere(4) assert_equal(len(s(center1)), 257) # test extent keyword #s = ne.Sphere(4,extent=(1,1,1)) #assert_array_equal(array(s((0,0,0))), array([[0,0,0]])) # test Errors during initialisation and call #assert_raises(ValueError, ne.Sphere, 2) #assert_raises(ValueError, ne.Sphere, 1.0) # no longer extent available assert_raises(TypeError, ne.Sphere, 1, extent=(1)) assert_raises(TypeError, ne.Sphere, 1, extent=(1.0, 1.0, 1.0)) s = ne.Sphere(1) #assert_raises(ValueError, s, (1)) if __debug__: # No float coordinates allowed for now... # XXX might like to change that ;) # assert_raises(ValueError, s, (1.0, 1.0, 1.0))
def test_attrpermute(): ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3,4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_equal(len(pds), nruns) for p in pds: assert_false(np.all(p.sa.ids == ds.sa.ids)) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids))
def test_nfold_random_counted_selection_partitioner_huge(self): # Just test that it completes in a reasonable time and does # not blow up as if would do if it was not limited by count kwargs = dict(count=10) ds = dataset_wizard(np.arange(1000).reshape((-1, 1)), targets=range(1000), chunks=range(500) * 2) split_partitions_random = [ tuple(x.sa.partitions) for x in NFoldPartitioner( 100, selection_strategy='random', **kwargs).generate(ds) ] assert_equal(len(split_partitions_random), 10) # we get just 10
def test_read_fsl_design(self): fname = os.path.join(pymvpa_dataroot, 'sample_design.fsf') # use our function design = read_fsl_design(fname) # and just load manually to see either we match fine set_lines = [x for x in open(fname).readlines() if x.startswith('set ')] assert_equal(len(set_lines), len(design)) # figure out which one is missing """TODO: would require the same special treatment for _files fields
def test_glmnet_r_sensitivities(): data = datasets['chirp_linear'] clf = GLMNET_R() clf.train(data) # now ask for the sensitivities WITHOUT having to pass the dataset # again sens = clf.get_sensitivity_analyzer(force_train=False)(None) assert_equal(sens.shape, (1, data.nfeatures))
def test_glmnet_r_sensitivities(): data = datasets['chirp_linear'] clf = GLMNET_R() clf.train(data) # now ask for the sensitivities WITHOUT having to pass the dataset # again sens = clf.get_sensitivity_analyzer(force_train=False)(None) assert_equal(sens.shape, (1, data.nfeatures))
def test_read_fsl_design(self): fname = os.path.join(pymvpa_dataroot, 'sample_design.fsf') # use our function design = read_fsl_design(fname) # and just load manually to see either we match fine set_lines = [x for x in open(fname).readlines() if x.startswith('set ')] assert_equal(len(set_lines), len(design)) # figure out which one is missing """TODO: would require the same special treatment for _files fields
def test_glmnet_c_sensitivities(): data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) # use GLMNET on binary problem clf = GLMNET_C() clf.train(data) # now ask for the sensitivities WITHOUT having to pass the dataset # again sens = clf.get_sensitivity_analyzer(force_train=False)(None) #failUnless(sens.shape == (data.nfeatures,)) assert_equal(sens.shape, (len(data.UT), data.nfeatures))
def test_glmnet_c_sensitivities(): data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) # use GLMNET on binary problem clf = GLMNET_C() clf.train(data) # now ask for the sensitivities WITHOUT having to pass the dataset # again sens = clf.get_sensitivity_analyzer(force_train=False)(None) #failUnless(sens.shape == (data.nfeatures,)) assert_equal(sens.shape, (len(data.UT), data.nfeatures))
def test_simple_n_minus_one_cv(self): data = get_mv_pattern(3) data.init_origids("samples") self.assertTrue(data.nsamples == 120) self.assertTrue(data.nfeatures == 2) self.assertTrue((data.sa.targets == [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] * 6).all()) self.assertTrue((data.sa.chunks == [k for k in range(1, 7) for i in range(20)]).all()) assert_equal(len(np.unique(data.sa.origids)), data.nsamples) cv = CrossValidation(sample_clf_nl, NFoldPartitioner(), enable_ca=["stats", "training_stats"]) # 'samples_error']) results = cv(data) self.assertTrue((results.samples < 0.2).all() and (results.samples >= 0.0).all())
def test_attrmap_repr(): assert_equal(repr(AttributeMap()), "AttributeMap()") assert_equal(repr(AttributeMap(dict(a=2, b=1))), "AttributeMap({'a': 2, 'b': 1})") assert_equal(repr(AttributeMap(dict(a=2, b=1), mapnumeric=True)), "AttributeMap({'a': 2, 'b': 1}, mapnumeric=True)") assert_equal(repr(AttributeMap(dict(a=2, b=1), mapnumeric=True, collisions_resolution='tuple')), "AttributeMap({'a': 2, 'b': 1}, mapnumeric=True, collisions_resolution='tuple')")
def test_mean_tpr(): # Let's test now on some disbalanced sets assert_raises(ValueError, mean_tpr, [1], []) assert_raises(ValueError, mean_tpr, [], [1]) assert_raises(ValueError, mean_tpr, [], []) # now interesting one where there were no target when it was in predicted assert_raises(ValueError, mean_tpr, [1], [0]) assert_raises(ValueError, mean_tpr, [0, 1], [0, 0]) # but it should be ok to have some targets not present in prediction assert_equal(mean_tpr([0, 0], [0, 1]), .5) # the same regardless how many samples in 0-class, if all misclassified # (winner by # of samples takes all) assert_equal(mean_tpr([0, 0, 0], [0, 0, 1]), .5) # whenever mean-accuracy would be different assert_almost_equal(mean_match_accuracy([0, 0, 0], [0, 0, 1]), 2/3.)
def test_mean_tpr(): # Let's test now on some disbalanced sets assert_raises(ValueError, mean_tpr, [1], []) assert_raises(ValueError, mean_tpr, [], [1]) assert_raises(ValueError, mean_tpr, [], []) # now interesting one where there were no target when it was in predicted assert_raises(ValueError, mean_tpr, [1], [0]) assert_raises(ValueError, mean_tpr, [0, 1], [0, 0]) # but it should be ok to have some targets not present in prediction assert_equal(mean_tpr([0, 0], [0, 1]), .5) # the same regardless how many samples in 0-class, if all misclassified # (winner by # of samples takes all) assert_equal(mean_tpr([0, 0, 0], [0, 0, 1]), .5) # whenever mean-accuracy would be different assert_almost_equal(mean_match_accuracy([0, 0, 0], [0, 0, 1]), 2 / 3.)
def test_static_reverse_doesnt_work_after_feature_selection_tuneup_1(): ds_orig = datasets['uni2small'].copy() # doesn't matter which m = StaticFeatureSelection(np.arange(4)) m.train(ds_orig) ds = ds_orig.get_mapped(m) ds0_rev = ds.a.mapper.reverse1(ds.samples[0]) # should work assert_equal(ds0_rev.shape, (ds_orig.nfeatures,)) # direct feature selection ds_ = ds[:, [0, 2]] # should work but doesn't due to # RuntimeError: Cannot reverse-map data since the original data shape is unknown. Either set `dshape` in the constructor, or call train(). ds0_rev_ = ds_.a.mapper.reverse1(ds_.samples[0]) #ds0_rev_ = _verified_reverse1(ds_.a.mapper, ds_.samples[0]) assert_equal(ds0_rev_.shape, (ds_orig.nfeatures,))
def test_distances(): a = np.array([3,8]) b = np.array([6,4]) # test distances or yarik recalls unit testing ;) assert_equal(cartesian_distance(a, b), 5.0) assert_equal(manhattan_distance(a, b), 7) assert_equal(absmin_distance(a, b), 4) # test that fixing typo didn't impact results assert_equal(manhattan_distance(a, b), manhatten_distance(a, b))
def test_distances(): a = np.array([3, 8]) b = np.array([6, 4]) # test distances or yarik recalls unit testing ;) assert_equal(cartesian_distance(a, b), 5.0) assert_equal(manhattan_distance(a, b), 7) assert_equal(absmin_distance(a, b), 4) # test that fixing typo didn't impact results assert_equal(manhattan_distance(a, b), manhatten_distance(a, b))
def test_attrmap_repr(): assert_equal(repr(AttributeMap()), "AttributeMap()") d = dict(a=2, b=1) assert_equal(repr(AttributeMap(d)), "AttributeMap(%r)" % (d,)) assert_equal(repr(AttributeMap(dict(a=2, b=1), mapnumeric=True)), "AttributeMap(%r, mapnumeric=True)" % (d,)) assert_equal(repr(AttributeMap(dict(a=2, b=1), mapnumeric=True, collisions_resolution='tuple')), "AttributeMap(%r, mapnumeric=True, collisions_resolution='tuple')" % (d,))
def test_sifter(): # somewhat duplicating the doctest ds = Dataset(samples=np.arange(8).reshape((4,2)), sa={'chunks': [ 0 , 1 , 2 , 3 ], 'targets': ['c', 'c', 'p', 'p']}) par = ChainNode([NFoldPartitioner(cvtype=2, attr='chunks'), Sifter([('partitions', 2), ('targets', ['c', 'p'])]) ]) dss = list(par.generate(ds)) assert_equal(len(dss), 4) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_discarded_boundaries(self): ds = datasets["hollow"] # four runs ds.sa["chunks"] = np.repeat(np.arange(4), 10) # do odd even splitting for lots of boundaries in few splits part = ChainNode([OddEvenPartitioner(), StripBoundariesSamples("chunks", 1, 2)]) parts = [d.samples.sid for d in part.generate(ds)] # both dataset should have the same samples, because the boundaries are # identical and the same sample should be stripped assert_array_equal(parts[0], parts[1]) # we strip 3 samples per boundary assert_equal(len(parts[0]), len(ds) - (3 * 3)) for i in [9, 10, 11, 19, 20, 21, 29, 30, 31]: assert_false(i in parts[0])
def test_sifter(): # somewhat duplicating the doctest ds = Dataset(samples=np.arange(8).reshape((4, 2)), sa={"chunks": [0, 1, 2, 3], "targets": ["c", "c", "p", "p"]}) for sift_targets_definition in (["c", "p"], dict(uvalues=["c", "p"])): par = ChainNode( [ NFoldPartitioner(cvtype=2, attr="chunks"), Sifter([("partitions", 2), ("targets", sift_targets_definition)]), ] ) dss = list(par.generate(ds)) assert_equal(len(dss), 4) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ["c", "p"]) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ["c", "p"])
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is self.data.samples) assert_true(s[1].samples.base.base is self.data.samples) spl = Splitter(attr='partitions', noslicing=True) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(s[0].samples.base.base is self.data.samples) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20,2), sa={'chunks': np.tile([0,1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [ list(spl.generate(p)) for p in oes.generate(step_ds) ] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def test_discarded_boundaries(self): ds = datasets['hollow'] # four runs ds.sa['chunks'] = np.repeat(np.arange(4), 10) # do odd even splitting for lots of boundaries in few splits part = ChainNode([OddEvenPartitioner(), StripBoundariesSamples('chunks', 1, 2)]) parts = [d.samples.sid for d in part.generate(ds)] # both dataset should have the same samples, because the boundaries are # identical and the same sample should be stripped assert_array_equal(parts[0], parts[1]) # we strip 3 samples per boundary assert_equal(len(parts[0]), len(ds) - (3 * 3)) for i in [9, 10, 11, 19, 20, 21, 29, 30, 31]: assert_false(i in parts[0])
def test_sifter(): # somewhat duplicating the doctest ds = Dataset(samples=np.arange(8).reshape((4,2)), sa={'chunks': [ 0 , 1 , 2 , 3 ], 'targets': ['c', 'c', 'p', 'p']}) for sift_targets_definition in (['c', 'p'], dict(uvalues=['c', 'p'])): par = ChainNode([NFoldPartitioner(cvtype=2, attr='chunks'), Sifter([('partitions', 2), ('targets', sift_targets_definition)]) ]) dss = list(par.generate(ds)) assert_equal(len(dss), 4) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_mean_tpr_balanced(): # in case of the balanced sets we should expect to match mean_match_accuracy for nclass in range(2, 4): for nsample in range(1, 3): target = np.repeat(np.arange(nclass), nsample) # perfect match assert_equal(mean_match_accuracy(target, target), 1.0) assert_equal(mean_tpr(target, target), 1.0) # perfect mismatch -- shift by nsample, so no target matches estimate = np.roll(target, nsample) assert_equal(mean_match_accuracy(target, estimate), 0) assert_equal(mean_tpr(target, estimate), 0) # do few permutations and see if both match for i in range(5): np.random.shuffle(estimate) assert_equal(mean_tpr(target, estimate), mean_match_accuracy(target, estimate)) assert_almost_equal(mean_tpr(target, estimate), 1 - mean_fnr(target, estimate))
def test_attrmap_repr(): assert_equal(repr(AttributeMap()), "AttributeMap()") d = dict(a=2, b=1) assert_equal(repr(AttributeMap(d)), "AttributeMap(%r)" % (d, )) assert_equal(repr(AttributeMap(dict(a=2, b=1), mapnumeric=True)), "AttributeMap(%r, mapnumeric=True)" % (d, )) assert_equal( repr( AttributeMap(dict(a=2, b=1), mapnumeric=True, collisions_resolution='tuple')), "AttributeMap(%r, mapnumeric=True, collisions_resolution='tuple')" % (d, ))
def _assert_rotation_maps_vector(r, x, y): # rotation must be 3x3 numpy array assert_equal(r.shape, (3, 3)) assert_is_instance(r, np.ndarray) # rotation applied to x must yield direction of y # (modulo rounding errors) def normed(v): n_v = np.linalg.norm(v) return 0 if n_v == 0 else v / n_v rx = r.dot(x) rx_normed = normed(rx) y_normed = normed(y) assert_vector_direction_almost_equal(rx_normed, y_normed) # since it is a rotation, the result must have the same # L2 norm as the input assert_almost_equal(np.linalg.norm(x), np.linalg.norm(rx))