def test_chained_crossvalidation_searchlight(): from mvpa2.clfs.gnb import GNB from mvpa2.clfs.meta import MappedClassifier from mvpa2.generators.partition import NFoldPartitioner from mvpa2.mappers.base import ChainMapper from mvpa2.mappers.base import Mapper from mvpa2.measures.base import CrossValidation from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.testing.datasets import datasets dataset = datasets['3dlarge'].copy() dataset.fa['voxel_indices'] = dataset.fa.myspace sample_clf = GNB() # fast and deterministic class ZScoreFeaturesMapper(Mapper): """Very basic mapper which would take care about standardizing all features within each sample separately """ def _forward_data(self, data): return (data - np.mean(data, axis=1)[:, None])/np.std(data, axis=1)[:, None] # only do partial to save time sl_kwargs = dict(radius=2, center_ids=[3, 50]) clf_mapped = MappedClassifier(sample_clf, ZScoreFeaturesMapper()) cv = CrossValidation(clf_mapped, NFoldPartitioner()) sl = sphere_searchlight(cv, **sl_kwargs) results_mapped = sl(dataset) cv_chained = ChainMapper([ZScoreFeaturesMapper(auto_train=True), CrossValidation(sample_clf, NFoldPartitioner())]) sl_chained = sphere_searchlight(cv_chained, **sl_kwargs) results_chained = sl_chained(dataset) assert_array_equal(results_mapped, results_chained)
def test_mapper_vs_zscore(): """Test by comparing to results of elderly z-score function """ # data: 40 sample feature line in 20d space (40x20; samples x features) dss = [ dataset_wizard(np.concatenate( [np.arange(40) for i in range(20)]).reshape(20,-1).T, targets=1, chunks=1), ] + datasets.values() for ds in dss: ds1 = deepcopy(ds) ds2 = deepcopy(ds) zsm = ZScoreMapper(chunks_attr=None) assert_raises(RuntimeError, zsm.forward, ds1.samples) idhashes = (idhash(ds1), idhash(ds1.samples)) zsm.train(ds1) idhashes_train = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_train) # forward dataset ds1z_ds = zsm.forward(ds1) idhashes_forwardds = (idhash(ds1), idhash(ds1.samples)) # must not modify samples in place! assert_equal(idhashes, idhashes_forwardds) # forward samples explicitly ds1z = zsm.forward(ds1.samples) idhashes_forward = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_forward) zscore(ds2, chunks_attr=None) assert_array_almost_equal(ds1z, ds2.samples) assert_array_equal(ds1.samples, ds.samples)
def test_remove_invariant_as_a_mapper(): from mvpa2.featsel.helpers import RangeElementSelector from mvpa2.featsel.base import StaticFeatureSelection, SensitivityBasedFeatureSelection from mvpa2.testing.datasets import datasets from mvpa2.datasets.miscfx import remove_invariant_features mapper = SensitivityBasedFeatureSelection( lambda x: np.std(x, axis=0), RangeElementSelector(lower=0, inclusive=False), train_analyzer=False, auto_train=True) ds = datasets['uni2large'].copy() ds.a['mapper'] = StaticFeatureSelection(np.arange(ds.nfeatures)) ds.fa['index'] = np.arange(ds.nfeatures) ds.samples[:, [1, 8]] = 10 ds_out = mapper(ds) # Validate that we are getting the same results as remove_invariant_features ds_rifs = remove_invariant_features(ds) assert_array_equal(ds_out.samples, ds_rifs.samples) assert_array_equal(ds_out.fa.index, ds_rifs.fa.index) assert_equal(ds_out.fa.index[1], 2) assert_equal(ds_out.fa.index[8], 10)
def test_basic(self): dataset = data_generators.linear1d_gaussian_noise() k = GeneralizedLinearKernel() clf = GPR(k) clf.train(dataset) y = clf.predict(dataset.samples) assert_array_equal(y.shape, dataset.targets.shape)
def test_corrstability_smoketest(ds): if not 'chunks' in ds.sa: return if len(ds.sa['targets'].unique) > 30: # was regression dataset return # very basic testing since cs = CorrStability() #ds = datasets['uni2small'] out = cs(ds) assert_equal(out.shape, (ds.nfeatures, )) ok_(np.all(out >= -1.001)) # it should be a correlation after all ok_(np.all(out <= 1.001)) # and theoretically those nonbogus features should have higher values if 'nonbogus_targets' in ds.fa: bogus_features = np.array([x == None for x in ds.fa.nonbogus_targets]) assert_array_less(np.mean(out[bogus_features]), np.mean(out[~bogus_features])) # and if we move targets to alternative location ds = ds.copy(deep=True) ds.sa['alt'] = ds.T ds.sa.pop('targets') assert_raises(KeyError, cs, ds) cs = CorrStability('alt') out_ = cs(ds) assert_array_equal(out, out_)
def test_sphere_scaled(): s1 = ne.Sphere(3) s = ne.Sphere(3, element_sizes=(1, 1)) # Should give exactly the same results since element_sizes are 1s for p in ((0, 0), (-23, 1)): assert_array_equal(s1(p), s(p)) ok_(len(s(p)) == len(set(s(p)))) # Raise exception if query dimensionality does not match element_sizes assert_raises(ValueError, s, (1,)) s = ne.Sphere(3, element_sizes=(1.5, 2)) assert_array_equal(s((0, 0)), [(-2, 0), (-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 0), (0, 1), (1, -1), (1, 0), (1, 1), (2, 0)]) s = ne.Sphere(1.5, element_sizes=(1.5, 1.5, 1.5)) res = s((0, 0, 0)) ok_(np.all([np.sqrt(np.sum(np.array(x)**2)) <= 1.5 for x in res])) ok_(len(res) == 7) # all neighbors so no more than 1 voxel away -- just a cube, for # some "sphere" effect radius had to be 3.0 ;) td = np.sqrt(3*1.5**2) s = ne.Sphere(td, element_sizes=(1.5, 1.5, 1.5)) res = s((0, 0, 0)) ok_(np.all([np.sqrt(np.sum(np.array(x)**2)) <= td for x in res])) ok_(np.all([np.sum(np.abs(x) > 1) == 0 for x in res])) ok_(len(res) == 27)
def test_identity(): # IdentityNeighborhood() behaves like Sphere(0.5) without all of the # computation. Test on a few different coordinates. neighborhood = ne.IdentityNeighborhood() sphere = ne.Sphere(0.5) for center in ((0, 0, 0), (1, 1, 1), (0, 0), (0, )): assert_array_equal(neighborhood(center), sphere(center))
def test_sifter_with_balancing(): # extended previous test which was already # "... somewhat duplicating the doctest" ds = Dataset(samples=np.arange(12).reshape((-1, 2)), sa={'chunks': [ 0 , 1 , 2 , 3 , 4, 5 ], 'targets': ['c', 'c', 'c', 'p', 'p', 'p']}) # Without sifter -- just to assure that we do get all of them # i.e. 6*5*4*3/(4!) = 15 par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')]) assert_equal(len(list(par.generate(ds))), 15) # so we will take 4 chunks out of available 7, but would care only # about those partitions where we have balanced number of 'c' and 'p' # entries assert_raises(ValueError, lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds) par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks'), Sifter([('partitions', 2), ('targets', dict(uvalues=['c', 'p'], balanced=True))]) ]) dss = list(par.generate(ds)) # print [ x[x.sa.partitions==2].sa.targets for x in dss ] assert_equal(len(dss), 9) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_corrstability_smoketest(ds): if not 'chunks' in ds.sa: return if len(ds.sa['targets'].unique) > 30: # was regression dataset return # very basic testing since cs = CorrStability() #ds = datasets['uni2small'] out = cs(ds) assert_equal(out.shape, (ds.nfeatures,)) ok_(np.all(out >= -1.001)) # it should be a correlation after all ok_(np.all(out <= 1.001)) # and theoretically those nonbogus features should have higher values if 'nonbogus_targets' in ds.fa: bogus_features = np.array([x==None for x in ds.fa.nonbogus_targets]) assert_array_less(np.mean(out[bogus_features]), np.mean(out[~bogus_features])) # and if we move targets to alternative location ds = ds.copy(deep=True) ds.sa['alt'] = ds.T ds.sa.pop('targets') assert_raises(KeyError, cs, ds) cs = CorrStability('alt') out_ = cs(ds) assert_array_equal(out, out_)
def test_cv_no_generator_custom_splitter(self): ds = Dataset(np.arange(4), sa={'category': ['to', 'to', 'from', 'from'], 'targets': ['a', 'b', 'c', 'd']}) class Measure(Classifier): def _train(self, ds_): assert_array_equal(ds_.samples, ds.samples[2:]) assert_array_equal(ds_.sa.category, ['from'] * len(ds_)) def _predict(self, ds_): assert(ds_ is not ds) # we pass a shallow copy # could be called to predit training or testing data if np.all(ds_.sa.targets != ['c', 'd']): assert_array_equal(ds_.samples, ds.samples[:2]) assert_array_equal(ds_.sa.category, ['to'] * len(ds_)) else: assert_array_equal(ds_.sa.category, ['from'] * len(ds_)) return ['c', 'd'] measure = Measure() cv = CrossValidation(measure, splitter=Splitter('category', ['from', 'to'])) res = cv(ds) assert_array_equal(res, [[1]]) # failed perfectly ;-)
def test_sphere_scaled(): s1 = ne.Sphere(3) s = ne.Sphere(3, element_sizes=(1, 1)) # Should give exactly the same results since element_sizes are 1s for p in ((0, 0), (-23, 1)): assert_array_equal(s1(p), s(p)) ok_(len(s(p)) == len(set(s(p)))) # Raise exception if query dimensionality does not match element_sizes assert_raises(ValueError, s, (1, )) s = ne.Sphere(3, element_sizes=(1.5, 2)) assert_array_equal(s((0, 0)), [(-2, 0), (-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 0), (0, 1), (1, -1), (1, 0), (1, 1), (2, 0)]) s = ne.Sphere(1.5, element_sizes=(1.5, 1.5, 1.5)) res = s((0, 0, 0)) ok_(np.all([np.sqrt(np.sum(np.array(x)**2)) <= 1.5 for x in res])) ok_(len(res) == 7) # all neighbors so no more than 1 voxel away -- just a cube, for # some "sphere" effect radius had to be 3.0 ;) td = np.sqrt(3 * 1.5**2) s = ne.Sphere(td, element_sizes=(1.5, 1.5, 1.5)) res = s((0, 0, 0)) ok_(np.all([np.sqrt(np.sum(np.array(x)**2)) <= td for x in res])) ok_(np.all([np.sum(np.abs(x) > 1) == 0 for x in res])) ok_(len(res) == 27)
def test_cv_no_generator_custom_splitter(self): ds = Dataset(np.arange(4), sa={ 'category': ['to', 'to', 'from', 'from'], 'targets': ['a', 'b', 'c', 'd'] }) class Measure(Classifier): def _train(self, ds_): assert_array_equal(ds_.samples, ds.samples[2:]) assert_array_equal(ds_.sa.category, ['from'] * len(ds_)) def _predict(self, ds_): assert (ds_ is not ds) # we pass a shallow copy # could be called to predit training or testing data if np.all(ds_.sa.targets != ['c', 'd']): assert_array_equal(ds_.samples, ds.samples[:2]) assert_array_equal(ds_.sa.category, ['to'] * len(ds_)) else: assert_array_equal(ds_.sa.category, ['from'] * len(ds_)) return ['c', 'd'] measure = Measure() cv = CrossValidation(measure, splitter=Splitter('category', ['from', 'to'])) res = cv(ds) assert_array_equal(res, [[1]]) # failed perfectly ;-)
def test_sifter_with_balancing(): # extended previous test which was already # "... somewhat duplicating the doctest" ds = Dataset(samples=np.arange(12).reshape((-1, 2)), sa={ 'chunks': [0, 1, 2, 3, 4, 5], 'targets': ['c', 'c', 'c', 'p', 'p', 'p'] }) # Without sifter -- just to assure that we do get all of them # i.e. 6*5*4*3/(4!) = 15 par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')]) assert_equal(len(list(par.generate(ds))), 15) # so we will take 4 chunks out of available 7, but would care only # about those partitions where we have balanced number of 'c' and 'p' # entries assert_raises( ValueError, lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds) par = ChainNode([ NFoldPartitioner(cvtype=4, attr='chunks'), Sifter([('partitions', 2), ('targets', dict(uvalues=['c', 'p'], balanced=True))]) ]) dss = list(par.generate(ds)) # print [ x[x.sa.partitions==2].sa.targets for x in dss ] assert_equal(len(dss), 9) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_aggregation(self): data = dataset_wizard(np.arange( 20 ).reshape((4, 5)), targets=1, chunks=1) ag_data = aggregate_features(data, np.mean) ok_(ag_data.nsamples == 4) ok_(ag_data.nfeatures == 1) assert_array_equal(ag_data.samples[:, 0], [2, 7, 12, 17])
def _assert_ds_mat_attributes_equal(ds, m, attr_keys=('a', 'sa', 'fa')): # ds is a Dataset object, m a matlab-like dictionary for attr_k in attr_keys: attr_v = getattr(ds, attr_k) for k in attr_v.keys(): v = attr_v[k].value assert_array_equal(m[attr_k][k][0, 0].ravel(), v)
def test_basic(self): skip_if_no_external('scipy') # needed by GPR code dataset = data_generators.linear1d_gaussian_noise() k = GeneralizedLinearKernel() clf = GPR(k) clf.train(dataset) y = clf.predict(dataset.samples) assert_array_equal(y.shape, dataset.targets.shape)
def test_size_random_prototypes(self): self.build_vector_based_pm() fraction = 0.5 prototype_number = max(int(len(self.samples)*fraction),1) ## debug("MAP","Generating "+str(prototype_number)+" random prototypes.") self.prototypes2 = np.array(random.sample(list(self.samples), prototype_number)) self.pm2 = PrototypeMapper(similarities=self.similarities, prototypes=self.prototypes2) self.pm2.train(self.samples) assert_array_equal(self.pm2.proj.shape, (self.samples.shape[0], self.pm2.prototypes.shape[0]*len(self.similarities)))
def _assert_ds_less_or_equal(x, y): # x and y are a Dataset; x should contain a subset of # elements in .sa, fa, a and have the same samples as y # Note: no support for fancy objects such as mappers assert_array_equal(x.samples, y.samples) for label in ('a', 'fa', 'sa'): vx = getattr(x, label) vy = getattr(y, label) _assert_array_collectable_less_or_equal(vx, vy)
def test_partitionmapper(): ds = give_data() oep = OddEvenPartitioner() parts = list(oep.generate(ds)) assert_equal(len(parts), 2) for i, p in enumerate(parts): assert_array_equal(p.sa['partitions'].unique, [1, 2]) assert_equal(p.a.partitions_set, i) assert_equal(len(p), len(ds))
def test_sphere(): # test sphere initialization s = ne.Sphere(1) center0 = (0, 0, 0) center1 = (1, 1, 1) assert_equal(len(s(center0)), 7) target = array([array([-1, 0, 0]), array([ 0, -1, 0]), array([ 0, 0, -1]), array([0, 0, 0]), array([0, 0, 1]), array([0, 1, 0]), array([1, 0, 0])]) # test of internals -- no recomputation of increments should be done prev_increments = s._increments assert_array_equal(s(center0), target) ok_(prev_increments is s._increments) # query lower dimensionality _ = s((0, 0)) ok_(not prev_increments is s._increments) # test Sphere call target = [array([0, 1, 1]), array([1, 0, 1]), array([1, 1, 0]), array([1, 1, 1]), array([1, 1, 2]), array([1, 2, 1]), array([2, 1, 1])] res = s(center1) assert_array_equal(array(res), target) # They all should be tuples ok_(np.all([isinstance(x, tuple) for x in res])) # test for larger diameter s = ne.Sphere(4) assert_equal(len(s(center1)), 257) # test extent keyword #s = ne.Sphere(4,extent=(1,1,1)) #assert_array_equal(array(s((0,0,0))), array([[0,0,0]])) # test Errors during initialisation and call #assert_raises(ValueError, ne.Sphere, 2) #assert_raises(ValueError, ne.Sphere, 1.0) # no longer extent available assert_raises(TypeError, ne.Sphere, 1, extent=(1)) assert_raises(TypeError, ne.Sphere, 1, extent=(1.0, 1.0, 1.0)) s = ne.Sphere(1) #assert_raises(ValueError, s, (1)) if __debug__: # No float coordinates allowed for now... # XXX might like to change that ;) # assert_raises(ValueError, s, (1.0, 1.0, 1.0))
def test_balancer(): ds = give_data() # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3, )) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal( get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal( get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round( np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1, 2], 5) ds.fa['chk'] = np.repeat([1, 2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def test_nonfinite_features_removal(self): r = np.random.normal(size=(4, 5)) ds = dataset_wizard(r, targets=1, chunks=1) ds.samples[2,0]=np.NaN ds.samples[3,3]=np.Inf dsc = remove_nonfinite_features(ds) self.assertTrue(dsc.nfeatures == 3) assert_array_equal(ds[:, [1, 2, 4]].samples, dsc.samples)
def test_balancer(): ds = give_data() # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3,)) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal(get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal(get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1,2], 5) ds.fa['chk'] = np.repeat([1,2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def test_identity_mapper(s): idm = IdentityMapper() # doesn't matter what you throw at it assert_true(idm.forward(s) is s) assert_true(idm.forward1(s) is s) assert_true(idm.reverse(s) is s) assert_true(idm.reverse1(s) is s) # even like this it should work, but type conversion # can happen assert_array_equal(_verified_reverse1(idm, s), s) assert_array_equal(idm.reverse1(s), s)
def test_attrpermute(): ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3,4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_equal(len(pds), nruns) for p in pds: assert_false(np.all(p.sa.ids == ds.sa.ids)) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids))
def test_streamline_equal_mapper(self): self.build_streamline_things() self.prototypes_equal = self.dataset.samples self.pm = PrototypeMapper(similarities=self.similarities, prototypes=self.prototypes_equal, demean=False) self.pm.train(self.dataset.samples) ## debug("MAP","projected data: "+str(self.pm.proj)) # check size: assert_array_equal(self.pm.proj.shape, (len(self.dataset.samples), len(self.prototypes_equal)*len(self.similarities))) # test symmetry assert_array_almost_equal(self.pm.proj, self.pm.proj.T)
def test_glmnet_state(): #data = datasets['dumb2'] # for some reason the R code fails with the dumb data data = datasets['chirp_linear'] clf = GLMNET_R() clf.train(data) clf.ca.enable('predictions') p = clf.predict(data.samples) assert_array_equal(p, clf.ca.predictions)
def test_glmnet_c(): # define binary prob data = datasets['dumb2'] # use GLMNET on binary problem clf = GLMNET_C() clf.ca.enable('estimates') clf.train(data) # test predictions pre = clf.predict(data.samples) assert_array_equal(pre, data.targets)
def test_custom_split(self): #simulate half splitter hs = CustomPartitioner([(None,[0,1,2,3,4]),(None,[5,6,7,8,9])]) spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.failUnless(len(splits) == 2) for i,p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 50 ) self.failUnless( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check fully customized split with working and validation set specified cs = CustomPartitioner([([0,3,4],[5,9])]) # we want to discared the unselected partition of the data, hence attr_value # these two splitters should do exactly the same thing splitters = (Splitter(attr='partitions', attr_values=[1,2]), Splitter(attr='partitions', ignore_values=(0,))) for spl in splitters: splits = [ list(spl.generate(p)) for p in cs.generate(self.data) ] self.failUnless(len(splits) == 1) for i,p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 30 ) self.failUnless( p[1].nsamples == 20 ) self.failUnless((splits[0][1].sa['chunks'].unique == [5, 9]).all()) self.failUnless((splits[0][0].sa['chunks'].unique == [0, 3, 4]).all())
def test_conditional_attr(): import copy import cPickle for node in (TestNodeOnDefault(enable_ca=['test', 'stats']), TestNodeOffDefault(enable_ca=['test', 'stats'])): node.ca.test = range(5) node.ca.stats = ConfusionMatrix(labels=['one', 'two']) node.ca.stats.add(('one', 'two', 'one', 'two'), ('one', 'two', 'two', 'one')) node.ca.stats.compute() dc_node = copy.deepcopy(node) assert_equal(set(node.ca.enabled), set(dc_node.ca.enabled)) assert (node.ca['test'].enabled) assert (node.ca['stats'].enabled) assert_array_equal(node.ca['test'].value, dc_node.ca['test'].value) assert_array_equal(node.ca['stats'].value.matrix, dc_node.ca['stats'].value.matrix) # check whether values survive pickling pickled = cPickle.dumps(node) up_node = cPickle.loads(pickled) assert_array_equal(up_node.ca['test'].value, range(5)) assert_array_equal(up_node.ca['stats'].value.matrix, node.ca['stats'].value.matrix)
def test_odd_even_split(self): oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = [list(spl.generate(p)) for p in oes.generate(self.data)] self.assertTrue(len(splits) == 2) for i, p in enumerate(splits): self.assertTrue(len(p) == 2) self.assertTrue(p[0].nsamples == 50) self.assertTrue(p[1].nsamples == 50) assert_array_equal(splits[0][1].sa['chunks'].unique, [1, 3, 5, 7, 9]) assert_array_equal(splits[0][0].sa['chunks'].unique, [0, 2, 4, 6, 8]) assert_array_equal(splits[1][0].sa['chunks'].unique, [1, 3, 5, 7, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [0, 2, 4, 6, 8]) # check if it works on pure odd and even chunk ids moresplits = [ list(spl.generate(p)) for p in oes.generate(splits[0][0]) ] for split in moresplits: self.assertTrue(split[0] != None) self.assertTrue(split[1] != None)
def test_forward_dense_array_mapper(): mask = np.ones((3, 2), dtype='bool') map_ = mask_mapper(mask) # test shape reports assert_equal(map_.forward1(mask).shape, (6, )) # test 1sample mapping assert_array_equal(map_.forward1(np.arange(6).reshape(3, 2)), [0, 1, 2, 3, 4, 5]) # test 4sample mapping foursample = map_.forward(np.arange(24).reshape(4, 3, 2)) assert_array_equal(foursample, [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11], [12, 13, 14, 15, 16, 17], [18, 19, 20, 21, 22, 23]]) # check incomplete masks mask[1, 1] = 0 map_ = mask_mapper(mask) assert_equal(map_.forward1(mask).shape, (5, )) assert_array_equal(map_.forward1(np.arange(6).reshape(3, 2)), [0, 1, 2, 4, 5]) # check that it doesn't accept wrong dataspace assert_raises(ValueError, map_.forward, np.arange(4).reshape(2, 2)) # check fail if neither mask nor shape assert_raises(ValueError, mask_mapper) # check that a full mask is automatically created when providing shape m = mask_mapper(shape=(2, 3, 4)) mp = m.forward1(np.arange(24).reshape(2, 3, 4)) assert_array_equal(mp, np.arange(24))
def test_custom_split(self): #simulate half splitter hs = CustomPartitioner([(None,[0,1,2,3,4]),(None,[5,6,7,8,9])]) spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.assertTrue(len(splits) == 2) for i,p in enumerate(splits): self.assertTrue( len(p) == 2 ) self.assertTrue( p[0].nsamples == 50 ) self.assertTrue( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check fully customized split with working and validation set specified cs = CustomPartitioner([([0,3,4],[5,9])]) # we want to discared the unselected partition of the data, hence attr_value # these two splitters should do exactly the same thing splitters = (Splitter(attr='partitions', attr_values=[1,2]), Splitter(attr='partitions', ignore_values=(0,))) for spl in splitters: splits = [ list(spl.generate(p)) for p in cs.generate(self.data) ] self.assertTrue(len(splits) == 1) for i,p in enumerate(splits): self.assertTrue( len(p) == 2 ) self.assertTrue( p[0].nsamples == 30 ) self.assertTrue( p[1].nsamples == 20 ) self.assertTrue((splits[0][1].sa['chunks'].unique == [5, 9]).all()) self.assertTrue((splits[0][0].sa['chunks'].unique == [0, 3, 4]).all())
def test_splitter(): ds = give_data() # split with defaults spl1 = Splitter('chunks') assert_raises(NotImplementedError, spl1, ds) splits = list(spl1.generate(ds)) assert_equal(len(splits), len(ds.sa['chunks'].unique)) for split in splits: # it should have perform basic slicing! assert_true(split.samples.base is ds.samples) assert_equal(len(split.sa['chunks'].unique), 1) assert_true('lastsplit' in split.a) assert_true(splits[-1].a.lastsplit) # now again, more customized spl2 = Splitter('targets', attr_values=[0, 1, 1, 2, 3, 3, 3], count=4, noslicing=True) splits = list(spl2.generate(ds)) assert_equal(len(splits), 4) for split in splits: # it should NOT have perform basic slicing! assert_false(split.samples.base is ds.samples) assert_equal(len(split.sa['targets'].unique), 1) assert_equal(len(split.sa['chunks'].unique), 10) assert_true(splits[-1].a.lastsplit) # two should be identical assert_array_equal(splits[1].samples, splits[2].samples) # now go wild and split by feature attribute ds.fa['roi'] = np.repeat([0, 1], 5) # splitter should auto-detect that this is a feature attribute spl3 = Splitter('roi') splits = list(spl3.generate(ds)) assert_equal(len(splits), 2) for split in splits: assert_true(split.samples.base is ds.samples) assert_equal(len(split.fa['roi'].unique), 1) assert_equal(split.shape, (100, 5)) # and finally test chained splitters cspl = ChainNode([spl2, spl3, spl1]) splits = list(cspl.generate(ds)) # 4 target splits and 2 roi splits each and 10 chunks each assert_equal(len(splits), 80)
def test_cosmo_queryengine(fn): skip_if_no_external('scipy', min_version='0.8') nbrhood_mat = _create_small_mat_nbrhood_dict() neighbors = nbrhood_mat['neighbors'] savemat(fn, nbrhood_mat) # test dict in matlab form, filename, and through QueryEngine loader for input in (nbrhood_mat, fn, cosmo.CosmoQueryEngine.from_mat(neighbors)): qe = cosmo.from_any(input) assert_array_equal(qe.ids, [0, 1, 2, 3]) for i in qe.ids: nbr_fids_base0 = neighbors[0, i][0] - 1 assert_array_equal(qe.query_byid(i), nbr_fids_base0) _assert_ds_mat_attributes_equal(qe, nbrhood_mat, ('fa', 'a'))
def test_streamline_random_mapper(self): self.build_streamline_things() # Adding one more similarity to test multiple similarities in the streamline case: self.similarities.append(StreamlineSimilarity(distance=corouge)) fraction = 0.5 prototype_number = max(int(len(self.dataset.samples)*fraction),1) ## debug("MAP","Generating "+str(prototype_number)+" random prototypes.") self.prototypes_random = self.dataset.samples[np.random.permutation(self.dataset.samples.size)][:prototype_number] ## debug("MAP","prototypes: "+str(self.prototypes_random)) self.pm = PrototypeMapper(similarities=self.similarities, prototypes=self.prototypes_random, demean=False) self.pm.train(self.dataset.samples) # , fraction=1.0) # test size: assert_array_equal(self.pm.proj.shape, (len(self.dataset.samples), len(self.prototypes_random)*len(self.similarities)))
def test_splitter(): ds = give_data() # split with defaults spl1 = Splitter('chunks') assert_raises(NotImplementedError, spl1, ds) splits = list(spl1.generate(ds)) assert_equal(len(splits), len(ds.sa['chunks'].unique)) for split in splits: # it should have perform basic slicing! assert_true(split.samples.base is ds.samples) assert_equal(len(split.sa['chunks'].unique), 1) assert_true('lastsplit' in split.a) assert_true(splits[-1].a.lastsplit) # now again, more customized spl2 = Splitter('targets', attr_values = [0,1,1,2,3,3,3], count=4, noslicing=True) splits = list(spl2.generate(ds)) assert_equal(len(splits), 4) for split in splits: # it should NOT have perform basic slicing! assert_false(split.samples.base is ds.samples) assert_equal(len(split.sa['targets'].unique), 1) assert_equal(len(split.sa['chunks'].unique), 10) assert_true(splits[-1].a.lastsplit) # two should be identical assert_array_equal(splits[1].samples, splits[2].samples) # now go wild and split by feature attribute ds.fa['roi'] = np.repeat([0,1], 5) # splitter should auto-detect that this is a feature attribute spl3 = Splitter('roi') splits = list(spl3.generate(ds)) assert_equal(len(splits), 2) for split in splits: assert_true(split.samples.base is ds.samples) assert_equal(len(split.fa['roi'].unique), 1) assert_equal(split.shape, (100, 5)) # and finally test chained splitters cspl = ChainNode([spl2, spl3, spl1]) splits = list(cspl.generate(ds)) # 4 target splits and 2 roi splits each and 10 chunks each assert_equal(len(splits), 80)
def test_collections(): sa = SampleAttributesCollection() assert_equal(len(sa), 0) assert_raises(ValueError, sa.__setitem__, 'test', 0) l = range(5) sa['test'] = l # auto-wrapped assert_true(isinstance(sa['test'], ArrayCollectable)) assert_equal(len(sa), 1) # names which are already present in dict interface assert_raises(ValueError, sa.__setitem__, 'values', range(5)) sa_c = copy.deepcopy(sa) assert_equal(len(sa), len(sa_c)) assert_array_equal(sa.test, sa_c.test)
def test_sifter(): # somewhat duplicating the doctest ds = Dataset(samples=np.arange(8).reshape((4,2)), sa={'chunks': [ 0 , 1 , 2 , 3 ], 'targets': ['c', 'c', 'p', 'p']}) par = ChainNode([NFoldPartitioner(cvtype=2, attr='chunks'), Sifter([('partitions', 2), ('targets', ['c', 'p'])]) ]) dss = list(par.generate(ds)) assert_equal(len(dss), 4) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_label_splitter(self): oes = OddEvenPartitioner(attr='targets') spl = Splitter(attr='partitions') splits = [list(spl.generate(p)) for p in oes.generate(self.data)] assert_array_equal(splits[0][0].sa['targets'].unique, [0, 2]) assert_array_equal(splits[0][1].sa['targets'].unique, [1, 3]) assert_array_equal(splits[1][0].sa['targets'].unique, [1, 3]) assert_array_equal(splits[1][1].sa['targets'].unique, [0, 2])
def test_label_splitter(self): oes = OddEvenPartitioner(attr='targets') spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in oes.generate(self.data) ] assert_array_equal(splits[0][0].sa['targets'].unique, [0,2]) assert_array_equal(splits[0][1].sa['targets'].unique, [1,3]) assert_array_equal(splits[1][0].sa['targets'].unique, [1,3]) assert_array_equal(splits[1][1].sa['targets'].unique, [0,2])
def test_subset_filler(): sm = StaticFeatureSelection(np.arange(3)) sm_f0 = StaticFeatureSelection(np.arange(3), filler=0) sm_fm1 = StaticFeatureSelection(np.arange(3), filler=-1) sm_fnan = StaticFeatureSelection(np.arange(3), filler=np.nan) data = np.arange(12).astype(float).reshape((2, -1)) sm.train(data) data_forwarded = sm.forward(data) for m in (sm, sm_f0, sm_fm1, sm_fnan): m.train(data) assert_array_equal(data_forwarded, m.forward(data)) data_back_fm1 = sm_fm1.reverse(data_forwarded) ok_(np.all(data_back_fm1[:, 3:] == -1)) data_back_fnan = sm_fnan.reverse(data_forwarded) ok_(np.all(np.isnan(data_back_fnan[:, 3:])))
def test_subset_filler(): sm = StaticFeatureSelection(np.arange(3)) sm_f0 = StaticFeatureSelection(np.arange(3), filler=0) sm_fm1 = StaticFeatureSelection(np.arange(3), filler= -1) sm_fnan = StaticFeatureSelection(np.arange(3), filler=np.nan) data = np.arange(12).astype(float).reshape((2, -1)) sm.train(data) data_forwarded = sm.forward(data) for m in (sm, sm_f0, sm_fm1, sm_fnan): m.train(data) assert_array_equal(data_forwarded, m.forward(data)) data_back_fm1 = sm_fm1.reverse(data_forwarded) ok_(np.all(data_back_fm1[:, 3:] == -1)) data_back_fnan = sm_fnan.reverse(data_forwarded) ok_(np.all(np.isnan(data_back_fnan[:, 3:])))