def test_query_engine(): data = np.arange(54) # indices in 3D ind = np.transpose((np.ones((3, 3, 3)).nonzero())) # sphere generator for 3 elements diameter sphere = ne.Sphere(1) # dataset with just one "space" ds = Dataset([data, data], fa={'s_ind': np.concatenate((ind, ind))}) # and the query engine attaching the generator to the "index-space" qe = ne.IndexQueryEngine(s_ind=sphere) # cannot train since the engine does not know about the second space assert_raises(ValueError, qe.train, ds) # now do it again with a full spec ds = Dataset([data, data], fa={ 's_ind': np.concatenate((ind, ind)), 't_ind': np.repeat([0, 1], 27) }) qe = ne.IndexQueryEngine(s_ind=sphere, t_ind=None) qe.train(ds) # internal representation check # YOH: invalid for new implementation with lookup tables (dictionaries) #assert_array_equal(qe._searcharray, # np.arange(54).reshape(qe._searcharray.shape) + 1) # should give us one corner, collapsing the 't_ind' assert_array_equal(qe(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36]) # directly specifying an index for 't_ind' without having an ROI # generator, should give the same corner, but just once assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=0), [0, 1, 3, 9]) # just out of the mask -- no match assert_array_equal(qe(s_ind=(3, 3, 3)), []) # also out of the mask -- but single match assert_array_equal(qe(s_ind=(2, 2, 3), t_ind=1), [53]) # query by id assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=0), qe[0]) assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=[0, 1]), qe(s_ind=(0, 0, 0))) # should not fail if t_ind is outside assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=[0, 1, 10]), qe(s_ind=(0, 0, 0))) # should fail if asked about some unknown thing assert_raises(ValueError, qe.__call__, s_ind=(0, 0, 0), buga=0) # Test by using some literal feature atttribute ds.fa['lit'] = ['roi1', 'ro2', 'r3'] * 18 # should work as well as before assert_array_equal(qe(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36]) # should fail if asked about some unknown (yet) thing assert_raises(ValueError, qe.__call__, s_ind=(0, 0, 0), lit='roi1') # Create qe which can query literals as well qe_lit = ne.IndexQueryEngine(s_ind=sphere, t_ind=None, lit=None) qe_lit.train(ds) # should work as well as before assert_array_equal(qe_lit(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36]) # and subselect nicely -- only /3 ones assert_array_equal(qe_lit(s_ind=(0, 0, 0), lit='roi1'), [0, 3, 9, 27, 30, 36]) assert_array_equal(qe_lit(s_ind=(0, 0, 0), lit=['roi1', 'ro2']), [0, 1, 3, 9, 27, 28, 30, 36])
def test_mergeds(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.fa['one'] = np.ones(5) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1, chunks=1) data1.fa['one'] = np.zeros(5) data2 = Dataset.from_wizard(np.ones((3, 5)), targets=2, chunks=1) data3 = Dataset.from_wizard(np.ones((4, 5)), targets=2) data4 = Dataset.from_wizard(np.ones((2, 5)), targets=3, chunks=2) data4.fa['test'] = np.arange(5) # cannot merge if there are attributes missing in one of the datasets assert_raises(DatasetError, data1.append, data0) merged = data1.copy() merged.append(data2) ok_( merged.nfeatures == 5 ) l12 = [1]*5 + [2]*3 l1 = [1]*8 ok_((merged.targets == l12).all()) ok_((merged.chunks == l1).all()) data_append = data1.copy() data_append.append(data2) ok_(data_append.nfeatures == 5) ok_((data_append.targets == l12).all()) ok_((data_append.chunks == l1).all()) # # appending # # we need the same samples attributes in both datasets assert_raises(DatasetError, data2.append, data3) # # vstacking # if __debug__: # tested only in __debug__ assert_raises(ValueError, vstack, (data0, data1, data2, data3)) datasets = (data1, data2, data4) merged = vstack(datasets) assert_equal(merged.shape, (np.sum([len(ds) for ds in datasets]), data1.nfeatures)) assert_true('test' in merged.fa) assert_array_equal(merged.sa.targets, [1]*5 + [2]*3 + [3]*2) # # hstacking # assert_raises(ValueError, hstack, datasets) datasets = (data0, data1) merged = hstack(datasets) assert_equal(merged.shape, (len(data1), np.sum([ds.nfeatures for ds in datasets]))) assert_true('chunks' in merged.sa) assert_array_equal(merged.fa.one, [1]*5 + [0]*5)
def test_query_engine(): data = np.arange(54) # indices in 3D ind = np.transpose((np.ones((3, 3, 3)).nonzero())) # sphere generator for 3 elements diameter sphere = ne.Sphere(1) # dataset with just one "space" ds = Dataset([data, data], fa={'s_ind': np.concatenate((ind, ind))}) # and the query engine attaching the generator to the "index-space" qe = ne.IndexQueryEngine(s_ind=sphere) # cannot train since the engine does not know about the second space assert_raises(ValueError, qe.train, ds) # now do it again with a full spec ds = Dataset([data, data], fa={'s_ind': np.concatenate((ind, ind)), 't_ind': np.repeat([0,1], 27)}) qe = ne.IndexQueryEngine(s_ind=sphere, t_ind=None) qe.train(ds) # internal representation check # YOH: invalid for new implementation with lookup tables (dictionaries) #assert_array_equal(qe._searcharray, # np.arange(54).reshape(qe._searcharray.shape) + 1) # should give us one corner, collapsing the 't_ind' assert_array_equal(qe(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36]) # directly specifying an index for 't_ind' without having an ROI # generator, should give the same corner, but just once assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=0), [0, 1, 3, 9]) # just out of the mask -- no match assert_array_equal(qe(s_ind=(3, 3, 3)), []) # also out of the mask -- but single match assert_array_equal(qe(s_ind=(2, 2, 3), t_ind=1), [53]) # query by id assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=0), qe[0]) assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=[0, 1]), qe(s_ind=(0, 0, 0))) # should not fail if t_ind is outside assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=[0, 1, 10]), qe(s_ind=(0, 0, 0))) # should fail if asked about some unknown thing assert_raises(ValueError, qe.__call__, s_ind=(0, 0, 0), buga=0) # Test by using some literal feature atttribute ds.fa['lit'] = ['roi1', 'ro2', 'r3']*18 # should work as well as before assert_array_equal(qe(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36]) # should fail if asked about some unknown (yet) thing assert_raises(ValueError, qe.__call__, s_ind=(0,0,0), lit='roi1') # Create qe which can query literals as well qe_lit = ne.IndexQueryEngine(s_ind=sphere, t_ind=None, lit=None) qe_lit.train(ds) # should work as well as before assert_array_equal(qe_lit(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36]) # and subselect nicely -- only /3 ones assert_array_equal(qe_lit(s_ind=(0, 0, 0), lit='roi1'), [0, 3, 9, 27, 30, 36]) assert_array_equal(qe_lit(s_ind=(0, 0, 0), lit=['roi1', 'ro2']), [0, 1, 3, 9, 27, 28, 30, 36])
def _call(self, dataset): # This code is based on SciPy's stats.f_oneway() # Copyright (c) Gary Strangman. All rights reserved # License: BSD # # However, it got tweaked and optimized to better fit into PyMVPA. # number of groups targets_sa = dataset.sa[self._targets_attr] labels = targets_sa.value ul = targets_sa.unique na = len(ul) bign = float(dataset.nsamples) alldata = dataset.samples # total squares of sums sostot = np.sum(alldata, axis=0) sostot *= sostot sostot /= bign # total sum of squares sstot = np.sum(alldata * alldata, axis=0) - sostot # between group sum of squares ssbn = 0 for l in ul: # all samples for the respective label d = alldata[labels == l] sos = np.sum(d, axis=0) sos *= sos ssbn += sos / float(len(d)) ssbn -= sostot # within sswn = sstot - ssbn # degrees of freedom dfbn = na - 1 dfwn = bign - na # mean sums of squares msb = ssbn / float(dfbn) msw = sswn / float(dfwn) f = msb / msw # assure no NaNs -- otherwise it leads instead of # sane unittest failure (check of NaNs) to crazy # File "mtrand.pyx", line 1661, in mtrand.shuffle # TypeError: object of type 'numpy.int64' has no len() # without any sane backtrace f[np.isnan(f)] = 0 if externals.exists('scipy'): from scipy.stats import fprob return Dataset(f[np.newaxis], fa={'fprob': fprob(dfbn, dfwn, f)}) else: return Dataset(f[np.newaxis])
def test_labelpermutation_randomsampling(): ds = Dataset.from_wizard(np.ones((5, 1)), targets=range(5), chunks=1) ds.append(Dataset.from_wizard(np.ones((5, 1)) + 1, targets=range(5), chunks=2)) ds.append(Dataset.from_wizard(np.ones((5, 1)) + 2, targets=range(5), chunks=3)) ds.append(Dataset.from_wizard(np.ones((5, 1)) + 3, targets=range(5), chunks=4)) ds.append(Dataset.from_wizard(np.ones((5, 1)) + 4, targets=range(5), chunks=5)) # use subclass for testing if it would survive ds.samples = ds.samples.view(myarray) ok_(ds.get_nsamples_per_attr('targets') == {0:5, 1:5, 2:5, 3:5, 4:5}) sample = ds.random_samples(2) ok_(sample.get_nsamples_per_attr('targets').values() == [ 2, 2, 2, 2, 2 ]) ok_((ds.sa['chunks'].unique == range(1, 6)).all()) # keep the orig labels orig_labels = ds.targets[:] # also keep the orig dataset, but SHALLOW copy and leave everything # else as a view! ods = copy.copy(ds) ds.permute_targets() # some permutation should have happened assert_false((ds.targets == orig_labels).all()) # but the original dataset should be uneffected assert_array_equal(ods.targets, orig_labels) # array subclass survives ok_(isinstance(ods.samples, myarray)) # samples are really shared ds.samples[0, 0] = 123456 assert_array_equal(ds.samples, ods.samples) # and other samples attributes too ds.chunks[0] = 9876 assert_array_equal(ds.chunks, ods.chunks) # try to permute on custom target ds = ods.copy() otargets = ods.sa.targets.copy() ds.sa['custom'] = ods.sa.targets.copy() assert_array_equal(ds.sa.custom, otargets) assert_array_equal(ds.sa.targets, otargets) ds.permute_targets(targets_attr='custom') # original targets should still match assert_array_equal(ds.sa.targets, otargets) # but custom should get permuted assert_false((ds.sa.custom == otargets).all())
def test_icamapper(): # data: 40 sample feature line in 2d space (40x2; samples x features) samples = np.vstack([np.arange(40.) for i in range(2)]).T samples -= samples.mean() samples += np.random.normal(size=samples.shape, scale=0.1) ndlin = Dataset(samples) pm = ICAMapper() pm.train(ndlin.copy()) assert_equal(pm.proj.shape, (2, 2)) p = pm.forward(ndlin.copy()) assert_equal(p.shape, (40, 2)) # check that the mapped data can be fully recovered by 'reverse()' assert_array_almost_equal(pm.reverse(p), ndlin)
def test_labelpermutation_randomsampling(): ds = Dataset.from_wizard(np.ones((5, 10)), targets=range(5), chunks=1) for i in xrange(1, 5): ds.append(Dataset.from_wizard(np.ones((5, 10)) + i, targets=range(5), chunks=i+1)) # assign some feature attributes ds.fa['roi'] = np.repeat(np.arange(5), 2) ds.fa['lucky'] = np.arange(10)%2 # use subclass for testing if it would survive ds.samples = ds.samples.view(myarray) ok_(ds.get_nsamples_per_attr('targets') == {0:5, 1:5, 2:5, 3:5, 4:5}) sample = ds.random_samples(2) ok_(sample.get_nsamples_per_attr('targets').values() == [ 2, 2, 2, 2, 2 ]) ok_((ds.sa['chunks'].unique == range(1, 6)).all())
def _call(self, dataset): # just for the beauty of it X = self._design # precompute transformation is not yet done if self._inv_design is None: self._inv_ip = (X.T * X).I self._inv_design = self._inv_ip * X.T # get parameter estimations for all features at once # (betas x features) betas = self._inv_design * dataset.samples # charge state self.ca.pe = pe = betas.T.A # if betas and no z-stats are desired return them right away if not self._voi == 'pe' or self.ca.is_enabled('zstat'): # compute residuals residuals = X * betas residuals -= dataset.samples # estimates of the parameter variance and compute zstats # assumption of mean(E) == 0 and equal variance # XXX next lines ignore off-diagonal elements and hence covariance # between regressors. The humble being writing these lines asks the # god of statistics for forgives, because it knows not what it does diag_ip = np.diag(self._inv_ip) # (features x betas) beta_vars = np.array([ r.var() * diag_ip for r in residuals.T ]) # (parameter x feature) zstat = pe / np.sqrt(beta_vars) # charge state self.ca.zstat = zstat if self._voi == 'pe': # return as (beta x feature) result = Dataset(pe.T) elif self._voi == 'zstat': # return as (zstat x feature) result = Dataset(zstat.T) else: # we shall never get to this point raise ValueError, \ "Unknown variable of interest '%s'" % str(self._voi) result.sa['regressor'] = np.arange(len(result)) return result
def test_slicing(self): spl = HalfSplitter() splits = [(train, test) for (train, test) in spl(self.data)] for s in splits: # we get slicing all the time assert_true(s[0].samples.base is self.data.samples) assert_true(s[1].samples.base is self.data.samples) spl = HalfSplitter(noslicing=True) splits = [(train, test) for (train, test) in spl(self.data)] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) spl = NFoldSplitter() splits = [(train, test) for (train, test) in spl(self.data)] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(s[0].samples.base is self.data.samples) else: assert_false(s[0].samples.base is self.data.samples) # we get slicing all the time assert_true(s[1].samples.base is self.data.samples) step_ds = Dataset(np.random.randn(20, 2), sa={'chunks': np.tile([0, 1], 10)}) spl = OddEvenSplitter() splits = [(train, test) for (train, test) in spl(step_ds)] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base is step_ds.samples) assert_true(s[1].samples.base is step_ds.samples)
def test_feature_masking(): mask = np.zeros((5, 3), dtype='bool') mask[2, 1] = True mask[4, 0] = True data = Dataset.from_wizard(np.arange(60).reshape((4, 5, 3)), targets=1, chunks=1, mask=mask) # check simple masking ok_(data.nfeatures == 2) # selection should be idempotent ok_(data[:, mask].nfeatures == data.nfeatures) # check that correct feature get selected assert_array_equal(data[:, 1].samples[:, 0], [12, 27, 42, 57]) # XXX put back when coord -> fattr is implemented #ok_(tuple(data[:, 1].a.mapper.getInId(0)) == (4, 0)) ok_(data[:, 1].a.mapper.forward1(mask).shape == (1,)) # check sugarings # XXX put me back #self.failUnless(np.all(data.I == data.origids)) assert_array_equal(data.C, data.chunks) assert_array_equal(data.UC, np.unique(data.chunks)) assert_array_equal(data.T, data.targets) assert_array_equal(data.UT, np.unique(data.targets)) assert_array_equal(data.S, data.samples) assert_array_equal(data.O, data.mapper.reverse(data.samples))
def get_data(self): data = np.random.standard_normal(( 100, 2, 2, 2 )) labels = np.concatenate( ( np.repeat( 0, 50 ), np.repeat( 1, 50 ) ) ) chunks = np.repeat( range(5), 10 ) chunks = np.concatenate( (chunks, chunks) ) return Dataset.from_wizard(samples=data, targets=labels, chunks=chunks)
def test_labelschunks_access(): samples = np.arange(12).reshape((4, 3)).view(myarray) labels = range(4) chunks = [1, 1, 2, 2] ds = Dataset.from_wizard(samples, labels, chunks) # array subclass survives ok_(isinstance(ds.samples, myarray)) assert_array_equal(ds.targets, labels) assert_array_equal(ds.chunks, chunks) # moreover they should point to the same thing ok_(ds.targets is ds.sa.targets) ok_(ds.targets is ds.sa['targets'].value) ok_(ds.chunks is ds.sa.chunks) ok_(ds.chunks is ds.sa['chunks'].value) # assignment should work at all levels including 1st ds.targets = chunks assert_array_equal(ds.targets, chunks) ok_(ds.targets is ds.sa.targets) ok_(ds.targets is ds.sa['targets'].value) # test broadcasting # but not for plain scalars assert_raises(ValueError, ds.set_attr, 'sa.bc', 5) # and not for plain plain str assert_raises(TypeError, ds.set_attr, 'sa.bc', "mike") # but for any iterable of len == 1 ds.set_attr('sa.bc', (5,)) ds.set_attr('sa.dc', ["mike"]) assert_array_equal(ds.sa.bc, [5] * len(ds)) assert_array_equal(ds.sa.dc, ["mike"] * len(ds))
def test_h5py_io(): skip_if_no_external('h5py') tempdir = tempfile.mkdtemp() # store random dataset to file ds = datasets['3dlarge'] ds.save(os.path.join(tempdir, 'plain.hdf5')) # reload and check for identity ds2 = Dataset.from_hdf5(os.path.join(tempdir, 'plain.hdf5')) assert_array_equal(ds.samples, ds2.samples) for attr in ds.sa: assert_array_equal(ds.sa[attr].value, ds2.sa[attr].value) for attr in ds.fa: assert_array_equal(ds.fa[attr].value, ds2.fa[attr].value) assert_true(len(ds.a.mapper), 2) # since we have no __equal__ do at least some comparison if __debug__: # debug mode needs special test as it enhances the repr output # with module info and id() appendix for objects assert_equal('#'.join(repr(ds.a.mapper).split('#')[:-1]), '#'.join(repr(ds2.a.mapper).split('#')[:-1])) else: assert_equal(repr(ds.a.mapper), repr(ds2.a.mapper)) #cleanup temp dir shutil.rmtree(tempdir, ignore_errors=True)
def zscore(ds, **kwargs): """In-place Z-scoring of a `Dataset` or `ndarray`. This function behaves identical to `ZScoreMapper`. The only difference is that the actual Z-scoring is done in-place -- potentially causing a significant reduction of memory demands. Parameters ---------- ds : Dataset or ndarray The data that will be Z-scored in-place. **kwargs For all other arguments, please see the documentation of `ZScoreMapper`. """ zm = ZScoreMapper(**kwargs) zm._secret_inplace_zscore = True # train if isinstance(ds, Dataset): zm.train(ds) else: zm.train(Dataset(ds)) # map mapped = zm(ds) # and append the mapper to the dataset if isinstance(mapped, Dataset): mapped._append_mapper(zm)
def test_multidim_attrs(): samples = np.arange(24).reshape(2, 3, 4) # have a dataset with two samples -- mapped from 2d into 1d # but have 2d labels and 3d chunks -- whatever that is ds = Dataset.from_wizard(samples.copy(), targets=samples.copy(), chunks=np.random.normal(size=(2,10,4,2))) assert_equal(ds.nsamples, 2) assert_equal(ds.nfeatures, 12) assert_equal(ds.sa.targets.shape, (2,3,4)) assert_equal(ds.sa.chunks.shape, (2,10,4,2)) # try slicing subds = ds[0] assert_equal(subds.nsamples, 1) assert_equal(subds.nfeatures, 12) assert_equal(subds.sa.targets.shape, (1,3,4)) assert_equal(subds.sa.chunks.shape, (1,10,4,2)) # add multidim feature attr fattr = ds.mapper.forward(samples) assert_equal(fattr.shape, (2,12)) # should puke -- first axis is #samples assert_raises(ValueError, ds.fa.__setitem__, 'moresamples', fattr) # but that should be fine ds.fa['moresamples'] = fattr.T assert_equal(ds.fa.moresamples.shape, (12,2))
def test_origmask_extraction(): origdata = np.random.standard_normal((10, 2, 4, 3)) data = Dataset.from_wizard(origdata, targets=2, chunks=2) # check with custom mask sel = data[:, 5] ok_(sel.samples.shape[1] == 1)
def _call(self, dataset): """Computes featurewise f-scores using compound comparisons.""" targets_sa = dataset.sa[self._targets_attr] orig_labels = targets_sa.value labels = orig_labels.copy() # Lets create a very shallow copy of a dataset with just # samples and targets_attr dataset_mod = Dataset(dataset.samples, sa={self._targets_attr: labels}) results = [] for ul in targets_sa.unique: labels[orig_labels == ul] = 1 labels[orig_labels != ul] = 2 f_ds = OneWayAnova._call(self, dataset_mod) if 'fprob' in f_ds.fa: # rename the fprob attribute to something label specific # to survive final aggregation stage f_ds.fa['fprob_' + str(ul)] = f_ds.fa.fprob del f_ds.fa['fprob'] results.append(f_ds) results = vstack(results) results.sa[self._targets_attr] = targets_sa.unique return results
def _call(self, dataset): # XXX Hm... it might make sense to unify access functions # naming across our swig libsvm wrapper and sg access # functions for svm clf = self.clf sgsvm = clf.svm sens_labels = None if isinstance(sgsvm, shogun.Classifier.MultiClassSVM): sens, biases = [], [] nsvms = sgsvm.get_num_svms() clabels = sorted(clf._attrmap.values()) nclabels = len(clabels) sens_labels = [] isvm = 0 # index for svm among known for i in xrange(nclabels): for j in xrange(i + 1, nclabels): sgsvmi = sgsvm.get_svm(isvm) labels_tuple = (clabels[i], clabels[j]) # Since we gave the labels in incremental order, # we always should be right - but it does not # hurt to check if set of labels is the same if __debug__ and _shogun_exposes_slavesvm_labels: if not sgsvmi.get_labels(): # We need to call classify() so labels get assigned # to the multiclass SVM sgsvm.classify() assert (set([ sgsvmi.get_label(int(x)) for x in sgsvmi.get_support_vectors() ]) == set(labels_tuple)) sens1, bias = self.__sg_helper(sgsvmi) sens.append(sens1) biases.append(bias) sens_labels += [labels_tuple[::-1]] # ??? positive first isvm += 1 assert (len(sens) == nsvms) # we should have covered all else: sens1, bias = self.__sg_helper(sgsvm) biases = np.atleast_1d(bias) sens = np.atleast_2d(sens1) if not clf.__is_regression__: assert (set(clf._attrmap.values()) == set([-1.0, 1.0])) assert (sens.shape[0] == 1) sens_labels = [(-1.0, 1.0)] ds = Dataset(np.atleast_2d(sens)) if sens_labels is not None: if isinstance(sens_labels[0], tuple): # Need to have them in array of dtype object sens_labels = asobjarray(sens_labels) if len(clf._attrmap): sens_labels = clf._attrmap.to_literal(sens_labels, recurse=True) ds.sa[clf.params.targets_attr] = sens_labels self.ca.biases = biases return ds
def test_samples_shape(): ds = Dataset.from_wizard(np.ones((10, 2, 3, 4)), targets=1, chunks=1) ok_(ds.samples.shape == (10, 24)) # what happens to 1D samples ds = Dataset(np.arange(5)) assert_equal(ds.shape, (5, 1)) assert_equal(ds.nfeatures, 1)
def test_ex_from_masked(): ds = Dataset.from_wizard(samples=np.atleast_2d(np.arange(5)).view(myarray), targets=1, chunks=1) # simple sequence has to be a single pattern assert_equal(ds.nsamples, 1) # array subclass survives ok_(isinstance(ds.samples, myarray)) # check correct pattern layout (1x5) assert_array_equal(ds.samples, [[0, 1, 2, 3, 4]]) # check for single label and origin assert_array_equal(ds.targets, [1]) assert_array_equal(ds.chunks, [1]) # now try adding pattern with wrong shape assert_raises(DatasetError, ds.append, Dataset.from_wizard(np.ones((2,3)), targets=1, chunks=1)) # now add two real patterns ds.append(Dataset.from_wizard(np.random.standard_normal((2, 5)), targets=2, chunks=2)) assert_equal(ds.nsamples, 3) assert_array_equal(ds.targets, [1, 2, 2]) assert_array_equal(ds.chunks, [1, 2, 2]) # test unique class labels ds.append(Dataset.from_wizard(np.random.standard_normal((2, 5)), targets=3, chunks=5)) assert_array_equal(ds.sa['targets'].unique, [1, 2, 3]) # test wrong attributes length assert_raises(ValueError, Dataset.from_wizard, np.random.standard_normal((4,2,3,4)), targets=[1, 2, 3], chunks=2) assert_raises(ValueError, Dataset.from_wizard, np.random.standard_normal((4,2,3,4)), targets=[1, 2, 3, 4], chunks=[2, 2, 2]) # no test one that is using from_masked ds = datasets['3dlarge'] for a in ds.sa: assert_equal(len(ds.sa[a].value), len(ds)) for a in ds.fa: assert_equal(len(ds.fa[a].value), ds.nfeatures)
def test_featuregroup_mapper(): ds = Dataset(np.arange(24).reshape(3,8)) ds.fa['roi'] = [0, 1] * 4 # just to check ds.sa['chunks'] = np.arange(3) # correct results csamples = [[3, 4], [11, 12], [19, 20]] croi = [0, 1] cchunks = np.arange(3) m = mean_group_feature(['roi']) mds = m.forward(ds) assert_equal(mds.shape, (3, 2)) assert_array_equal(mds.samples, csamples) assert_array_equal(mds.fa.roi, np.unique([0, 1] * 4)) # FAs should simply remain the same assert_array_equal(mds.sa.chunks, np.arange(3))
def test_shape_conversion(): ds = Dataset.from_wizard(np.arange(24).reshape((2, 3, 4)).view(myarray), targets=1, chunks=1) # array subclass survives ok_(isinstance(ds.samples, myarray)) assert_equal(ds.nsamples, 2) assert_equal(ds.samples.shape, (2, 12)) assert_array_equal(ds.samples, [range(12), range(12, 24)])
def test_llemapper(): skip_if_no_external('mdp', min_version='2.4') ds = Dataset( np.array([[0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [1., 0., 0.], [0., 1., 1.], [1., 0., 1.], [1., 1., 0.], [1., 1., 1.]])) pm = LLEMapper(3, output_dim=2) pm.train(ds) fmapped = pm(ds) assert_equal(fmapped.shape, (8, 2))
def test_icamapper(): # data: 40 sample feature line in 2d space (40x2; samples x features) samples = np.vstack([np.arange(40.) for i in range(2)]).T samples -= samples.mean() samples += np.random.normal(size=samples.shape, scale=0.1) ndlin = Dataset(samples) pm = ICAMapper() try: pm.train(ndlin.copy()) assert_equal(pm.proj.shape, (2, 2)) p = pm.forward(ndlin.copy()) assert_equal(p.shape, (40, 2)) # check that the mapped data can be fully recovered by 'reverse()' assert_array_almost_equal(pm.reverse(p), ndlin) except mdp.NodeException: # do not puke if the ICA did not converge at all -- that is not our # fault but MDP's pass
def _call(self, dataset=None): """Extract weights from GLMNET classifier. GLMNET always has weights available, so nothing has to be computed here. """ clf = self.clf weights = clf.weights if __debug__: debug('GLMNET', "Extracting weights for GLMNET - "+ "Result: min=%f max=%f" %\ (np.min(weights), np.max(weights))) #return weights if clf.params.family == 'multinomial': return Dataset(weights.T, sa={clf.params.targets_attr: clf._utargets}) else: return Dataset(weights[np.newaxis])
def setUp(self): data = np.random.standard_normal(( 100, 3, 4, 2 )) labels = np.concatenate( ( np.repeat( 0, 50 ), np.repeat( 1, 50 ) ) ) chunks = np.repeat( range(5), 10 ) chunks = np.concatenate( (chunks, chunks) ) mask = np.ones( (3, 4, 2), dtype='bool') mask[0,0,0] = 0 mask[1,3,1] = 0 self.dataset = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks, mask=mask)
def test_pcamapper(): # data: 40 sample feature line in 20d space (40x20; samples x features) ndlin = Dataset( np.concatenate([np.arange(40) for i in range(20)]).reshape(20, -1).T) pm = PCAMapper() # train PCA assert_raises(mdp.NodeException, pm.train, ndlin) ndlin.samples = ndlin.samples.astype('float') ndlin_noise = ndlin.copy() ndlin_noise.samples += np.random.random(size=ndlin.samples.shape) # we have no variance for more than one PCA component, hence just one # actual non-zero eigenvalue assert_raises(mdp.NodeException, pm.train, ndlin) pm.train(ndlin_noise) assert_equal(pm.proj.shape, (20, 20)) # now project data into PCA space p = pm.forward(ndlin.samples) assert_equal(p.shape, (40, 20)) # check that the mapped data can be fully recovered by 'reverse()' assert_array_almost_equal(pm.reverse(p), ndlin)
def test_pcamapper(): # data: 40 sample feature line in 20d space (40x20; samples x features) ndlin = Dataset(np.concatenate([np.arange(40) for i in range(20)]).reshape(20,-1).T) pm = PCAMapper() # train PCA assert_raises(mdp.NodeException, pm.train, ndlin) ndlin.samples = ndlin.samples.astype('float') ndlin_noise = ndlin.copy() ndlin_noise.samples += np.random.random(size=ndlin.samples.shape) # we have no variance for more than one PCA component, hence just one # actual non-zero eigenvalue assert_raises(mdp.NodeException, pm.train, ndlin) pm.train(ndlin_noise) assert_equal(pm.proj.shape, (20, 20)) # now project data into PCA space p = pm.forward(ndlin.samples) assert_equal(p.shape, (40, 20)) # check that the mapped data can be fully recovered by 'reverse()' assert_array_almost_equal(pm.reverse(p), ndlin)
def _predict(self, data): """Predict the class labels for the provided data. Returns a list of class labels (one for each data sample). """ # make sure we're talking about arrays data = np.asarray(data) # checks only in debug mode if __debug__: if not data.ndim == 2: raise ValueError, "Data array must be two-dimensional." if not data.shape[1] == self.__data.nfeatures: raise ValueError, "Length of data samples (features) does " \ "not match the classifier." # compute the distance matrix between training and test data with # distances stored row-wise, ie. distances between test sample [0] # and all training samples will end up in row 0 dists = self.__dfx(self.__data.samples, data).T if self.ca.is_enabled('distances'): # TODO: theoretically we should have used deepcopy for sa # here self.ca.distances = Dataset(dists, fa=self.__data.sa.copy()) # determine the k nearest neighbors per test sample knns = dists.argsort(axis=1)[:, :self.__k] # predicted class labels will go here predicted = [] if self.__voting == 'majority': vfx = self.get_majority_vote elif self.__voting == 'weighted': vfx = self.get_weighted_vote else: raise ValueError, "kNN told to perform unknown voting '%s'." \ % self.__voting # perform voting results = [vfx(knn) for knn in knns] # extract predictions predicted = [r[0] for r in results] # store the predictions in the state. Relies on State._setitem to do # nothing if the relevant state member is not enabled self.ca.predictions = predicted self.ca.estimates = np.array([r[1] for r in results]) return predicted
def _call(self, dataset): # first cast to floating point dtype, because noise is most likely # floating point as well and '+=' on int would not do the right thing if not np.issubdtype(dataset.samples.dtype, np.float): ds = dataset.copy(deep=False) ds.samples = dataset.samples.astype('float32') dataset = ds if __debug__: nfeatures = dataset.nfeatures # using a list here, to be able to handle output of unknown # dimensionality sens_map = [] # compute the datameasure on the original dataset # this is used as a baseline orig_measure = self.__datameasure(dataset) # do for every _single_ feature in the dataset for feature in xrange(dataset.nfeatures): if __debug__: debug('PSA', "Analyzing %i features: %i [%i%%]" \ % (nfeatures, feature+1, float(feature+1)/nfeatures*100,), cr=True) # store current feature to restore it later on current_feature = dataset.samples[:, feature].copy() # add noise to current feature dataset.samples[:, feature] += self.__noise(size=len(dataset)) # compute the datameasure on the perturbed dataset perturbed_measure = self.__datameasure(dataset) # restore the current feature dataset.samples[:, feature] = current_feature # difference from original datameasure is sensitivity sens_map.append(perturbed_measure.samples - orig_measure.samples) if __debug__: debug('PSA', '') # turn into an array and get rid of unnecessary axes -- ideally yielding # 2D array sens_map = np.array(sens_map).squeeze() # swap first to axis: we have nfeatures on first but want it as second # in a dataset sens_map = np.swapaxes(sens_map, 0, 1) return Dataset(sens_map)
def test_fxmapper(): origdata = np.arange(24).reshape(3,8) ds = Dataset(origdata.copy()) ds.samples *= -1 # test a mapper that doesn't change the shape # it shouldn't mapper along with axis it is applied m_s = FxMapper('samples', np.absolute) m_f = FxMapper('features', np.absolute) a_m = absolute_features() assert_array_equal(m_s.forward(ds), origdata) assert_array_equal(a_m.forward(ds), origdata) assert_array_equal(m_s.forward(ds), m_f.forward(ds))
def aggregate_features(dataset, fx=np.mean): """Apply a function to each row of the samples matrix of a dataset. The functor given as `fx` has to honour an `axis` keyword argument in the way that NumPy used it (e.g. NumPy.mean, var). Returns ------- a new `Dataset` object with the aggregated feature(s). """ agg = fx(dataset.samples, axis=1) return Dataset(samples=np.array(agg, ndmin=2).T, sa=dataset.sa)
def test_basic_datamapping(): samples = np.arange(24).reshape((4, 3, 2)).view(myarray) ds = Dataset.from_wizard(samples) # array subclass survives ok_(isinstance(ds.samples, myarray)) # mapper should end up in the dataset ok_(ds.a.has_key('mapper')) # check correct mapping ok_(ds.nsamples == 4) ok_(ds.nfeatures == 6)
def _call(self, dataset=None): """Extract weights from LARS classifier. LARS always has weights available, so nothing has to be computed here. """ clf = self.clf weights = clf.weights if __debug__: debug('LARS', "Extracting weights for LARS - "+ "Result: min=%f max=%f" %\ (np.min(weights), np.max(weights))) return Dataset(np.atleast_2d(weights))
def test_anova(self): """Additional aspects of OnewayAnova """ oa = OneWayAnova() oa_custom = OneWayAnova(targets_attr='custom') ds = datasets['uni4large'] ds_custom = Dataset(ds.samples, sa={'custom': ds.targets}) r = oa(ds) self.failUnlessRaises(KeyError, oa_custom, ds) r_custom = oa_custom(ds_custom) self.failUnless(np.allclose(r.samples, r_custom.samples)) # we should get the same results on subsequent runs r2 = oa(ds) r_custom2 = oa_custom(ds_custom) self.failUnless(np.allclose(r.samples, r2.samples)) self.failUnless(np.allclose(r_custom.samples, r_custom2.samples))
def test_h5py_io(): skip_if_no_external('h5py') tempdir = tempfile.mkdtemp() # store random dataset to file ds = datasets['3dlarge'] ds.save(os.path.join(tempdir, 'plain.hdf5')) # reload and check for identity ds2 = Dataset.from_hdf5(os.path.join(tempdir, 'plain.hdf5')) assert_array_equal(ds.samples, ds2.samples) for attr in ds.sa: assert_array_equal(ds.sa[attr].value, ds2.sa[attr].value) for attr in ds.fa: assert_array_equal(ds.fa[attr].value, ds2.fa[attr].value) assert_true(len(ds.a.mapper), 2) # since we have no __equal__ do at least some comparison assert_equal(repr(ds.a.mapper), repr(ds2.a.mapper)) #cleanup temp dir shutil.rmtree(tempdir, ignore_errors=True)
def test_masked_featureselection(): origdata = np.random.standard_normal((10, 2, 4, 3, 5)).view(myarray) data = Dataset.from_wizard(origdata, targets=2, chunks=2) unmasked = data.samples.copy() # array subclass survives ok_(isinstance(data.samples, myarray)) # default must be no mask ok_(data.nfeatures == 120) ok_(data.a.mapper.forward1(origdata[0]).shape == (120,)) # check that full mask uses all features # this uses auto-mapping of selection arrays in __getitem__ sel = data[:, np.ones((2, 4, 3, 5), dtype='bool')] ok_(sel.nfeatures == data.samples.shape[1]) ok_(data.nfeatures == 120) # check partial array mask partial_mask = np.zeros((2, 4, 3, 5), dtype='bool') partial_mask[0, 0, 2, 2] = 1 partial_mask[1, 2, 2, 0] = 1 sel = data[:, partial_mask] ok_(sel.nfeatures == 2) # check that feature selection does not change source data ok_(data.nfeatures == 120) ok_(data.a.mapper.forward1(origdata[0]).shape == (120,)) # check selection with feature list sel = data[:, [0, 37, 119]] ok_(sel.nfeatures == 3) # check size of the masked samples ok_(sel.samples.shape == (10, 3)) # check that the right features are selected assert_array_equal(unmasked[:, [0, 37, 119]], sel.samples)
def test_labelschunks_access(): samples = np.arange(12).reshape((4, 3)).view(myarray) labels = range(4) chunks = [1, 1, 2, 2] ds = Dataset.from_wizard(samples, labels, chunks) # array subclass survives ok_(isinstance(ds.samples, myarray)) assert_array_equal(ds.targets, labels) assert_array_equal(ds.chunks, chunks) # moreover they should point to the same thing ok_(ds.targets is ds.sa.targets) ok_(ds.targets is ds.sa['targets'].value) ok_(ds.chunks is ds.sa.chunks) ok_(ds.chunks is ds.sa['chunks'].value) # assignment should work at all levels including 1st ds.targets = chunks assert_array_equal(ds.targets, chunks) ok_(ds.targets is ds.sa.targets) ok_(ds.targets is ds.sa['targets'].value)
def test_featuregroup_mapper(): ds = Dataset(np.arange(24).reshape(3,8)) ds.fa['roi'] = [0, 1] * 4 # just to check ds.sa['chunks'] = np.arange(3) # correct results csamples = [[3, 4], [11, 12], [19, 20]] croi = [0, 1] cchunks = np.arange(3) m = mean_group_feature(['roi']) mds = m.forward(ds) assert_equal(mds.shape, (3, 2)) assert_array_equal(mds.samples, csamples) assert_array_equal(mds.fa.roi, np.unique([0, 1] * 4)) # FAs should simply remain the same assert_array_equal(mds.sa.chunks, np.arange(3)) # now without grouping m = mean_feature() # forwarding just the samples should yield the same result assert_array_equal(m.forward(ds.samples), m.forward(ds).samples) # And when operating on a dataset with >1D samples, then operate # only across "features", i.e. 1st dimension ds = Dataset(np.arange(24).reshape(3,2,2,2)) mapped = ds.get_mapped(m) assert_array_equal(m.forward(ds.samples), mapped.samples) assert_array_equal(mapped.samples.shape, (3, 2, 2)) assert_array_equal(mapped.samples, np.mean(ds.samples, axis=1)) # and still could map back? ;) not ATM, so just to ensure consistency assert_raises(NotImplementedError, mapped.a.mapper.reverse, mapped.samples) # but it should also work with standard 2d sample arrays ds = Dataset(np.arange(24).reshape(3,8)) mapped = ds.get_mapped(m) assert_array_equal(mapped.samples.shape, (3, 1))
def _call(self, dataset=None): """Extract weights from SMLR classifier. SMLR always has weights available, so nothing has to be computed here. """ clf = self.clf # transpose to have the number of features on the second axis # (as usual) weights = clf.weights.T if clf.params.has_bias: self.ca.biases = clf.biases if __debug__: debug('SMLR', "Extracting weights for %d-class SMLR" % (len(weights) + 1) + "Result: min=%f max=%f" %\ (np.min(weights), np.max(weights))) # limit the labels to the number of sensitivity sets, to deal # with the case of `fit_all_weights=False` return Dataset( weights, sa={clf.params.targets_attr: clf._ulabels[:len(weights)]})
def test_flatten(): samples_shape = (2, 2, 4) data_shape = (4, ) + samples_shape data = np.arange(np.prod(data_shape)).reshape(data_shape).view(myarray) pristinedata = data.copy() target = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]] target = np.array(target).view(myarray) index_target = np.array([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3], [1, 0, 0], [1, 0, 1], [1, 0, 2], [1, 0, 3], [1, 1, 0], [1, 1, 1], [1, 1, 2], [1, 1, 3]]) # array subclass survives ok_(isinstance(data, myarray)) # actually, there should be no difference between a plain FlattenMapper and # a chain that only has a FlattenMapper as the one element for fm in [ FlattenMapper(inspace='voxel'), ChainMapper([ FlattenMapper(inspace='voxel'), FeatureSliceMapper(slice(None)) ]) ]: # not working if untrained assert_raises(RuntimeError, fm.forward1, np.arange(np.sum(samples_shape) + 1)) fm.train(data) ok_(isinstance(fm.forward(data), myarray)) ok_(isinstance(fm.forward1(data[2]), myarray)) assert_array_equal(fm.forward(data), target) assert_array_equal(fm.forward1(data[2]), target[2]) assert_raises(ValueError, fm.forward, np.arange(4)) # all of that leaves that data unmodified assert_array_equal(data, pristinedata) # reverse mapping ok_(isinstance(fm.reverse(target), myarray)) ok_(isinstance(fm.reverse1(target[0]), myarray)) ok_(isinstance(fm.reverse(target[1:2]), myarray)) assert_array_equal(fm.reverse(target), data) assert_array_equal(fm.reverse1(target[0]), data[0]) assert_array_equal(fm.reverse(target[1:2]), data[1:2]) assert_raises(ValueError, fm.reverse, np.arange(14)) # check one dimensional data, treated as scalar samples oned = np.arange(5) fm.train(Dataset(oned)) # needs 2D assert_raises(ValueError, fm.forward, oned) # doesn't match mapper, since Dataset turns `oned` into (5,1) assert_raises(ValueError, fm.forward, oned) assert_equal(Dataset(oned).nfeatures, 1) # try dataset mode, with some feature attribute fattr = np.arange(np.prod(samples_shape)).reshape(samples_shape) ds = Dataset(data, fa={'awesome': fattr.copy()}) assert_equal(ds.samples.shape, data_shape) fm.train(ds) dsflat = fm.forward(ds) ok_(isinstance(dsflat, Dataset)) ok_(isinstance(dsflat.samples, myarray)) assert_array_equal(dsflat.samples, target) assert_array_equal(dsflat.fa.awesome, np.arange(np.prod(samples_shape))) assert_true(isinstance(dsflat.fa['awesome'], ArrayCollectable)) # test index creation assert_array_equal(index_target, dsflat.fa.voxel) # and back revds = fm.reverse(dsflat) ok_(isinstance(revds, Dataset)) ok_(isinstance(revds.samples, myarray)) assert_array_equal(revds.samples, data) assert_array_equal(revds.fa.awesome, fattr) assert_true(isinstance(revds.fa['awesome'], ArrayCollectable)) assert_false('voxel' in revds.fa)
# full dataset datasets[basename] = list(ttp.generate(dataset))[0] # sample 3D total = 2*spec['perlabel'] nchunks = spec['nchunks'] data = np.random.standard_normal(( total, 3, 6, 6 )) labels = np.concatenate( ( np.repeat( 0, spec['perlabel'] ), np.repeat( 1, spec['perlabel'] ) ) ) data[:, 1, 0, 0] += 2*labels # add some signal chunks = np.asarray(range(nchunks)*(total/nchunks)) mask = np.ones((3, 6, 6), dtype='bool') mask[0, 0, 0] = 0 mask[1, 3, 2] = 0 ds = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks, mask=mask, space='myspace') datasets['3d%s' % kind] = ds # some additional datasets datasets['dumb2'] = dumb_feature_binary_dataset() datasets['dumb'] = dumb_feature_dataset() # dataset with few invariant features _dsinv = dumb_feature_dataset() _dsinv.samples = np.hstack((_dsinv.samples, np.zeros((_dsinv.nsamples, 1)), np.ones((_dsinv.nsamples, 1)))) datasets['dumbinv'] = _dsinv # Datasets for regressions testing datasets['sin_modulated'] = list(ttp.generate(multiple_chunks(sin_modulated, 4, 30, 1)))[0]
def _call(self, dataset): """Train linear SVM on `dataset` and extract weights from classifier. """ sens = self.__mult * (np.arange(dataset.nfeatures) - int(dataset.nfeatures / 2)) return Dataset(sens[np.newaxis])
def wrap_samples(obj, data, *args, **kwargs): if is_datasetlike(data): return fx(obj, data, *args, **kwargs) else: return fx(obj, Dataset(data), *args, **kwargs)
def _call(self, dataset, callables=[]): # local bindings clf = self.clf model = clf.model # Labels for sensitivities to be returned sens_labels = None if clf.__is_regression__: nr_class = None svm_labels = None # shouldn't bother to provide "targets" for regressions else: nr_class = model.nr_class svm_labels = model.labels # No need to warn since now we by default we do not do # anything evil and provide labels -- so it is up for a user # to decide either he wants to do something silly #if nr_class != 2: # warning("You are estimating sensitivity for SVM %s trained on %d" % # (str(clf), nr_class) + # " classes. Make sure that it is what you intended to do" ) svcoef = np.matrix(model.get_sv_coef()) svs = np.matrix(model.get_sv()) rhos = np.asarray(model.get_rho()) self.ca.biases = rhos if self.params.split_weights: if nr_class != 2: raise NotImplementedError, \ "Cannot compute per-class weights for" \ " non-binary classification task" # libsvm might have different idea on the ordering # of labels, so we would need to map them back explicitely ds_labels = list(dataset.sa[ clf.params.targets_attr].unique) # labels in the dataset senses = [None for i in ds_labels] # first label is given positive value for i, (c, l) in enumerate([(svcoef > 0, lambda x: x), (svcoef < 0, lambda x: x * -1)]): # convert to array, and just take the meaningful dimension c_ = c.A[0] # NOTE svm_labels are numerical; ds_labels are literal senses[ds_labels.index( clf._attrmap.to_literal(svm_labels[i]))] = \ (l(svcoef[:, c_] * svs[c_, :])).A[0] weights = np.array(senses) sens_labels = svm_labels else: # XXX yoh: .mean() is effectively # averages across "sensitivities" of all paired classifiers (I # think). See more info on this topic in svm.py on how sv_coefs # are stored # # First multiply SV coefficients with the actual SVs to get # weighted impact of SVs on decision, then for each feature # take mean across SVs to get a single weight value # per feature if nr_class is None or nr_class <= 2: # as simple as this weights = (svcoef * svs).A # and only in case of classification if nr_class: # ??? First label seems corresponds to positive sens_labels = [tuple(svm_labels[::-1])] else: # we need to compose correctly per each pair of classifiers. # See docstring for get_sv_coef for more details on internal # structure of bloody storage # total # of pairs npairs = nr_class * (nr_class - 1) / 2 # # of SVs in each class NSVs_perclass = model.get_n_sv() # indices where each class starts in each row of SVs # name is after similar variable in libsvm internals nz_start = np.cumsum([0] + NSVs_perclass[:-1]) nz_end = nz_start + NSVs_perclass # reserve storage weights = np.zeros((npairs, svs.shape[1])) ipair = 0 # index of the pair """ // classifier (i,j): coefficients with // i are in sv_coef[j-1][nz_start[i]...], // j are in sv_coef[i][nz_start[j]...] """ sens_labels = [] for i in xrange(nr_class): for j in xrange(i + 1, nr_class): weights[ipair, :] = np.asarray( svcoef[j - 1, nz_start[i]:nz_end[i]] * svs[nz_start[i]:nz_end[i]] + svcoef[i, nz_start[j]:nz_end[j]] * svs[nz_start[j]:nz_end[j]]) # ??? First label corresponds to positive # that is why [j], [i] sens_labels += [(svm_labels[j], svm_labels[i])] ipair += 1 # go to the next pair assert (ipair == npairs) if __debug__ and 'SVM' in debug.active: if nr_class: nsvs = model.get_n_sv() else: nsvs = model.get_total_n_sv() debug('SVM', "Extracting weights for %s-class SVM: #SVs=%s, " % \ (nr_class, nsvs) + \ " SVcoefshape=%s SVs.shape=%s Rhos=%s." % \ (svcoef.shape, svs.shape, rhos) + \ " Result: min=%f max=%f" % (np.min(weights), np.max(weights))) ds_kwargs = {} if nr_class: # for classification only # and we should have prepared the labels assert (sens_labels is not None) if len(clf._attrmap): if isinstance(sens_labels[0], tuple): sens_labels = asobjarray(sens_labels) sens_labels = clf._attrmap.to_literal(sens_labels, recurse=True) # NOTE: `weights` is already and always 2D ds_kwargs = dict(sa={clf.params.targets_attr: sens_labels}) weights_ds = Dataset(weights, **ds_kwargs) return weights_ds
def generate_testing_datasets(specs): # Lets permute upon each invocation of test, so we could possibly # trigger some funny cases nonbogus_pool = np.random.permutation([0, 1, 3, 5]) datasets = {} # use a partitioner to flag odd/even samples as training and test ttp = OddEvenPartitioner(space='train', count=1) for kind, spec in specs.iteritems(): # set of univariate datasets for nlabels in [ 2, 3, 4 ]: basename = 'uni%d%s' % (nlabels, kind) nonbogus_features = nonbogus_pool[:nlabels] dataset = normal_feature_dataset( nlabels=nlabels, nonbogus_features=nonbogus_features, **spec) # full dataset datasets[basename] = list(ttp.generate(dataset))[0] # sample 3D total = 2*spec['perlabel'] nchunks = spec['nchunks'] data = np.random.standard_normal(( total, 3, 6, 6 )) labels = np.concatenate( ( np.repeat( 0, spec['perlabel'] ), np.repeat( 1, spec['perlabel'] ) ) ) data[:, 1, 0, 0] += 2*labels # add some signal chunks = np.asarray(range(nchunks)*(total/nchunks)) mask = np.ones((3, 6, 6), dtype='bool') mask[0, 0, 0] = 0 mask[1, 3, 2] = 0 ds = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks, mask=mask, space='myspace') # and to stress tests on manipulating sa/fa possibly containing # attributes of dtype object ds.sa['test_object'] = [['a'], [1, 2]] * (ds.nsamples/2) datasets['3d%s' % kind] = ds # some additional datasets datasets['dumb2'] = dumb_feature_binary_dataset() datasets['dumb'] = dumb_feature_dataset() # dataset with few invariant features _dsinv = dumb_feature_dataset() _dsinv.samples = np.hstack((_dsinv.samples, np.zeros((_dsinv.nsamples, 1)), np.ones((_dsinv.nsamples, 1)))) datasets['dumbinv'] = _dsinv # Datasets for regressions testing datasets['sin_modulated'] = list(ttp.generate(multiple_chunks(sin_modulated, 4, 30, 1)))[0] # use the same full for training datasets['sin_modulated_train'] = datasets['sin_modulated'] datasets['sin_modulated_test'] = sin_modulated(30, 1, flat=True) # simple signal for linear regressors datasets['chirp_linear'] = multiple_chunks(chirp_linear, 6, 50, 10, 2, 0.3, 0.1) datasets['chirp_linear_test'] = chirp_linear(20, 5, 2, 0.4, 0.1) datasets['wr1996'] = multiple_chunks(wr1996, 4, 50) datasets['wr1996_test'] = wr1996(50) datasets['hollow'] = Dataset(HollowSamples((40,20)), sa={'targets': np.tile(['one', 'two'], 20)}) return datasets
def _call(self, dataset): """Perform cross-validation on a dataset. 'dataset' is passed to the splitter instance and serves as the source dataset to generate split for the single cross-validation folds. """ # store the results of the splitprocessor results = [] self.ca.splits = [] # local bindings ca = self.ca clf = self.__transerror.clf expose_testdataset = self.__expose_testdataset # what ca to enable in terr terr_enable = [] for state_var in ['confusion', 'training_confusion', 'samples_error']: if ca.is_enabled(state_var): terr_enable += [state_var] # charge ca with initial values summaryClass = clf.__summary_class__ clf_hastestdataset = hasattr(clf, 'testdataset') self.ca.confusion = summaryClass() self.ca.training_confusion = summaryClass() self.ca.transerrors = [] if ca.is_enabled('samples_error'): dataset.init_origids('samples', attr=self.__samples_idattr, mode='existing') self.ca.samples_error = dict([ (id_, []) for id_ in dataset.sa[self.__samples_idattr].value ]) # enable requested ca in child TransferError instance (restored # again below) if len(terr_enable): self.__transerror.ca.change_temporarily(enable_ca=terr_enable) # We better ensure that underlying classifier is not trained if we # are going to deepcopy transerror if ca.is_enabled("transerrors"): self.__transerror.untrain() # collect sum info about the split that where made for the resulting # dataset splitinfo = [] # splitter for split in self.__splitter(dataset): splitinfo.append("%s->%s" % (','.join([ str(c) for c in split[0].sa[self.__splitter.splitattr].unique ]), ','.join([ str(c) for c in split[1].sa[self.__splitter.splitattr].unique ]))) # only train classifier if splitter provides something in first # element of tuple -- the is the behavior of TransferError if ca.is_enabled("splits"): self.ca.splits.append(split) if ca.is_enabled("transerrors"): # copy first and then train, as some classifiers cannot be copied # when already trained, e.g. SWIG'ed stuff lastsplit = None for ds in split: if ds is not None: lastsplit = ds.a.lastsplit break if lastsplit: # only if we could deduce that it was last split # use the 'mother' transerror transerror = self.__transerror else: # otherwise -- deep copy transerror = deepcopy(self.__transerror) else: transerror = self.__transerror # assign testing dataset if given classifier can digest it if clf_hastestdataset and expose_testdataset: transerror.clf.testdataset = split[1] # run the beast result = transerror(split[1], split[0]) # unbind the testdataset from the classifier if clf_hastestdataset and expose_testdataset: transerror.clf.testdataset = None # next line is important for 'self._harvest' call self._harvest(locals()) # XXX Look below -- may be we should have not auto added .? # then transerrors also could be deprecated if ca.is_enabled("transerrors"): self.ca.transerrors.append(transerror) # XXX: could be merged with next for loop using a utility class # that can add dict elements into a list if ca.is_enabled("samples_error"): for k, v in \ transerror.ca.samples_error.iteritems(): self.ca.samples_error[k].append(v) # pull in child ca for state_var in ['confusion', 'training_confusion']: if ca.is_enabled(state_var): ca[state_var].value.__iadd__( transerror.ca[state_var].value) if __debug__: debug("CROSSC", "Split #%d: result %s" \ % (len(results), `result`)) results.append(result) # Since we could have operated with a copy -- bind the last used one back self.__transerror = transerror # put ca of child TransferError back into original config if len(terr_enable): self.__transerror.ca.reset_changed_temporarily() self.ca.results = results """Store conditional attribute if it is enabled""" results = Dataset(results, sa={'cv_fold': splitinfo}) return results
def fmri_dataset(samples, targets=None, chunks=None, mask=None, sprefix='voxel', tprefix='time', add_fa=None,): """Create a dataset from an fMRI timeseries image. The timeseries image serves as the samples data, with each volume becoming a sample. All 3D volume samples are flattened into one-dimensional feature vectors, optionally being masked (i.e. subset of voxels corresponding to non-zero elements in a mask image). In addition to (optional) samples attributes for targets and chunks the returned dataset contains a number of additional attributes: Samples attributes (per each volume): * volume index (time_indices) * volume acquisition time (time_coord) Feature attributes (per each voxel): * voxel indices (voxel_indices), sometimes referred to as ijk Dataset attributes: * dump of the NIfTI image header data (imghdr) * volume extent (voxel_dim) * voxel extent (voxel_eldim) The default attribute name is listed in parenthesis, but may be altered by the corresponding prefix arguments. The validity of the attribute values relies on correct settings in the NIfTI image header. Parameters ---------- samples : str or NiftiImage or list fMRI timeseries, specified either as a filename (single file 4D image), an image instance (4D image), or a list of filenames or image instances (each list item corresponding to a 3D volume). targets : scalar or sequence Label attribute for each volume in the timeseries, or a scalar value that is assigned to all samples. chunks : scalar or sequence Chunk attribute for each volume in the timeseries, or a scalar value that is assigned to all samples. mask : str or NiftiImage Filename or image instance of a 3D volume mask. Voxels corresponding to non-zero elements in the mask will be selected. The mask has to be in the same space (orientation and dimensions) as the timeseries image sprefix : str or None Prefix for attribute names describing spatial properties of the timeseries. If None, no such attributes are stored in the dataset. tprefix : str or None Prefix for attribute names describing temporal properties of the timeseries. If None, no such attributes are stored in the dataset. add_fa : dict or None Optional dictionary with additional volumetric data that shall be stored as feature attributes in the dataset. The dictionary key serves as the feature attribute name. Each value might be of any type supported by the 'mask' argument of this function. Returns ------- Dataset """ # load the samples imgdata, imghdr = _load_anyimg(samples, ensure=True, enforce_dim=4) # figure out what the mask is, but only handle known cases, the rest # goes directly into the mapper which maybe knows more maskimg = _load_anyimg(mask) if maskimg is None: pass else: # take just data and ignore the header mask = maskimg[0] # compile the samples attributes sa = {} if not targets is None: sa['targets'] = _expand_attribute(targets, imgdata.shape[0], 'targets') if not chunks is None: sa['chunks'] = _expand_attribute(chunks, imgdata.shape[0], 'chunks') # create a dataset ds = Dataset(imgdata, sa=sa) if sprefix is None: inspace = None else: inspace = sprefix + '_indices' ds = ds.get_mapped(FlattenMapper(shape=imgdata.shape[1:], inspace=inspace)) # now apply the mask if any if not mask is None: flatmask = ds.a.mapper.forward1(mask) # direct slicing is possible, and it is potentially more efficient, # so let's use it #mapper = FeatureSliceMapper(flatmask) #ds = ds.get_mapped(FeatureSliceMapper(flatmask)) ds = ds[:, flatmask != 0] # load and store additional feature attributes if not add_fa is None: for fattr in add_fa: value = _load_anyimg(add_fa[fattr], ensure=True)[0] ds.fa[fattr] = ds.a.mapper.forward1(value) # store interesting props in the dataset ds.a['imghdr'] = imghdr # If there is a space assigned , store the extent of that space if sprefix is not None: ds.a[sprefix + '_dim'] = imgdata.shape[1:] # 'voxdim' is (x,y,z) while 'samples' are (t,z,y,x) ds.a[sprefix + '_eldim'] = _get_voxdim(imghdr) # TODO extend with the unit if tprefix is not None: ds.sa[tprefix + '_indices'] = np.arange(len(ds), dtype='int') ds.sa[tprefix + '_coords'] = np.arange(len(ds), dtype='float') \ * _get_dt(imghdr) # TODO extend with the unit return ds
def fmri_dataset(samples, targets=None, chunks=None, mask=None, sprefix='voxel', tprefix='time', add_fa=None,): """Create a dataset from an fMRI timeseries image. The timeseries image serves as the samples data, with each volume becoming a sample. All 3D volume samples are flattened into one-dimensional feature vectors, optionally being masked (i.e. subset of voxels corresponding to non-zero elements in a mask image). In addition to (optional) samples attributes for targets and chunks the returned dataset contains a number of additional attributes: Samples attributes (per each volume): * volume index (time_indices) * volume acquisition time (time_coord) Feature attributes (per each voxel): * voxel indices (voxel_indices), sometimes referred to as ijk Dataset attributes: * dump of the image (e.g. NIfTI) header data (imghdr) * class of the image (e.g. Nifti1Image) (imgtype) * volume extent (voxel_dim) * voxel extent (voxel_eldim) The default attribute name is listed in parenthesis, but may be altered by the corresponding prefix arguments. The validity of the attribute values relies on correct settings in the NIfTI image header. Parameters ---------- samples : str or NiftiImage or list fMRI timeseries, specified either as a filename (single file 4D image), an image instance (4D image), or a list of filenames or image instances (each list item corresponding to a 3D volume). targets : scalar or sequence Label attribute for each volume in the timeseries, or a scalar value that is assigned to all samples. chunks : scalar or sequence Chunk attribute for each volume in the timeseries, or a scalar value that is assigned to all samples. mask : str or NiftiImage Filename or image instance of a 3D volume mask. Voxels corresponding to non-zero elements in the mask will be selected. The mask has to be in the same space (orientation and dimensions) as the timeseries image sprefix : str or None Prefix for attribute names describing spatial properties of the timeseries. If None, no such attributes are stored in the dataset. tprefix : str or None Prefix for attribute names describing temporal properties of the timeseries. If None, no such attributes are stored in the dataset. add_fa : dict or None Optional dictionary with additional volumetric data that shall be stored as feature attributes in the dataset. The dictionary key serves as the feature attribute name. Each value might be of any type supported by the 'mask' argument of this function. Returns ------- Dataset """ # load the samples imgdata, imghdr, imgtype = _load_anyimg(samples, ensure=True, enforce_dim=4) # figure out what the mask is, but only handle known cases, the rest # goes directly into the mapper which maybe knows more maskimg = _load_anyimg(mask) if maskimg is None: pass else: # take just data and ignore the header mask = maskimg[0] # compile the samples attributes sa = {} if not targets is None: sa['targets'] = _expand_attribute(targets, imgdata.shape[0], 'targets') if not chunks is None: sa['chunks'] = _expand_attribute(chunks, imgdata.shape[0], 'chunks') # create a dataset ds = Dataset(imgdata, sa=sa) if sprefix is None: space = None else: space = sprefix + '_indices' ds = ds.get_mapped(FlattenMapper(shape=imgdata.shape[1:], space=space)) # now apply the mask if any if not mask is None: flatmask = ds.a.mapper.forward1(mask) # direct slicing is possible, and it is potentially more efficient, # so let's use it #mapper = StaticFeatureSelection(flatmask) #ds = ds.get_mapped(StaticFeatureSelection(flatmask)) ds = ds[:, flatmask != 0] # load and store additional feature attributes if not add_fa is None: for fattr in add_fa: value = _load_anyimg(add_fa[fattr], ensure=True)[0] ds.fa[fattr] = ds.a.mapper.forward1(value) # store interesting props in the dataset ds.a['imghdr'] = imghdr ds.a['imgtype'] = imgtype # If there is a space assigned , store the extent of that space if sprefix is not None: ds.a[sprefix + '_dim'] = imgdata.shape[1:] # 'voxdim' is (x,y,z) while 'samples' are (t,z,y,x) ds.a[sprefix + '_eldim'] = _get_voxdim(imghdr) # TODO extend with the unit if tprefix is not None: ds.sa[tprefix + '_indices'] = np.arange(len(ds), dtype='int') ds.sa[tprefix + '_coords'] = np.arange(len(ds), dtype='float') \ * _get_dt(imghdr) # TODO extend with the unit return ds
def get_data(self): data = np.random.standard_normal((100, 2, 2, 2)) labels = np.concatenate((np.repeat(0, 50), np.repeat(1, 50))) chunks = np.repeat(range(5), 10) chunks = np.concatenate((chunks, chunks)) return Dataset.from_wizard(samples=data, targets=labels, chunks=chunks)
def test_from_wizard(): samples = np.arange(12).reshape((4, 3)).view(myarray) labels = range(4) chunks = [1, 1, 2, 2] ds = Dataset(samples, sa={'targets': labels, 'chunks': chunks}) ds.init_origids('both') first = ds.sa.origids # now do again and check that they get regenerated ds.init_origids('both') assert_false(first is ds.sa.origids) assert_array_equal(first, ds.sa.origids) ok_(is_datasetlike(ds)) ok_(not is_datasetlike(labels)) # array subclass survives ok_(isinstance(ds.samples, myarray)) ## XXX stuff that needs thought: # ds.sa (empty) has this in the public namespace: # add, get, getvalue, has_key, is_set, items, listing, name, names # owner, remove, reset, setvalue, which_set # maybe we need some form of leightweightCollection? assert_array_equal(ds.samples, samples) assert_array_equal(ds.sa.targets, labels) assert_array_equal(ds.sa.chunks, chunks) # same should work for shortcuts assert_array_equal(ds.targets, labels) assert_array_equal(ds.chunks, chunks) ok_(sorted(ds.sa.keys()) == ['chunks', 'origids', 'targets']) ok_(sorted(ds.fa.keys()) == ['origids']) # add some more ds.a['random'] = 'blurb' # check stripping attributes from a copy cds = ds.copy() # full copy ok_(sorted(cds.sa.keys()) == ['chunks', 'origids', 'targets']) ok_(sorted(cds.fa.keys()) == ['origids']) ok_(sorted(cds.a.keys()) == ['random']) cds = ds.copy(sa=[], fa=[], a=[]) # plain copy ok_(cds.sa.keys() == []) ok_(cds.fa.keys() == []) ok_(cds.a.keys() == []) cds = ds.copy(sa=['targets'], fa=None, a=['random']) # partial copy ok_(cds.sa.keys() == ['targets']) ok_(cds.fa.keys() == ['origids']) ok_(cds.a.keys() == ['random']) # there is not necessarily a mapper present ok_(not ds.a.has_key('mapper')) # has to complain about misshaped samples attributes assert_raises(ValueError, Dataset.from_wizard, samples, labels + labels) # check that we actually have attributes of the expected type ok_(isinstance(ds.sa['targets'], ArrayCollectable)) # the dataset will take care of not adding stupid stuff assert_raises(ValueError, ds.sa.__setitem__, 'stupid', np.arange(3)) assert_raises(ValueError, ds.fa.__setitem__, 'stupid', np.arange(4)) # or change proper attributes to stupid shapes try: ds.sa.targets = np.arange(3) except ValueError: pass else: ok_(False, msg="Assigning value with improper shape to attribute " "did not raise exception.")
def fmri_dataset(samples, targets=None, chunks=None, mask=None, sprefix="voxel", tprefix="time", add_fa=None): """Create a dataset from an fMRI timeseries image. The timeseries image serves as the samples data, with each volume becoming a sample. All 3D volume samples are flattened into one-dimensional feature vectors, optionally being masked (i.e. subset of voxels corresponding to non-zero elements in a mask image). In addition to (optional) samples attributes for targets and chunks the returned dataset contains a number of additional attributes: Samples attributes (per each volume): * volume index (time_indices) * volume acquisition time (time_coord) Feature attributes (per each voxel): * voxel indices (voxel_indices), sometimes referred to as ijk Dataset attributes: * dump of the NIfTI image header data (imghdr) * volume extent (voxel_dim) * voxel extent (voxel_eldim) The default attribute name is listed in parenthesis, but may be altered by the corresponding prefix arguments. The validity of the attribute values relies on correct settings in the NIfTI image header. Parameters ---------- samples : str or NiftiImage or list fMRI timeseries, specified either as a filename (single file 4D image), an image instance (4D image), or a list of filenames or image instances (each list item corresponding to a 3D volume). targets : scalar or sequence Label attribute for each volume in the timeseries, or a scalar value that is assigned to all samples. chunks : scalar or sequence Chunk attribute for each volume in the timeseries, or a scalar value that is assigned to all samples. mask : str or NiftiImage Filename or image instance of a 3D volume mask. Voxels corresponding to non-zero elements in the mask will be selected. The mask has to be in the same space (orientation and dimensions) as the timeseries image sprefix : str or None Prefix for attribute names describing spatial properties of the timeseries. If None, no such attributes are stored in the dataset. tprefix : str or None Prefix for attribute names describing temporal properties of the timeseries. If None, no such attributes are stored in the dataset. add_fa : dict or None Optional dictionary with additional volumetric data that shall be stored as feature attributes in the dataset. The dictionary key serves as the feature attribute name. Each value might be of any type supported by the 'mask' argument of this function. Returns ------- Dataset """ # load the samples niftisamples = _load_anynifti(samples, ensure=True, enforce_dim=4) samples = niftisamples.data # figure out what the mask is, but onyl handle known cases, the rest # goes directly into the mapper which maybe knows more niftimask = _load_anynifti(mask) if niftimask is None: pass elif isinstance(niftimask, np.ndarray): mask = niftimask else: mask = _get_nifti_data(niftimask) # compile the samples attributes sa = {} if not targets is None: sa["targets"] = _expand_attribute(targets, samples.shape[0], "targets") if not chunks is None: sa["chunks"] = _expand_attribute(chunks, samples.shape[0], "chunks") # create a dataset ds = Dataset(samples, sa=sa) if sprefix is None: inspace = None else: inspace = sprefix + "_indices" ds = ds.get_mapped(FlattenMapper(shape=samples.shape[1:], inspace=inspace)) # now apply the mask if any if not mask is None: flatmask = ds.a.mapper.forward1(mask) # direct slicing is possible, and it is potentially more efficient, # so let's use it # mapper = FeatureSliceMapper(flatmask) # ds = ds.get_mapped(FeatureSliceMapper(flatmask)) ds = ds[:, flatmask != 0] # load and store additional feature attributes if not add_fa is None: for fattr in add_fa: value = _get_nifti_data(_load_anynifti(add_fa[fattr])) ds.fa[fattr] = ds.a.mapper.forward1(value) # store interesting props in the dataset # do not put the whole NiftiImage in the dict as this will most # likely be deepcopy'ed at some point and ensuring data integrity # of the complex Python-C-Swig hybrid might be a tricky task. # Only storing the header dict should achieve the same and is more # memory efficient and even simpler ds.a["imghdr"] = niftisamples.header # If there is a space assigned , store the extent of that space if sprefix is not None: ds.a[sprefix + "_dim"] = samples.shape[1:] # 'voxdim' is (x,y,z) while 'samples' are (t,z,y,x) ds.a[sprefix + "_eldim"] = tuple([i for i in reversed(niftisamples.voxdim)]) # TODO extend with the unit if tprefix is not None: ds.sa[tprefix + "_indices"] = np.arange(len(ds), dtype="int") ds.sa[tprefix + "_coords"] = np.arange(len(ds), dtype="float") * niftisamples.header["pixdim"][4] # TODO extend with the unit return ds
def _sl_call(self, dataset, roi_ids, nproc): """Call to GNBSearchlight """ # Local bindings gnb = self._gnb params = gnb.params splitter = self._splitter errorfx = self._errorfx qe = self._qe ## if False: ## class A(object): ## pass ## self = A() ## import numpy as np ## from mvpa.clfs.gnb import GNB ## from mvpa.datasets.splitters import NFoldSplitter ## from mvpa.misc.errorfx import MeanMismatchErrorFx ## #from mvpa.testing.datasets import datasets ## from mvpa.datasets import Dataset ## from mvpa.misc.neighborhood import IndexQueryEngine, Sphere ## from mvpa.clfs.distance import absmin_distance ## import time ## if __debug__: ## from mvpa.base import debug ## debug.active += ['SLC.*'] ## # XXX is it that ugly? ## debug.active.pop(debug.active.index('SLC_')) ## debug.metrics += ['reltime'] ## dataset = datasets['3dlarge'] ## sphere = Sphere(radius=1, ## distance_func=absmin_distance) ## qe = IndexQueryEngine(myspace=sphere) ## # Fracisco's data ## dataset = ds_fp ## qe = IndexQueryEngine(voxel_indices=sphere) ## qe.train(dataset) ## roi_ids = np.arange(dataset.nfeatures) ## gnb = GNB() ## params = gnb.params ## splitter = NFoldSplitter() ## errorfx = MeanMismatchErrorFx() if __debug__: time_start = time.time() targets_sa_name = params.targets_attr targets_sa = dataset.sa[targets_sa_name] if __debug__: debug_slc_ = 'SLC_' in debug.active # get the dataset information into easy vars X = dataset.samples if len(X.shape) != 2: raise ValueError, \ 'Unlike GNB, GNBSearchlight (for now) operates on already' \ 'flattened datasets' labels = targets_sa.value ulabels = targets_sa.unique nlabels = len(ulabels) label2index = dict((l, il) for il, l in enumerate(ulabels)) labels_numeric = np.array([label2index[l] for l in labels]) ulabels_numeric = [label2index[l] for l in ulabels] # set the feature dimensions nsamples = len(X) s_shape = X.shape[1:] # shape of a single sample # # Everything toward optimization ;) # # Silly Yarik thinks that it might be worth to pre-compute # statistics per each feature within a block of the samples # which always come together in splits -- most often it is a # (chunk, label) combination, but since we simply use a # splitter -- who knows! Therefore lets figure out what are # those blocks and operate on them instead of original samples. # # After additional thinking about this -- probably it would be # just minor additional improvements (ie not worth it) but # since it is coded already -- let it me so # 1. Query splitter for the splits we will have if __debug__: debug( 'SLC', 'Phase 1. Initializing splits using %s on %s' % (splitter, dataset)) # check the splitter -- splitcfg isn't sufficient # TODO: RF splitters so we could reliably obtain the configuration # splitcfg just returns what to split into the other in terms # of chunks... and we need actual indicies if splitter.permute_attr is not None: raise NotImplementedError, \ "Splitters which permute targets aren't supported here" # Lets just create a dummy ds which will store for us actual sample # indicies # XXX we could make it even more lightweight I guess... dataset_indicies = Dataset(np.arange(nsamples), sa=dataset.sa) splits = list(splitter(dataset_indicies)) nsplits = len(splits) assert (len(splits[0]) == 2) # assure that we have only 2 # splits here for cvte # 2. Figure out the new 'chunks x labels' blocks of combinations # of samples if __debug__: debug( 'SLC', 'Phase 2. Blocking data for %i splits and %i labels' % (nsplits, nlabels)) # array of indicies for label, split1, split2, ... # through which we will pass later on to figure out # unique combinations combinations = np.ones((nsamples, 1 + nsplits), dtype=int) * -1 # labels combinations[:, 0] = labels_numeric for isplit, (split1, split2) in enumerate(splits): combinations[split1.samples[:, 0], 1 + isplit] = 1 combinations[split2.samples[:, 0], 1 + isplit] = 2 # sample descriptions -- should be unique for # samples within the same block descriptions = [tuple(c) for c in combinations] udescriptions = sorted(list(set(descriptions))) nblocks = len(udescriptions) description2block = dict([(d, i) for i, d in enumerate(udescriptions)]) # Indices for samples to point to their block sample2block = np.array([description2block[d] for d in descriptions]) # 3. Compute statistics per each block # if __debug__: debug('SLC', 'Phase 3. Computing statistics for %i blocks' % (nblocks, )) # # reusable containers which should stay of the same size # # sums and sums of squares per each block sums = np.zeros((nblocks, ) + s_shape) # sums of squares sums2 = np.zeros((nblocks, ) + s_shape) # per each label: means = np.zeros((nlabels, ) + s_shape) # means of squares for stddev computation means2 = np.zeros((nlabels, ) + s_shape) variances = np.zeros((nlabels, ) + s_shape) # degenerate dimension are added for easy broadcasting later on nsamples_per_class = np.zeros((nlabels, ) + (1, ) * len(s_shape)) # results results = np.zeros((nsplits, ) + s_shape) block_counts = np.zeros((nblocks, )) block_labels = [None] * nblocks X2 = np.square(X) # silly way for now for l, s, s2, ib in zip(labels_numeric, X, X2, sample2block): sums[ib] += s sums2[ib] += s2 block_counts[ib] += 1 if block_labels[ib] is None: block_labels[ib] = l else: assert (block_labels[ib] == l) block_labels = np.asanyarray(block_labels) # additional silly tests for paranoid assert (block_labels.dtype.kind is 'i') # 4. Lets deduce all neighbors... might need to be RF into the # parallel part later on nrois = len(roi_ids) if __debug__: debug( 'SLC', 'Phase 4. Deducing neighbors information for %i ROIs' % (nrois, )) roi_fids = [qe.query_byid(f) for f in roi_ids] nroi_fids = len(roi_fids) # makes sense to waste precious ms only if ca is enabled if self.ca.is_enabled('roi_sizes'): roi_sizes = [len(x) for x in roi_fids] else: roi_sizes = [] indexsum = self._indexsum if indexsum == 'sparse': if __debug__: debug( 'SLC', 'Phase 4b. Converting neighbors to sparse matrix ' 'representation') # convert to "sparse representation" where column j contains # 1s only at the roi_fids[j] indices roi_fids = inds_to_coo(roi_fids, shape=(dataset.nfeatures, nroi_fids)) indexsum_fx = lastdim_columnsums_spmatrix elif indexsum == 'fancy': indexsum_fx = lastdim_columnsums_fancy_indexing else: raise ValueError, \ "Do not know how to deal with indexsum=%s" % indexsum # 5. Lets do actual "splitting" and "classification" if __debug__: debug('SLC', 'Phase 5. Major loop') for isplit, split in enumerate(splits): if __debug__: debug('SLC', ' Split %i out of %i' % (isplit, nsplits)) # figure out for a given splits the blocks we want to work # with # sample_indicies training_sis = split[0].samples[:, 0] # convert to blocks training split training_bis = np.unique(sample2block[training_sis]) # now lets do our GNB business training_nsamples = 0 for il, l in enumerate(ulabels_numeric): bis_il = training_bis[block_labels[training_bis] == l] nsamples_per_class[il] = N_float = \ float(np.sum(block_counts[bis_il])) training_nsamples += N_float if N_float == 0.0: variances[il] = means[il] = means2[il] = 0. else: means[il] = np.sum(sums[bis_il], axis=0) / N_float # Not yet normed means2[il] = np.sum(sums2[bis_il], axis=0) ## Actually compute the non-0 variances non0labels = (nsamples_per_class.squeeze() != 0) if np.all(non0labels): # For a possible tiny speed up avoiding copying and # using (no) slicing non0labels = slice(None) if params.common_variance: variances[:] = \ np.sum(means2 - nsamples_per_class*np.square(means), axis=0) \ / training_nsamples else: variances[non0labels] = \ (means2 - nsamples_per_class*np.square(means))[non0labels] \ / nsamples_per_class[non0labels] # assign priors priors = gnb._get_priors(nlabels, training_nsamples, nsamples_per_class) # proceed in a way we have in GNB code with logprob=True, # i.e. operating within the exponents -- should lead to some # performance advantage norm_weight = -0.5 * np.log(2 * np.pi * variances) # last added dimension would be for ROIs logpriors = np.log(priors[:, np.newaxis, np.newaxis]) if __debug__: debug('SLC', " 'Training' is done") # Now it is time to "classify" our samples. # and for that we first need to compute corresponding # probabilities (or may be un data = X[split[1].samples[:, 0]] targets = labels_numeric[split[1].samples[:, 0]] # argument of exponentiation scaled_distances = \ -0.5 * (((data - means[:, np.newaxis, ...])**2) \ / variances[:, np.newaxis, ...]) # incorporate the normalization from normals lprob_csfs = norm_weight[:, np.newaxis, ...] + scaled_distances ## First we need to reshape to get class x samples x features lprob_csf = lprob_csfs.reshape(lprob_csfs.shape[:2] + (-1, )) ## Now we come to naive part which requires looping ## through all spheres if __debug__: debug('SLC', " Doing 'Searchlight'") # resultant logprobs for each class x sample x roi lprob_cs_sl = np.zeros(lprob_csfs.shape[:2] + (nroi_fids, )) indexsum_fx(lprob_csf, roi_fids, out=lprob_cs_sl) lprob_cs_sl += logpriors lprob_cs_cp_sl = lprob_cs_sl # for each of the ROIs take the class with maximal (log)probability predictions = lprob_cs_cp_sl.argmax(axis=0) # no need to map back [self.ulabels[c] for c in winners] #predictions = winners # assess the errors if __debug__: debug('SLC', " Assessing accuracies") if isinstance(errorfx, MeanMismatchErrorFx): results[isplit, :] = \ (predictions != targets[:, None]).sum(axis=0) \ / float(len(targets)) else: # somewhat silly but a way which allows to use pre-crafted # error functions without a chance to screw up for i, fpredictions in enumerate(predictions.T): results[isplit, i] = errorfx(fpredictions, targets) if __debug__: debug( 'SLC', "GNBSearchlight is done in %.3g sec" % (time.time() - time_start)) return Dataset(results), roi_sizes
def test_labelpermutation_randomsampling(): ds = Dataset.from_wizard(np.ones((5, 10)), targets=range(5), chunks=1) for i in xrange(1, 5): ds.append(Dataset.from_wizard(np.ones((5, 10)) + i, targets=range(5), chunks=i+1)) # assign some feature attributes ds.fa['roi'] = np.repeat(np.arange(5), 2) ds.fa['lucky'] = np.arange(10)%2 # use subclass for testing if it would survive ds.samples = ds.samples.view(myarray) ok_(ds.get_nsamples_per_attr('targets') == {0:5, 1:5, 2:5, 3:5, 4:5}) sample = ds.random_samples(2) ok_(sample.get_nsamples_per_attr('targets').values() == [ 2, 2, 2, 2, 2 ]) ok_((ds.sa['chunks'].unique == range(1, 6)).all()) # keep the orig labels orig_labels = ds.targets.copy() # also keep the orig dataset, but SHALLOW copy and leave everything # else as a view! ods = copy.copy(ds) ds.permute_attr() # by default, some permutation of targets should have happened assert_false((ds.targets == orig_labels).all()) # but the original dataset should be unaffected assert_array_equal(ods.targets, orig_labels) # array subclass survives ok_(isinstance(ods.samples, myarray)) # samples are really shared ds.samples[0, 0] = 123456 assert_array_equal(ds.samples, ods.samples) # and other samples attributes too ds.chunks[0] = 9876 assert_array_equal(ds.chunks, ods.chunks) # try to permute on custom target ds = ods.copy() otargets = ods.sa.targets.copy() ds.sa['custom'] = ods.sa.targets.copy() assert_array_equal(ds.sa.custom, otargets) assert_array_equal(ds.sa.targets, otargets) ds.permute_attr(attr='custom') # original targets should still match assert_array_equal(ds.sa.targets, otargets) # but custom should get permuted assert_false((ds.sa.custom == otargets).all()) # # Test permutation among features # assert_raises(KeyError, ds.permute_attr, attr='roi') # wrong collection ds = ods.copy() ds.permute_attr(attr='lucky', chunks_attr='roi', col='fa') # we should have not touched samples attributes for sa in ds.sa.keys(): assert_array_equal(ds.sa[sa].value, ods.sa[sa].value) # but we should have changed the roi assert_false((ds.fa['lucky'].value == ods.fa['lucky'].value).all()) assert_array_equal(ds.fa['roi'].value, ods.fa['roi'].value) # permute ROI as well without chunking (??? should we make # chunks_attr=None by default?) ds.permute_attr(attr='roi', chunks_attr=None, col='fa') assert_false((ds.fa['roi'].value == ods.fa['roi'].value).all())