def test_cached_kernel(self): nchunks = 5 n = 50 * nchunks d = Dataset(np.random.randn(n, 132)) d.sa.chunks = np.random.randint(nchunks, size=n) # We'll compare against an Rbf just because it has a parameter to change rk = npK.RbfKernel(sigma=1.5) # Assure two kernels are independent for this test ck = CachedKernel(kernel=npK.RbfKernel(sigma=1.5)) ck.compute(d) # Initial cache of all data self.assertTrue(ck._recomputed, 'CachedKernel was not initially computed') # Try some splitting for chunk in [d[d.sa.chunks == i] for i in range(nchunks)]: rk.compute(chunk) ck.compute(chunk) self.kernel_equiv(rk, ck) #, accuracy=1e-12) self.failIf(ck._recomputed, "CachedKernel incorrectly recomputed it's kernel") # Test what happens when a parameter changes ck.params.sigma = 3.5 ck.compute(d) self.assertTrue(ck._recomputed, "CachedKernel doesn't recompute on kernel change") rk.params.sigma = 3.5 rk.compute(d) self.assertTrue(np.all(rk._k == ck._k), 'Cached and rbf kernels disagree after kernel change') # Now test handling new data d2 = Dataset(np.random.randn(32, 43)) ck.compute(d2) self.assertTrue( ck._recomputed, "CachedKernel did not automatically recompute new data") ck.compute(d) self.assertTrue(ck._recomputed, "CachedKernel did not recompute old data which had\n" +\ "previously been computed, but had the cache overriden")
def test_zscore_withoutchunks(): # just a smoke test to see if all issues of # https://github.com/PyMVPA/PyMVPA/issues/26 # are fixed from mvpa2.datasets import Dataset ds = Dataset(np.arange(32).reshape((8, -1)), sa=dict(targets=range(8))) zscore(ds, chunks_attr=None) assert (np.any(ds.samples != np.arange(32).reshape((8, -1)))) ds_summary = ds.summary() assert (ds_summary is not None)
def _call(self, dataset): # For performance measures -- increase to 50-200 # np.sum here is just to get some meaningful value in # them #return np.ones(shape=(2, 2))*np.sum(dataset) return Dataset( np.array([{ 'd': np.ones(shape=(5, 5)) * np.sum(dataset) }], dtype=object))
def _call(self, ds): y = ds.sa[self.space].value if self.numeric or ((self.numeric is None) and y.dtype.char == 'S'): y = AttributeMap().to_numeric(y) # TODO: if not self.uni: out = self.fx(ds.samples, y) else: out = np.array([self.fx(feat, y) for feat in ds.samples.T]) return Dataset(out[None], fa=ds.fa)
def _sl_call(self, dataset, roi_ids, nproc): """Classical generic searchlight implementation """ assert(self.results_backend in ('native', 'hdf5')) # compute if nproc is not None and nproc > 1: # split all target ROIs centers into `nproc` equally sized blocks nproc_needed = min(len(roi_ids), nproc) nblocks = nproc_needed \ if self.nblocks is None else self.nblocks roi_blocks = np.array_split(roi_ids, nblocks) # the next block sets up the infrastructure for parallel computing # this can easily be changed into a ParallelPython loop, if we # decide to have a PP job server in PyMVPA import pprocess p_results = pprocess.Map(limit=nproc_needed) if __debug__: debug('SLC', "Starting off %s child processes for nblocks=%i" % (nproc_needed, nblocks)) compute = p_results.manage( pprocess.MakeParallel(self._proc_block)) for iblock, block in enumerate(roi_blocks): # should we maybe deepcopy the measure to have a unique and # independent one per process? seed = mvpa2.get_random_seed() compute(block, dataset, copy.copy(self.__datameasure), seed=seed, iblock=iblock) else: # otherwise collect the results in an 1-item list p_results = [ self._proc_block(roi_ids, dataset, self.__datameasure)] # Finally collect and possibly process results # p_results here is either a generator from pprocess.Map or a list. # In case of a generator it allows to process results as they become # available result_ds = self.results_fx(sl=self, dataset=dataset, roi_ids=roi_ids, results=self.__handle_all_results(p_results)) # Assure having a dataset (for paranoid ones) if not is_datasetlike(result_ds): try: result_a = np.atleast_1d(result_ds) except ValueError, e: if 'setting an array element with a sequence' in str(e): # try forcing object array. Happens with # test_custom_results_fx_logic on numpy 1.4.1 on Debian # squeeze result_a = np.array(result_ds, dtype=object) else: raise result_ds = Dataset(result_a)
def resting_dmn(sub, ses, in_file=None, lh_ctx_file=None, rh_ctx_file=None, sc_file=None, schedule_file=None): from pipe_hbn_ssi import wb_to_tss import os, sys import numpy as np sys.path.append('/home/bpinsard/data/projects/CoRe') import core.mvpa.dataset as cds from nipy.modalities.fmri.glm import GeneralLinearModel import scipy.ndimage from mvpa2.datasets import Dataset sched = np.loadtxt( schedule_file, converters = {0:int,1:int,2:str,3:int,4:str,5:str}, dtype=np.object, skiprows=1) idx = sched[:,1].tolist().index(ses) #scan_no = sched[idx,2].split('-').index('Rest') if in_file is None: scan_no = [i for i,n in enumerate(lh_ctx_file) if 'RESTING' in n] else: scan_no = [i for i,n in enumerate(in_file) if 'RESTING' in n] scan_no = scan_no[0] if in_file is None: inf = lh_ctx_file[scan_no] print(inf) ds = Dataset(wb_to_tss(lh_ctx_file[scan_no], rh_ctx_file[scan_no], sc_file[scan_no])) else: inf = in_file[scan_no] print(inf) ds = cds.ds_from_ts(in_file[scan_no]) #cds.preproc_ds(ds, detrend=True) ds.samples -= scipy.ndimage.gaussian_filter1d(ds.samples,sigma=8,axis=0,truncate=2) seed_roi = 9 cds.add_aparc_ba_fa(ds, sub, pproc_tpl=os.path.join(pipe_hbn_ssi.proc_dir,'moco_multiband','surface_32k','_sub_%d')) roi_mask = np.logical_or(ds.fa.aparc==seed_roi+11100, ds.fa.aparc==seed_roi+12100) mean_roi_ts = ds.samples[:,roi_mask].mean(1) mean_roi_ts -= mean_roi_ts.mean() mtx = np.asarray([mean_roi_ts, np.ones(ds.nsamples)]).T glm = GeneralLinearModel(mtx) glm.fit(ds.samples,model='ols') contrast = glm.contrast([1,0], contrast_type='t') out_file = os.path.abspath('sub%d_ses%d_connectivity_results.npz'%(sub,ses)) np.savez_compressed(out_file, contrast=contrast, mean_roi_ts=mean_roi_ts) #return contrast return out_file, inf
def _call(self, dataset): """Computes featurewise I-RELIEF weights.""" samples = dataset.samples NS, NF = samples.shape[:2] if self.w_guess is None: self.w = np.ones(NF, 'd') # do normalization in all cases to be safe :) self.w = self.w / (self.w**2).sum() M, H = self.compute_M_H(dataset.targets) while True: self.k = self.kernel(length_scale=self.kernel_width / self.w) d_w_k = self.k.computed(samples).as_raw_np() # set d_w_k to zero where distance=0 (i.e. kernel == # 1.0), otherwise I-RELIEF could not converge. # XXX Note that kernel==1 for distance=0 only for # exponential kernels!! IMPROVE d_w_k[np.abs(d_w_k - 1.0) < 1.0e-15] = 0.0 ni = np.zeros(NF, 'd') for n in range(NS): # d_w_k[n,n] could be omitted since == 0.0 gamma_n = 1.0 - np.nan_to_num(d_w_k[n, M[n]].sum() \ / (d_w_k[n, :].sum()-d_w_k[n, n])) alpha_n = np.nan_to_num(d_w_k[n, M[n]] / (d_w_k[n, M[n]].sum())) beta_n = np.nan_to_num(d_w_k[n, H[n]] / (d_w_k[n, H[n]].sum())) m_n = (np.abs(samples[n, :] - samples[M[n], :]) \ * alpha_n[:, None]).sum(0) h_n = (np.abs(samples[n, :] - samples[H[n], :]) \ * beta_n[:, None]).sum(0) ni += gamma_n * (m_n - h_n) ni = ni / NS ni_plus = np.clip(ni, 0.0, np.inf) # set all negative elements to zero w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum()))) change = np.abs(w_new - self.w).sum() if __debug__ and 'IRELIEF' in debug.active: debug( 'IRELIEF', "change=%.4f max=%f min=%.4f mean=%.4f std=%.4f #nan=%d" % (change, w_new.max(), w_new.min(), w_new.mean(), w_new.std(), np.isnan(w_new).sum())) # update weights: self.w = w_new if change < self.threshold: break return Dataset(self.w[np.newaxis])
def prep_parcelwise_data(subject, parcel, datatype): from mvpa2.datasets import Dataset from mvpa2.mappers.zscore import zscore if datatype == 'sponpain': ds = Dataset( np.load( os.path.join( sponpain_by_parcel, subject + '_sponpain_connectome_parcel-' + str(parcel) + '.npy'))) ds.fa['voxel_indices'] = range(ds.shape[1]) zscore(ds, chunks_attr=None) elif datatype == 'bladderpain': ds = Dataset( np.load( os.path.join( bladderpain_by_parcel, subject + '_bladderpain-cleaned-ts_parcel-' + str(parcel) + '.npy'))) ds.fa['voxel_indices'] = range(ds.shape[1]) zscore(ds, chunks_attr=None) else: print('Must specify datatyp as either sponpain or bladderpain') return ds
def _forward_dataset_helper(self, ds): # local binding num = self.__num pos = None if not self.__position_attr is None: # we know something about sample position pos = ds.sa[self.__position_attr].value rsamples, pos = resample(ds.samples, self.__num, t=pos, window=self.__window_args) else: # we know nothing about samples position rsamples = resample(ds.samples, self.__num, t=None, window=self.__window_args) # new dataset that reuses that feature and dataset attributes of the # source mds = Dataset(rsamples, fa=ds.fa, a=ds.a) # the tricky part is what to do with the samples attributes, since their # number has changes if self.__attr_strategy == 'remove': # nothing to be done pass elif self.__attr_strategy == 'sample': step = int(len(ds) / num) sa = dict([(k, ds.sa[k].value[0::step][:num]) for k in ds.sa]) mds.sa.update(sa) elif self.__attr_strategy == 'resample': # resample the attributes themselves sa = {} for k in ds.sa: v = ds.sa[k].value if pos is None: sa[k] = resample(v, self.__num, t=None, window=self.__window_args) else: if k == self.__position_attr: # position attr will be handled separately at the end continue sa[k] = resample(v, self.__num, t=pos, window=self.__window_args)[0] # inject them all mds.sa.update(sa) else: raise ValueError("Unkown attribute handling strategy '%s'." % self.__attr_strategy) if not pos is None: # we got the new sample positions and can store them mds.sa[self.__position_attr] = pos return mds
def intersubject_correlation(dss, reference_ds=0): """ Computes voxelwise inter-subject time series correlation in a pairwise fashion for a list of Datasets. Datasets must all be the same shape. Resulting dataset of pairwise correlations will inherit Dataset attributes from reference data set [Default: first data set in list]. """ # Check if input list contains Datasets, ndarrays dss = [Dataset(ds) if not type(ds) == Dataset else ds for ds in dss] ds_shape = dss[reference_ds].shape n_features = ds_shape[1] for ds in dss: assert ds.shape == ds_shape # Compute time series correlation per voxel per subject pair correlations = [] for pair in combinations(dss, 2): pair_map = [] for feature in xrange(n_features): pair_map.append( pearson_correlation(pair[0].samples[:, feature], pair[1].samples[:, feature])) correlations.append(pair_map) # Resulting correlation map inherits attributes of referece data set correlations_ds = Dataset(correlations, fa=dss[reference_ds].fa, a=dss[reference_ds].a) correlations_ds.sa['pairs'] = list(combinations(range(len(dss)), 2)) assert correlations_ds.shape[0] == len(dss) * (len(dss) - 1) / 2 assert correlations_ds.shape[1] == n_features return correlations_ds
def build_streamline_things(self): # Build a dataset having samples of different lengths. This is # trying to mimic a possible interface for streamlines # datasets, i.e., an iterable container of Mx3 points, where M # depends on each single streamline. # trying to pack it into an 'object' array to prevent conversion in the # Dataset self.streamline_samples = np.array([ np.random.rand(3,3), np.random.rand(5,3), np.random.rand(7,3)], dtype='object') self.dataset = Dataset(self.streamline_samples) self.similarities = [StreamlineSimilarity(distance=corouge)]
def test_strip_nibabel(): # lots of implicit test already, just make sure it doesn't ruin other # datasets ds = Dataset([range(5)]) strip_nibabel(ds) assert_true('imgtype' not in ds.a) # can run multiple times: idempotent ds = fmri_dataset( pathjoin(pymvpa_dataroot, 'haxby2001', 'sub001', 'BOLD', 'task001_run001', 'bold_25mm.nii.gz')) strip_nibabel(ds) # this is real strip_nibabel(ds) # this is not a copy&paste error! assert_true('imgtype' in ds.a) assert_true('imgaffine' in ds.a) assert_equal(type(ds.a.imghdr), dict)
def _call(self, dataset): """Computes featurewise I-RELIEF weights.""" samples = dataset.samples NS, NF = samples.shape[:2] if self.w_guess is None: w = np.ones(NF, 'd') w /= (w**2).sum() # do normalization in all cases to be safe :) M, H = self.compute_M_H(dataset.targets) while True: d_w_k = self.k(pnorm_w(data1=samples, weight=w, p=1)) ni = np.zeros(NF, 'd') for n in range(NS): # d_w_k[n, n] could be omitted since == 0.0 gamma_n = 1.0 - np.nan_to_num(d_w_k[n, M[n]].sum() \ / (d_w_k[n, :].sum() - d_w_k[n, n])) alpha_n = np.nan_to_num(d_w_k[n, M[n]] / (d_w_k[n, M[n]].sum())) beta_n = np.nan_to_num(d_w_k[n, H[n]] / (d_w_k[n, H[n]].sum())) m_n = (np.abs(samples[n, :] - samples[M[n], :]) \ * alpha_n[:, None]).sum(0) h_n = (np.abs(samples[n, :] - samples[H[n], :]) \ * beta_n[:, None]).sum(0) ni += gamma_n * (m_n - h_n) ni = ni / NS ni_plus = np.clip(ni, 0.0, np.inf) # set all negative elements to zero w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum()))) change = np.abs(w_new - w).sum() if __debug__ and 'IRELIEF' in debug.active: debug('IRELIEF', "change=%.4f max=%f min=%.4f mean=%.4f std=%.4f #nan=%d" \ % (change, w_new.max(), w_new.min(), w_new.mean(), w_new.std(), np.isnan(w_new).sum())) # update weights: w = w_new if change < self.threshold: break self.w = w return Dataset(self.w[np.newaxis])
def test_resample(): time = np.linspace(0, 2 * np.pi, 100) ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T, sa={ 'time': time, 'section': np.repeat(range(10), 10) }) assert_equal(ds.shape, (100, 2)) # downsample num = 10 rm = FFTResampleMapper(num, window=('gauss', 50), position_attr='time', attr_strategy='sample') mds = rm.forward(ds) assert_equal(mds.shape, (num, ds.nfeatures)) # didn't change the orig assert_equal(len(ds), 100) # check position-based resampling ds_partial = ds[0::10] mds_partial = rm.forward(ds_partial) # despite different input sampling should yield the same output timepoints assert_array_almost_equal(mds.sa.time, mds_partial.sa.time) # exclude the first points to prevent edge effects, but the data should be # very similar too assert_array_almost_equal(mds.samples[2:], mds_partial.samples[2:], decimal=2) # simple sample of sa's should give meaningful stuff assert_array_equal(mds.sa.section, range(10)) # and now for a dataset with chunks cds = vstack([ds.copy(), ds.copy()]) cds.sa['chunks'] = np.repeat([0, 1], len(ds)) rm = FFTResampleMapper(num, attr_strategy='sample', chunks_attr='chunks', window=('gauss', 50)) mcds = rm.forward(cds) assert_equal(mcds.shape, (20, 2)) assert_array_equal(mcds.sa.section, np.tile(range(10), 2)) # each individual chunks should be identical to previous dataset assert_array_almost_equal(mds.samples, mcds.samples[:10]) assert_array_almost_equal(mds.samples, mcds.samples[10:])
def _call(self, ds): # extract samples and targets and pass them to the errorfx targets = ds.sa[self.get_space()].value # squeeze to remove bogus dimensions and prevent problems during # comparision later on values = np.atleast_1d(ds.samples.squeeze()) if not values.shape == targets.shape: # if they have different shape numpy's broadcasting might introduce # pointless stuff (compare individual features or yield a single # boolean raise ValueError("Trying to compute an error between data of " "different shape (%s vs. %s)." % (values.shape, targets.shape)) err = self.fx(values, targets) if np.isscalar(err): err = np.array(err, ndmin=2) return Dataset(err)
def test_iirfilter(): # dataset with one feature from two waves t = np.linspace(0, 1.0, 2001) xlow = np.sin(2 * np.pi * 5 * t) xhigh = np.sin(2 * np.pi * 250 * t) x = xlow + xhigh ds = Dataset(x, sa={'sid': np.arange(len(x))}, fa={'fid':['theone']}) # butterworth filter with a cutoff between the waves from scipy import signal b, a = signal.butter(8, 0.125) mds = iir_filter(ds, b, a, padlen=150) # check we get just the slow wave out (compensate for edge artifacts) assert_false(np.sum(np.abs(mds.samples[100:-100,0] - xlow[100:-100]) > 0.001)) assert_equal(len(ds.sa), len(mds.sa)) assert_equal(len(ds.fa), len(mds.fa)) assert_array_equal(ds.fa.fid, mds.fa.fid) assert_array_equal(ds.sa.sid, mds.sa.sid)
def __process_roi(self, ds, roi_feature_id, measure, assure_dataset): # retrieve the feature ids of all features in the ROI from the query # engine roi_specs = self._queryengine[roi_feature_id] if __debug__: debug( 'SLC_', 'For %r query returned roi_specs %r' % (roi_feature_id, roi_specs)) if is_datasetlike(roi_specs): # TODO: unittest assert (len(roi_specs) == 1) roi_fids = roi_specs.samples[0] else: roi_fids = roi_specs # slice the dataset roi = ds[:, roi_fids] if is_datasetlike(roi_specs): for n, v in roi_specs.fa.iteritems(): roi.fa[n] = v if self.__add_center_fa: # add fa to indicate ROI seed if requested roi_seed = np.zeros(roi.nfeatures, dtype='bool') if roi_feature_id in roi_fids: roi_seed[roi_fids.index(roi_feature_id)] = True else: warning("Center feature attribute id %s not found" % roi_feature_id) roi.fa[self.__add_center_fa] = roi_seed # compute the datameasure and store in results res = measure(roi) if assure_dataset and not is_datasetlike(res): res = Dataset(np.atleast_1d(res)) if self.ca.is_enabled('roi_feature_ids'): # add roi feature ids to intermediate result dataset for later # aggregation res.a['roi_feature_ids'] = roi_fids if self.ca.is_enabled('roi_sizes'): res.a['roi_sizes'] = roi.nfeatures if self.ca.is_enabled('roi_center_ids'): res.a['roi_center_ids'] = roi_feature_id return res, roi
def test_sifter(): # somewhat duplicating the doctest ds = Dataset(samples=np.arange(8).reshape((4,2)), sa={'chunks': [ 0 , 1 , 2 , 3 ], 'targets': ['c', 'c', 'p', 'p']}) for sift_targets_definition in (['c', 'p'], dict(uvalues=['c', 'p'])): par = ChainNode([NFoldPartitioner(cvtype=2, attr='chunks'), Sifter([('partitions', 2), ('targets', sift_targets_definition)]) ]) dss = list(par.generate(ds)) assert_equal(len(dss), 4) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def _fill_in_scattered_results(sl, dataset, roi_ids, results): """this requires the searchlight conditional attribute 'roi_feature_ids' to be enabled""" import numpy as np from mvpa2.datasets import Dataset resmap = None probmap = None for resblock in results: for res in resblock: if resmap is None: # prepare the result container resmap = np.zeros((len(res), dataset.nfeatures), dtype=res.samples.dtype) if 'null_prob' in res.fa: # initialize the prob map also with zeroes, as p=0 can never # happen as an empirical result probmap = np.zeros( (dataset.nfeatures, ) + res.fa.null_prob.shape[1:], dtype=res.samples.dtype) observ_counter = np.zeros(dataset.nfeatures, dtype=int) #project the result onto all features -- love broadcasting! resmap[:, res.a.roi_feature_ids] += res.samples if not probmap is None: probmap[res.a.roi_feature_ids] += res.fa.null_prob # increment observation counter for all relevant features observ_counter[res.a.roi_feature_ids] += 1 # when all results have been added up average them according to the number # of observations observ_mask = observ_counter > 0 resmap[:, observ_mask] /= observ_counter[observ_mask] result_ds = Dataset(resmap, fa={'observations': observ_counter}) if not probmap is None: # transpose to make broadcasting work -- creates a view, so in-place # modification still does the job probmap.T[:, observ_mask] /= observ_counter[observ_mask] result_ds.fa['null_prob'] = probmap.squeeze() if 'mapper' in dataset.a: import copy result_ds.a['mapper'] = copy.copy(dataset.a.mapper) return result_ds
def _get_hypesvs(self, sl_connectomes, local_common_model=None): ''' Hyperalign connectomes and return mapppers and trained SVDMapper of common space. Parameters ---------- sl_connectomes: a list of connectomes to hyperalign local_common_model: a reference common model to be used. Returns ------- a tuple (sl_hmappers, svm, local_common_model) sl_hmappers: a list of mappers corresponding to input list in that order. svm: a svm mapper based on the input data. if given a common model, this is None. local_common_model: If local_common_model is provided as input, this will be None. Otherwise, local_common_model will be computed here and returned. ''' # TODO Should we z-score sl_connectomes? return_model = False if self.params.save_model is None else True if local_common_model is not None: ha = Hyperalignment(level2_niter=0) if not is_datasetlike(local_common_model): local_common_model = Dataset(samples=local_common_model) ha.train([local_common_model]) sl_hmappers = ha(sl_connectomes) return sl_hmappers, None, None ha = Hyperalignment() sl_hmappers = ha(sl_connectomes) sl_connectomes = [ slhm.forward(slc) for slhm, slc in zip(sl_hmappers, sl_connectomes) ] _ = [zscore(slc, chunks_attr=None) for slc in sl_connectomes] sl_connectomes = np.dstack(sl_connectomes).mean(axis=-1) svm = SVDMapper(force_train=True) svm.train(sl_connectomes) if return_model: local_common_model = svm.forward(sl_connectomes) else: local_common_model = None return sl_hmappers, svm, local_common_model
def test_1d_multispace_searchlight(self): ds = Dataset([np.arange(6)]) ds.fa['coord1'] = np.repeat(np.arange(3), 2) # add a second space to the dataset ds.fa['coord2'] = np.tile(np.arange(2), 3) measure = lambda x: "+".join([str(x) for x in x.samples[0]]) # simply select each feature once res = Searchlight(measure, IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(0)), nproc=1)(ds) assert_array_equal(res.samples, [['0', '1', '2', '3', '4', '5']]) res = Searchlight(measure, IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(1)), nproc=1)(ds) assert_array_equal(res.samples, [['0+1', '0+1', '2+3', '2+3', '4+5', '4+5']]) res = Searchlight(measure, IndexQueryEngine(coord1=Sphere(1), coord2=Sphere(0)), nproc=1)(ds) assert_array_equal(res.samples, [['0+2', '1+3', '0+2+4', '1+3+5', '2+4', '3+5']])
def testmodel(wts, des, ds, tc, use_corr=True): import numpy as np widx = wts.sa['chunks'].unique didx = ds.sa['chunks'].unique if len(widx) != len(didx): print "unequal number of chunks... exiting" return if 'word2vec' in tc: tc.remove('word2vec') for i in np.arange(0, 300): tc.append('word2vec' + str(i)) corrs = [] regidx = [des.names.index(i) for i in tc] for i in np.arange(len(widx)): pred = np.dot( des.matrix[:, regidx], wts[wts.sa['chunks'].value == widx[i]].samples[regidx, :])[ ds.sa['chunks'].value == didx[i]] Presp = ds[ds.sa['chunks'].value == didx[i]].samples # Find prediction correlations nnpred = np.nan_to_num(pred) if use_corr: vcorrs = np.nan_to_num( np.array([ np.corrcoef(Presp[:, ii], nnpred[:, ii].ravel())[0, 1] for ii in range(Presp.shape[1]) ])) else: resvar = (Presp - pred).var(0) Rsqs = 1 - (resvar / Presp.var(0)) vcorrs = np.sqrt(np.abs(Rsqs)) * np.sign(Rsqs) corrs.append(vcorrs) from mvpa2.datasets import Dataset return Dataset(np.vstack(corrs), sa={'chunks': ds.sa['chunks'].unique}, fa=ds.fa, a=ds.a)
def test_exclude_targets_combinations_subjectchunks(): partitioner = ChainNode([ NFoldPartitioner(attr='subjects'), ExcludeTargetsCombinationsPartitioner( k=1, targets_attr='chunks', space='partitions') ], space='partitions') # targets do not need even to be defined! ds = Dataset(np.arange(18).reshape(9, 2), sa={ 'chunks': np.arange(9) // 3, 'subjects': np.arange(9) % 3 }) dss = list(partitioner.generate(ds)) assert_equal(len(dss), 9) testing_subjs, testing_chunks = [], [] for ds_ in dss: testing_partition = ds_.sa.partitions == 2 training_partition = ds_.sa.partitions == 1 # must be scalars -- so implicit test here # if not -- would be error testing_subj = np.asscalar( np.unique(ds_.sa.subjects[testing_partition])) testing_subjs.append(testing_subj) testing_chunk = np.asscalar(np.unique( ds_.sa.chunks[testing_partition])) testing_chunks.append(testing_chunk) # and those must not appear for training ok_(not testing_subj in ds_.sa.subjects[training_partition]) ok_(not testing_chunk in ds_.sa.chunks[training_partition]) # and we should have gone through all chunks/subjs pairs testing_pairs = set(zip(testing_subjs, testing_chunks)) assert_equal(len(testing_pairs), 9) # yoh: equivalent to set(itertools.product(range(3), range(3)))) # but .product is N/A for python2.5 assert_equal(testing_pairs, set(zip(*np.where(np.ones((3, 3))))))
def _level3(self, datasets): params = self.params # for quicker access ;) # create a mapper per dataset mappers = [deepcopy(params.alignment) for ds in datasets] # key different from level-2; the common space is uniform #temp_commonspace = commonspace # Fixing nproc=0 if params.nproc == 0: from mvpa2.base import warning warning("nproc of 0 doesn't make sense. Setting nproc to 1.") params.nproc = 1 # Checking for joblib, if not, set nproc to 1 if params.nproc != 1: from mvpa2.base import externals, warning if not externals.exists('joblib'): warning( "Setting nproc different from 1 requires joblib package, which " "does not seem to exist. Setting nproc to 1.") params.nproc = 1 # start from original input datasets again if params.nproc == 1: residuals = [] for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) m, residual = get_trained_mapper( ds_new, self.commonspace, m, self.ca['residual_errors'].enabled) if self.ca['residual_errors'].enabled: residuals.append(residual) else: if __debug__: debug('HPAL_', "Level 3: Using joblib with nproc = %d " % params.nproc) verbose_level_parallel = 20 \ if (__debug__ and 'HPAL' in debug.active) else 0 from joblib import Parallel, delayed import sys # joblib's 'multiprocessing' backend has known issues of failure on OSX # Tested with MacOS 10.12.13, python 2.7.13, joblib v0.10.3 if params.joblib_backend is None: params.joblib_backend = 'threading' if sys.platform == 'darwin' \ else 'multiprocessing' res = Parallel(n_jobs=params.nproc, pre_dispatch=params.nproc, backend=params.joblib_backend, verbose=verbose_level_parallel)( delayed(get_trained_mapper)( ds, self.commonspace, mapper, self.ca['residual_errors'].enabled) for ds, mapper in zip(datasets, mappers)) mappers = [m for m, r in res] if self.ca['residual_errors'].enabled: residuals = [r for m, r in res] if self.ca['residual_errors'].enabled: self.ca.residual_errors = Dataset( samples=np.array(residuals)[None, :]) return mappers
def train(self, datasets): """Derive a common feature space from a series of datasets. Parameters ---------- datasets : sequence of datasets Returns ------- A list of trained Mappers matching the number of input datasets. """ params = self.params # for quicker access ;) ca = self.ca # Check to make sure we get a list of datasets as input. if not isinstance(datasets, (list, tuple, np.ndarray)): raise TypeError("Input datasets should be a sequence " "(of type list, tuple, or ndarray) of datasets.") ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] alpha = params.alpha residuals = None if ca['training_residual_errors'].enabled: residuals = np.zeros((1 + params.level2_niter, ndatasets)) ca.training_residual_errors = Dataset( samples=residuals, sa={ 'levels': ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)] }) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds # Making sure that ref_ds is within range. #Parameter() already checks for it being a non-negative integer if ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.chosen_ref_ds = ref_ds # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # TODO since we are doing in-place zscoring create deep copies # of the datasets with pruned targets and shallow copies of # the collections (if they would come needed in the transformation) # TODO: handle floats and non-floats differently to prevent # waste of memory if there is no need (e.g. no z-scoring) #otargets = [ds.sa.targets for ds in datasets] datasets = [ds.copy(deep=False) for ds in datasets] #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)}) #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)}) # for ds in datasets] if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) if alpha < 1: datasets, wmappers = self._regularize(datasets, alpha) # initial common space is the reference dataset commonspace = datasets[ref_ds].samples # the reference dataset might have been zscored already, don't do it # twice if params.zscore_common and not params.zscore_all: if __debug__: debug( 'HPAL_', "Creating copy of a commonspace and assuring " "it is of a floating type") commonspace = commonspace.astype(float) zscore(commonspace, chunks_attr=None) # If there is only one dataset in training phase, there is nothing to be done # just use that data as the common space if len(datasets) < 2: self.commonspace = commonspace else: # create a mapper per dataset # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # # Level 1 -- initial projection # lvl1_projdata = self._level1(datasets, commonspace, ref_ds, mappers, residuals) # # Level 2 -- might iterate multiple times # # this is the final common space self.commonspace = self._level2(datasets, lvl1_projdata, mappers, residuals) if params.output_dim is not None: mappers = self._level3(datasets) self._svd_mapper = SVDMapper() self._svd_mapper.train(self._map_and_mean(datasets, mappers)) self._svd_mapper = StaticProjectionMapper( proj=self._svd_mapper.proj[:, :params.output_dim])
class MCNullDist(NullDist): """Null-hypothesis distribution is estimated from randomly permuted data labels. The distribution is estimated by calling fit() with an appropriate `Measure` or `TransferError` instance and a training and a validation dataset (in case of a `TransferError`). For a customizable amount of cycles the training data labels are permuted and the corresponding measure computed. In case of a `TransferError` this is the error when predicting the *correct* labels of the validation dataset. The distribution can be queried using the `cdf()` method, which can be configured to report probabilities/frequencies from `left` or `right` tail, i.e. fraction of the distribution that is lower or larger than some critical value. This class also supports `FeaturewiseMeasure`. In that case `cdf()` returns an array of featurewise probabilities/frequencies. """ _DEV_DOC = """ TODO automagically decide on the number of samples/permutations needed Caution should be paid though since resultant distributions might be quite far from some conventional ones (e.g. Normal) -- it is expected to them to be bimodal (or actually multimodal) in many scenarios. """ dist_samples = ConditionalAttribute(enabled=False, doc='Samples obtained for each permutation') skipped = ConditionalAttribute(enabled=True, doc='# of the samples which were skipped because ' 'measure has failed to evaluated at them') def __init__(self, permutator, dist_class=Nonparametric, measure=None, **kwargs): """Initialize Monte-Carlo Permutation Null-hypothesis testing Parameters ---------- permutator : Node Node instance that generates permuted datasets. dist_class : class This can be any class which provides parameters estimate using `fit()` method to initialize the instance, and provides `cdf(x)` method for estimating value of x in CDF. All distributions from SciPy's 'stats' module can be used. measure : Measure or None Optional measure that is used to compute results on permuted data. If None, a measure needs to be passed to ``fit()``. """ NullDist.__init__(self, **kwargs) self._dist_class = dist_class self._dist = [] # actual distributions self._measure = measure self.__permutator = permutator def __repr__(self, prefixes=[]): prefixes_ = ["%s" % self.__permutator] if self._dist_class != Nonparametric: prefixes_.insert(0, 'dist_class=%r' % (self._dist_class,)) return super(MCNullDist, self).__repr__( prefixes=prefixes_ + prefixes) def fit(self, measure, ds): """Fit the distribution by performing multiple cycles which repeatedly permuted labels in the training dataset. Parameters ---------- measure: Measure or None A measure used to compute the results from shuffled data. Can be None if a measure instance has been provided to the constructor. ds: `Dataset` which gets permuted and used to compute the measure/transfer error multiple times. """ # TODO: place exceptions separately so we could avoid circular imports from mvpa2.base.learner import LearnerError # prefer the already assigned measure over anything the was passed to # the function. # XXX that is a bit awkward but is necessary to keep the code changes # in the rest of PyMVPA minimal till this behavior become mandatory if not self._measure is None: measure = self._measure measure.untrain() dist_samples = [] """Holds the values for randomized labels.""" # estimate null-distribution # TODO this really needs to be more clever! If data samples are # shuffled within a class it really makes no difference for the # classifier, hence the number of permutations to estimate the # null-distribution of transfer errors can be reduced dramatically # when the *right* permutations (the ones that matter) are done. skipped = 0 # # of skipped permutations for p, permuted_ds in enumerate(self.__permutator.generate(ds)): # new permutation all the time # but only permute the training data and keep the testdata constant # if __debug__: debug('STATMC', "Doing %i permutations: %i" \ % (self.__permutator.count, p+1), cr=True) # compute and store the measure of this permutation # assume it has `TransferError` interface try: res = measure(permuted_ds) dist_samples.append(res.samples) except LearnerError, e: if __debug__: debug('STATMC', " skipped", cr=True) warning('Failed to obtain value from %s due to %s. Measurement' ' was skipped, which could lead to unstable and/or' ' incorrect assessment of the null_dist' % (measure, e)) skipped += 1 continue self.ca.skipped = skipped if __debug__: debug('STATMC', ' Skipped: %d permutations' % skipped) if not len(dist_samples) and skipped > 0: raise RuntimeError( 'Failed to obtain any value from %s. %d measurements were ' 'skipped. Check above warnings, and your code/data' % (measure, skipped)) # store samples as (npermutations x nsamples x nfeatures) dist_samples = np.asanyarray(dist_samples) # for the ca storage use a dataset with # (nsamples x nfeatures x npermutations) to make it compatible with the # result dataset of the measure self.ca.dist_samples = Dataset(np.rollaxis(dist_samples, 0, len(dist_samples.shape))) # fit distribution per each element # to decide either it was done on scalars or vectors shape = dist_samples.shape nshape = len(shape) # if just 1 dim, original data was scalar, just create an # artif dimension for it if nshape == 1: dist_samples = dist_samples[:, np.newaxis] # fit per each element. # XXX could be more elegant? may be use np.vectorize? dist_samples_rs = dist_samples.reshape((shape[0], -1)) dist = [] for samples in dist_samples_rs.T: params = self._dist_class.fit(samples) if __debug__ and 'STAT__' in debug.active: debug('STAT', 'Estimated parameters for the %s are %s' % (self._dist_class, str(params))) dist.append(self._dist_class(*params)) self._dist = dist
def _call(self, ds): return CrossValidation._call( self, Dataset(np.hstack((ds, ds.sa.beh)), sa=ds.sa))
def test_factorialpartitioner(): # Test against sifter and chainmap implemented in test_usecases # -- code below copied from test_usecases -- # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, ) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 # let's make two other datasets to test later # one superordinate category only ds_1super = ds.copy() ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets] # one superordinate category has only one subordinate #ds_unbalanced = ds.copy() #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1') #mask_superord = ds_unbalanced.sa.superord == 'super1' #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord]) #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)] ds_unbalanced = Dataset(range(4), sa={ 'subord': [0, 0, 1, 2], 'superord': [1, 1, 2, 2] }) npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') # now the new implementation factpart = FactorialPartitioner(NFoldPartitioner(attr='subord'), attr='superord') partitions_npart = [p.sa.partitions for p in npart.generate(ds)] partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)] assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart)) # now let's check it behaves correctly if we have only one superord class nfold = NFoldPartitioner(attr='subord') partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)] partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_1super) ] assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart)) # smoke test for unbalanced subord classes warning_msg = 'One or more superordinate attributes do not have the same '\ 'number of subordinate attributes. This could yield to '\ 'unbalanced partitions.' with assert_warnings([(RuntimeWarning, warning_msg)]): partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_unbalanced) ] partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])] superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])] subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])] for out_part, true_part, super_out, sub_out in \ zip(partitions_factpart, partitions_unbalanced, superord_unbalanced, subord_unbalanced): assert_array_equal(out_part, true_part) assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()), super_out) assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out) # now let's test on a dummy dataset ds_dummy = Dataset(range(4), sa={ 'subord': range(4), 'superord': [1, 2] * 2 }) partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_dummy) ] assert_array_equal( partitions_factpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
def test_surf_ring_queryengine(self): s = surf.generate_plane((0, 0, 0), (0, 1, 0), (0, 0, 1), 4, 5) # add second layer s2 = surf.merge(s, (s + (.01, 0, 0))) ds = Dataset(samples=np.arange(20)[np.newaxis], fa=dict(node_indices=np.arange(39, 0, -2))) # add more features (with shared node indices) ds3 = hstack((ds, ds, ds)) radius = 2.5 inner_radius = 1.0 # Makes sure it raises error if inner_radius is >= radius assert_raises( ValueError, lambda: queryengine.SurfaceRingQueryEngine( surface=s2, inner_radius=2.5, radius=radius)) distance_metrics = ('euclidean', 'dijkstra', 'euclidean', 'dijkstra') for distance_metric, include_center in zip(distance_metrics, [True, False] * 2): qe = queryengine.SurfaceRingQueryEngine( surface=s2, radius=radius, inner_radius=inner_radius, distance_metric=distance_metric, include_center=include_center) # untrained qe should give errors assert_raises(ValueError, lambda: qe.ids) assert_raises(ValueError, lambda: qe.query_byid(0)) # node index out of bounds should give error ds_ = ds.copy() ds_.fa.node_indices[0] = 100 assert_raises(ValueError, lambda: qe.train(ds_)) # lack of node indices should give error ds_.fa.pop('node_indices') assert_raises(ValueError, lambda: qe.train(ds_)) # train the qe qe.train(ds3) for node in np.arange(-1, s2.nvertices + 1): if node < 0 or node >= s2.nvertices: assert_raises(KeyError, lambda: qe.query_byid(node)) continue feature_ids = np.asarray(qe.query_byid(node)) # node indices relative to ds base_ids = feature_ids[feature_ids < 20] # should have multiples of 20 assert_equal(set(feature_ids), set((base_ids[np.newaxis].T + \ [0, 20, 40]).ravel())) node_indices = s2.circlearound_n2d( node, radius, distance_metric or 'dijkstra') fa_indices = [ fa_index for fa_index, inode in enumerate(ds3.fa.node_indices) if inode in node_indices and node_indices[inode] > inner_radius ] if include_center and node in ds3.fa.node_indices: fa_indices += np.where( ds3.fa.node_indices == node)[0].tolist() assert_equal(set(feature_ids), set(fa_indices))
def test_surf_queryengine(self, qefn): s = surf.generate_plane((0, 0, 0), (0, 1, 0), (0, 0, 1), 4, 5) # add second layer s2 = surf.merge(s, (s + (.01, 0, 0))) ds = Dataset(samples=np.arange(20)[np.newaxis], fa=dict(node_indices=np.arange(39, 0, -2))) # add more features (with shared node indices) ds3 = hstack((ds, ds, ds)) radius = 2.5 # Note: sweepargs it not used to avoid re-generating the same # surface and dataset multiple times. for distance_metric in ('euclidean', 'dijkstra', '<illegal>', None): builder = lambda: queryengine.SurfaceQueryEngine( s2, radius, distance_metric) if distance_metric in ('<illegal>', None): assert_raises(ValueError, builder) continue qe = builder() # test i/o and ensure that the untrained instance is not trained if externals.exists('h5py'): h5save(qefn, qe) qe = h5load(qefn) # untrained qe should give errors assert_raises(ValueError, lambda: qe.ids) assert_raises(ValueError, lambda: qe.query_byid(0)) # node index out of bounds should give error ds_ = ds.copy() ds_.fa.node_indices[0] = 100 assert_raises(ValueError, lambda: qe.train(ds_)) # lack of node indices should give error ds_.fa.pop('node_indices') assert_raises(ValueError, lambda: qe.train(ds_)) # train the qe qe.train(ds3) # test i/o and ensure that the loaded instance is trained if externals.exists('h5py'): h5save(qefn, qe) qe = h5load(qefn) for node in np.arange(-1, s2.nvertices + 1): if node < 0 or node >= s2.nvertices: assert_raises(KeyError, lambda: qe.query_byid(node)) continue feature_ids = np.asarray(qe.query_byid(node)) # node indices relative to ds base_ids = feature_ids[feature_ids < 20] # should have multiples of 20 assert_equal(set(feature_ids), set((base_ids[np.newaxis].T + \ [0, 20, 40]).ravel())) node_indices = list( s2.circlearound_n2d(node, radius, distance_metric or 'dijkstra')) fa_indices = [ fa_index for fa_index, node in enumerate(ds3.fa.node_indices) if node in node_indices ] assert_equal(set(feature_ids), set(fa_indices)) # smoke tests assert_true('SurfaceQueryEngine' in '%s' % qe) assert_true('SurfaceQueryEngine' in '%r' % qe)