예제 #1
    def test_cached_kernel(self):
        nchunks = 5
        n = 50 * nchunks
        d = Dataset(np.random.randn(n, 132))
        d.sa.chunks = np.random.randint(nchunks, size=n)

        # We'll compare against an Rbf just because it has a parameter to change
        rk = npK.RbfKernel(sigma=1.5)

        # Assure two kernels are independent for this test
        ck = CachedKernel(kernel=npK.RbfKernel(sigma=1.5))
        ck.compute(d)  # Initial cache of all data

                        'CachedKernel was not initially computed')

        # Try some splitting
        for chunk in [d[d.sa.chunks == i] for i in range(nchunks)]:
            self.kernel_equiv(rk, ck)  #, accuracy=1e-12)
                        "CachedKernel incorrectly recomputed it's kernel")

        # Test what happens when a parameter changes
        ck.params.sigma = 3.5
                        "CachedKernel doesn't recompute on kernel change")
        rk.params.sigma = 3.5
        self.assertTrue(np.all(rk._k == ck._k),
                        'Cached and rbf kernels disagree after kernel change')

        # Now test handling new data
        d2 = Dataset(np.random.randn(32, 43))
            "CachedKernel did not automatically recompute new data")
                        "CachedKernel did not recompute old data which had\n" +\
                        "previously been computed, but had the cache overriden")
예제 #2
def test_zscore_withoutchunks():
    # just a smoke test to see if all issues of
    # https://github.com/PyMVPA/PyMVPA/issues/26
    # are fixed
    from mvpa2.datasets import Dataset
    ds = Dataset(np.arange(32).reshape((8, -1)), sa=dict(targets=range(8)))
    zscore(ds, chunks_attr=None)
    assert (np.any(ds.samples != np.arange(32).reshape((8, -1))))
    ds_summary = ds.summary()
    assert (ds_summary is not None)
예제 #3
 def _call(self, dataset):
     # For performance measures -- increase to 50-200
     # np.sum here is just to get some meaningful value in
     # them
     #return np.ones(shape=(2, 2))*np.sum(dataset)
     return Dataset(
             'd': np.ones(shape=(5, 5)) * np.sum(dataset)
예제 #4
 def _call(self, ds):
     y = ds.sa[self.space].value
     if self.numeric or ((self.numeric is None) and y.dtype.char == 'S'):
         y = AttributeMap().to_numeric(y)
     # TODO:
     if not self.uni:
         out = self.fx(ds.samples, y)
         out = np.array([self.fx(feat, y) for feat in ds.samples.T])
     return Dataset(out[None], fa=ds.fa)
예제 #5
    def _sl_call(self, dataset, roi_ids, nproc):
        """Classical generic searchlight implementation
        assert(self.results_backend in ('native', 'hdf5'))
        # compute
        if nproc is not None and nproc > 1:
            # split all target ROIs centers into `nproc` equally sized blocks
            nproc_needed = min(len(roi_ids), nproc)
            nblocks = nproc_needed \
                      if self.nblocks is None else self.nblocks
            roi_blocks = np.array_split(roi_ids, nblocks)

            # the next block sets up the infrastructure for parallel computing
            # this can easily be changed into a ParallelPython loop, if we
            # decide to have a PP job server in PyMVPA
            import pprocess
            p_results = pprocess.Map(limit=nproc_needed)
            if __debug__:
                debug('SLC', "Starting off %s child processes for nblocks=%i"
                      % (nproc_needed, nblocks))
            compute = p_results.manage(
            for iblock, block in enumerate(roi_blocks):
                # should we maybe deepcopy the measure to have a unique and
                # independent one per process?
                seed = mvpa2.get_random_seed()
                compute(block, dataset, copy.copy(self.__datameasure),
                        seed=seed, iblock=iblock)
            # otherwise collect the results in an 1-item list
            p_results = [
                    self._proc_block(roi_ids, dataset, self.__datameasure)]

        # Finally collect and possibly process results
        # p_results here is either a generator from pprocess.Map or a list.
        # In case of a generator it allows to process results as they become
        # available
        result_ds = self.results_fx(sl=self,

        # Assure having a dataset (for paranoid ones)
        if not is_datasetlike(result_ds):
                result_a = np.atleast_1d(result_ds)
            except ValueError, e:
                if 'setting an array element with a sequence' in str(e):
                    # try forcing object array.  Happens with
                    # test_custom_results_fx_logic on numpy 1.4.1 on Debian
                    # squeeze
                    result_a = np.array(result_ds, dtype=object)
            result_ds = Dataset(result_a)
예제 #6
파일: moco_eval.py 프로젝트: bpinsard/misc
def resting_dmn(sub, ses, in_file=None,
                lh_ctx_file=None, rh_ctx_file=None, sc_file=None,
    from pipe_hbn_ssi import wb_to_tss
    import os, sys
    import numpy as np
    import core.mvpa.dataset as cds
    from nipy.modalities.fmri.glm import GeneralLinearModel
    import scipy.ndimage    
    from mvpa2.datasets import Dataset

    sched = np.loadtxt(
        converters = {0:int,1:int,2:str,3:int,4:str,5:str},
    idx = sched[:,1].tolist().index(ses)
    #scan_no = sched[idx,2].split('-').index('Rest')
    if in_file is None:
        scan_no = [i for i,n in enumerate(lh_ctx_file) if 'RESTING' in n]
        scan_no = [i for i,n in enumerate(in_file) if 'RESTING' in n]
    scan_no = scan_no[0]
    if in_file is None:
        inf = lh_ctx_file[scan_no]
        ds = Dataset(wb_to_tss(lh_ctx_file[scan_no], rh_ctx_file[scan_no], sc_file[scan_no]))
        inf = in_file[scan_no]
        ds = cds.ds_from_ts(in_file[scan_no])
    #cds.preproc_ds(ds, detrend=True)
    ds.samples -= scipy.ndimage.gaussian_filter1d(ds.samples,sigma=8,axis=0,truncate=2)
    seed_roi = 9
    cds.add_aparc_ba_fa(ds, sub, pproc_tpl=os.path.join(pipe_hbn_ssi.proc_dir,'moco_multiband','surface_32k','_sub_%d'))
    roi_mask = np.logical_or(ds.fa.aparc==seed_roi+11100, ds.fa.aparc==seed_roi+12100)
    mean_roi_ts = ds.samples[:,roi_mask].mean(1)
    mean_roi_ts -= mean_roi_ts.mean()
    mtx = np.asarray([mean_roi_ts, np.ones(ds.nsamples)]).T
    glm = GeneralLinearModel(mtx)
    contrast = glm.contrast([1,0], contrast_type='t')
    out_file = os.path.abspath('sub%d_ses%d_connectivity_results.npz'%(sub,ses))
    np.savez_compressed(out_file, contrast=contrast, mean_roi_ts=mean_roi_ts)
    #return contrast
    return out_file, inf
예제 #7
    def _call(self, dataset):
        """Computes featurewise I-RELIEF weights."""
        samples = dataset.samples
        NS, NF = samples.shape[:2]
        if self.w_guess is None:
            self.w = np.ones(NF, 'd')
        # do normalization in all cases to be safe :)
        self.w = self.w / (self.w**2).sum()

        M, H = self.compute_M_H(dataset.targets)

        while True:
            self.k = self.kernel(length_scale=self.kernel_width / self.w)
            d_w_k = self.k.computed(samples).as_raw_np()
            # set d_w_k to zero where distance=0 (i.e. kernel ==
            # 1.0), otherwise I-RELIEF could not converge.
            # XXX Note that kernel==1 for distance=0 only for
            # exponential kernels!!  IMPROVE
            d_w_k[np.abs(d_w_k - 1.0) < 1.0e-15] = 0.0
            ni = np.zeros(NF, 'd')
            for n in range(NS):
                # d_w_k[n,n] could be omitted since == 0.0
                gamma_n = 1.0 - np.nan_to_num(d_w_k[n, M[n]].sum() \
                                / (d_w_k[n, :].sum()-d_w_k[n, n]))
                alpha_n = np.nan_to_num(d_w_k[n, M[n]] /
                                        (d_w_k[n, M[n]].sum()))
                beta_n = np.nan_to_num(d_w_k[n, H[n]] / (d_w_k[n, H[n]].sum()))

                m_n = (np.abs(samples[n, :] - samples[M[n], :]) \
                        * alpha_n[:, None]).sum(0)
                h_n = (np.abs(samples[n, :] - samples[H[n], :]) \
                        * beta_n[:, None]).sum(0)
                ni += gamma_n * (m_n - h_n)
            ni = ni / NS

            ni_plus = np.clip(ni, 0.0,
                              np.inf)  # set all negative elements to zero
            w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum())))
            change = np.abs(w_new - self.w).sum()
            if __debug__ and 'IRELIEF' in debug.active:
                    "change=%.4f max=%f min=%.4f mean=%.4f std=%.4f #nan=%d" %
                    (change, w_new.max(), w_new.min(), w_new.mean(),
                     w_new.std(), np.isnan(w_new).sum()))

            # update weights:
            self.w = w_new
            if change < self.threshold:

        return Dataset(self.w[np.newaxis])
예제 #8
def prep_parcelwise_data(subject, parcel, datatype):
    from mvpa2.datasets import Dataset
    from mvpa2.mappers.zscore import zscore
    if datatype == 'sponpain':
        ds = Dataset(
                    sponpain_by_parcel, subject +
                    '_sponpain_connectome_parcel-' + str(parcel) + '.npy')))
        ds.fa['voxel_indices'] = range(ds.shape[1])
        zscore(ds, chunks_attr=None)
    elif datatype == 'bladderpain':
        ds = Dataset(
                    bladderpain_by_parcel, subject +
                    '_bladderpain-cleaned-ts_parcel-' + str(parcel) + '.npy')))
        ds.fa['voxel_indices'] = range(ds.shape[1])
        zscore(ds, chunks_attr=None)
        print('Must specify datatyp as either sponpain or bladderpain')
    return ds
예제 #9
    def _forward_dataset_helper(self, ds):
        # local binding
        num = self.__num

        pos = None
        if not self.__position_attr is None:
            # we know something about sample position
            pos = ds.sa[self.__position_attr].value
            rsamples, pos = resample(ds.samples, self.__num, t=pos,
            # we know nothing about samples position
            rsamples = resample(ds.samples, self.__num, t=None,
        # new dataset that reuses that feature and dataset attributes of the
        # source
        mds = Dataset(rsamples, fa=ds.fa, a=ds.a)

        # the tricky part is what to do with the samples attributes, since their
        # number has changes
        if self.__attr_strategy == 'remove':
            # nothing to be done
        elif self.__attr_strategy == 'sample':
            step = int(len(ds) / num)
            sa = dict([(k, ds.sa[k].value[0::step][:num]) for k in ds.sa])
        elif self.__attr_strategy == 'resample':
            # resample the attributes themselves
            sa = {}
            for k in ds.sa:
                v = ds.sa[k].value
                if pos is None:
                    sa[k] = resample(v, self.__num, t=None,
                    if k == self.__position_attr:
                        # position attr will be handled separately at the end
                    sa[k] = resample(v, self.__num, t=pos,
            # inject them all
            raise ValueError("Unkown attribute handling strategy '%s'."
                             % self.__attr_strategy)

        if not pos is None:
            # we got the new sample positions and can store them
            mds.sa[self.__position_attr] = pos
        return mds
def intersubject_correlation(dss, reference_ds=0):
    Computes voxelwise inter-subject time series correlation
    in a pairwise fashion for a list of Datasets. Datasets
    must all be the same shape. Resulting dataset of pairwise
    correlations will inherit Dataset attributes from
    reference data set [Default: first data set in list].

    # Check if input list contains Datasets, ndarrays
    dss = [Dataset(ds) if not type(ds) == Dataset else ds for ds in dss]

    ds_shape = dss[reference_ds].shape
    n_features = ds_shape[1]

    for ds in dss:
        assert ds.shape == ds_shape

    # Compute time series correlation per voxel per subject pair
    correlations = []
    for pair in combinations(dss, 2):
        pair_map = []
        for feature in xrange(n_features):
                pearson_correlation(pair[0].samples[:, feature],
                                    pair[1].samples[:, feature]))

    # Resulting correlation map inherits attributes of referece data set
    correlations_ds = Dataset(correlations,
    correlations_ds.sa['pairs'] = list(combinations(range(len(dss)), 2))

    assert correlations_ds.shape[0] == len(dss) * (len(dss) - 1) / 2
    assert correlations_ds.shape[1] == n_features

    return correlations_ds
예제 #11
    def build_streamline_things(self):
        # Build a dataset having samples of different lengths. This is
        # trying to mimic a possible interface for streamlines
        # datasets, i.e., an iterable container of Mx3 points, where M
        # depends on each single streamline.

        # trying to pack it into an 'object' array to prevent conversion in the
        # Dataset
        self.streamline_samples = np.array([
        self.dataset = Dataset(self.streamline_samples)
        self.similarities = [StreamlineSimilarity(distance=corouge)]
예제 #12
def test_strip_nibabel():
    # lots of implicit test already, just make sure it doesn't ruin other
    # datasets
    ds = Dataset([range(5)])
    assert_true('imgtype' not in ds.a)
    # can run multiple times: idempotent
    ds = fmri_dataset(
        pathjoin(pymvpa_dataroot, 'haxby2001', 'sub001', 'BOLD',
                 'task001_run001', 'bold_25mm.nii.gz'))
    strip_nibabel(ds)  # this is real
    strip_nibabel(ds)  # this is not a copy&paste error!
    assert_true('imgtype' in ds.a)
    assert_true('imgaffine' in ds.a)
    assert_equal(type(ds.a.imghdr), dict)
예제 #13
    def _call(self, dataset):
        """Computes featurewise I-RELIEF weights."""
        samples = dataset.samples
        NS, NF = samples.shape[:2]

        if self.w_guess is None:
            w = np.ones(NF, 'd')

        w /= (w**2).sum()  # do normalization in all cases to be safe :)

        M, H = self.compute_M_H(dataset.targets)

        while True:
            d_w_k = self.k(pnorm_w(data1=samples, weight=w, p=1))
            ni = np.zeros(NF, 'd')
            for n in range(NS):
                # d_w_k[n, n] could be omitted since == 0.0
                gamma_n = 1.0 - np.nan_to_num(d_w_k[n, M[n]].sum() \
                                / (d_w_k[n, :].sum() - d_w_k[n, n]))
                alpha_n = np.nan_to_num(d_w_k[n, M[n]] /
                                        (d_w_k[n, M[n]].sum()))
                beta_n = np.nan_to_num(d_w_k[n, H[n]] / (d_w_k[n, H[n]].sum()))

                m_n = (np.abs(samples[n, :] - samples[M[n], :]) \
                       * alpha_n[:, None]).sum(0)
                h_n = (np.abs(samples[n, :] - samples[H[n], :]) \
                       * beta_n[:, None]).sum(0)
                ni += gamma_n * (m_n - h_n)

            ni = ni / NS

            ni_plus = np.clip(ni, 0.0,
                              np.inf)  # set all negative elements to zero
            w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum())))
            change = np.abs(w_new - w).sum()
            if __debug__ and 'IRELIEF' in debug.active:
                      "change=%.4f max=%f min=%.4f mean=%.4f std=%.4f #nan=%d" \
                      % (change, w_new.max(), w_new.min(), w_new.mean(),
                         w_new.std(), np.isnan(w_new).sum()))

            # update weights:
            w = w_new
            if change < self.threshold:

        self.w = w
        return Dataset(self.w[np.newaxis])
예제 #14
def test_resample():
    time = np.linspace(0, 2 * np.pi, 100)
    ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T,
                     'time': time,
                     'section': np.repeat(range(10), 10)
    assert_equal(ds.shape, (100, 2))

    # downsample
    num = 10
    rm = FFTResampleMapper(num,
                           window=('gauss', 50),
    mds = rm.forward(ds)
    assert_equal(mds.shape, (num, ds.nfeatures))
    # didn't change the orig
    assert_equal(len(ds), 100)

    # check position-based resampling
    ds_partial = ds[0::10]
    mds_partial = rm.forward(ds_partial)
    # despite different input sampling should yield the same output timepoints
    assert_array_almost_equal(mds.sa.time, mds_partial.sa.time)
    # exclude the first points to prevent edge effects, but the data should be
    # very similar too
    # simple sample of sa's should give meaningful stuff
    assert_array_equal(mds.sa.section, range(10))

    # and now for a dataset with chunks
    cds = vstack([ds.copy(), ds.copy()])
    cds.sa['chunks'] = np.repeat([0, 1], len(ds))
    rm = FFTResampleMapper(num,
                           window=('gauss', 50))
    mcds = rm.forward(cds)
    assert_equal(mcds.shape, (20, 2))
    assert_array_equal(mcds.sa.section, np.tile(range(10), 2))
    # each individual chunks should be identical to previous dataset
    assert_array_almost_equal(mds.samples, mcds.samples[:10])
    assert_array_almost_equal(mds.samples, mcds.samples[10:])
예제 #15
파일: fx.py 프로젝트: reka-daniel/PyMVPA
 def _call(self, ds):
     # extract samples and targets and pass them to the errorfx
     targets = ds.sa[self.get_space()].value
     # squeeze to remove bogus dimensions and prevent problems during
     # comparision later on
     values = np.atleast_1d(ds.samples.squeeze())
     if not values.shape == targets.shape:
         # if they have different shape numpy's broadcasting might introduce
         # pointless stuff (compare individual features or yield a single
         # boolean
         raise ValueError("Trying to compute an error between data of "
                          "different shape (%s vs. %s)." %
                          (values.shape, targets.shape))
     err = self.fx(values, targets)
     if np.isscalar(err):
         err = np.array(err, ndmin=2)
     return Dataset(err)
예제 #16
def test_iirfilter():
    # dataset with one feature from two waves
    t = np.linspace(0, 1.0, 2001)
    xlow = np.sin(2 * np.pi * 5 * t)
    xhigh = np.sin(2 * np.pi * 250 * t)
    x = xlow + xhigh
    ds = Dataset(x, sa={'sid': np.arange(len(x))}, fa={'fid':['theone']})

    # butterworth filter with a cutoff between the waves
    from scipy import signal
    b, a = signal.butter(8, 0.125)
    mds = iir_filter(ds, b, a, padlen=150)
    # check we get just the slow wave out (compensate for edge artifacts)
    assert_false(np.sum(np.abs(mds.samples[100:-100,0] - xlow[100:-100]) > 0.001))
    assert_equal(len(ds.sa), len(mds.sa))
    assert_equal(len(ds.fa), len(mds.fa))
    assert_array_equal(ds.fa.fid, mds.fa.fid)
    assert_array_equal(ds.sa.sid, mds.sa.sid)
예제 #17
    def __process_roi(self, ds, roi_feature_id, measure, assure_dataset):
        # retrieve the feature ids of all features in the ROI from the query
        # engine
        roi_specs = self._queryengine[roi_feature_id]
        if __debug__:
                'SLC_', 'For %r query returned roi_specs %r' %
                (roi_feature_id, roi_specs))
        if is_datasetlike(roi_specs):
            # TODO: unittest
            assert (len(roi_specs) == 1)
            roi_fids = roi_specs.samples[0]
            roi_fids = roi_specs

        # slice the dataset
        roi = ds[:, roi_fids]
        if is_datasetlike(roi_specs):
            for n, v in roi_specs.fa.iteritems():
                roi.fa[n] = v
        if self.__add_center_fa:
            # add fa to indicate ROI seed if requested
            roi_seed = np.zeros(roi.nfeatures, dtype='bool')
            if roi_feature_id in roi_fids:
                roi_seed[roi_fids.index(roi_feature_id)] = True
                warning("Center feature attribute id %s not found" %
            roi.fa[self.__add_center_fa] = roi_seed

        # compute the datameasure and store in results
        res = measure(roi)
        if assure_dataset and not is_datasetlike(res):
            res = Dataset(np.atleast_1d(res))
        if self.ca.is_enabled('roi_feature_ids'):
            # add roi feature ids to intermediate result dataset for later
            # aggregation
            res.a['roi_feature_ids'] = roi_fids
        if self.ca.is_enabled('roi_sizes'):
            res.a['roi_sizes'] = roi.nfeatures
        if self.ca.is_enabled('roi_center_ids'):
            res.a['roi_center_ids'] = roi_feature_id
        return res, roi
예제 #18
def test_sifter():
    # somewhat duplicating the doctest
    ds = Dataset(samples=np.arange(8).reshape((4,2)),
                 sa={'chunks':   [ 0 ,  1 ,  2 ,  3 ],
                     'targets':  ['c', 'c', 'p', 'p']})
    for sift_targets_definition in (['c', 'p'],
                                    dict(uvalues=['c', 'p'])):
        par = ChainNode([NFoldPartitioner(cvtype=2, attr='chunks'),
                         Sifter([('partitions', 2),
                                 ('targets', sift_targets_definition)])
        dss = list(par.generate(ds))
        assert_equal(len(dss), 4)
        for ds_ in dss:
            testing = ds[ds_.sa.partitions == 2]
            assert_array_equal(np.unique(testing.sa.targets), ['c', 'p'])
            # and we still have both targets  present in training
            training = ds[ds_.sa.partitions == 1]
            assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
예제 #19
def _fill_in_scattered_results(sl, dataset, roi_ids, results):
    """this requires the searchlight conditional attribute 'roi_feature_ids'
    to be enabled"""
    import numpy as np
    from mvpa2.datasets import Dataset

    resmap = None
    probmap = None
    for resblock in results:
        for res in resblock:
            if resmap is None:
                # prepare the result container
                resmap = np.zeros((len(res), dataset.nfeatures),
                if 'null_prob' in res.fa:
                    # initialize the prob map also with zeroes, as p=0 can never
                    # happen as an empirical result
                    probmap = np.zeros(
                        (dataset.nfeatures, ) + res.fa.null_prob.shape[1:],
                observ_counter = np.zeros(dataset.nfeatures, dtype=int)
            #project the result onto all features -- love broadcasting!
            resmap[:, res.a.roi_feature_ids] += res.samples
            if not probmap is None:
                probmap[res.a.roi_feature_ids] += res.fa.null_prob
            # increment observation counter for all relevant features
            observ_counter[res.a.roi_feature_ids] += 1
    # when all results have been added up average them according to the number
    # of observations
    observ_mask = observ_counter > 0
    resmap[:, observ_mask] /= observ_counter[observ_mask]
    result_ds = Dataset(resmap, fa={'observations': observ_counter})
    if not probmap is None:
        # transpose to make broadcasting work -- creates a view, so in-place
        # modification still does the job
        probmap.T[:, observ_mask] /= observ_counter[observ_mask]
        result_ds.fa['null_prob'] = probmap.squeeze()
    if 'mapper' in dataset.a:
        import copy
        result_ds.a['mapper'] = copy.copy(dataset.a.mapper)
    return result_ds
예제 #20
    def _get_hypesvs(self, sl_connectomes, local_common_model=None):
        Hyperalign connectomes and return mapppers
        and trained SVDMapper of common space.

        sl_connectomes: a list of connectomes to hyperalign
        local_common_model: a reference common model to be used.

        a tuple (sl_hmappers, svm, local_common_model)
        sl_hmappers: a list of mappers corresponding to input list in that order.
        svm: a svm mapper based on the input data. if given a common model, this is None.
        local_common_model: If local_common_model is provided as input, this will be None.
            Otherwise, local_common_model will be computed here and returned.
        # TODO Should we z-score sl_connectomes?
        return_model = False if self.params.save_model is None else True
        if local_common_model is not None:
            ha = Hyperalignment(level2_niter=0)
            if not is_datasetlike(local_common_model):
                local_common_model = Dataset(samples=local_common_model)
            sl_hmappers = ha(sl_connectomes)
            return sl_hmappers, None, None
        ha = Hyperalignment()
        sl_hmappers = ha(sl_connectomes)
        sl_connectomes = [
            slhm.forward(slc) for slhm, slc in zip(sl_hmappers, sl_connectomes)
        _ = [zscore(slc, chunks_attr=None) for slc in sl_connectomes]
        sl_connectomes = np.dstack(sl_connectomes).mean(axis=-1)
        svm = SVDMapper(force_train=True)
        if return_model:
            local_common_model = svm.forward(sl_connectomes)
            local_common_model = None
        return sl_hmappers, svm, local_common_model
예제 #21
 def test_1d_multispace_searchlight(self):
     ds = Dataset([np.arange(6)])
     ds.fa['coord1'] = np.repeat(np.arange(3), 2)
     # add a second space to the dataset
     ds.fa['coord2'] = np.tile(np.arange(2), 3)
     measure = lambda x: "+".join([str(x) for x in x.samples[0]])
     # simply select each feature once
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(0)),
     assert_array_equal(res.samples, [['0', '1', '2', '3', '4', '5']])
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(1)),
                        [['0+1', '0+1', '2+3', '2+3', '4+5', '4+5']])
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(1), coord2=Sphere(0)),
                        [['0+2', '1+3', '0+2+4', '1+3+5', '2+4', '3+5']])
예제 #22
def testmodel(wts, des, ds, tc, use_corr=True):
    import numpy as np
    widx = wts.sa['chunks'].unique
    didx = ds.sa['chunks'].unique
    if len(widx) != len(didx):
        print "unequal number of chunks... exiting"
    if 'word2vec' in tc:
        for i in np.arange(0, 300):
            tc.append('word2vec' + str(i))

    corrs = []
    regidx = [des.names.index(i) for i in tc]
    for i in np.arange(len(widx)):
        pred = np.dot(
            des.matrix[:, regidx],
            wts[wts.sa['chunks'].value == widx[i]].samples[regidx, :])[
                ds.sa['chunks'].value == didx[i]]
        Presp = ds[ds.sa['chunks'].value == didx[i]].samples
        # Find prediction correlations
        nnpred = np.nan_to_num(pred)
        if use_corr:
            vcorrs = np.nan_to_num(
                    np.corrcoef(Presp[:, ii], nnpred[:, ii].ravel())[0, 1]
                    for ii in range(Presp.shape[1])
            resvar = (Presp - pred).var(0)
            Rsqs = 1 - (resvar / Presp.var(0))
            vcorrs = np.sqrt(np.abs(Rsqs)) * np.sign(Rsqs)
    from mvpa2.datasets import Dataset
    return Dataset(np.vstack(corrs),
                   sa={'chunks': ds.sa['chunks'].unique},
예제 #23
def test_exclude_targets_combinations_subjectchunks():
    partitioner = ChainNode([
            k=1, targets_attr='chunks', space='partitions')
    # targets do not need even to be defined!
    ds = Dataset(np.arange(18).reshape(9, 2),
                     'chunks': np.arange(9) // 3,
                     'subjects': np.arange(9) % 3
    dss = list(partitioner.generate(ds))
    assert_equal(len(dss), 9)

    testing_subjs, testing_chunks = [], []
    for ds_ in dss:
        testing_partition = ds_.sa.partitions == 2
        training_partition = ds_.sa.partitions == 1
        # must be scalars -- so implicit test here
        # if not -- would be error
        testing_subj = np.asscalar(
        testing_chunk = np.asscalar(np.unique(
        # and those must not appear for training
        ok_(not testing_subj in ds_.sa.subjects[training_partition])
        ok_(not testing_chunk in ds_.sa.chunks[training_partition])
    # and we should have gone through all chunks/subjs pairs
    testing_pairs = set(zip(testing_subjs, testing_chunks))
    assert_equal(len(testing_pairs), 9)
    # yoh: equivalent to set(itertools.product(range(3), range(3))))
    #      but .product is N/A for python2.5
    assert_equal(testing_pairs, set(zip(*np.where(np.ones((3, 3))))))
예제 #24
    def _level3(self, datasets):
        params = self.params  # for quicker access ;)
        # create a mapper per dataset
        mappers = [deepcopy(params.alignment) for ds in datasets]

        # key different from level-2; the common space is uniform
        #temp_commonspace = commonspace
        # Fixing nproc=0
        if params.nproc == 0:
            from mvpa2.base import warning
            warning("nproc of 0 doesn't make sense. Setting nproc to 1.")
            params.nproc = 1
        # Checking for joblib, if not, set nproc to 1
        if params.nproc != 1:
            from mvpa2.base import externals, warning
            if not externals.exists('joblib'):
                    "Setting nproc different from 1 requires joblib package, which "
                    "does not seem to exist. Setting nproc to 1.")
                params.nproc = 1

        # start from original input datasets again
        if params.nproc == 1:
            residuals = []
            for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
                if __debug__:
                    debug('HPAL_', "Level 3: ds #%i" % i)
                m, residual = get_trained_mapper(
                    ds_new, self.commonspace, m,
                if self.ca['residual_errors'].enabled:
            if __debug__:
                      "Level 3: Using joblib with nproc = %d " % params.nproc)
            verbose_level_parallel = 20 \
                if (__debug__ and 'HPAL' in debug.active) else 0
            from joblib import Parallel, delayed
            import sys
            # joblib's 'multiprocessing' backend has known issues of failure on OSX
            # Tested with MacOS 10.12.13, python 2.7.13, joblib v0.10.3
            if params.joblib_backend is None:
                params.joblib_backend = 'threading' if sys.platform == 'darwin' \
                                        else 'multiprocessing'
            res = Parallel(n_jobs=params.nproc,
                                   ds, self.commonspace, mapper,
                               for ds, mapper in zip(datasets, mappers))
            mappers = [m for m, r in res]
            if self.ca['residual_errors'].enabled:
                residuals = [r for m, r in res]

        if self.ca['residual_errors'].enabled:
            self.ca.residual_errors = Dataset(
                samples=np.array(residuals)[None, :])

        return mappers
예제 #25
    def train(self, datasets):
        """Derive a common feature space from a series of datasets.

        datasets : sequence of datasets

        A list of trained Mappers matching the number of input datasets.
        params = self.params  # for quicker access ;)
        ca = self.ca
        # Check to make sure we get a list of datasets as input.
        if not isinstance(datasets, (list, tuple, np.ndarray)):
            raise TypeError("Input datasets should be a sequence "
                            "(of type list, tuple, or ndarray) of datasets.")

        ndatasets = len(datasets)
        nfeatures = [ds.nfeatures for ds in datasets]
        alpha = params.alpha

        residuals = None
        if ca['training_residual_errors'].enabled:
            residuals = np.zeros((1 + params.level2_niter, ndatasets))
            ca.training_residual_errors = Dataset(
                    ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)]

        if __debug__:
                  "Hyperalignment %s for %i datasets" % (self, ndatasets))

        if params.ref_ds is None:
            ref_ds = np.argmax(nfeatures)
            ref_ds = params.ref_ds
            # Making sure that ref_ds is within range.
            #Parameter() already checks for it being a non-negative integer
            if ref_ds >= ndatasets:
                raise ValueError, "Requested reference dataset %i is out of " \
                      "bounds. We have only %i datasets provided" \
                      % (ref_ds, ndatasets)
        ca.chosen_ref_ds = ref_ds
        # zscore all data sets
        # ds = [ zscore(ds, chunks_attr=None) for ds in datasets]

        # TODO since we are doing in-place zscoring create deep copies
        # of the datasets with pruned targets and shallow copies of
        # the collections (if they would come needed in the transformation)
        # TODO: handle floats and non-floats differently to prevent
        #       waste of memory if there is no need (e.g. no z-scoring)
        #otargets = [ds.sa.targets for ds in datasets]
        datasets = [ds.copy(deep=False) for ds in datasets]
        #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)})
        #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)})
        #            for ds in datasets]

        if params.zscore_all:
            if __debug__:
                debug('HPAL', "Z-scoring all datasets")
            for ids in xrange(len(datasets)):
                zmapper = ZScoreMapper(chunks_attr=None)
                datasets[ids] = zmapper.forward(datasets[ids])

        if alpha < 1:
            datasets, wmappers = self._regularize(datasets, alpha)

        # initial common space is the reference dataset
        commonspace = datasets[ref_ds].samples
        # the reference dataset might have been zscored already, don't do it
        # twice
        if params.zscore_common and not params.zscore_all:
            if __debug__:
                    'HPAL_', "Creating copy of a commonspace and assuring "
                    "it is of a floating type")
            commonspace = commonspace.astype(float)
            zscore(commonspace, chunks_attr=None)
        # If there is only one dataset in training phase, there is nothing to be done
        # just use that data as the common space
        if len(datasets) < 2:
            self.commonspace = commonspace
            # create a mapper per dataset
            # might prefer some other way to initialize... later
            mappers = [deepcopy(params.alignment) for ds in datasets]

            # Level 1 -- initial projection
            lvl1_projdata = self._level1(datasets, commonspace, ref_ds,
                                         mappers, residuals)
            # Level 2 -- might iterate multiple times
            # this is the final common space
            self.commonspace = self._level2(datasets, lvl1_projdata, mappers,
        if params.output_dim is not None:
            mappers = self._level3(datasets)
            self._svd_mapper = SVDMapper()
            self._svd_mapper.train(self._map_and_mean(datasets, mappers))
            self._svd_mapper = StaticProjectionMapper(
                proj=self._svd_mapper.proj[:, :params.output_dim])
예제 #26
파일: stats.py 프로젝트: Soletmons/PyMVPA
class MCNullDist(NullDist):
    """Null-hypothesis distribution is estimated from randomly permuted data labels.

    The distribution is estimated by calling fit() with an appropriate
    `Measure` or `TransferError` instance and a training and a
    validation dataset (in case of a `TransferError`). For a customizable
    amount of cycles the training data labels are permuted and the
    corresponding measure computed. In case of a `TransferError` this is the
    error when predicting the *correct* labels of the validation dataset.

    The distribution can be queried using the `cdf()` method, which can be
    configured to report probabilities/frequencies from `left` or `right` tail,
    i.e. fraction of the distribution that is lower or larger than some
    critical value.

    This class also supports `FeaturewiseMeasure`. In that case `cdf()`
    returns an array of featurewise probabilities/frequencies.

    _DEV_DOC = """
    TODO automagically decide on the number of samples/permutations needed
    Caution should be paid though since resultant distributions might be
    quite far from some conventional ones (e.g. Normal) -- it is expected to
    them to be bimodal (or actually multimodal) in many scenarios.

    dist_samples = ConditionalAttribute(enabled=False,
                                 doc='Samples obtained for each permutation')
    skipped = ConditionalAttribute(enabled=True,
                  doc='# of the samples which were skipped because '
                      'measure has failed to evaluated at them')

    def __init__(self, permutator, dist_class=Nonparametric, measure=None,
        """Initialize Monte-Carlo Permutation Null-hypothesis testing

        permutator : Node
          Node instance that generates permuted datasets.
        dist_class : class
          This can be any class which provides parameters estimate
          using `fit()` method to initialize the instance, and
          provides `cdf(x)` method for estimating value of x in CDF.
          All distributions from SciPy's 'stats' module can be used.
        measure : Measure or None
          Optional measure that is used to compute results on permuted
          data. If None, a measure needs to be passed to ``fit()``.
        NullDist.__init__(self, **kwargs)

        self._dist_class = dist_class
        self._dist = []                 # actual distributions
        self._measure = measure

        self.__permutator = permutator

    def __repr__(self, prefixes=[]):
        prefixes_ = ["%s" % self.__permutator]
        if self._dist_class != Nonparametric:
            prefixes_.insert(0, 'dist_class=%r' % (self._dist_class,))
        return super(MCNullDist, self).__repr__(
            prefixes=prefixes_ + prefixes)

    def fit(self, measure, ds):
        """Fit the distribution by performing multiple cycles which repeatedly
        permuted labels in the training dataset.

        measure: Measure or None
          A measure used to compute the results from shuffled data. Can be None
          if a measure instance has been provided to the constructor.
        ds: `Dataset` which gets permuted and used to compute the
          measure/transfer error multiple times.
        # TODO: place exceptions separately so we could avoid circular imports
        from mvpa2.base.learner import LearnerError

        # prefer the already assigned measure over anything the was passed to
        # the function.
        # XXX that is a bit awkward but is necessary to keep the code changes
        # in the rest of PyMVPA minimal till this behavior become mandatory
        if not self._measure is None:
            measure = self._measure

        dist_samples = []
        """Holds the values for randomized labels."""

        # estimate null-distribution
        # TODO this really needs to be more clever! If data samples are
        # shuffled within a class it really makes no difference for the
        # classifier, hence the number of permutations to estimate the
        # null-distribution of transfer errors can be reduced dramatically
        # when the *right* permutations (the ones that matter) are done.
        skipped = 0                     # # of skipped permutations
        for p, permuted_ds in enumerate(self.__permutator.generate(ds)):
            # new permutation all the time
            # but only permute the training data and keep the testdata constant
            if __debug__:
                debug('STATMC', "Doing %i permutations: %i" \
                      % (self.__permutator.count, p+1), cr=True)

            # compute and store the measure of this permutation
            # assume it has `TransferError` interface
                res = measure(permuted_ds)
            except LearnerError, e:
                if __debug__:
                    debug('STATMC', " skipped", cr=True)
                warning('Failed to obtain value from %s due to %s.  Measurement'
                        ' was skipped, which could lead to unstable and/or'
                        ' incorrect assessment of the null_dist' % (measure, e))
                skipped += 1

        self.ca.skipped = skipped

        if __debug__:
            debug('STATMC', ' Skipped: %d permutations' % skipped)

        if not len(dist_samples) and skipped > 0:
            raise RuntimeError(
                'Failed to obtain any value from %s. %d measurements were '
                'skipped. Check above warnings, and your code/data'
                % (measure, skipped))
        # store samples as (npermutations x nsamples x nfeatures)
        dist_samples = np.asanyarray(dist_samples)
        # for the ca storage use a dataset with
        # (nsamples x nfeatures x npermutations) to make it compatible with the
        # result dataset of the measure
        self.ca.dist_samples = Dataset(np.rollaxis(dist_samples,
                                       0, len(dist_samples.shape)))

        # fit distribution per each element

        # to decide either it was done on scalars or vectors
        shape = dist_samples.shape
        nshape = len(shape)
        # if just 1 dim, original data was scalar, just create an
        # artif dimension for it
        if nshape == 1:
            dist_samples = dist_samples[:, np.newaxis]

        # fit per each element.
        # XXX could be more elegant? may be use np.vectorize?
        dist_samples_rs = dist_samples.reshape((shape[0], -1))
        dist = []
        for samples in dist_samples_rs.T:
            params = self._dist_class.fit(samples)
            if __debug__ and 'STAT__' in debug.active:
                debug('STAT', 'Estimated parameters for the %s are %s'
                      % (self._dist_class, str(params)))
        self._dist = dist
예제 #27
 def _call(self, ds):
     return CrossValidation._call(
         self, Dataset(np.hstack((ds, ds.sa.beh)), sa=ds.sa))
예제 #28
def test_factorialpartitioner():
    # Test against sifter and chainmap implemented in test_usecases
    # -- code below copied from test_usecases --
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        snr=100,  # pure signal! ;)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, )
                         for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    # let's make two other datasets to test later
    # one superordinate category only
    ds_1super = ds.copy()
    ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets]

    # one superordinate category has only one subordinate
    #ds_unbalanced = ds.copy()
    #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1')
    #mask_superord = ds_unbalanced.sa.superord == 'super1'
    #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord])
    #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)]
    ds_unbalanced = Dataset(range(4),
                                'subord': [0, 0, 1, 2],
                                'superord': [1, 1, 2, 2]

    npart = ChainNode(
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True

    # now the new implementation
    factpart = FactorialPartitioner(NFoldPartitioner(attr='subord'),

    partitions_npart = [p.sa.partitions for p in npart.generate(ds)]
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)]

    assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart))

    # now let's check it behaves correctly if we have only one superord class
    nfold = NFoldPartitioner(attr='subord')
    partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)]
    partitions_factpart = [
        p.sa.partitions for p in factpart.generate(ds_1super)
    assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart))

    # smoke test for unbalanced subord classes
    warning_msg = 'One or more superordinate attributes do not have the same '\
                  'number of subordinate attributes. This could yield to '\
                  'unbalanced partitions.'
    with assert_warnings([(RuntimeWarning, warning_msg)]):
        partitions_factpart = [
            p.sa.partitions for p in factpart.generate(ds_unbalanced)

    partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])]
    superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])]
    subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])]

    for out_part, true_part, super_out, sub_out in \
            zip(partitions_factpart, partitions_unbalanced,
                superord_unbalanced, subord_unbalanced):
        assert_array_equal(out_part, true_part)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(),
                            ds_unbalanced[out_part == 2].sa.superord.tolist()),
        assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(),
                            ds_unbalanced[out_part == 2].sa.subord.tolist()),

    # now let's test on a dummy dataset
    ds_dummy = Dataset(range(4),
                           'subord': range(4),
                           'superord': [1, 2] * 2
    partitions_factpart = [
        p.sa.partitions for p in factpart.generate(ds_dummy)
        [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
예제 #29
    def test_surf_ring_queryengine(self):
        s = surf.generate_plane((0, 0, 0), (0, 1, 0), (0, 0, 1), 4, 5)
        # add second layer
        s2 = surf.merge(s, (s + (.01, 0, 0)))
        ds = Dataset(samples=np.arange(20)[np.newaxis],
                     fa=dict(node_indices=np.arange(39, 0, -2)))
        # add more features (with shared node indices)
        ds3 = hstack((ds, ds, ds))
        radius = 2.5
        inner_radius = 1.0
        # Makes sure it raises error if inner_radius is >= radius
            ValueError, lambda: queryengine.SurfaceRingQueryEngine(
                surface=s2, inner_radius=2.5, radius=radius))
        distance_metrics = ('euclidean', 'dijkstra', 'euclidean', 'dijkstra')
        for distance_metric, include_center in zip(distance_metrics,
                                                   [True, False] * 2):
            qe = queryengine.SurfaceRingQueryEngine(
            # untrained qe should give errors
            assert_raises(ValueError, lambda: qe.ids)
            assert_raises(ValueError, lambda: qe.query_byid(0))

            # node index out of bounds should give error
            ds_ = ds.copy()
            ds_.fa.node_indices[0] = 100
            assert_raises(ValueError, lambda: qe.train(ds_))

            # lack of node indices should give error
            assert_raises(ValueError, lambda: qe.train(ds_))
            # train the qe

            for node in np.arange(-1, s2.nvertices + 1):
                if node < 0 or node >= s2.nvertices:
                    assert_raises(KeyError, lambda: qe.query_byid(node))

                feature_ids = np.asarray(qe.query_byid(node))
                # node indices relative to ds
                base_ids = feature_ids[feature_ids < 20]
                # should have multiples of 20
                             set((base_ids[np.newaxis].T + \
                                  [0, 20, 40]).ravel()))

                node_indices = s2.circlearound_n2d(
                    node, radius, distance_metric or 'dijkstra')

                fa_indices = [
                    for fa_index, inode in enumerate(ds3.fa.node_indices)
                    if inode in node_indices
                    and node_indices[inode] > inner_radius
                if include_center and node in ds3.fa.node_indices:
                    fa_indices += np.where(
                        ds3.fa.node_indices == node)[0].tolist()
                assert_equal(set(feature_ids), set(fa_indices))
예제 #30
    def test_surf_queryengine(self, qefn):
        s = surf.generate_plane((0, 0, 0), (0, 1, 0), (0, 0, 1), 4, 5)

        # add second layer
        s2 = surf.merge(s, (s + (.01, 0, 0)))

        ds = Dataset(samples=np.arange(20)[np.newaxis],
                     fa=dict(node_indices=np.arange(39, 0, -2)))

        # add more features (with shared node indices)
        ds3 = hstack((ds, ds, ds))

        radius = 2.5

        # Note: sweepargs it not used to avoid re-generating the same
        #       surface and dataset multiple times.
        for distance_metric in ('euclidean', 'dijkstra', '<illegal>', None):
            builder = lambda: queryengine.SurfaceQueryEngine(
                s2, radius, distance_metric)
            if distance_metric in ('<illegal>', None):
                assert_raises(ValueError, builder)

            qe = builder()

            # test i/o and ensure that the untrained instance is not trained
            if externals.exists('h5py'):
                h5save(qefn, qe)
                qe = h5load(qefn)

            # untrained qe should give errors
            assert_raises(ValueError, lambda: qe.ids)
            assert_raises(ValueError, lambda: qe.query_byid(0))

            # node index out of bounds should give error
            ds_ = ds.copy()
            ds_.fa.node_indices[0] = 100
            assert_raises(ValueError, lambda: qe.train(ds_))

            # lack of node indices should give error
            assert_raises(ValueError, lambda: qe.train(ds_))

            # train the qe

            # test i/o and ensure that the loaded instance is trained
            if externals.exists('h5py'):
                h5save(qefn, qe)
                qe = h5load(qefn)

            for node in np.arange(-1, s2.nvertices + 1):
                if node < 0 or node >= s2.nvertices:
                    assert_raises(KeyError, lambda: qe.query_byid(node))

                feature_ids = np.asarray(qe.query_byid(node))

                # node indices relative to ds
                base_ids = feature_ids[feature_ids < 20]

                # should have multiples of 20
                             set((base_ids[np.newaxis].T + \
                                            [0, 20, 40]).ravel()))

                node_indices = list(
                    s2.circlearound_n2d(node, radius, distance_metric
                                        or 'dijkstra'))

                fa_indices = [
                    for fa_index, node in enumerate(ds3.fa.node_indices)
                    if node in node_indices

                assert_equal(set(feature_ids), set(fa_indices))

            # smoke tests
            assert_true('SurfaceQueryEngine' in '%s' % qe)
            assert_true('SurfaceQueryEngine' in '%r' % qe)