示例#1
0
文件: searchlight.py 项目: esc/PyMVPA
    def _call(self, dataset):
        """Perform the ROI search.
        """
        # local binding
        nproc = self.nproc

        if nproc is None and externals.exists('pprocess'):
            import pprocess
            try:
                nproc = pprocess.get_number_of_cores() or 1
            except AttributeError:
                warning("pprocess version %s has no API to figure out maximal "
                        "number of cores. Using 1"
                        % externals.versions['pprocess'])
                nproc = 1
        # train the queryengine
        self._queryengine.train(dataset)

        # decide whether to run on all possible center coords or just a provided
        # subset
        if isinstance(self.__roi_ids, str):
            roi_ids = dataset.fa[self.__roi_ids].value.nonzero()[0]
        elif self.__roi_ids is not None:
            roi_ids = self.__roi_ids
            # safeguard against stupidity
            if __debug__:
                if max(roi_ids) >= dataset.nfeatures:
                    raise IndexError, \
                          "Maximal center_id found is %s whenever given " \
                          "dataset has only %d features" \
                          % (max(roi_ids), dataset.nfeatures)
        else:
            roi_ids = np.arange(dataset.nfeatures)

        # pass to subclass
        results, roi_sizes = self._sl_call(dataset, roi_ids, nproc)

        if not roi_sizes is None:
            self.ca.roi_sizes = roi_sizes

        if 'mapper' in dataset.a:
            # since we know the space we can stick the original mapper into the
            # results as well
            if self.__roi_ids is None:
                results.a['mapper'] = copy.copy(dataset.a.mapper)
            else:
                # there is an additional selection step that needs to be
                # expressed by another mapper
                mapper = copy.copy(dataset.a.mapper)
                mapper.append(StaticFeatureSelection(roi_ids,
                                                     dshape=dataset.shape[1:]))
                results.a['mapper'] = mapper

        # charge state
        self.ca.raw_results = results

        # return raw results, base-class will take care of transformations
        return results
示例#2
0
    def _call(self, dataset):
        analyzers = []
        # create analyzers
        for clf in self.clf.clfs:
            if self.__analyzer is None:
                analyzer = clf.get_sensitivity_analyzer(**(self._slave_kwargs))
                if analyzer is None:
                    raise ValueError, \
                          "Wasn't able to figure basic analyzer for clf %r" % \
                          (clf,)
                if __debug__:
                    debug("SA", "Selected analyzer %r for clf %r" % \
                          (analyzer, clf))
            else:
                # XXX shallow copy should be enough...
                analyzer = copy.copy(self.__analyzer)

            # assign corresponding classifier
            analyzer.clf = clf
            # if clf was trained already - don't train again
            if clf.trained:
                analyzer._force_training = False
            analyzers.append(analyzer)

        self.__combined_analyzer.analyzers = analyzers

        # XXX not sure if we don't want to call directly ._call(dataset) to avoid
        # double application of transformers/combiners, after all we are just
        # 'proxying' here to combined_analyzer...
        # YOH: decided -- lets call ._call
        return self.__combined_analyzer._call(dataset)
示例#3
0
def test_subset():
    data = np.array(
            [[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
            [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
            [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
            [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]])
    # float array doesn't work
    sm = FeatureSliceMapper(np.ones(16))
    assert_raises(IndexError, sm.forward, data)

    # full mask
    sm = FeatureSliceMapper(slice(None))
    # should not change single samples
    assert_array_equal(sm.forward(data[0:1].copy()), data[0:1])
    # or multi-samples
    assert_array_equal(sm.forward(data.copy()), data)
    sm.train(data)
    # same on reverse
    assert_array_equal(sm.reverse(data[0:1].copy()), data[0:1])
    # or multi-samples
    assert_array_equal(sm.reverse(data.copy()), data)

    # identical mappers
    sm_none = FeatureSliceMapper(slice(None))
    sm_int = FeatureSliceMapper(np.arange(16))
    sm_bool = FeatureSliceMapper(np.ones(16, dtype='bool'))
    sms = [sm_none, sm_int, sm_bool]

    # test subsets
    sids = [3,4,5,6]
    bsubset = np.zeros(16, dtype='bool')
    bsubset[sids] = True
    subsets = [sids, slice(3,7), bsubset, [3,3,4,4,6,6,6,5]]
    # all test subset result in equivalent masks, hence should do the same to
    # the mapper and result in identical behavior
    for st in sms:
        for i, sub in enumerate(subsets):
            # shallow copy
            orig = copy(st)
            subsm = FeatureSliceMapper(sub)
            # should do copy-on-write for all important stuff!!
            assert_true(orig.is_mergable(subsm))
            orig += subsm
            # test if selection did its job
            if i == 3:
                # special case of multiplying features
                assert_array_equal(orig.forward1(data[0].copy()), subsets[i])
            else:
                assert_array_equal(orig.forward1(data[0].copy()), sids)

    ## all of the above shouldn't change the original mapper
    #assert_array_equal(sm.get_mask(), np.arange(16))

    # check for some bug catcher
    # no 3D input
    #assert_raises(IndexError, sm.forward, np.ones((3,2,1)))
    # no input of wrong length
    if __debug__:
        # checked only in __debug__
        assert_raises(ValueError, sm.forward, np.ones(4))
示例#4
0
文件: base.py 项目: gorlins/PyMVPA
    def _postcall(self, dataset, result):
        """Some postprocessing on the result
        """
        self.raw_result = result
        if not self.__transformer is None:
            if __debug__:
                debug("SA_", "Applying transformer %s" % self.__transformer)
            result = self.__transformer(result)

        # estimate the NULL distribution when functor is given
        if not self.__null_dist is None:
            if __debug__:
                debug("SA_", "Estimating NULL distribution using %s"
                      % self.__null_dist)

            # we need a matching datameasure instance, but we have to disable
            # the estimation of the null distribution in that child to prevent
            # infinite looping.
            measure = copy.copy(self)
            measure.__null_dist = None
            self.__null_dist.fit(measure, dataset)

            if self.states.isEnabled('null_t'):
                # get probability under NULL hyp, but also request
                # either it belong to the right tail
                null_prob, null_right_tail = \
                           self.__null_dist.p(result, return_tails=True)
                self.null_prob = null_prob

                externals.exists('scipy', raiseException=True)
                from scipy.stats import norm

                # TODO: following logic should appear in NullDist,
                #       not here
                tail = self.null_dist.tail
                if tail == 'left':
                    acdf = N.abs(null_prob)
                elif tail == 'right':
                    acdf = 1.0 - N.abs(null_prob)
                elif tail in ['any', 'both']:
                    acdf = 1.0 - N.clip(N.abs(null_prob), 0, 0.5)
                else:
                    raise RuntimeError, 'Unhandled tail %s' % tail
                # We need to clip to avoid non-informative inf's ;-)
                # that happens due to lack of precision in mantissa
                # which is 11 bits in double. We could clip values
                # around 0 at as low as 1e-100 (correspond to z~=21),
                # but for consistency lets clip at 1e-16 which leads
                # to distinguishable value around p=1 and max z=8.2.
                # Should be sufficient range of z-values ;-)
                clip = 1e-16
                null_t = norm.ppf(N.clip(acdf, clip, 1.0 - clip))
                null_t[~null_right_tail] *= -1.0 # revert sign for negatives
                self.null_t = null_t                 # store
            else:
                # get probability of result under NULL hypothesis if available
                # and don't request tail information
                self.null_prob = self.__null_dist.p(result)

        return result
示例#5
0
    def test_confusion_based_error(self, l_clf):
        train = datasets['uni2medium']
        train = train[train.sa.train == 1]
        # to check if we fail to classify for 3 labels
        test3 = datasets['uni3medium']
        test3 = test3[test3.sa.train == 1]
        err = ConfusionBasedError(clf=l_clf)
        terr = TransferMeasure(l_clf, Splitter('train', attr_values=[1,1]),
                               postproc=BinaryFxNode(mean_mismatch_error,
                                                     'targets'))

        self.failUnlessRaises(UnknownStateError, err, None)
        """Shouldn't be able to access the state yet"""

        l_clf.train(train)
        e, te = err(None), terr(train)
        te = np.asscalar(te)
        self.failUnless(abs(e-te) < 1e-10,
            msg="ConfusionBasedError (%.2g) should be equal to TransferError "
                "(%.2g) on traindataset" % (e, te))

        # this will print nasty WARNING but it is ok -- it is just checking code
        # NB warnings are not printed while doing whole testing
        warning("Don't worry about the following warning.")
        if 'multiclass' in l_clf.__tags__:
            self.failIf(terr(test3) is None)

        # try copying the beast
        terr_copy = copy(terr)
示例#6
0
文件: base.py 项目: B-Rich/PyMVPA
    def _call(self, dataset):
        analyzers = []
        # create analyzers
        for clf in self.clf.clfs:
            if self.__analyzer is None:
                analyzer = clf.get_sensitivity_analyzer(**(self._slave_kwargs))
                if analyzer is None:
                    raise ValueError, \
                          "Wasn't able to figure basic analyzer for clf %r" % \
                          (clf,)
                if __debug__:
                    debug("SA", "Selected analyzer %r for clf %r" % \
                          (analyzer, clf))
            else:
                # XXX shallow copy should be enough...
                analyzer = copy.copy(self.__analyzer)

            # assign corresponding classifier
            analyzer.clf = clf
            # if clf was trained already - don't train again
            if clf.trained:
                analyzer._force_train = False
            analyzers.append(analyzer)

        self.__combined_analyzer.analyzers = analyzers

        # XXX not sure if we don't want to call directly ._call(dataset) to avoid
        # double application of transformers/combiners, after all we are just
        # 'proxying' here to combined_analyzer...
        # YOH: decided -- lets call ._call
        return self.__combined_analyzer._call(dataset)
示例#7
0
def test_subset():
    data = np.array(
        [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
         [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
         [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
         [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]])
    # float array doesn't work
    sm = FeatureSliceMapper(np.ones(16))
    assert_raises(IndexError, sm.forward, data)

    # full mask
    sm = FeatureSliceMapper(slice(None))
    # should not change single samples
    assert_array_equal(sm.forward(data[0:1].copy()), data[0:1])
    # or multi-samples
    assert_array_equal(sm.forward(data.copy()), data)
    sm.train(data)
    # same on reverse
    assert_array_equal(sm.reverse(data[0:1].copy()), data[0:1])
    # or multi-samples
    assert_array_equal(sm.reverse(data.copy()), data)

    # identical mappers
    sm_none = FeatureSliceMapper(slice(None))
    sm_int = FeatureSliceMapper(np.arange(16))
    sm_bool = FeatureSliceMapper(np.ones(16, dtype='bool'))
    sms = [sm_none, sm_int, sm_bool]

    # test subsets
    sids = [3, 4, 5, 6]
    bsubset = np.zeros(16, dtype='bool')
    bsubset[sids] = True
    subsets = [sids, slice(3, 7), bsubset, [3, 3, 4, 4, 6, 6, 6, 5]]
    # all test subset result in equivalent masks, hence should do the same to
    # the mapper and result in identical behavior
    for st in sms:
        for i, sub in enumerate(subsets):
            # shallow copy
            orig = copy(st)
            subsm = FeatureSliceMapper(sub)
            # should do copy-on-write for all important stuff!!
            assert_true(orig.is_mergable(subsm))
            orig += subsm
            # test if selection did its job
            if i == 3:
                # special case of multiplying features
                assert_array_equal(orig.forward1(data[0].copy()), subsets[i])
            else:
                assert_array_equal(orig.forward1(data[0].copy()), sids)

    ## all of the above shouldn't change the original mapper
    #assert_array_equal(sm.get_mask(), np.arange(16))

    # check for some bug catcher
    # no 3D input
    #assert_raises(IndexError, sm.forward, np.ones((3,2,1)))
    # no input of wrong length
    if __debug__:
        # checked only in __debug__
        assert_raises(ValueError, sm.forward, np.ones(4))
示例#8
0
def test_ds_shallowcopy():
    # lets use some instance of somewhat evolved dataset
    ds = normal_feature_dataset()
    ds.samples = ds.samples.view(myarray)

    # SHALLOW copy the beast
    ds_ = copy.copy(ds)
    # verify that we have the same data
    assert_array_equal(ds.samples, ds_.samples)
    assert_array_equal(ds.targets, ds_.targets)
    assert_array_equal(ds.chunks, ds_.chunks)

    # array subclass survives
    ok_(isinstance(ds_.samples, myarray))


    # modify and see that we actually DO change the data in both
    ds_.samples[0, 0] = 1234
    assert_array_equal(ds.samples, ds_.samples)
    assert_array_equal(ds.targets, ds_.targets)
    assert_array_equal(ds.chunks, ds_.chunks)

    ds_.sa.targets[0] = 'ab'
    ds_.sa.chunks[0] = 234
    assert_array_equal(ds.samples, ds_.samples)
    assert_array_equal(ds.targets, ds_.targets)
    assert_array_equal(ds.chunks, ds_.chunks)
    ok_(ds.sa.targets[0] == 'ab')
    ok_(ds.sa.chunks[0] == 234)
示例#9
0
 def __iadd__(self, other):
     """Add the sets from `other` s `SummaryStatistics` to current one
     """
     #print "adding ", other, " to ", self
     # need to do shallow copy, or otherwise smth like "cm += cm"
     # would loop forever and exhaust memory eventually
     othersets = copy.copy(other.__sets)
     for set in othersets:
         self.add(*set)#[0], set[1])
     return self
示例#10
0
文件: node.py 项目: esc/PyMVPA
 def __getitem__(self, key):
     # if just one is requested return just one, otherwise return a
     # NodeChain again
     if isinstance(key, int):
         return self._nodes[key]
     else:
         # operate on shallow copy of self
         sliced = copy.copy(self)
         sliced._nodes = self._nodes[key]
         return sliced
示例#11
0
文件: searchlight.py 项目: esc/PyMVPA
    def _sl_call(self, dataset, roi_ids, nproc):
        """Classical generic searchlight implementation
        """
        # compute
        if nproc > 1:
            # split all target ROIs centers into `nproc` equally sized blocks
            nproc_needed = min(len(roi_ids), nproc)
            roi_blocks = np.array_split(roi_ids, nproc_needed)

            # the next block sets up the infrastructure for parallel computing
            # this can easily be changed into a ParallelPython loop, if we
            # decide to have a PP job server in PyMVPA
            import pprocess
            p_results = pprocess.Map(limit=nproc_needed)
            if __debug__:
                debug('SLC', "Starting off child processes for nproc=%i"
                      % nproc_needed)
            compute = p_results.manage(
                        pprocess.MakeParallel(self._proc_block))
            for block in roi_blocks:
                # should we maybe deepcopy the measure to have a unique and
                # independent one per process?
                compute(block, dataset, copy.copy(self.__datameasure))

            # collect results
            results = []
            if self.ca.is_enabled('roi_sizes'):
                roi_sizes = []
            else:
                roi_sizes = None

            for r, rsizes in p_results:
                results += r
                if not roi_sizes is None:
                    roi_sizes += rsizes
        else:
            # otherwise collect the results in a list
            results, roi_sizes = \
                    self._proc_block(roi_ids, dataset, self.__datameasure)

        if __debug__ and 'SLC' in debug.active:
            debug('SLC', '')            # just newline
            resshape = len(results) and np.asanyarray(results[0]).shape or 'N/A'
            debug('SLC', ' hstacking %d results of shape %s'
                  % (len(results), resshape))

        # but be careful: this call also serves as conversion from parallel maps
        # to regular lists!
        # this uses the Dataset-hstack
        results = hstack(results)

        if __debug__:
            debug('SLC', " hstacked shape %s" % (results.shape,))

        return results, roi_sizes
示例#12
0
文件: support.py 项目: gorlins/PyMVPA
    def __call__(self, *args, **kwargs):
        """
        """
        # prepare complex result structure for all calls and their respective
        # attributes: calls x dict(attributes x loop iterations)
        results = [dict([('result', [])] + [(a, []) for a in c.attribs]) \
                        for c in self.__calls]

        # Lets do it!
        for (i, X) in enumerate(self.__source(*args, **kwargs)):
            for (c, call) in enumerate(self.__calls):
                # sanity check
                if i == 0 and call.expand_args and not isSequenceType(X):
                    raise RuntimeError, \
                          "Cannot expand non-sequence result from %s" % \
                          `self.__source`

                # apply argument filter (and reorder) if requested
                if call.argfilter:
                    filtered_args = [X[f] for f in call.argfilter]
                else:
                    filtered_args = X

                if call.expand_args:
                    result = call.call(*filtered_args)
                else:
                    result = call.call(filtered_args)

#                # XXX pylint doesn't like `` for some reason
#                if __debug__:
#                    debug("LOOP", "Iteration %i on call %s. Got result %s" %
#                          (i, `self.__call`, `result`))


                results[c]['result'].append(result)

                for attrib in call.attribs:
                    attrv = call.call.__getattribute__(attrib)

                    if call.copy_attribs:
                        attrv = copy(attrv)

                    results[c][attrib].append(attrv)

        # reduce results structure
        if self.__simplify_results:
            # get rid of dictionary if just the results are requested
            for (c, call) in enumerate(self.__calls):
                if not len(call.attribs):
                    results[c] = results[c]['result']

            if len(self.__calls) == 1:
                results = results[0]

        return results
示例#13
0
文件: base.py 项目: B-Rich/PyMVPA
    def _precall(self, ds):
        # estimate the NULL distribution when functor is given
        if not self.__null_dist is None:
            if __debug__:
                debug("SA_", "Estimating NULL distribution using %s"
                      % self.__null_dist)

            # we need a matching measure instance, but we have to disable
            # the estimation of the null distribution in that child to prevent
            # infinite looping.
            measure = copy.copy(self)
            measure.__null_dist = None
            self.__null_dist.fit(measure, ds)
示例#14
0
def test_labelpermutation_randomsampling():
    ds  = Dataset.from_wizard(np.ones((5, 1)),     targets=range(5), chunks=1)
    ds.append(Dataset.from_wizard(np.ones((5, 1)) + 1, targets=range(5), chunks=2))
    ds.append(Dataset.from_wizard(np.ones((5, 1)) + 2, targets=range(5), chunks=3))
    ds.append(Dataset.from_wizard(np.ones((5, 1)) + 3, targets=range(5), chunks=4))
    ds.append(Dataset.from_wizard(np.ones((5, 1)) + 4, targets=range(5), chunks=5))
    # use subclass for testing if it would survive
    ds.samples = ds.samples.view(myarray)

    ok_(ds.get_nsamples_per_attr('targets') == {0:5, 1:5, 2:5, 3:5, 4:5})
    sample = ds.random_samples(2)
    ok_(sample.get_nsamples_per_attr('targets').values() == [ 2, 2, 2, 2, 2 ])
    ok_((ds.sa['chunks'].unique == range(1, 6)).all())

    # keep the orig labels
    orig_labels = ds.targets[:]

    # also keep the orig dataset, but SHALLOW copy and leave everything
    # else as a view!
    ods = copy.copy(ds)

    ds.permute_targets()
    # some permutation should have happened
    assert_false((ds.targets == orig_labels).all())

    # but the original dataset should be uneffected
    assert_array_equal(ods.targets, orig_labels)
    # array subclass survives
    ok_(isinstance(ods.samples, myarray))

    # samples are really shared
    ds.samples[0, 0] = 123456
    assert_array_equal(ds.samples, ods.samples)

    # and other samples attributes too
    ds.chunks[0] = 9876
    assert_array_equal(ds.chunks, ods.chunks)

    # try to permute on custom target
    ds = ods.copy()
    otargets = ods.sa.targets.copy()
    ds.sa['custom'] = ods.sa.targets.copy()
    assert_array_equal(ds.sa.custom, otargets)
    assert_array_equal(ds.sa.targets, otargets)

    ds.permute_targets(targets_attr='custom')
    # original targets should still match
    assert_array_equal(ds.sa.targets, otargets)
    # but custom should get permuted
    assert_false((ds.sa.custom == otargets).all())
示例#15
0
文件: support.py 项目: arokem/PyMVPA
    def as_descrete_time(self, dt, storeoffset=False, offsetattr='offset'):
        """Convert `onset` and `duration` information into descrete timepoints.

        Parameters
        ----------
        dt : float
          Temporal distance between two timepoints in the same unit as `onset`
          and `duration`.
        storeoffset : bool
          If True, the temporal offset between original `onset` and
          descretized onset is stored as an additional item.
        offsetattr : str
          The name of the attribute that is used to store the computed offset
          in case the `storeoffset` is enabled.

        Returns
        -------
        A copy of the original `Event` with `onset` and optionally `duration`
        replaced by their corresponding descrete timepoint. The new onset will
        correspond to the timepoint just before or exactly at the original
        onset. The new duration will be the number of timepoints covering the
        event from the computed onset timepoint till the timepoint exactly at
        the end, or just after the event.

        Note again, that the new values are expressed as #timepoint and not
        in their original unit!
        """
        dt = float(dt)
        onset = self['onset']
        out = copy(self)

        # get the timepoint just prior the onset
        out['onset'] = int(np.floor(onset / dt))

        if storeoffset:
            # compute offset
            offset = onset - (out['onset'] * dt)
            out[offsetattr] = offset

        if out.has_key('duration'):
            # how many timepoint cover the event (from computed onset
            # to the one timepoint just after the end of the event
            out['duration'] = int(np.ceil((onset + out['duration']) / dt) \
                                  - out['onset'])

        return out
示例#16
0
    def as_descrete_time(self, dt, storeoffset=False, offsetattr='offset'):
        """Convert `onset` and `duration` information into descrete timepoints.

        Parameters
        ----------
        dt : float
          Temporal distance between two timepoints in the same unit as `onset`
          and `duration`.
        storeoffset : bool
          If True, the temporal offset between original `onset` and
          descretized onset is stored as an additional item.
        offsetattr : str
          The name of the attribute that is used to store the computed offset
          in case the `storeoffset` is enabled.

        Returns
        -------
        A copy of the original `Event` with `onset` and optionally `duration`
        replaced by their corresponding descrete timepoint. The new onset will
        correspond to the timepoint just before or exactly at the original
        onset. The new duration will be the number of timepoints covering the
        event from the computed onset timepoint till the timepoint exactly at
        the end, or just after the event.

        Note again, that the new values are expressed as #timepoint and not
        in their original unit!
        """
        dt = float(dt)
        onset = self['onset']
        out = copy(self)

        # get the timepoint just prior the onset
        out['onset'] = int(np.floor(onset / dt))

        if storeoffset:
            # compute offset
            offset = onset - (out['onset'] * dt)
            out[offsetattr] = offset

        if out.has_key('duration'):
            # how many timepoint cover the event (from computed onset
            # to the one timepoint just after the end of the event
            out['duration'] = int(np.ceil((onset + out['duration']) / dt) \
                                  - out['onset'])

        return out
示例#17
0
    def _sl_call(self, dataset, roi_ids, nproc):
        """Classical generic searchlight implementation
        """
        # compute
        if nproc > 1:
            # split all target ROIs centers into `nproc` equally sized blocks
            roi_blocks = np.array_split(roi_ids, nproc)

            # the next block sets up the infrastructure for parallel computing
            # this can easily be changed into a ParallelPython loop, if we
            # decide to have a PP job server in PyMVPA
            import pprocess

            p_results = pprocess.Map(limit=nproc)
            if __debug__:
                debug("SLC", "Starting off child processes for nproc=%i" % nproc)
            compute = p_results.manage(pprocess.MakeParallel(self._proc_block))
            for block in roi_blocks:
                # should we maybe deepcopy the measure to have a unique and
                # independent one per process?
                compute(block, dataset, copy.copy(self.__datameasure))

            # collect results
            results = []
            if self.ca.is_enabled("roi_sizes"):
                roi_sizes = []
            else:
                roi_sizes = None

            for r, rsizes in p_results:
                results += r
                if not roi_sizes is None:
                    roi_sizes += rsizes
        else:
            # otherwise collect the results in a list
            results, roi_sizes = self._proc_block(roi_ids, dataset, self.__datameasure)

        if __debug__:
            debug("SLC", "")

        # but be careful: this call also serves as conversion from parallel maps
        # to regular lists!
        # this uses the Dataset-hstack
        results = hstack(results)

        return results, roi_sizes
示例#18
0
    def testConfusionBasedError(self, l_clf):
        train = datasets['uni2medium_train']
        # to check if we fail to classify for 3 labels
        test3 = datasets['uni3medium_train']
        err = ConfusionBasedError(clf=l_clf)
        terr = TransferError(clf=l_clf)

        self.failUnlessRaises(UnknownStateError, err, None)
        """Shouldn't be able to access the state yet"""

        l_clf.train(train)
        self.failUnlessEqual(err(None), terr(train),
            msg="ConfusionBasedError should be equal to TransferError on" +
                " traindataset")

        # this will print nasty WARNING but it is ok -- it is just checking code
        # NB warnings are not printed while doing whole testing
        self.failIf(terr(test3) is None)

        # try copying the beast
        terr_copy = copy(terr)
示例#19
0
    def test_confusion_based_error(self, l_clf):
        train = datasets['uni2medium_train']
        # to check if we fail to classify for 3 labels
        test3 = datasets['uni3medium_train']
        err = ConfusionBasedError(clf=l_clf)
        terr = TransferError(clf=l_clf)

        self.failUnlessRaises(UnknownStateError, err, None)
        """Shouldn't be able to access the state yet"""

        l_clf.train(train)
        e, te = err(None), terr(train)
        self.failUnless(abs(e-te) < 1e-10,
            msg="ConfusionBasedError (%.2g) should be equal to TransferError "
                "(%.2g) on traindataset" % (e, te))

        # this will print nasty WARNING but it is ok -- it is just checking code
        # NB warnings are not printed while doing whole testing
        warning("Don't worry about the following warning.")
        self.failIf(terr(test3) is None)

        # try copying the beast
        terr_copy = copy(terr)
示例#20
0
    def test_confusion_based_error(self, l_clf):
        train = datasets['uni2medium_train']
        # to check if we fail to classify for 3 labels
        test3 = datasets['uni3medium_train']
        err = ConfusionBasedError(clf=l_clf)
        terr = TransferError(clf=l_clf)

        self.failUnlessRaises(UnknownStateError, err, None)
        """Shouldn't be able to access the state yet"""

        l_clf.train(train)
        e, te = err(None), terr(train)
        self.failUnless(
            abs(e - te) < 1e-10,
            msg="ConfusionBasedError (%.2g) should be equal to TransferError "
            "(%.2g) on traindataset" % (e, te))

        # this will print nasty WARNING but it is ok -- it is just checking code
        # NB warnings are not printed while doing whole testing
        warning("Don't worry about the following warning.")
        self.failIf(terr(test3) is None)

        # try copying the beast
        terr_copy = copy(terr)
示例#21
0
    def _call(self, dataset, testdataset):
        """Proceed and select the features recursively eliminating less
        important ones.

        Parameters
        ----------
        dataset : Dataset
          used to select features and train classifiers to determine the
          transfer error.
        testdataset : Dataset
          used to test the trained classifer on a certain feature set
          to determine the transfer error.

        Returns
        -------
        A tuple with the dataset containing the feature subset of
        `dataset` that had the lowest transfer error of all tested sets until
        the stopping criterion was reached. The tuple also contains a dataset
        with the corrsponding features from the `testdataset`.
        """
        errors = []
        """Computed error for each tested features set."""

        # feature candidate are all features in the pattern object
        candidates = range(dataset.nfeatures)

        # initially empty list of selected features
        selected = []

        # results in here please
        results = None

        # as long as there are candidates left
        # the loop will most likely get broken earlier if the stopping
        # criterion is reached
        while len(candidates):
            # measures for all candidates
            measures = []

            # for all possible candidates
            for i, candidate in enumerate(candidates):
                if __debug__:
                    debug('IFSC', "Tested %i" % i, cr=True)

                # take the new candidate and all already selected features
                # select a new temporay feature subset from the dataset
                # XXX assume MappedDataset and issue plain=True ??
                tmp_dataset = \
                        dataset[:, selected + [candidate]]

                # compute data measure on this feature set
                measures.append(self.__data_measure(tmp_dataset))

            measures = [np.asscalar(m) for m in measures]
            # Select promissing feature candidates (staging)
            # IDs are only applicable to the current set of feature candidates
            tmp_staging_ids = self.__feature_selector(measures)

            # translate into real candidate ids
            staging_ids = [candidates[i] for i in tmp_staging_ids]

            # mark them as selected and remove from candidates
            selected += staging_ids
            for i in staging_ids:
                candidates.remove(i)

            # compute transfer error for the new set
            # XXX assume MappedDataset and issue plain=True ??
            error = self.__transfer_error(testdataset[:, selected],
                                          dataset[:, selected])
            errors.append(error)

            # Check if it is time to stop and if we got
            # the best result
            stop = self.__stopping_criterion(errors)
            isthebest = self.__bestdetector(errors)

            if __debug__:
                debug('IFSC',
                      "nselected %i; error: %.4f " \
                      "best/stop=%d/%d\n" \
                      % (len(selected), errors[-1], isthebest, stop),
                      cr=True, lf=True)

            if isthebest:
                # do copy to survive later selections
                results = copy(selected)

            # leave the loop when the criterion is reached
            if stop:
                break

        # charge state
        self.ca.errors = errors

        # best dataset ever is returned
        return dataset[:, results], testdataset[:, results]
示例#22
0
文件: rfe.py 项目: B-Rich/PyMVPA
    def _train(self, ds):
        """Proceed and select the features recursively eliminating less
        important ones.

        Parameters
        ----------
        dataset : Dataset
          used to compute sensitivity maps and train a classifier
          to determine the transfer error
        testdataset : Dataset
          used to test the trained classifer to determine the
          transfer error

        Returns a tuple of two new datasets with the feature subset of
        `dataset` that had the lowest transfer error of all tested
        sets until the stopping criterion was reached. The first
        dataset is the feature subset of the training data and the
        second the selection of the test dataset.
        """
        # get the initial split into train and test
        dataset, testdataset = self._get_traintest_ds(ds)

        errors = []
        """Computed error for each tested features set."""

        ca = self.ca
        ca.nfeatures = []
        """Number of features at each step. Since it is not used by the
        algorithm it is stored directly in the conditional attribute"""

        ca.history = np.arange(dataset.nfeatures)
        """Store the last step # when the feature was still present
        """

        ca.sensitivities = []

        stop = False
        """Flag when RFE should be stopped."""

        results = None
        """Will hold the best feature set ever."""

        wdataset = dataset
        """Operate on working dataset initially identical."""

        wtestdataset = testdataset
        """Same feature selection has to be performs on test dataset as well.
        This will hold the current testdataset."""

        step = 0
        """Counter how many selection step where done."""

        orig_feature_ids = np.arange(dataset.nfeatures)
        """List of feature Ids as per original dataset remaining at any given
        step"""

        sensitivity = None
        """Contains the latest sensitivity map."""

        result_selected_ids = orig_feature_ids
        """Resultant ids of selected features. Since the best is not
        necessarily is the last - we better keep this one around. By
        default -- all features are there"""
        selected_ids = result_selected_ids

        while wdataset.nfeatures > 0:

            if __debug__:
                debug('RFEC',
                      "Step %d: nfeatures=%d" % (step, wdataset.nfeatures))

            # mark the features which are present at this step
            # if it brings anyb mentionable computational burden in the future,
            # only mark on removed features at each step
            ca.history[orig_feature_ids] = step

            # Compute sensitivity map
            if self.__update_sensitivity or sensitivity == None:
                sensitivity = self._fmeasure(wdataset)
                if len(sensitivity) > 1:
                    raise ValueError(
                            "RFE cannot handle multiple sensitivities at once. "
                            "'%s' returned %i sensitivities."
                            % (self._fmeasure.__class__.__name__,
                               len(sensitivity)))

            if ca.is_enabled("sensitivities"):
                ca.sensitivities.append(sensitivity)

            # get error for current feature set (handles optional retraining)
            error = self._evaluate_pmeasure(wdataset, wtestdataset)
            # Record the error
            errors.append(np.asscalar(error))

            # Check if it is time to stop and if we got
            # the best result
            stop = self._stopping_criterion(errors)
            isthebest = self._bestdetector(errors)

            nfeatures = wdataset.nfeatures

            if ca.is_enabled("nfeatures"):
                ca.nfeatures.append(wdataset.nfeatures)

            # store result
            if isthebest:
                result_selected_ids = orig_feature_ids

            if __debug__:
                debug('RFEC',
                      "Step %d: nfeatures=%d error=%.4f best/stop=%d/%d " %
                      (step, nfeatures, np.asscalar(error), isthebest, stop))

            # stop if it is time to finish
            if nfeatures == 1 or stop:
                break

            # Select features to preserve
            selected_ids = self._fselector(sensitivity)

            if __debug__:
                debug('RFEC_',
                      "Sensitivity: %s, nfeatures_selected=%d, selected_ids: %s" %
                      (sensitivity, len(selected_ids), selected_ids))


            # Create a dataset only with selected features
            wdataset = wdataset[:, selected_ids]

            # select corresponding sensitivity values if they are not
            # recomputed
            if not self.__update_sensitivity:
                sensitivity = sensitivity[selected_ids]

            # need to update the test dataset as well
            # XXX why should it ever become None?
            # yoh: because we can have __transfer_error computed
            #      using wdataset. See xia-generalization estimate
            #      in lightsvm. Or for god's sake leave-one-out
            #      on a wdataset
            # TODO: document these cases in this class
            if not testdataset is None:
                wtestdataset = wtestdataset[:, selected_ids]

            step += 1

            # WARNING: THIS MUST BE THE LAST THING TO DO ON selected_ids
            selected_ids.sort()
            if self.ca.is_enabled("history") \
                   or self.ca.is_enabled('selected_ids'):
                orig_feature_ids = orig_feature_ids[selected_ids]

            # we already have the initial sensitivities, so even for a shared
            # classifier we can cleanup here
            self._pmeasure.untrain()
        # charge conditional attributes
        self.ca.errors = errors
        self.ca.selected_ids = result_selected_ids
        # announce desired features to the underlying slice mapper
        # do copy to survive later selections
        self._safe_assign_slicearg(copy(result_selected_ids))
示例#23
0
文件: ifs.py 项目: B-Rich/PyMVPA
    def _train(self, ds):
        # local binding
        fmeasure = self._fmeasure
        fselector = self._fselector
        scriterion = self._stopping_criterion
        bestdetector = self._bestdetector

        # init
        # Computed error for each tested features set.
        errors = []
        # feature candidate are all features in the pattern object
        candidates = range(ds.nfeatures)
        # initially empty list of selected features
        selected = []
        # results in here please
        results = None

        # as long as there are candidates left
        # the loop will most likely get broken earlier if the stopping
        # criterion is reached
        while len(candidates):
            # measures for all candidates
            measures = []

            # for all possible candidates
            for i, candidate in enumerate(candidates):
                if __debug__:
                    debug('IFSC', "Tested %i" % i, cr=True)

                # take the new candidate and all already selected features
                # select a new temporay feature subset from the dataset
                # slice the full dataset, because for the initial iteration
                # steps this will be much mure effecient than splitting the
                # full ds into train and test at first
                fslm = StaticFeatureSelection(selected + [candidate])
                fslm.train(ds)
                candidate_ds = fslm(ds)
                # activate the dataset splitter
                dsgen = self._splitter.generate(candidate_ds)
                # and derived the dataset part that is used for computing the selection
                # criterion
                trainds = dsgen.next()
                # compute data measure on the training part of this feature set
                measures.append(fmeasure(trainds))

            # relies on ds.item() to work properly
            measures = [np.asscalar(m) for m in measures]

            # Select promissing feature candidates (staging)
            # IDs are only applicable to the current set of feature candidates
            tmp_staging_ids = fselector(measures)

            # translate into real candidate ids
            staging_ids = [candidates[i] for i in tmp_staging_ids]

            # mark them as selected and remove from candidates
            selected += staging_ids
            for i in staging_ids:
                candidates.remove(i)

            # actually run the performance measure to estimate "quality" of
            # selection
            fslm = StaticFeatureSelection(selected)
            fslm.train(ds)
            selectedds = fslm(ds)
            # split into train and test part
            trainds, testds = self._get_traintest_ds(selectedds)
            # evaluate and store
            error = self._evaluate_pmeasure(trainds, testds)
            errors.append(np.asscalar(error))
            # intermediate cleanup, so the datasets do not hand around while
            # the next candidate evaluation is computed
            del trainds
            del testds

            # Check if it is time to stop and if we got
            # the best result
            stop = scriterion(errors)
            isthebest = bestdetector(errors)

            if __debug__:
                debug('IFSC',
                      "nselected %i; error: %.4f " \
                      "best/stop=%d/%d\n" \
                      % (len(selected), errors[-1], isthebest, stop),
                      cr=True, lf=True)

            if isthebest:
                # announce desired features to the underlying slice mapper
                # do copy to survive later selections
                self._safe_assign_slicearg(copy(selected))

            # leave the loop when the criterion is reached
            if stop:
                break

        # charge state
        self.ca.errors = errors
示例#24
0
文件: node.py 项目: esc/PyMVPA
 def __copy__(self):
     # XXX how do we safely and exhaustively copy a node?
     return self.__class__([copy.copy(n) for n in self])
示例#25
0
 def __add__(self, other):
     """Add two `SummaryStatistics`s
     """
     result = copy.copy(self)
     result += other
     return result
示例#26
0
文件: ifs.py 项目: arokem/PyMVPA
    def _call(self, dataset, testdataset):
        """Proceed and select the features recursively eliminating less
        important ones.

        Parameters
        ----------
        dataset : Dataset
          used to select features and train classifiers to determine the
          transfer error.
        testdataset : Dataset
          used to test the trained classifer on a certain feature set
          to determine the transfer error.

        Returns
        -------
        A tuple with the dataset containing the feature subset of
        `dataset` that had the lowest transfer error of all tested sets until
        the stopping criterion was reached. The tuple also contains a dataset
        with the corrsponding features from the `testdataset`.
        """
        errors = []
        """Computed error for each tested features set."""

        # feature candidate are all features in the pattern object
        candidates = range( dataset.nfeatures )

        # initially empty list of selected features
        selected = []

        # results in here please
        results = None

        # as long as there are candidates left
        # the loop will most likely get broken earlier if the stopping
        # criterion is reached
        while len( candidates ):
            # measures for all candidates
            measures = []

            # for all possible candidates
            for i, candidate in enumerate(candidates):
                if __debug__:
                    debug('IFSC', "Tested %i" % i, cr=True)

                # take the new candidate and all already selected features
                # select a new temporay feature subset from the dataset
                # XXX assume MappedDataset and issue plain=True ??
                tmp_dataset = \
                        dataset[:, selected + [candidate]]

                # compute data measure on this feature set
                measures.append(self.__data_measure(tmp_dataset))

            measures = [np.asscalar(m) for m in measures]
            # Select promissing feature candidates (staging)
            # IDs are only applicable to the current set of feature candidates
            tmp_staging_ids = self.__feature_selector(measures)

            # translate into real candidate ids
            staging_ids = [ candidates[i] for i in tmp_staging_ids ]

            # mark them as selected and remove from candidates
            selected += staging_ids
            for i in staging_ids:
                candidates.remove(i)

            # compute transfer error for the new set
            # XXX assume MappedDataset and issue plain=True ??
            error = self.__transfer_error(testdataset[:, selected],
                                          dataset[:, selected])
            errors.append(error)

            # Check if it is time to stop and if we got
            # the best result
            stop = self.__stopping_criterion(errors)
            isthebest = self.__bestdetector(errors)

            if __debug__:
                debug('IFSC',
                      "nselected %i; error: %.4f " \
                      "best/stop=%d/%d\n" \
                      % (len(selected), errors[-1], isthebest, stop),
                      cr=True, lf=True)

            if isthebest:
                # do copy to survive later selections
                results = copy(selected)

            # leave the loop when the criterion is reached
            if stop:
                break

        # charge state
        self.ca.errors = errors

        # best dataset ever is returned
        return dataset[:, results], testdataset[:, results]
示例#27
0
def test_labelpermutation_randomsampling():
    ds = Dataset.from_wizard(np.ones((5, 10)),     targets=range(5), chunks=1)
    for i in xrange(1, 5):
        ds.append(Dataset.from_wizard(np.ones((5, 10)) + i,
                                      targets=range(5), chunks=i+1))
    # assign some feature attributes
    ds.fa['roi'] = np.repeat(np.arange(5), 2)
    ds.fa['lucky'] = np.arange(10)%2
    # use subclass for testing if it would survive
    ds.samples = ds.samples.view(myarray)

    ok_(ds.get_nsamples_per_attr('targets') == {0:5, 1:5, 2:5, 3:5, 4:5})
    sample = ds.random_samples(2)
    ok_(sample.get_nsamples_per_attr('targets').values() == [ 2, 2, 2, 2, 2 ])
    ok_((ds.sa['chunks'].unique == range(1, 6)).all())

    # keep the orig labels
    orig_labels = ds.targets.copy()

    # also keep the orig dataset, but SHALLOW copy and leave everything
    # else as a view!
    ods = copy.copy(ds)

    ds.permute_attr()
    # by default, some permutation of targets should have happened
    assert_false((ds.targets == orig_labels).all())

    # but the original dataset should be unaffected
    assert_array_equal(ods.targets, orig_labels)
    # array subclass survives
    ok_(isinstance(ods.samples, myarray))

    # samples are really shared
    ds.samples[0, 0] = 123456
    assert_array_equal(ds.samples, ods.samples)

    # and other samples attributes too
    ds.chunks[0] = 9876
    assert_array_equal(ds.chunks, ods.chunks)

    # try to permute on custom target
    ds = ods.copy()
    otargets = ods.sa.targets.copy()
    ds.sa['custom'] = ods.sa.targets.copy()
    assert_array_equal(ds.sa.custom, otargets)
    assert_array_equal(ds.sa.targets, otargets)

    ds.permute_attr(attr='custom')
    # original targets should still match
    assert_array_equal(ds.sa.targets, otargets)
    # but custom should get permuted
    assert_false((ds.sa.custom == otargets).all())

    #
    # Test permutation among features
    #
    assert_raises(KeyError, ds.permute_attr,
                  attr='roi') # wrong collection
    ds = ods.copy()
    ds.permute_attr(attr='lucky', chunks_attr='roi', col='fa')
    # we should have not touched samples attributes
    for sa in ds.sa.keys():
        assert_array_equal(ds.sa[sa].value, ods.sa[sa].value)
    # but we should have changed the roi
    assert_false((ds.fa['lucky'].value == ods.fa['lucky'].value).all())
    assert_array_equal(ds.fa['roi'].value, ods.fa['roi'].value)

    # permute ROI as well without chunking (??? should we make
    # chunks_attr=None by default?)
    ds.permute_attr(attr='roi', chunks_attr=None, col='fa')
    assert_false((ds.fa['roi'].value == ods.fa['roi'].value).all())
示例#28
0
    def _call(self, dataset):
        """Perform the ROI search.
        """
        # local binding
        nproc = self.__nproc

        if nproc is None and externals.exists('pprocess'):
            import pprocess
            try:
                nproc = pprocess.get_number_of_cores() or 1
            except AttributeError:
                warning("pprocess version %s has no API to figure out maximal "
                        "number of cores. Using 1" % externals.versions['pprocess'])
                nproc = 1
        # train the queryengine
        self.__qe.train(dataset)

        # decide whether to run on all possible center coords or just a provided
        # subset
        if self.__center_ids is not None:
            roi_ids = self.__center_ids
            # safeguard against stupidity
            if __debug__:
                if max(roi_ids) >= dataset.nfeatures:
                    raise IndexError, \
                          "Maximal center_id found is %s whenever given " \
                          "dataset has only %d features" \
                          % (max(roi_ids), dataset.nfeatures)
        else:
            roi_ids = np.arange(dataset.nfeatures)

        # compute
        if nproc > 1:
            # split all target ROIs centers into `nproc` equally sized blocks
            roi_blocks = np.array_split(roi_ids, nproc)

            # the next block sets up the infrastructure for parallel computing
            # this can easily be changed into a ParallelPython loop, if we
            # decide to have a PP job server in PyMVPA
            import pprocess
            p_results = pprocess.Map(limit=nproc)
            compute = p_results.manage(
                        pprocess.MakeParallel(self._proc_block))
            for block in roi_blocks:
                # should we maybe deepcopy the measure to have a unique and
                # independent one per process?
                compute(block, dataset, copy.copy(self.__datameasure))

            # collect results
            results = []
            if self.ca.is_enabled('roisizes'):
                roisizes = []
            else:
                roisizes = None

            for r, rsizes in p_results:
                results += r
                if not roisizes is None:
                    roisizes += rsizes
        else:
            # otherwise collect the results in a list
            results, roisizes = \
                    self._proc_block(roi_ids, dataset, self.__datameasure)

        if not roisizes is None:
            self.ca.roisizes = roisizes

        if __debug__:
            debug('SLC', '')

        # but be careful: this call also serves as conversion from parallel maps
        # to regular lists!
        # this uses the Dataset-hstack
        results = hstack(results)

        if 'mapper' in dataset.a:
            # since we know the space we can stick the original mapper into the
            # results as well
            if self.__center_ids is None:
                results.a['mapper'] = copy.copy(dataset.a.mapper)
            else:
                # there is an additional selection step that needs to be
                # expressed by another mapper
                mapper = copy.copy(dataset.a.mapper)
                mapper.append(FeatureSliceMapper(self.__center_ids,
                                                 dshape=dataset.shape[1:]))
                results.a['mapper'] = mapper

        # charge state
        self.ca.raw_results = results

        # return raw results, base-class will take care of transformations
        return results
示例#29
0
    def _postcall(self, dataset, result):
        """Some postprocessing on the result
        """
        self.ca.raw_results = result

        # post-processing
        if not self.__postproc is None:
            if __debug__:
                debug("SA_", "Applying mapper %s" % self.__postproc)
            result = self.__postproc.forward(result)

        # estimate the NULL distribution when functor is given
        if not self.__null_dist is None:
            if __debug__:
                debug("SA_", "Estimating NULL distribution using %s"
                      % self.__null_dist)

            # we need a matching datameasure instance, but we have to disable
            # the estimation of the null distribution in that child to prevent
            # infinite looping.
            measure = copy.copy(self)
            measure.__null_dist = None
            self.__null_dist.fit(measure, dataset)

            if self.ca.is_enabled('null_t'):
                # get probability under NULL hyp, but also request
                # either it belong to the right tail
                null_prob, null_right_tail = \
                           self.__null_dist.p(result, return_tails=True)
                self.ca.null_prob = null_prob

                externals.exists('scipy', raise_=True)
                from scipy.stats import norm

                # TODO: following logic should appear in NullDist,
                #       not here
                tail = self.null_dist.tail
                if tail == 'left':
                    acdf = np.abs(null_prob)
                elif tail == 'right':
                    acdf = 1.0 - np.abs(null_prob)
                elif tail in ['any', 'both']:
                    acdf = 1.0 - np.clip(np.abs(null_prob), 0, 0.5)
                else:
                    raise RuntimeError, 'Unhandled tail %s' % tail
                # We need to clip to avoid non-informative inf's ;-)
                # that happens due to lack of precision in mantissa
                # which is 11 bits in double. We could clip values
                # around 0 at as low as 1e-100 (correspond to z~=21),
                # but for consistency lets clip at 1e-16 which leads
                # to distinguishable value around p=1 and max z=8.2.
                # Should be sufficient range of z-values ;-)
                clip = 1e-16
                null_t = norm.ppf(np.clip(acdf, clip, 1.0 - clip))
                # assure that we deal with arrays:
                null_t = np.array(null_t, ndmin=1, copy=False)
                null_t[~null_right_tail] *= -1.0 # revert sign for negatives
                self.ca.null_t = null_t          # store
            else:
                # get probability of result under NULL hypothesis if available
                # and don't request tail information
                self.ca.null_prob = self.__null_dist.p(result)

        return result