示例#1
0
文件: base.py 项目: gorlins/PyMVPA
    def clone(self):
        """Create full copy of the classifier.

        It might require classifier to be untrained first due to
        present SWIG bindings.

        TODO: think about proper re-implementation, without enrollment of deepcopy
        """
        try:
            return deepcopy(self)
        except:
            self.untrain()
            return deepcopy(self)
示例#2
0
    def clone(self):
        """Create full copy of the classifier.

        It might require classifier to be untrained first due to
        present SWIG bindings.

        TODO: think about proper re-implementation, without enrollment of deepcopy
        """
        if __debug__:
            debug("CLF", "Cloning %s#%s" % (self, id(self)))
        try:
            return deepcopy(self)
        except:
            self.untrain()
            return deepcopy(self)
示例#3
0
    def testFeatureSelection(self):
        """Testing feature selection: sorted/not sorted, feature groups
        """
        origdata = datasets["uni2large"].samples[:10, :20]
        data = Dataset(samples=origdata, labels=2, chunks=2)

        # define some feature groups
        data.defineFeatureGroups(N.repeat(range(4), 5))

        unmasked = data.samples.copy()

        # default must be no mask
        self.failUnless(data.nfeatures == 20)

        features_to_select = [3, 0, 17]
        features_to_select_copy = copy.deepcopy(features_to_select)
        features_to_select_sorted = copy.deepcopy(features_to_select)
        features_to_select_sorted.sort()

        bsel = N.array([False] * 20)
        bsel[features_to_select] = True
        # check selection with feature list
        for sel, issorted in [
            (data.selectFeatures(features_to_select, sort=False), False),
            (data.selectFeatures(features_to_select, sort=True), True),
            (data.select(slice(None), features_to_select), True),
            (data.select(slice(None), N.array(features_to_select)), True),
            (data.select(slice(None), bsel), True),
        ]:
            self.failUnless(sel.nfeatures == 3)

            # check size of the masked patterns
            self.failUnless(sel.samples.shape == (10, 3))

            # check that the right features are selected
            fts = (features_to_select, features_to_select_sorted)[int(issorted)]
            self.failUnless((unmasked[:, fts] == sel.samples).all())

            # check grouping information
            self.failUnless((sel._dsattr["featuregroups"] == [0, 0, 3]).all())

            # check side effect on features_to_select parameter:
            self.failUnless(features_to_select == features_to_select_copy)

        # check selection by feature group id
        gsel = data.selectFeatures(groups=[2, 3])
        self.failUnless(gsel.nfeatures == 10)
        self.failUnless(set(gsel._dsattr["featuregroups"]) == set([2, 3]))
示例#4
0
文件: zscore.py 项目: esc/PyMVPA
    def _forward_data(self, data):
        if self.__chunks_attr is not None:
            raise RuntimeError(
                "%s cannot do chunk-wise Z-scoring of plain data "
                "since it has to be parameterized with chunks_attr." % self
            )
        if self.__param_est is not None:
            raise RuntimeError(
                "%s cannot do Z-scoring with estimating " "parameters on some attributes of plain" "data." % self
            )

        params = self.__params_dict
        if params is None:
            raise RuntimeError, "ZScoreMapper needs to be trained before call to forward"

        # mappers should not modify the input data
        # cast the data to float, since in-place operations below to not upcast!
        if np.issubdtype(data.dtype, np.integer):
            if self._secret_inplace_zscore:
                raise TypeError(
                    "Cannot perform inplace z-scoring since data is of integer "
                    "type. Please convert to float before calling zscore"
                )
            mdata = data.astype(self.__dtype)
        elif self._secret_inplace_zscore:
            mdata = data
        else:
            # do not call .copy() directly, since it might not be an array
            mdata = copy.deepcopy(data)

        self._zscore(mdata, *params["__all__"])
        return mdata
示例#5
0
文件: mapped.py 项目: gorlins/PyMVPA
    def selectFeatures(self, ids, plain=False, sort=False):
        """Select features given their ids.

        The methods behaves similar to Dataset.selectFeatures(), but
        additionally takes care of adjusting the embedded mapper
        appropriately.

        :Parameters:
          ids: sequence
            Iterable container to select ids
          plain: boolean
            Flag whether to return MappedDataset (or just Dataset)
          sort: boolean
            Flag whether to sort Ids. Order matters and selectFeatures
            assumes incremental order. If not such, in non-optimized
            code selectFeatures would verify the order and sort
        """

        # call base method to get selected feature subset
        if plain:
            sdata = Dataset(self._data, self._dsattr, check_data=False,
                            copy_samples=False, copy_data=False,
                            copy_dsattr=False)
            return sdata.selectFeatures(ids=ids, sort=sort)
        else:
            sdata = Dataset.selectFeatures(self, ids=ids, sort=sort)
            # since we have new DataSet we better have a new mapper
            sdata._dsattr['mapper'] = copy.deepcopy(sdata._dsattr['mapper'])
            if sort:
                sdata._dsattr['mapper'].selectOut(sorted(ids))
            else:
                sdata._dsattr['mapper'].selectOut(ids)
            return sdata
示例#6
0
 def test_deep_copying_state_variable(self):
     for v in (True, False):
         sv = ConditionalAttribute(enabled=v, doc="Testing")
         sv.enabled = not v
         sv_dc = copy.deepcopy(sv)
         self.failUnlessEqual(sv.enabled, sv_dc.enabled)
         self.failUnlessEqual(sv.name, sv_dc.name)
         self.failUnlessEqual(sv._instance_index, sv_dc._instance_index)
示例#7
0
 def setUp(self):
     self.backup = []
     # paranoid check
     self.cfgstr = str(cfg)
     # clean up externals cfg for proper testing
     if cfg.has_section('externals'):
         self.backup = copy.deepcopy(cfg.items('externals'))
     cfg.remove_section('externals')
示例#8
0
 def setUp(self):
     self.backup = []
     # paranoid check
     self.cfgstr = str(cfg)
     # clean up externals cfg for proper testing
     if cfg.has_section("externals"):
         self.backup = copy.deepcopy(cfg.items("externals"))
     cfg.remove_section("externals")
示例#9
0
文件: base.py 项目: arokem/PyMVPA
    def select_samples(self, selection):
        """Return new ColumnData with selected samples"""

        data = copy.deepcopy(self)
        for k, v in data.iteritems():
            data[k] = [v[x] for x in selection]

        data._check()
        return data
示例#10
0
    def select_samples(self, selection):
        """Return new ColumnData with selected samples"""

        data = copy.deepcopy(self)
        for k, v in data.iteritems():
            data[k] = [v[x] for x in selection]

        data._check()
        return data
示例#11
0
    def testMoreSVD(self):
        pm = SVDMapper()
        # train SVD
        pm.train(self.largefeat)

        # mixing matrix cannot be square
        self.failUnlessEqual(pm.proj.shape, (40, 10))

        # only first singular value significant
        self.failUnless(pm.sv[:1] > 10)
        self.failUnless((pm.sv[1:] < 10).all())

        # now project data into SVD space
        p = pm.forward(self.largefeat.samples)

        # only variance of first component significant
        var = p.var(axis=0)

        # test that only one component has variance
        self.failUnless(var[:1] > 1.0)
        self.failUnless((var[1:] < 0.0001).all())

        # check that the mapped data can be fully recovered by 'reverse()'
        rp = pm.reverse(p)
        self.failUnlessEqual(rp.shape, self.largefeat.samples.shape)
        self.failUnless((N.round(rp) == self.largefeat.samples).all())

        self.failUnlessEqual(pm.getInSize(), 40)
        self.failUnlessEqual(pm.getOutSize(), 10)

        # copy mapper
        pm2 = deepcopy(pm)

        # now remove all but the first 2 components from the mapper
        pm2.selectOut([0, 1])

        # sanity check
        self.failUnlessEqual(pm2.getInSize(), 40)
        self.failUnlessEqual(pm2.getOutSize(), 2)

        # but orginal mapper must be left intact
        self.failUnlessEqual(pm.getInSize(), 40)
        self.failUnlessEqual(pm.getOutSize(), 10)

        # data should still be fully recoverable by 'reverse()'
        rp2 = pm2.reverse(p[:, [0, 1]])
        self.failUnlessEqual(rp2.shape, self.largefeat.samples.shape)
        self.failUnless(N.abs(rp2 - self.largefeat.samples).sum() < 0.0001)

        # now make new random data and do forward->reverse check
        data = N.random.normal(size=(98, 40))
        data_f = pm.forward(data)

        self.failUnlessEqual(data_f.shape, (98, 10))

        data_r = pm.reverse(data_f)
        self.failUnlessEqual(data_r.shape, (98, 40))
示例#12
0
 def test_deep_copying_state_variable(self):
     for v in (True, False):
         sv = ConditionalAttribute(enabled=v,
                            doc="Testing")
         sv.enabled = not v
         sv_dc = copy.deepcopy(sv)
         self.failUnlessEqual(sv.enabled, sv_dc.enabled)
         self.failUnlessEqual(sv.name, sv_dc.name)
         self.failUnlessEqual(sv._instance_index, sv_dc._instance_index)
示例#13
0
    def testCompareToZscore(self):
        """Test by comparing to results of elderly z-score function
        """
        for ds in self.dss:
            ds1 = deepcopy(ds)
            ds2 = deepcopy(ds)

            zsm = ZScoreMapper()
            zsm.train(ds1)
            ds1z = zsm.forward(ds1.samples)

            zscore(ds2, perchunk=False)
            self.failUnless(N.linalg.norm(ds1z - ds2.samples) < 1e-12)
            self.failUnless((ds1.samples == ds.samples).all(),
                            msg="It seems we modified original dataset!")

            ds0 = zsm.reverse(ds1z)
            self.failUnless(N.linalg.norm(ds0 - ds.samples) < 1e-12,
                            msg="Can't reconstruct from z-scores")
示例#14
0
def test_mapper_vs_zscore():
    """Test by comparing to results of elderly z-score function
    """
    # data: 40 sample feature line in 20d space (40x20; samples x features)
    dss = [
        dataset_wizard(np.concatenate(
            [np.arange(40) for i in range(20)]).reshape(20,-1).T,
                targets=1, chunks=1),
        ] + datasets.values()

    for ds in dss:
        ds1 = deepcopy(ds)
        ds2 = deepcopy(ds)

        zsm = ZScoreMapper(chunks_attr=None)
        assert_raises(RuntimeError, zsm.forward, ds1.samples)
        zsm.train(ds1)
        ds1z = zsm.forward(ds1.samples)

        zscore(ds2, chunks_attr=None)
        assert_array_almost_equal(ds1z, ds2.samples)
        assert_array_equal(ds1.samples, ds.samples)
示例#15
0
def test_mapper_vs_zscore():
    """Test by comparing to results of elderly z-score function
    """
    # data: 40 sample feature line in 20d space (40x20; samples x features)
    dss = [
        dataset_wizard(np.concatenate(
            [np.arange(40) for i in range(20)]).reshape(20,-1).T,
                targets=1, chunks=1),
        ] + datasets.values()

    for ds in dss:
        ds1 = deepcopy(ds)
        ds2 = deepcopy(ds)

        zsm = ZScoreMapper(chunks_attr=None)
        assert_raises(RuntimeError, zsm.forward, ds1.samples)
        zsm.train(ds1)
        ds1z = zsm.forward(ds1.samples)

        zscore(ds2, chunks_attr=None)
        assert_array_almost_equal(ds1z, ds2.samples)
        assert_array_equal(ds1.samples, ds.samples)
示例#16
0
文件: support.py 项目: gorlins/PyMVPA
def isSorted(items):
    """Check if listed items are in sorted order.

    :Parameters:
        `items`: iterable container

    :return: `True` if were sorted. Otherwise `False` + Warning
    """
    items_sorted = deepcopy(items)
    items_sorted.sort()
    equality = items_sorted == items
    # XXX yarik forgotten analog to isiterable
    if hasattr(equality, '__iter__'):
        equality = N.all(equality)
    return equality
示例#17
0
    def testAutoOptimizePCA(self):
        # train PCA
        self.pm.train(self.largefeat)

        # mixing matrix cannot be square
#        self.failUnlessEqual(self.pm.mix.shape, (10, 40))

        # only first eigenvalue significant
        self.failUnless(self.pm.sv[:1] > 10)
        self.failUnless((self.pm.sv[1:] < 10).all())

        # now project data into PCA space
        p = self.pm.forward(self.largefeat.samples)

        # only variance of first component significant
        var = p.var(axis=0)
        # test that only one component has variance
        self.failUnless(var[:1] > 1.0)
        self.failUnless((var[1:] < 0.0001).all())

        # check that the mapped data can be fully recovered by 'reverse()'
        rp = self.pm.reverse(p)
        self.failUnlessEqual(rp.shape, self.largefeat.samples.shape)
        self.failUnless((N.round(rp) == self.largefeat.samples).all())

        self.failUnlessEqual(self.pm.getInSize(), 40)
#        self.failUnlessEqual(self.pm.getOutSize(), 10)
        self.failUnlessEqual(self.pm.getOutSize(), 40)

        # copy mapper
        pm2 = deepcopy(self.pm)

        # now remove all but the first 2 components from the mapper
        pm2.selectOut([0,1])

        # sanity check
        self.failUnlessEqual(pm2.getInSize(), 40)
        self.failUnlessEqual(pm2.getOutSize(), 2)

        # but orginal mapper must be left intact
        self.failUnlessEqual(self.pm.getInSize(), 40)
#        self.failUnlessEqual(self.pm.getOutSize(), 10)
        self.failUnlessEqual(self.pm.getOutSize(), 40)

        # data should still be fully recoverable by 'reverse()'
        rp2 = pm2.reverse(p[:,[0,1]])
        self.failUnlessEqual(rp2.shape, self.largefeat.samples.shape)
        self.failUnless((N.round(rp2) == self.largefeat.samples).all())
示例#18
0
 def test_id_hash(self, pair):
     a, b = pair
     a1 = deepcopy(a)
     a_1 = idhash(a)
     self.failUnless(a_1 == idhash(a),  msg="Must be of the same idhash")
     self.failUnless(a_1 != idhash(b), msg="Must be of different idhash")
     if isinstance(a, np.ndarray):
         self.failUnless(a_1 != idhash(a.T), msg=".T must be of different idhash")
     if not isinstance(a, tuple):
         self.failUnless(a_1 != idhash(a1), msg="Must be of different idhash")
         a[2] += 1; a_2 = idhash(a)
         self.failUnless(a_1 != a_2, msg="Idhash must change")
     else:
         a_2 = a_1
     a = a[2:]; a_3 = idhash(a)
     self.failUnless(a_2 != a_3, msg="Idhash must change after slicing")
示例#19
0
def is_sorted(items):
    """Check if listed items are in sorted order.

    Parameters
    ----------
      `items`: iterable container

    :return: `True` if were sorted. Otherwise `False` + Warning
    """
    items_sorted = deepcopy(items)
    items_sorted.sort()
    equality = items_sorted == items
    # XXX yarik forgotten analog to isiterable
    if hasattr(equality, '__iter__'):
        equality = np.all(equality)
    return equality
示例#20
0
文件: state.py 项目: geeragh/PyMVPA
    def __new__(cls, *args, **kwargs):
        """Instantiate ClassWithCollections object
        """
        self = super(ClassWithCollections, cls).__new__(cls)

        s__dict__ = self.__dict__

        # init variable
        # XXX: Added as pylint complained (rightfully) -- not sure if false
        # is the proper default
        self.__params_set = False

        # need to check to avoid override of enabled ca in the case
        # of multiple inheritance, like both ClassWithCollectionsl and
        # Harvestable
        if not s__dict__.has_key('_collections'):
            s__class__ = self.__class__

            collections = copy.deepcopy(s__class__._collections_template)
            s__dict__['_collections'] = collections
            s__dict__['_known_attribs'] = {}
            """Dictionary to contain 'links' to the collections from each
            known attribute. Is used to gain some speed up in lookup within
            __getattribute__ and __setattr__
            """

            # Assign owner to all collections
            for col, collection in collections.iteritems():
                if col in s__dict__:
                    raise ValueError, \
                          "Object %s has already attribute %s" % \
                          (self, col)
                s__dict__[col] = collection
                collection.name = col
                collection.owner = self

            self.__params_set = False

        if __debug__:
            descr = kwargs.get('descr', None)
            debug("COL", "ClassWithCollections.__new__ was done "
                  "for %s#%s with descr=%s" \
                  % (s__class__.__name__, id(self), descr))

        return self
示例#21
0
文件: support.py 项目: gorlins/PyMVPA
    def asDescreteTime(self, dt, storeoffset=False):
        """Convert `onset` and `duration` information into descrete timepoints.

        :Parameters:
          dt: float
            Temporal distance between two timepoints in the same unit as `onset`
            and `duration`.
          storeoffset: bool
            If True, the temporal offset between original `onset` and
            descretized `onset` is stored as an additional item in `features`.

        :Return:
          A copy of the original `Event` with `onset` and optionally `duration`
          replaced by their corresponding descrete timepoint. The new onset will
          correspond to the timepoint just before or exactly at the original
          onset. The new duration will be the number of timepoints covering the
          event from the computed onset timepoint till the timepoint exactly at
          the end, or just after the event.

          Note again, that the new values are expressed as #timepoint and not
          in their original unit!
        """
        dt = float(dt)
        onset = self['onset']
        out = deepcopy(self)

        # get the timepoint just prior the onset
        out['onset'] = int(N.floor(onset / dt))

        if storeoffset:
            # compute offset
            offset = onset - (out['onset'] * dt)

            if out.has_key('features'):
                out['features'].append(offset)
            else:
                out['features'] = [offset]

        if out.has_key('duration'):
            # how many timepoint cover the event (from computed onset
            # to the one timepoint just after the end of the event
            out['duration'] = int(N.ceil((onset + out['duration']) / dt) \
                                  - out['onset'])

        return out
示例#22
0
    def __new__(cls, *args, **kwargs):
        """Instantiate ClassWithCollections object
        """
        self = super(ClassWithCollections, cls).__new__(cls)

        s__dict__ = self.__dict__

        # init variable
        # XXX: Added as pylint complained (rightfully) -- not sure if false
        # is the proper default
        self.__params_set = False

        # need to check to avoid override of enabled ca in the case
        # of multiple inheritance, like both ClassWithCollectionsl and
        # Harvestable
        if not s__dict__.has_key('_collections'):
            s__class__ = self.__class__

            collections = copy.deepcopy(s__class__._collections_template)
            s__dict__['_collections'] = collections
            s__dict__['_known_attribs'] = {}
            """Dictionary to contain 'links' to the collections from each
            known attribute. Is used to gain some speed up in lookup within
            __getattribute__ and __setattr__
            """

            # Assign owner to all collections
            for col, collection in collections.iteritems():
                if col in s__dict__:
                    raise ValueError, \
                          "Object %s has already attribute %s" % \
                          (self, col)
                s__dict__[col] = collection
                collection.name = col

            self.__params_set = False

        if __debug__:
            descr = kwargs.get('descr', None)
            debug("COL", "ClassWithCollections.__new__ was done "
                  "for %s#%s with descr=%s" \
                  % (s__class__.__name__, id(self), descr))

        return self
示例#23
0
    def _call(self, dataset):
        """Compute the sensitivity map.

        Returns a 1d array of sensitivities for all features in `dataset`.
        """
        # first cast to floating point dtype, because noise is most likely
        # floating point as well and '+=' on int would not do the right thing
        # XXX should we already deepcopy here to keep orig dtype?
        if not N.issubdtype(dataset.samples.dtype, N.float):
            dataset.setSamplesDType('float32')

        if __debug__:
            nfeatures = dataset.nfeatures

        sens_map = []

        # compute the datameasure on the original dataset
        # this is used as a baseline
        orig_measure = self.__datameasure(dataset)

        # do for every _single_ feature in the dataset
        for feature in xrange(dataset.nfeatures):
            if __debug__:
                debug('PSA', "Analyzing %i features: %i [%i%%]" \
                    % (nfeatures,
                       feature+1,
                       float(feature+1)/nfeatures*100,), cr=True)

            # make a copy of the dataset to preserve data integrity
            wdata = deepcopy(dataset)

            # add noise to current feature
            wdata.samples[:, feature] += self.__noise(size=wdata.nsamples)

            # compute the datameasure on the perturbed dataset
            perturbed_measure = self.__datameasure(wdata)

            # difference from original datameasure is sensitivity
            sens_map.append(perturbed_measure - orig_measure)

        if __debug__:
            debug('PSA', '')

        return N.array(sens_map)
示例#24
0
 def test_id_hash(self, pair):
     a, b = pair
     a1 = deepcopy(a)
     a_1 = idhash(a)
     self.failUnless(a_1 == idhash(a), msg="Must be of the same idhash")
     self.failUnless(a_1 != idhash(b), msg="Must be of different idhash")
     if isinstance(a, np.ndarray):
         self.failUnless(a_1 != idhash(a.T),
                         msg=".T must be of different idhash")
     if not isinstance(a, tuple):
         self.failUnless(a_1 != idhash(a1),
                         msg="Must be of different idhash")
         a[2] += 1
         a_2 = idhash(a)
         self.failUnless(a_1 != a_2, msg="Idhash must change")
     else:
         a_2 = a_1
     a = a[2:]
     a_3 = idhash(a)
     self.failUnless(a_2 != a_3, msg="Idhash must change after slicing")
示例#25
0
文件: test_clf.py 项目: esc/PyMVPA
    def test_generic_tests(self):
        """Test all classifiers for conformant behavior
        """
        for clf_, traindata in \
                [(clfswh['binary'], datasets['dumb2']),
                 (clfswh['multiclass'], datasets['dumb'])]:
            traindata_copy = deepcopy(traindata) # full copy of dataset
            for clf in clf_:
                clf.train(traindata)
                self.failUnless(
                   (traindata.samples == traindata_copy.samples).all(),
                   "Training of a classifier shouldn't change original dataset")

            # TODO: enforce uniform return from predict??
            #predicted = clf.predict(traindata.samples)
            #self.failUnless(isinstance(predicted, np.ndarray))

        # Just simple test that all of them are syntaxed correctly
        self.failUnless(str(clf) != "")
        self.failUnless(repr(clf) != "")
示例#26
0
    def add(self, targets, predictions, estimates=None):
        """Add new results to the set of known results"""
        if len(targets) != len(predictions):
            raise ValueError, \
                  "Targets[%d] and predictions[%d]" % (len(targets),
                                                       len(predictions)) + \
                  " have different number of samples"

        # extract value if necessary
        if isinstance(estimates, Collectable):
            estimates = estimates.value

        if estimates is not None and len(targets) != len(estimates):
            raise ValueError, \
                  "Targets[%d] and estimates[%d]" % (len(targets),
                                                  len(estimates)) + \
                  " have different number of samples"

        # enforce labels in predictions to be of the same datatype as in
        # targets, since otherwise we are getting doubles for unknown at a
        # given moment labels
        nonetype = type(None)
        for i in xrange(len(targets)):
            t1, t2 = type(targets[i]), type(predictions[i])
            # if there were no prediction made - leave None, otherwise
            # convert to appropriate type
            if t1 != t2 and t2 != nonetype:
                #warning("Obtained target %s and prediction %s are of " %
                #       (t1, t2) + "different datatypes.")
                if isinstance(predictions, tuple):
                    predictions = list(predictions)
                predictions[i] = t1(predictions[i])

        if estimates is not None:
            # assure that we have a copy, or otherwise further in-place
            # modifications might screw things up (some classifiers share
            # estimates and spit out results)
            estimates = copy.deepcopy(estimates)

        self.__sets.append( (targets, predictions, estimates) )
        self._computed = False
示例#27
0
    def test_generic_tests(self):
        """Test all classifiers for conformant behavior
        """
        for clf_, traindata in \
                [(clfswh['binary'], datasets['dumb2']),
                 (clfswh['multiclass'], datasets['dumb'])]:
            traindata_copy = deepcopy(traindata)  # full copy of dataset
            for clf in clf_:
                clf.train(traindata)
                self.failUnless(
                    (traindata.samples == traindata_copy.samples).all(),
                    "Training of a classifier shouldn't change original dataset"
                )

            # TODO: enforce uniform return from predict??
            #predicted = clf.predict(traindata.samples)
            #self.failUnless(isinstance(predicted, np.ndarray))

        # Just simple test that all of them are syntaxed correctly
        self.failUnless(str(clf) != "")
        self.failUnless(repr(clf) != "")
示例#28
0
    def test_more_svd(self):
        pm = SVDMapper()
        # train SVD
        pm.train(self.largefeat)

        # mixing matrix cannot be square
        self.failUnlessEqual(pm.proj.shape, (40, 10))

        # only first singular value significant
        self.failUnless(pm.sv[:1] > 10)
        self.failUnless((pm.sv[1:] < 10).all())

        # now project data into SVD space
        p = pm.forward(self.largefeat)

        # only variance of first component significant
        var = p.var(axis=0)

        # test that only one component has variance
        self.failUnless(var[:1] > 1.0)
        self.failUnless((var[1:] < 0.0001).all())

        # check that the mapped data can be fully recovered by 'reverse()'
        rp = pm.reverse(p)
        self.failUnlessEqual(rp.shape, self.largefeat.shape)
        self.failUnless((np.round(rp) == self.largefeat).all())

        # copy mapper
        pm2 = deepcopy(pm)

        # now make new random data and do forward->reverse check
        data = np.random.normal(size=(98, 40))
        data_f = pm.forward(data)

        self.failUnlessEqual(data_f.shape, (98, 10))

        data_r = pm.reverse(data_f)
        self.failUnlessEqual(data_r.shape, (98, 40))
示例#29
0
    def test_more_svd(self):
        pm = SVDMapper()
        # train SVD
        pm.train(self.largefeat)

        # mixing matrix cannot be square
        self.failUnlessEqual(pm.proj.shape, (40, 10))

        # only first singular value significant
        self.failUnless(pm.sv[:1] > 10)
        self.failUnless((pm.sv[1:] < 10).all())

        # now project data into SVD space
        p = pm.forward(self.largefeat)

        # only variance of first component significant
        var = p.var(axis=0)

        # test that only one component has variance
        self.failUnless(var[:1] > 1.0)
        self.failUnless((var[1:] < 0.0001).all())

        # check that the mapped data can be fully recovered by 'reverse()'
        rp = pm.reverse(p)
        self.failUnlessEqual(rp.shape, self.largefeat.shape)
        self.failUnless((np.round(rp) == self.largefeat).all())

        # copy mapper
        pm2 = deepcopy(pm)

        # now make new random data and do forward->reverse check
        data = np.random.normal(size=(98,40))
        data_f = pm.forward(data)

        self.failUnlessEqual(data_f.shape, (98,10))

        data_r = pm.reverse(data_f)
        self.failUnlessEqual(data_r.shape, (98,40))
示例#30
0
    def test_proper_state(self):
        proper   = TestClassProper()
        proper2  = TestClassProper(enable_ca=['state1'], disable_ca=['state2'])

        # disable_ca should override anything in enable_ca
        proper3 = TestClassProper(enable_ca=['all'], disable_ca='all')

        self.failUnlessEqual(len(proper3.ca.enabled), 0,
            msg="disable_ca should override anything in enable_ca")

        proper.ca.state2 = 1000
        value = proper.ca.state2
        self.failUnlessEqual(proper.ca.state2, 1000, msg="Simple assignment/retrieval")

        proper.ca.disable('state2')
        proper.ca.state2 = 10000
        self.failUnlessEqual(proper.ca.state2, 1000, msg="Simple assignment after being disabled")

        proper4 = copy.deepcopy(proper)

        proper.ca.reset('state2')
        self.failUnlessRaises(UnknownStateError, proper.ca.__getattribute__, 'state2')
        """Must be blank after being reset"""

        self.failUnlessEqual(proper4.ca.state2, 1000,
            msg="Simple assignment after being reset in original instance")


        proper.ca.enable(['state2'])
        self.failUnlessEqual(set(proper.ca.keys()), set(['state1', 'state2']))
        if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active:
            # skip testing since all ca are on now
            return
        self.failUnless(proper.ca.enabled == ['state2'])

        self.failUnless(set(proper2.ca.enabled) == set(['state1']))

        self.failUnlessRaises(AttributeError, proper.__getattribute__, 'state12')

        # if documentary on the state is appropriate
        self.failUnlessEqual(proper2.ca.listing,
                             ['%sstate1+%s: state1 doc' % (_def_sep, _def_sep),
                              '%sstate2%s: state2 doc' % (_def_sep, _def_sep)])

        # if __str__ lists correct number of ca
        str_ = str(proper2)
        self.failUnless(str_.find('2 ca:') != -1)

        # check if disable works
        self.failUnless(set(proper2.ca.enabled), set(['state1']))

        proper2.ca.disable("all")
        self.failUnlessEqual(set(proper2.ca.enabled), set())

        proper2.ca.enable("all")
        self.failUnlessEqual(len(proper2.ca.enabled), 2)

        proper2.ca.state1, proper2.ca.state2 = 1,2
        self.failUnlessEqual(proper2.ca.state1, 1)
        self.failUnlessEqual(proper2.ca.state2, 2)

        # now reset them
        proper2.ca.reset('all')
        self.failUnlessRaises(UnknownStateError, proper2.ca.__getattribute__, 'state1')
        self.failUnlessRaises(UnknownStateError, proper2.ca.__getattribute__, 'state2')
示例#31
0
    def testCombinedMapper(self):
        # simple case: two array of different shape combined
        m = CombinedMapper([DenseArrayMapper(mask=N.ones((2,3,4))),
                            MaskMapper(mask=N.array((1,1)))])

        self.failUnless(m.getInSize() == 26)
        self.failUnless(m.getOutSize() == 26)

        d1 = N.ones((5,2,3,4))
        d2_broken = N.ones((6,2)) + 1
        d2 = N.ones((5,2)) + 1

        # should not work for sample mismatch
        self.failUnlessRaises(ValueError, m.forward, (d1, d2_broken))

        # check forward mapping (size and identity)
        mf = m.forward((d1, d2))
        self.failUnless(mf.shape == (5, 26))
        self.failUnless((mf[:,:24] == 1).all())
        self.failUnless((mf[:,-2:] == 2).all())

        # check reverse mapping
        self.failUnlessRaises(ValueError, m.reverse, N.arange(12))
        mr = m.reverse(N.arange(26) + 1)
        self.failUnless(len(mr) == 2)
        self.failUnless((mr[0] == N.arange(24).reshape((2,3,4)) + 1).all())
        self.failUnless((mr[1] == N.array((25,26))).all())

        # check reverse mapping of multiple samples
        mr = m.reverse(N.array([N.arange(26) + 1 for i in range(4)]))
        self.failUnless(len(mr) == 2)
        self.failUnless(
            (mr[0] == N.array([N.arange(24).reshape((2,3,4)) + 1
                                    for i in range(4)])).all())
        self.failUnless(
            (mr[1] == N.array([N.array((25,26)) for i in range(4)])).all())


        # check dummy train
        m.train(Dataset(samples=N.random.rand(10,26), labels=range(10)))
        self.failUnlessRaises(ValueError, m.train,
            Dataset(samples=N.random.rand(10,25), labels=range(10)))

        # check neighbor information
        # fail if invalid id
        self.failUnlessRaises(ValueError, m.getNeighbor, 26)
        # neighbors for last feature of first mapper, ie.
        # close in out space but infinite/undefined distance in in-space
        self.failUnless([n for n in m.getNeighbor(23, radius=2)]
                        == [6, 7, 10, 11, 15, 18, 19, 21, 22, 23])

        # check feature selection
        m.selectOut((23,25))
        self.failUnless(m.getInSize() == 26)
        self.failUnless(m.getOutSize() == 2)

        # check reverse mapping of truncated mapper
        mr = m.reverse(N.array((99,88)))
        target1 = N.zeros((2,3,4))
        target1[1,2,3] = 99
        target2 = N.array((0, 88))
        self.failUnless(len(mr) == 2)
        self.failUnless((mr[0] == target1).all())
        self.failUnless((mr[1] == target2).all())

        # check forward mapping
        self.failUnless((m.forward((d1, d2))[0] == (1, 2)).all())

        # check copying
        mc = deepcopy(m)
        mc.selectOut([1])
        self.failUnless(m.getOutSize() == 2)
        self.failUnless(mc.getOutSize() == 1)
示例#32
0
    def test_proper_state(self):
        proper = TestClassProper()
        proper2 = TestClassProper(enable_ca=['state1'], disable_ca=['state2'])

        # disable_ca should override anything in enable_ca
        proper3 = TestClassProper(enable_ca=['all'], disable_ca='all')

        self.failUnlessEqual(
            len(proper3.ca.enabled),
            0,
            msg="disable_ca should override anything in enable_ca")

        proper.ca.state2 = 1000
        value = proper.ca.state2
        self.failUnlessEqual(proper.ca.state2,
                             1000,
                             msg="Simple assignment/retrieval")

        proper.ca.disable('state2')
        proper.ca.state2 = 10000
        self.failUnlessEqual(proper.ca.state2,
                             1000,
                             msg="Simple assignment after being disabled")

        proper4 = copy.deepcopy(proper)

        proper.ca.reset('state2')
        self.failUnlessRaises(UnknownStateError, proper.ca.__getattribute__,
                              'state2')
        """Must be blank after being reset"""

        self.failUnlessEqual(
            proper4.ca.state2,
            1000,
            msg="Simple assignment after being reset in original instance")

        proper.ca.enable(['state2'])
        self.failUnlessEqual(Set(proper.ca.keys()), Set(['state1', 'state2']))
        if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active:
            # skip testing since all ca are on now
            return
        self.failUnless(proper.ca.enabled == ['state2'])

        self.failUnless(Set(proper2.ca.enabled) == Set(['state1']))

        self.failUnlessRaises(AttributeError, proper.__getattribute__,
                              'state12')

        # if documentary on the state is appropriate
        self.failUnlessEqual(proper2.ca.listing, [
            '%sstate1+%s: state1 doc' % (_def_sep, _def_sep),
            '%sstate2%s: state2 doc' % (_def_sep, _def_sep)
        ])

        # if __str__ lists correct number of ca
        str_ = str(proper2)
        self.failUnless(str_.find('2 ca:') != -1)

        # check if disable works
        self.failUnless(Set(proper2.ca.enabled), Set(['state1']))

        proper2.ca.disable("all")
        self.failUnlessEqual(Set(proper2.ca.enabled), Set())

        proper2.ca.enable("all")
        self.failUnlessEqual(len(proper2.ca.enabled), 2)

        proper2.ca.state1, proper2.ca.state2 = 1, 2
        self.failUnlessEqual(proper2.ca.state1, 1)
        self.failUnlessEqual(proper2.ca.state2, 2)

        # now reset them
        proper2.ca.reset('all')
        self.failUnlessRaises(UnknownStateError, proper2.ca.__getattribute__,
                              'state1')
        self.failUnlessRaises(UnknownStateError, proper2.ca.__getattribute__,
                              'state2')
示例#33
0
文件: nifti.py 项目: gorlins/PyMVPA
    def __init__(self, samples=None, events=None, mask=None, evconv=False,
                 storeoffset=False, tr=None, enforce_dim=4, **kwargs):
        """
        :Paramaters:
          mask: str | NiftiImage | ndarray
            Filename of a NIfTI image or a `NiftiImage` instance or an ndarray
            of appropriate shape.
          evconv: bool
            Convert event definitions using `onset` and `duration` in some
            temporal unit into #sample notation.
          storeoffset: Bool
            Whether to store temproal offset information when converting
            Events into descrete time. Only considered when evconv == True.
          tr: float
            Temporal distance of two adjacent NIfTI volumes. This can be used
            to override the corresponding value in the NIfTI header.
          enforce_dim : int or None
            If not None, it is the dimensionality of the data to be enforced,
            commonly 4D for the data, and 3D for the mask in case of fMRI.
        """
        # check if we are in copy constructor mode
        if events is None:
            EventDataset.__init__(self, samples=samples, events=events,
                                  mask=mask, **kwargs)
            return

        nifti = getNiftiFromAnySource(samples, ensure=True,
                                      enforce_dim=enforce_dim)
        # no copying
        samples = nifti.data

        # do not put the whole NiftiImage in the dict as this will most
        # likely be deepcopy'ed at some point and ensuring data integrity
        # of the complex Python-C-Swig hybrid might be a tricky task.
        # Only storing the header dict should achieve the same and is more
        # memory efficient and even simpler
        dsattr = {'niftihdr': nifti.header}

        # determine TR, take from NIfTI header by default
        dt = nifti.rtime
        # override if necessary
        if not tr is None:
            dt = tr

        # NiftiDataset uses a DescreteMetric with cartesian
        # distance and element size from the NIfTI header
        # 'voxdim' is (x,y,z) while 'samples' are (t,z,y,x)
        elementsize = [dt] + [i for i in reversed(nifti.voxdim)]
        # XXX metric might be inappropriate if boxcar has length 1
        # might move metric setup after baseclass init and check what has
        # really happened
        metric = DescreteMetric(elementsize=elementsize,
                                distance_function=cartesianDistance)

        # convert EVs if necessary -- not altering original
        if evconv:
            if dt == 0:
                raise ValueError, "'dt' cannot be zero when converting Events"

            events = [ev.asDescreteTime(dt, storeoffset) for ev in events]
        else:
            # do not touch the original
            events = deepcopy(events)

            # forcefully convert onset and duration into integers, as expected
            # by the baseclass
            for ev in events:
                oldonset = ev['onset']
                oldduration = ev['duration']
                ev['onset'] = int(ev['onset'])
                ev['duration'] = int(ev['duration'])
                if not oldonset == ev['onset'] \
                   or not oldduration == ev['duration']:
                    warning("Loosing information during automatic integer "
                            "conversion of EVs. Consider an explicit conversion"
                            " by setting `evconv` in ERNiftiDataset().")

        # pull mask array from NIfTI (if present)
        if mask is None:
            pass
        elif isinstance(mask, N.ndarray):
            # plain array can be passed on to base class
            pass
        else:
            mask_nim = getNiftiFromAnySource(mask)
            if not mask_nim is None:
                mask = getNiftiData(mask_nim)
            else:
                raise ValueError, "Cannot load mask from '%s'" % mask

        # finally init baseclass
        EventDataset.__init__(self, samples=samples, events=events,
                              mask=mask, dametric=metric, dsattr=dsattr,
                              **kwargs)
示例#34
0
    def _call(self, dataset):
        """Perform cross-validation on a dataset.

        'dataset' is passed to the splitter instance and serves as the source
        dataset to generate split for the single cross-validation folds.
        """
        # store the results of the splitprocessor
        results = []
        self.ca.splits = []

        # local bindings
        ca = self.ca
        clf = self.__transerror.clf
        expose_testdataset = self.__expose_testdataset

        # what ca to enable in terr
        terr_enable = []
        for state_var in ['confusion', 'training_confusion', 'samples_error']:
            if ca.is_enabled(state_var):
                terr_enable += [state_var]

        # charge ca with initial values
        summaryClass = clf.__summary_class__
        clf_hastestdataset = hasattr(clf, 'testdataset')

        self.ca.confusion = summaryClass()
        self.ca.training_confusion = summaryClass()
        self.ca.transerrors = []
        if ca.is_enabled('samples_error'):
            dataset.init_origids('samples',
                                 attr=self.__samples_idattr, mode='existing')
            self.ca.samples_error = dict(
                [(id_, []) for id_ in dataset.sa[self.__samples_idattr].value])

        # enable requested ca in child TransferError instance (restored
        # again below)
        if len(terr_enable):
            self.__transerror.ca.change_temporarily(
                enable_ca=terr_enable)

        # We better ensure that underlying classifier is not trained if we
        # are going to deepcopy transerror
        if ca.is_enabled("transerrors"):
            self.__transerror.untrain()

        # collect sum info about the split that where made for the resulting
        # dataset
        splitinfo = []

        # splitter
        for split in self.__splitter(dataset):
            splitinfo.append(
                "%s->%s"
                % (','.join([str(c)
                    for c in split[0].sa[self.__splitter.splitattr].unique]),
                   ','.join([str(c)
                    for c in split[1].sa[self.__splitter.splitattr].unique])))

            # only train classifier if splitter provides something in first
            # element of tuple -- the is the behavior of TransferError
            if ca.is_enabled("splits"):
                self.ca.splits.append(split)

            if ca.is_enabled("transerrors"):
                # copy first and then train, as some classifiers cannot be copied
                # when already trained, e.g. SWIG'ed stuff
                lastsplit = None
                for ds in split:
                    if ds is not None:
                        lastsplit = ds.a.lastsplit
                        break
                if lastsplit:
                    # only if we could deduce that it was last split
                    # use the 'mother' transerror
                    transerror = self.__transerror
                else:
                    # otherwise -- deep copy
                    transerror = deepcopy(self.__transerror)
            else:
                transerror = self.__transerror

            # assign testing dataset if given classifier can digest it
            if clf_hastestdataset and expose_testdataset:
                transerror.clf.testdataset = split[1]

            # run the beast
            result = transerror(split[1], split[0])

            # unbind the testdataset from the classifier
            if clf_hastestdataset and expose_testdataset:
                transerror.clf.testdataset = None

            # next line is important for 'self._harvest' call
            self._harvest(locals())

            # XXX Look below -- may be we should have not auto added .?
            #     then transerrors also could be deprecated
            if ca.is_enabled("transerrors"):
                self.ca.transerrors.append(transerror)

            # XXX: could be merged with next for loop using a utility class
            # that can add dict elements into a list
            if ca.is_enabled("samples_error"):
                for k, v in \
                  transerror.ca.samples_error.iteritems():
                    self.ca.samples_error[k].append(v)

            # pull in child ca
            for state_var in ['confusion', 'training_confusion']:
                if ca.is_enabled(state_var):
                    ca[state_var].value.__iadd__(
                        transerror.ca[state_var].value)

            if __debug__:
                debug("CROSSC", "Split #%d: result %s" \
                      % (len(results), `result`))
            results.append(result)

        # Since we could have operated with a copy -- bind the last used one back
        self.__transerror = transerror

        # put ca of child TransferError back into original config
        if len(terr_enable):
            self.__transerror.ca.reset_changed_temporarily()

        self.ca.results = results
        """Store conditional attribute if it is enabled"""
        results = Dataset(results, sa={'cv_fold': splitinfo})
        return results
示例#35
0
    def test_multivariate(self):
        mv_perf = []
        mv_lin_perf = []
        uv_perf = []

        l_clf = clfswh['linear', 'svm'][0]
        nl_clf = clfswh['non-linear', 'svm'][0]

        #orig_keys = nl_clf.param._params.keys()
        #nl_param_orig = nl_clf.param._params.copy()

        # l_clf = LinearNuSVMC()

        # XXX ??? not sure what below meant and it is obsolete if
        # using SG... commenting out for now
        # for some reason order is not preserved thus dictionaries are not
        # the same any longer -- lets compare values
        #self.failUnlessEqual([nl_clf.param._params[k] for k in orig_keys],
        #                     [nl_param_orig[k] for k in orig_keys],
        #   msg="New instance mustn't override values in previously created")
        ## and keys separately
        #self.failUnlessEqual(set(nl_clf.param._params.keys()),
        #                     set(orig_keys),
        #   msg="New instance doesn't change set of parameters in original")

        # We must be able to deepcopy not yet trained SVMs now
        import mvpa.support.copy as copy
        try:
            nl_clf.untrain()
            nl_clf_copy = copy.deepcopy(nl_clf)
        except:
            self.fail(msg="Failed to deepcopy not-yet trained SVM %s" % nl_clf)

        for i in xrange(20):
            train = pure_multivariate_signal( 20, 3 )
            test = pure_multivariate_signal( 20, 3 )

            # use non-linear CLF on 2d data
            nl_clf.train(train)
            p_mv = nl_clf.predict(test.samples)
            mv_perf.append(np.mean(p_mv==test.targets))

            # use linear CLF on 2d data
            l_clf.train(train)
            p_lin_mv = l_clf.predict(test.samples)
            mv_lin_perf.append(np.mean(p_lin_mv==test.targets))

            # use non-linear CLF on 1d data
            nl_clf.train(train[:, 0])
            p_uv = nl_clf.predict(test[:, 0].samples)
            uv_perf.append(np.mean(p_uv==test.targets))

        mean_mv_perf = np.mean(mv_perf)
        mean_mv_lin_perf = np.mean(mv_lin_perf)
        mean_uv_perf = np.mean(uv_perf)

        # non-linear CLF has to be close to perfect
        self.failUnless( mean_mv_perf > 0.9 )
        # linear CLF cannot learn this problem!
        self.failUnless( mean_mv_perf > mean_mv_lin_perf )
        # univariate has insufficient information
        self.failUnless( mean_uv_perf < mean_mv_perf )
示例#36
0
    def __call__(self, datasets):
        """Estimate mappers for each dataset

        Parameters
        ----------
          datasets : list or tuple of datasets

        Returns
        -------
        A list of trained Mappers of the same length as datasets
        """
        params = self.params            # for quicker access ;)
        ca = self.ca
        ndatasets = len(datasets)
        nfeatures = [ds.nfeatures for ds in datasets]

        residuals = None
        if ca['residual_errors'].enabled:
            residuals = np.zeros((2 + params.level2_niter, ndatasets))
            ca.residual_errors = Dataset(
                samples = residuals,
                sa = {'levels' :
                       ['1'] +
                       ['2:%i' % i for i in xrange(params.level2_niter)] +
                       ['3']})

        if __debug__:
            debug('HPAL', "Hyperalignment %s for %i datasets"
                  % (self, ndatasets))

        if params.ref_ds is None:
            ref_ds = np.argmax(nfeatures)
        else:
            ref_ds = params.ref_ds
            if ref_ds < 0 and ref_ds >= ndatasets:
                raise ValueError, "Requested reference dataset %i is out of " \
                      "bounds. We have only %i datasets provided" \
                      % (ref_ds, ndatasets)
        ca.choosen_ref_ds = ref_ds
        # might prefer some other way to initialize... later
        mappers = [deepcopy(params.alignment) for ds in datasets]
        # zscore all data sets
        # ds = [ zscore(ds, chunks_attr=None) for ds in datasets]

        # Level 1 (first)
        commonspace = np.asanyarray(datasets[ref_ds])
        if params.zscore_common:
            zscore(commonspace, chunks_attr=None)
        data_mapped = [np.asanyarray(ds) for ds in datasets]
        for i, (m, data) in enumerate(zip(mappers, data_mapped)):
            if __debug__:
                debug('HPAL_', "Level 1: ds #%i" % i)
            if i == ref_ds:
                continue
            #ZSC zscore(data, chunks_attr=None)
            ds = dataset_wizard(samples=data, targets=commonspace)
            #ZSC zscore(ds, chunks_attr=None)
            m.train(ds)
            data_temp = m.forward(data)
            #ZSC zscore(data_temp, chunks_attr=None)
            data_mapped[i] = data_temp

            if residuals is not None:
                residuals[0, i] = np.linalg.norm(data_temp - commonspace)

            ## if ds_mapped == []:
            ##     ds_mapped = [zscore(m.forward(d), chunks_attr=None)]
            ## else:
            ##     ds_mapped += [zscore(m.forward(d), chunks_attr=None)]

            # zscore before adding
            # TODO: make just a function so we dont' waste space
            commonspace = params.combiner1(data_mapped[i], commonspace)
            if params.zscore_common:
                zscore(commonspace, chunks_attr=None)

        # update commonspace to mean of ds_mapped
        commonspace = params.combiner2(data_mapped)
        if params.zscore_common:
            zscore(commonspace, chunks_attr=None)

        # Level 2 -- might iterate multiple times
        for loop in xrange(params.level2_niter):
            for i, (m, ds) in enumerate(zip(mappers, datasets)):
                if __debug__:
                    debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i))

                ## ds_temp = zscore( (commonspace*ndatasets - ds_mapped[i])
                ##                   /(ndatasets-1), chunks_attr=None )
                ds_new = ds.copy()
                #ZSC zscore(ds_new, chunks_attr=None)
                #PRJ ds_temp = (commonspace*ndatasets - ds_mapped[i])/(ndatasets-1)
                #ZSC zscore(ds_temp, chunks_attr=None)
                ds_new.targets = commonspace #PRJ ds_temp
                m.train(ds_new) # ds_temp)
                data_mapped[i] = m.forward(np.asanyarray(ds))
                if residuals is not None:
                    residuals[1+loop, i] = np.linalg.norm(data_mapped - commonspace)

                #ds_mapped[i] = zscore( m.forward(ds_temp), chunks_attr=None)

            commonspace = params.combiner2(data_mapped)
            if params.zscore_common:
                zscore(commonspace, chunks_attr=None)

        # Level 3 (last) to params.levels
        for i, (m, ds) in enumerate(zip(mappers, datasets)):
            if __debug__:
                debug('HPAL_', "Level 3: ds #%i" % i)

            ## ds_temp = zscore( (commonspace*ndatasets - ds_mapped[i])
            ##                   /(ndatasets-1), chunks_attr=None )
            ds_new = ds.copy()     # shallow copy so we could assign new labels
            #ZSC zscore(ds_new, chunks_attr=None)
            #PRJ ds_temp = (commonspace*ndatasets - ds_mapped[i])/(ndatasets-1)
            #ZSC zscore(ds_temp, chunks_attr=None)
            ds_new.targets = commonspace #PRJ ds_temp#
            m.train(ds_new) #ds_temp)

            if residuals is not None:
                data_mapped = m.forward(ds_new)
                residuals[-1, i] = np.linalg.norm(data_mapped - commonspace)

        return mappers
示例#37
0
文件: test_clf.py 项目: esc/PyMVPA
    def test_retrainables(self, clf):
        # XXX we agreed to not worry about this for the initial 0.6 release
        raise SkipTest
        # we need a copy since will tune its internals later on
        clf = clf.clone()
        clf.ca.change_temporarily(enable_ca = ['estimates'],
                                      # ensure that it does do predictions
                                      # while training
                                      disable_ca=['training_stats'])
        clf_re = clf.clone()
        # TODO: .retrainable must have a callback to call smth like
        # _set_retrainable
        clf_re._set_retrainable(True)

        # need to have high snr so we don't 'cope' with problematic
        # datasets since otherwise unittests would fail.
        dsargs = {'perlabel':50, 'nlabels':2, 'nfeatures':5, 'nchunks':1,
                  'nonbogus_features':[2,4], 'snr': 5.0}

        ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        # NB datasets will be changed by the end of testing, so if
        # are to change to use generic datasets - make sure to copy
        # them here
        ds = deepcopy(datasets['uni2large'])
        clf.untrain()
        clf_re.untrain()
        trerr = TransferMeasure(clf, Splitter('train'),
                                postproc=BinaryFxNode(mean_mismatch_error,
                                                      'targets'))
        trerr_re =  TransferMeasure(clf_re, Splitter('train'),
                                    disable_ca=['training_stats'],
                                    postproc=BinaryFxNode(mean_mismatch_error,
                                                          'targets'))

        # Just check for correctness of retraining
        err_1 = np.asscalar(trerr(ds))
        self.failUnless(err_1<0.3,
            msg="We should test here on easy dataset. Got error of %s" % err_1)
        values_1 = clf.ca.estimates[:]
        # some times retraining gets into deeper optimization ;-)
        eps = 0.05
        corrcoef_eps = 0.85        # just to get no failures... usually > 0.95


        def batch_test(retrain=True, retest=True, closer=True):
            err = np.asscalar(trerr(ds))
            err_re = np.asscalar(trerr_re(ds))
            corr = np.corrcoef(
                clf.ca.estimates, clf_re.ca.estimates)[0, 1]
            corr_old = np.corrcoef(values_1, clf_re.ca.estimates)[0, 1]
            if __debug__:
                debug('TEST', "Retraining stats: errors %g %g corr %g "
                      "with old error %g corr %g" %
                  (err, err_re, corr, err_1, corr_old))
            self.failUnless(clf_re.ca.retrained == retrain,
                            ("Must fully train",
                             "Must retrain instead of full training")[retrain])
            self.failUnless(clf_re.ca.repredicted == retest,
                            ("Must fully test",
                             "Must retest instead of full testing")[retest])
            self.failUnless(corr > corrcoef_eps,
              msg="Result must be close to the one without retraining."
                  " Got corrcoef=%s" % (corr))
            if closer:
                self.failUnless(
                    corr >= corr_old,
                    msg="Result must be closer to current without retraining"
                    " than to old one. Got corrcoef=%s" % (corr_old))

        # Check sequential retraining/retesting
        for i in xrange(3):
            flag = bool(i!=0)
            # ok - on 1st call we should train/test, then retrain/retest
            # and we can't compare for closinest to old result since
            # we are working on the same data/classifier
            batch_test(retrain=flag, retest=flag, closer=False)

        # should retrain nicely if we change a parameter
        if 'C' in clf.params:
            clf.params.C *= 0.1
            clf_re.params.C *= 0.1
            batch_test()
        elif 'sigma_noise' in clf.params:
            clf.params.sigma_noise *= 100
            clf_re.params.sigma_noise *= 100
            batch_test()
        else:
            raise RuntimeError, \
                  'Please implement testing while changing some of the ' \
                  'params for clf %s' % clf

        # should retrain nicely if we change kernel parameter
        if hasattr(clf, 'kernel_params') and len(clf.kernel_params):
            clf.kernel_params.gamma = 0.1
            clf_re.kernel_params.gamma = 0.1
            # retest is false since kernel got recomputed thus
            # can't expect to use the same kernel
            batch_test(retest=not('gamma' in clf.kernel_params))

        # should retrain nicely if we change labels
        permute = AttributePermutator('targets', assure=True)
        oldlabels = dstrain.targets[:]
        dstrain = permute(dstrain)
        self.failUnless((oldlabels != dstrain.targets).any(),
            msg="We should succeed at permutting -- now got the same targets")
        ds = vstack((dstrain, dstest))
        batch_test()

        # Change labels in testing
        oldlabels = dstest.targets[:]
        dstest = permute(dstest)
        self.failUnless((oldlabels != dstest.targets).any(),
            msg="We should succeed at permutting -- now got the same targets")
        ds = vstack((dstrain, dstest))
        batch_test()

        # should re-train if we change data
        # reuse trained SVM and its 'final' optimization point
        if not clf.__class__.__name__ in ['GPR']: # on GPR everything depends on the data ;-)
            oldsamples = dstrain.samples.copy()
            dstrain.samples[:] += dstrain.samples*0.05
            self.failUnless((oldsamples != dstrain.samples).any())
            ds = vstack((dstrain, dstest))
            batch_test(retest=False)
        clf.ca.reset_changed_temporarily()

        # test retrain()
        # TODO XXX  -- check validity
        clf_re.retrain(dstrain);
        self.failUnless(clf_re.ca.retrained)
        clf_re.retrain(dstrain, labels=True);
        self.failUnless(clf_re.ca.retrained)
        clf_re.retrain(dstrain, traindataset=True);
        self.failUnless(clf_re.ca.retrained)

        # test repredict()
        clf_re.repredict(dstest.samples);
        self.failUnless(clf_re.ca.repredicted)
        self.failUnlessRaises(RuntimeError, clf_re.repredict,
                              dstest.samples, labels=True)
        """for now retesting with anything changed makes no sense"""
        clf_re._set_retrainable(False)
示例#38
0
    def _call(self, dataset):
        """Perform cross-validation on a dataset.

        'dataset' is passed to the splitter instance and serves as the source
        dataset to generate split for the single cross-validation folds.
        """
        # store the results of the splitprocessor
        results = []
        self.splits = []

        # local bindings
        states = self.states
        clf = self.__transerror.clf
        expose_testdataset = self.__expose_testdataset

        # what states to enable in terr
        terr_enable = []
        for state_var in ['confusion', 'training_confusion', 'samples_error']:
            if states.isEnabled(state_var):
                terr_enable += [state_var]

        # charge states with initial values
        summaryClass = clf._summaryClass
        clf_hastestdataset = hasattr(clf, 'testdataset')

        self.confusion = summaryClass()
        self.training_confusion = summaryClass()
        self.transerrors = []
        self.samples_error = dict([(id, []) for id in dataset.origids])

        # enable requested states in child TransferError instance (restored
        # again below)
        if len(terr_enable):
            self.__transerror.states._changeTemporarily(
                enable_states=terr_enable)

        # splitter
        for split in self.__splitter(dataset):
            # only train classifier if splitter provides something in first
            # element of tuple -- the is the behavior of TransferError
            if states.isEnabled("splits"):
                self.splits.append(split)

            if states.isEnabled("transerrors"):
                # copy first and then train, as some classifiers cannot be copied
                # when already trained, e.g. SWIG'ed stuff
                transerror = deepcopy(self.__transerror)
            else:
                transerror = self.__transerror

            # assign testing dataset if given classifier can digest it
            if clf_hastestdataset and expose_testdataset:
                clf.testdataset = split[1]
                pass

            # run the beast
            result = transerror(split[1], split[0])

            # unbind the testdataset from the classifier
            if clf_hastestdataset and expose_testdataset:
                clf.testdataset = None

            # next line is important for 'self._harvest' call
            self._harvest(locals())

            # XXX Look below -- may be we should have not auto added .?
            #     then transerrors also could be deprecated
            if states.isEnabled("transerrors"):
                self.transerrors.append(transerror)

            # XXX: could be merged with next for loop using a utility class
            # that can add dict elements into a list
            if states.isEnabled("samples_error"):
                for k, v in \
                  transerror.states.getvalue("samples_error").iteritems():
                    self.samples_error[k].append(v)

            # pull in child states
            for state_var in ['confusion', 'training_confusion']:
                if states.isEnabled(state_var):
                    states.getvalue(state_var).__iadd__(
                        transerror.states.getvalue(state_var))

            if __debug__:
                debug("CROSSC", "Split #%d: result %s" \
                      % (len(results), `result`))
            results.append(result)

        # Since we could have operated with a copy -- bind the last used one back
        self.__transerror = transerror

        # put states of child TransferError back into original config
        if len(terr_enable):
            self.__transerror.states._resetEnabledTemporarily()

        self.results = results
        """Store state variable if it is enabled"""

        # Provide those labels_map if appropriate
        try:
            if states.isEnabled("confusion"):
                states.confusion.labels_map = dataset.labels_map
            if states.isEnabled("training_confusion"):
                states.training_confusion.labels_map = dataset.labels_map
        except:
            pass

        return self.__combiner(results)
示例#39
0
文件: channel.py 项目: gorlins/PyMVPA
        def resample(self, nt=None, sr=None, dt=None, window='ham',
                     inplace=True, **kwargs):
            """Convenience method to resample data sample channel-wise.

            Resampling target can be specified by number of timepoint
            or temporal distance or sampling rate.

            Please note that this method only operates on
            `ChannelDataset` and always returns such.

            :Parameters:
              nt: int
                Number of timepoints to resample to.
              dt: float
                Temporal distance of samples after resampling.
              sr: float
                Target sampling rate.
              inplace : bool
                If inplace=False, it would create and return a new dataset
                with new samples
              **kwargs:
                All additional arguments are passed to resample() from
                scipy.signal

            :Return:
              ChannelDataset
            """
            if nt is None and sr is None and dt is None:
                raise ValueError, \
                      "Required argument missing. Either needs ntimepoints, sr or dt."

            # get data in original shape
            orig_data = self.O

            if len(orig_data.shape) != 3:
                raise ValueError, "resample() only works with data from ChannelDataset."

            orig_nt = orig_data.shape[2]
            orig_length = self.dt * orig_nt

            if nt is None:
                # translate dt or sr into nt
                if not dt is None:
                    nt = orig_nt * float(self.dt) / dt
                elif not sr is None:
                    nt = orig_nt * float(sr) / self.samplingrate
                else:
                    raise RuntimeError, 'This should not happen!'
            else:
                raise RuntimeError, 'This should not happen!'


            nt = N.round(nt)

            # downsample data
            data = signal.resample(orig_data, nt, axis=2, window=window, **kwargs)
            new_dt = float(orig_length) / nt

            dsattr = self._dsattr

            # would be needed for not inplace generation
            if inplace:
                dsattr['ch_dt'] = new_dt
                # XXX We could have resampled range(nsamples) and
                #     rounded  it. and adjust then mapper's mask
                #     accordingly instead of creating a new one.
                #     It would give us opportunity to assess what
                #     resampling did...
                mapper = MaskMapper(N.ones(data.shape[1:], dtype='bool'))
                # reassign a new mapper.
                dsattr['mapper'] = mapper
                self.samples = mapper.forward(data)
                return self
            else:
                # we have to pass dsattr inside to don't loose
                # some additional attributes such as
                # labels_map
                dsattr = copy.deepcopy(dsattr)
                return ChannelDataset(data=self._data,
                                      dsattr=dsattr,
                                      samples=data,
                                      t0=self.t0,
                                      dt=new_dt,
                                      channelids=self.channelids,
                                      copy_data=True,
                                      copy_dsattr=False)
示例#40
0
文件: test_svm.py 项目: esc/PyMVPA
    def test_multivariate(self):
        mv_perf = []
        mv_lin_perf = []
        uv_perf = []

        l_clf = clfswh['linear', 'svm'][0]
        nl_clf = clfswh['non-linear', 'svm'][0]

        #orig_keys = nl_clf.param._params.keys()
        #nl_param_orig = nl_clf.param._params.copy()

        # l_clf = LinearNuSVMC()

        # XXX ??? not sure what below meant and it is obsolete if
        # using SG... commenting out for now
        # for some reason order is not preserved thus dictionaries are not
        # the same any longer -- lets compare values
        #self.failUnlessEqual([nl_clf.param._params[k] for k in orig_keys],
        #                     [nl_param_orig[k] for k in orig_keys],
        #   msg="New instance mustn't override values in previously created")
        ## and keys separately
        #self.failUnlessEqual(set(nl_clf.param._params.keys()),
        #                     set(orig_keys),
        #   msg="New instance doesn't change set of parameters in original")

        # We must be able to deepcopy not yet trained SVMs now
        import mvpa.support.copy as copy
        try:
            nl_clf.untrain()
            nl_clf_copy = copy.deepcopy(nl_clf)
        except:
            self.fail(msg="Failed to deepcopy not-yet trained SVM %s" % nl_clf)

        for i in xrange(20):
            train = pure_multivariate_signal( 20, 3 )
            test = pure_multivariate_signal( 20, 3 )

            # use non-linear CLF on 2d data
            nl_clf.train(train)
            p_mv = nl_clf.predict(test.samples)
            mv_perf.append(np.mean(p_mv==test.targets))

            # use linear CLF on 2d data
            l_clf.train(train)
            p_lin_mv = l_clf.predict(test.samples)
            mv_lin_perf.append(np.mean(p_lin_mv==test.targets))

            # use non-linear CLF on 1d data
            nl_clf.train(train[:, 0])
            p_uv = nl_clf.predict(test[:, 0].samples)
            uv_perf.append(np.mean(p_uv==test.targets))

        mean_mv_perf = np.mean(mv_perf)
        mean_mv_lin_perf = np.mean(mv_lin_perf)
        mean_uv_perf = np.mean(uv_perf)

        # non-linear CLF has to be close to perfect
        self.failUnless( mean_mv_perf > 0.9 )
        # linear CLF cannot learn this problem!
        self.failUnless( mean_mv_perf > mean_mv_lin_perf )
        # univariate has insufficient information
        self.failUnless( mean_uv_perf < mean_mv_perf )
示例#41
0
class _SVM(Classifier):
    """Support Vector Machine Classifier.

    Base class for all external SVM implementations.
    """
    """
    Derived classes should define:

    * _KERNELS: map(dict) should define assignment to a tuple containing
      implementation kernel type, list of parameters adherent to the
      kernel, and sensitivity analyzer e.g.::

        _KERNELS = {
             'linear': (shogun.Kernel.LinearKernel, (), LinearSVMWeights),
             'rbf' :   (shogun.Kernel.GaussianKernel, ('gamma',), None),
             ...
             }

    * _KNOWN_IMPLEMENTATIONS: map(dict) should define assignment to a
      tuple containing implementation of the SVM, list of parameters
      adherent to the implementation, additional internals, and
      description e.g.::

        _KNOWN_IMPLEMENTATIONS = {
          'C_SVC' : (svm.svmc.C_SVC, ('C',),
                   ('binary', 'multiclass'), 'C-SVM classification'),
          ...
          }

    """

    _ATTRIBUTE_COLLECTIONS = ['params'
                              ]  # enforce presence of params collections

    # Placeholder: map kernel names to sensitivity classes, ie
    # 'linear':LinearSVMWeights, for each backend
    _KNOWN_SENSITIVITIES = {}
    kernel = Parameter(None, allowedtype=Kernel, doc='Kernel object', index=-1)

    _SVM_PARAMS = {
        'C':
        Parameter(-1.0,
                  doc='Trade-off parameter between width of the '
                  'margin and number of support vectors. Higher C -- '
                  'more rigid margin SVM. In linear kernel, negative '
                  'values provide automatic scaling of their value '
                  'according to the norm of the data'),
        'nu':
        Parameter(0.5,
                  min=0.0,
                  max=1.0,
                  doc='Fraction of datapoints within the margin'),
        'cache_size':
        Parameter(100, doc='Size of the kernel cache, specified in megabytes'),
        'tube_epsilon':
        Parameter(0.01,
                  doc='Epsilon in epsilon-insensitive loss function of '
                  'epsilon-SVM regression (SVR)'),
        'tau':
        Parameter(1e-6, doc='TAU parameter of KRR regression in shogun'),
        'probability':
        Parameter(0,
                  doc='Flag to signal either probability estimate is obtained '
                  'within LIBSVM'),
        'shrinking':
        Parameter(1, doc='Either shrinking is to be conducted'),
        'weight_label':
        Parameter([],
                  allowedtype='[int]',
                  doc='To be used in conjunction with weight for custom '
                  'per-label weight'),
        # TODO : merge them into a single dictionary
        'weight':
        Parameter([], allowedtype='[double]', doc='Custom weights per label'),
        # For some reason setting up epsilon to 1e-5 slowed things down a bit
        # in comparison to how it was before (in yoh/master) by up to 20%... not clear why
        # may be related to 1e-3 default within _svm.py?
        'epsilon':
        Parameter(
            5e-5,
            min=1e-10,
            doc=
            'Tolerance of termination criteria. (For nu-SVM default is 0.001)')
    }

    _KNOWN_PARAMS = ()  # just a placeholder to please lintian
    """Parameters which are specific to a given instantiation of SVM
    """

    __tags__ = ['svm', 'kernel-based', 'swig']

    def __init__(self, **kwargs):
        """Init base class of SVMs. *Not to be publicly used*

        TODO: handling of parameters might migrate to be generic for
        all classifiers. SVMs are chosen to be testbase for that
        functionality to see how well it would fit.
        """

        # Check if requested implementation is known
        svm_impl = kwargs.get('svm_impl', None)
        if not svm_impl in self._KNOWN_IMPLEMENTATIONS:
            raise ValueError, \
                  "Unknown SVM implementation '%s' is requested for %s." \
                  "Known are: %s" % (svm_impl, self.__class__,
                                     self._KNOWN_IMPLEMENTATIONS.keys())
        self._svm_impl = svm_impl

        impl, add_params, add_internals, descr = \
              self._KNOWN_IMPLEMENTATIONS[svm_impl]

        # Add corresponding parameters to 'known' depending on the
        # implementation chosen
        if add_params is not None:
            self._KNOWN_PARAMS = \
                 self._KNOWN_PARAMS[:] + list(add_params)

        # Assign per-instance __tags__
        self.__tags__ = self.__tags__[:]

        # Add corresponding internals
        if add_internals is not None:
            self.__tags__ += list(add_internals)
        self.__tags__.append(svm_impl)

        k = kwargs.get('kernel', None)
        if k is None:
            kwargs['kernel'] = self.__default_kernel_class__()
        if 'linear' in ('%s' %
                        kwargs['kernel']).lower():  # XXX not necessarily best
            self.__tags__ += ['linear', 'has_sensitivity']
        else:
            self.__tags__ += ['non-linear']

        # pop out all args from **kwargs which are known to be SVM parameters
        _args = {}
        for param in self._KNOWN_PARAMS + ['svm_impl'
                                           ]:  # Update to remove kp's?
            if param in kwargs:
                _args[param] = kwargs.pop(param)

        try:
            Classifier.__init__(self, **kwargs)

        except TypeError, e:
            if "__init__() got an unexpected keyword argument " in e.args[0]:
                # TODO: make it even more specific -- if that argument is listed
                # within _SVM_PARAMS
                e.args = tuple( [e.args[0] +
                                 "\n Given SVM instance of class %s knows following parameters: %s" %
                                 (self.__class__, self._KNOWN_PARAMS) + \
                                 list(e.args)[1:]])
            raise e

        # populate collections and add values from arguments
        for paramfamily, paramset in ((self._KNOWN_PARAMS, self.params), ):
            for paramname in paramfamily:
                if not (paramname in self._SVM_PARAMS):
                    raise ValueError, "Unknown parameter %s" % paramname + \
                          ". Known SVM params are: %s" % self._SVM_PARAMS.keys()
                param = deepcopy(self._SVM_PARAMS[paramname])
                if paramname in _args:
                    param.value = _args[paramname]
                    # XXX might want to set default to it -- not just value

                paramset[paramname] = param

        # TODO: Below commented out because kernel_type has been removed.
        # Find way to set default C as necessary

        # tune up C if it has one and non-linear classifier is used
        #if self.params.has_key('C') and kernel_type != "linear" \
        #and self.params['C'].is_default:
        #if __debug__:
        #debug("SVM_", "Assigning default C value to be 1.0 for SVM "
        #"%s with non-linear kernel" % self)
        #self.params['C'].default = 1.0

        # Some postchecks
        if self.params.has_key('weight') and self.params.has_key(
                'weight_label'):
            if not len(self.params.weight_label) == len(self.params.weight):
                raise ValueError, "Lenghts of 'weight' and 'weight_label' lists" \
                      "must be equal."

        if __debug__:
            debug("SVM",
                  "Initialized %s with kernel %s" % (self, self.params.kernel))
示例#42
0
    def _call(self, dataset):
        """Perform cross-validation on a dataset.

        'dataset' is passed to the splitter instance and serves as the source
        dataset to generate split for the single cross-validation folds.
        """
        # store the results of the splitprocessor
        results = []
        self.ca.splits = []

        # local bindings
        ca = self.ca
        clf = self.__transerror.clf
        expose_testdataset = self.__expose_testdataset

        # what ca to enable in terr
        terr_enable = []
        for state_var in ['confusion', 'training_confusion', 'samples_error']:
            if ca.is_enabled(state_var):
                terr_enable += [state_var]

        # charge ca with initial values
        summaryClass = clf.__summary_class__
        clf_hastestdataset = hasattr(clf, 'testdataset')

        self.ca.confusion = summaryClass()
        self.ca.training_confusion = summaryClass()
        self.ca.transerrors = []
        if ca.is_enabled('samples_error'):
            dataset.init_origids('samples',
                                 attr=self.__samples_idattr,
                                 mode='existing')
            self.ca.samples_error = dict([
                (id_, []) for id_ in dataset.sa[self.__samples_idattr].value
            ])

        # enable requested ca in child TransferError instance (restored
        # again below)
        if len(terr_enable):
            self.__transerror.ca.change_temporarily(enable_ca=terr_enable)

        # We better ensure that underlying classifier is not trained if we
        # are going to deepcopy transerror
        if ca.is_enabled("transerrors"):
            self.__transerror.untrain()

        # collect sum info about the split that where made for the resulting
        # dataset
        splitinfo = []

        # splitter
        for split in self.__splitter(dataset):
            splitinfo.append("%s->%s" % (','.join([
                str(c) for c in split[0].sa[self.__splitter.splitattr].unique
            ]), ','.join([
                str(c) for c in split[1].sa[self.__splitter.splitattr].unique
            ])))

            # only train classifier if splitter provides something in first
            # element of tuple -- the is the behavior of TransferError
            if ca.is_enabled("splits"):
                self.ca.splits.append(split)

            if ca.is_enabled("transerrors"):
                # copy first and then train, as some classifiers cannot be copied
                # when already trained, e.g. SWIG'ed stuff
                lastsplit = None
                for ds in split:
                    if ds is not None:
                        lastsplit = ds.a.lastsplit
                        break
                if lastsplit:
                    # only if we could deduce that it was last split
                    # use the 'mother' transerror
                    transerror = self.__transerror
                else:
                    # otherwise -- deep copy
                    transerror = deepcopy(self.__transerror)
            else:
                transerror = self.__transerror

            # assign testing dataset if given classifier can digest it
            if clf_hastestdataset and expose_testdataset:
                transerror.clf.testdataset = split[1]

            # run the beast
            result = transerror(split[1], split[0])

            # unbind the testdataset from the classifier
            if clf_hastestdataset and expose_testdataset:
                transerror.clf.testdataset = None

            # next line is important for 'self._harvest' call
            self._harvest(locals())

            # XXX Look below -- may be we should have not auto added .?
            #     then transerrors also could be deprecated
            if ca.is_enabled("transerrors"):
                self.ca.transerrors.append(transerror)

            # XXX: could be merged with next for loop using a utility class
            # that can add dict elements into a list
            if ca.is_enabled("samples_error"):
                for k, v in \
                  transerror.ca.samples_error.iteritems():
                    self.ca.samples_error[k].append(v)

            # pull in child ca
            for state_var in ['confusion', 'training_confusion']:
                if ca.is_enabled(state_var):
                    ca[state_var].value.__iadd__(
                        transerror.ca[state_var].value)

            if __debug__:
                debug("CROSSC", "Split #%d: result %s" \
                      % (len(results), `result`))
            results.append(result)

        # Since we could have operated with a copy -- bind the last used one back
        self.__transerror = transerror

        # put ca of child TransferError back into original config
        if len(terr_enable):
            self.__transerror.ca.reset_changed_temporarily()

        self.ca.results = results
        """Store conditional attribute if it is enabled"""
        results = Dataset(results, sa={'cv_fold': splitinfo})
        return results
示例#43
0
    def test_retrainables(self, clf):
        # we need a copy since will tune its internals later on
        clf = clf.clone()
        clf.ca.change_temporarily(
            enable_ca=['estimates'],
            # ensure that it does do predictions
            # while training
            disable_ca=['training_confusion'])
        clf_re = clf.clone()
        # TODO: .retrainable must have a callback to call smth like
        # _set_retrainable
        clf_re._set_retrainable(True)

        # need to have high snr so we don't 'cope' with problematic
        # datasets since otherwise unittests would fail.
        dsargs = {
            'perlabel': 50,
            'nlabels': 2,
            'nfeatures': 5,
            'nchunks': 1,
            'nonbogus_features': [2, 4],
            'snr': 5.0
        }

        ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        # NB datasets will be changed by the end of testing, so if
        # are to change to use generic datasets - make sure to copy
        # them here
        dstrain = deepcopy(datasets['uni2large_train'])
        dstest = deepcopy(datasets['uni2large_test'])

        clf.untrain()
        clf_re.untrain()
        trerr, trerr_re = TransferError(clf), \
                          TransferError(clf_re,
                                        disable_ca=['training_confusion'])

        # Just check for correctness of retraining
        err_1 = trerr(dstest, dstrain)
        self.failUnless(
            err_1 < 0.3,
            msg="We should test here on easy dataset. Got error of %s" % err_1)
        values_1 = clf.ca.estimates[:]
        # some times retraining gets into deeper optimization ;-)
        eps = 0.05
        corrcoef_eps = 0.85  # just to get no failures... usually > 0.95

        def batch_test(retrain=True, retest=True, closer=True):
            err = trerr(dstest, dstrain)
            err_re = trerr_re(dstest, dstrain)
            corr = np.corrcoef(clf.ca.estimates, clf_re.ca.estimates)[0, 1]
            corr_old = np.corrcoef(values_1, clf_re.ca.estimates)[0, 1]
            if __debug__:
                debug(
                    'TEST', "Retraining stats: errors %g %g corr %g "
                    "with old error %g corr %g" %
                    (err, err_re, corr, err_1, corr_old))
            self.failUnless(clf_re.ca.retrained == retrain,
                            ("Must fully train",
                             "Must retrain instead of full training")[retrain])
            self.failUnless(clf_re.ca.repredicted == retest,
                            ("Must fully test",
                             "Must retest instead of full testing")[retest])
            self.failUnless(
                corr > corrcoef_eps,
                msg="Result must be close to the one without retraining."
                " Got corrcoef=%s" % (corr))
            if closer:
                self.failUnless(
                    corr >= corr_old,
                    msg="Result must be closer to current without retraining"
                    " than to old one. Got corrcoef=%s" % (corr_old))

        # Check sequential retraining/retesting
        for i in xrange(3):
            flag = bool(i != 0)
            # ok - on 1st call we should train/test, then retrain/retest
            # and we can't compare for closinest to old result since
            # we are working on the same data/classifier
            batch_test(retrain=flag, retest=flag, closer=False)

        # should retrain nicely if we change a parameter
        if 'C' in clf.params:
            clf.params.C *= 0.1
            clf_re.params.C *= 0.1
            batch_test()
        elif 'sigma_noise' in clf.params:
            clf.params.sigma_noise *= 100
            clf_re.params.sigma_noise *= 100
            batch_test()
        else:
            raise RuntimeError, \
                  'Please implement testing while changing some of the ' \
                  'params for clf %s' % clf

        # should retrain nicely if we change kernel parameter
        if hasattr(clf, 'kernel_params') and len(clf.kernel_params):
            clf.kernel_params.gamma = 0.1
            clf_re.kernel_params.gamma = 0.1
            # retest is false since kernel got recomputed thus
            # can't expect to use the same kernel
            batch_test(retest=not ('gamma' in clf.kernel_params))

        # should retrain nicely if we change labels
        oldlabels = dstrain.targets[:]
        dstrain.permute_attr(assure_permute=True)
        self.failUnless(
            (oldlabels != dstrain.targets).any(),
            msg="We should succeed at permutting -- now got the same targets")
        batch_test()

        # Change labels in testing
        oldlabels = dstest.targets[:]
        dstest.permute_attr(assure_permute=True)
        self.failUnless(
            (oldlabels != dstest.targets).any(),
            msg="We should succeed at permutting -- now got the same targets")
        batch_test()

        # should re-train if we change data
        # reuse trained SVM and its 'final' optimization point
        if not clf.__class__.__name__ in [
                'GPR'
        ]:  # on GPR everything depends on the data ;-)
            oldsamples = dstrain.samples.copy()
            dstrain.samples[:] += dstrain.samples * 0.05
            self.failUnless((oldsamples != dstrain.samples).any())
            batch_test(retest=False)
        clf.ca.reset_changed_temporarily()

        # test retrain()
        # TODO XXX  -- check validity
        clf_re.retrain(dstrain)
        self.failUnless(clf_re.ca.retrained)
        clf_re.retrain(dstrain, labels=True)
        self.failUnless(clf_re.ca.retrained)
        clf_re.retrain(dstrain, traindataset=True)
        self.failUnless(clf_re.ca.retrained)

        # test repredict()
        clf_re.repredict(dstest.samples)
        self.failUnless(clf_re.ca.repredicted)
        self.failUnlessRaises(RuntimeError,
                              clf_re.repredict,
                              dstest.samples,
                              labels=True)
        """for now retesting with anything changed makes no sense"""
        clf_re._set_retrainable(False)