Пример #1
0
def test_all_equal():
    # all these values are supposed to be different from each other
    # but equal to themselves
    a = np.random.normal(size=(10, 10)) + 1000.
    b = np.zeros((10, 10))
    c = np.zeros(10)
    d = np.zeros(11)
    e = 0
    f = None
    g = True
    h = ''
    i = 'a'
    j = dict(bummer=np.arange(5))

    values = [a, b, c, d, e, f, g, h, i, j]
    for ii, v in enumerate(values):
        for jj, w in enumerate(values):
            # make deepcopy so == operator cannot cheat by checking id()
            assert_equal(all_equal(copy.deepcopy(v),
                                   copy.deepcopy(w)),
                         ii == jj,
                         msg='cmp(%s, %s)' % (type(v), type(w)))

    # ensure that this function behaves like the 
    # standard python '==' comparator for singulars
    singulars = [0, None, True, False, '', 1, 'a']
    for v in singulars:
        for w in singulars:
            assert_equal(all_equal(v, w), v == w)
Пример #2
0
def test_mapper_vs_zscore():
    """Test by comparing to results of elderly z-score function
    """
    # data: 40 sample feature line in 20d space (40x20; samples x features)
    dss = [
        dataset_wizard(np.concatenate(
            [np.arange(40) for i in range(20)]).reshape(20,-1).T,
                targets=1, chunks=1),
        ] + datasets.values()

    for ds in dss:
        ds1 = deepcopy(ds)
        ds2 = deepcopy(ds)

        zsm = ZScoreMapper(chunks_attr=None)
        assert_raises(RuntimeError, zsm.forward, ds1.samples)
        idhashes = (idhash(ds1), idhash(ds1.samples))
        zsm.train(ds1)
        idhashes_train = (idhash(ds1), idhash(ds1.samples))
        assert_equal(idhashes, idhashes_train)

        # forward dataset
        ds1z_ds = zsm.forward(ds1)
        idhashes_forwardds = (idhash(ds1), idhash(ds1.samples))
        # must not modify samples in place!
        assert_equal(idhashes, idhashes_forwardds)

        # forward samples explicitly
        ds1z = zsm.forward(ds1.samples)
        idhashes_forward = (idhash(ds1), idhash(ds1.samples))
        assert_equal(idhashes, idhashes_forward)

        zscore(ds2, chunks_attr=None)
        assert_array_almost_equal(ds1z, ds2.samples)
        assert_array_equal(ds1.samples, ds.samples)
Пример #3
0
    def clone(self):
        """Create full copy of the classifier.

        It might require classifier to be untrained first due to
        present SWIG bindings.

        TODO: think about proper re-implementation, without enrollment of deepcopy
        """
        if __debug__:
            debug("CLF", "Cloning %s%s", (self, _strid(self)))
        try:
            return deepcopy(self)
        except:
            self.untrain()
            return deepcopy(self)
Пример #4
0
    def clone(self):
        """Create full copy of the classifier.

        It might require classifier to be untrained first due to
        present SWIG bindings.

        TODO: think about proper re-implementation, without enrollment of deepcopy
        """
        if __debug__:
            debug("CLF", "Cloning %s%s", (self, _strid(self)))
        try:
            return deepcopy(self)
        except:
            self.untrain()
            return deepcopy(self)
Пример #5
0
    def _level3(self, datasets):
        params = self.params  # for quicker access ;)
        # create a mapper per dataset
        mappers = [deepcopy(params.alignment) for ds in datasets]

        # key different from level-2; the common space is uniform
        #temp_commonspace = commonspace

        residuals = None
        if self.ca['residual_errors'].enabled:
            residuals = np.zeros((1, len(datasets)))
            self.ca.residual_errors = Dataset(samples=residuals)

        # start from original input datasets again
        for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
            if __debug__:
                debug('HPAL_', "Level 3: ds #%i" % i)

            # retrain mapper on final common space
            ds_new.sa[m.get_space()] = self.commonspace
            m.train(ds_new)
            # remove common space attribute again to save on memory
            del ds_new.sa[m.get_space()]

            if residuals is not None:
                # obtain final projection
                data_mapped = m.forward(ds_new.samples)
                residuals[0,
                          i] = np.linalg.norm(data_mapped - self.commonspace)

        return mappers
Пример #6
0
    def _forward_data(self, data):
        if self.__chunks_attr is not None:
            raise RuntimeError(
                "%s cannot do chunk-wise Z-scoring of plain data "
                "since it has to be parameterized with chunks_attr." % self)
        if self.__param_est is not None:
            raise RuntimeError("%s cannot do Z-scoring with estimating "
                               "parameters on some attributes of plain"
                               "data." % self)

        params = self.__params_dict
        if params is None:
            raise RuntimeError, \
                  "ZScoreMapper needs to be trained before call to forward"

        # mappers should not modify the input data
        # cast the data to float, since in-place operations below to not upcast!
        if np.issubdtype(data.dtype, np.integer):
            if self._secret_inplace_zscore:
                raise TypeError(
                    "Cannot perform inplace z-scoring since data is of integer "
                    "type. Please convert to float before calling zscore")
            mdata = data.astype(self.__dtype)
        elif self._secret_inplace_zscore:
            mdata = data
        else:
            # do not call .copy() directly, since it might not be an array
            mdata = copy.deepcopy(data)

        self._zscore(mdata, *params['__all__'])
        return mdata
Пример #7
0
    def _level3(self, datasets):
        params = self.params            # for quicker access ;)
        # create a mapper per dataset
        mappers = [deepcopy(params.alignment) for ds in datasets]

        # key different from level-2; the common space is uniform
        #temp_commonspace = commonspace

        residuals = None
        if self.ca['residual_errors'].enabled:
            residuals = np.zeros((1, len(datasets)))
            self.ca.residual_errors = Dataset(samples=residuals)

        # start from original input datasets again
        for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
            if __debug__:
                debug('HPAL_', "Level 3: ds #%i" % i)

            # retrain mapper on final common space
            ds_new.sa[m.get_space()] = self.commonspace
            m.train(ds_new)
            # remove common space attribute again to save on memory
            del ds_new.sa[m.get_space()]

            if residuals is not None:
                # obtain final projection
                data_mapped = m.forward(ds_new.samples)
                residuals[0, i] = np.linalg.norm(data_mapped - self.commonspace)

        return mappers
Пример #8
0
    def _forward_data(self, data):
        if self.__chunks_attr is not None:
            raise RuntimeError(
                "%s cannot do chunk-wise Z-scoring of plain data "
                "since it has to be parameterized with chunks_attr." % self)
        if self.__param_est is not None:
            raise RuntimeError("%s cannot do Z-scoring with estimating "
                               "parameters on some attributes of plain"
                               "data." % self)

        params = self.__params_dict
        if params is None:
            raise RuntimeError, \
                  "ZScoreMapper needs to be trained before call to forward"

        # mappers should not modify the input data
        # cast the data to float, since in-place operations below to not upcast!
        if np.issubdtype(data.dtype, np.integer):
            if self._secret_inplace_zscore:
                raise TypeError(
                    "Cannot perform inplace z-scoring since data is of integer "
                    "type. Please convert to float before calling zscore")
            mdata = data.astype(self.__dtype)
        elif self._secret_inplace_zscore:
            mdata = data
        else:
            # do not call .copy() directly, since it might not be an array
            mdata = copy.deepcopy(data)

        self._zscore(mdata, *params['__all__'])
        return mdata
Пример #9
0
    def _level3(self, datasets):
        params = self.params            # for quicker access ;)
        # create a mapper per dataset
        mappers = [deepcopy(params.alignment) for ds in datasets]

        # key different from level-2; the common space is uniform
        #temp_commonspace = commonspace
        # Fixing nproc=0
        if params.nproc == 0:
            from mvpa2.base import warning
            warning("nproc of 0 doesn't make sense. Setting nproc to 1.")
            params.nproc = 1
        # Checking for joblib, if not, set nproc to 1
        if params.nproc != 1:
            from mvpa2.base import externals, warning
            if not externals.exists('joblib'):
                warning("Setting nproc different from 1 requires joblib package, which "
                        "does not seem to exist. Setting nproc to 1.")
                params.nproc = 1

        # start from original input datasets again
        if params.nproc == 1:
            residuals = []
            for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
                if __debug__:
                    debug('HPAL_', "Level 3: ds #%i" % i)
                m, residual = get_trained_mapper(ds_new, self.commonspace, m,
                                                 self.ca['residual_errors'].enabled)
                if self.ca['residual_errors'].enabled:
                    residuals.append(residual)
        else:
            if __debug__:
                debug('HPAL_', "Level 3: Using joblib with nproc = %d " % params.nproc)
            verbose_level_parallel = 20 \
                if (__debug__ and 'HPAL' in debug.active) else 0
            from joblib import Parallel, delayed
            import sys
            # joblib's 'multiprocessing' backend has known issues of failure on OSX
            # Tested with MacOS 10.12.13, python 2.7.13, joblib v0.10.3
            if params.joblib_backend is None:
                params.joblib_backend = 'threading' if sys.platform == 'darwin' \
                                        else 'multiprocessing'
            res = Parallel(
                    n_jobs=params.nproc, pre_dispatch=params.nproc,
                    backend=params.joblib_backend,
                    verbose=verbose_level_parallel
                    )(
                        delayed(get_trained_mapper)
                        (ds, self.commonspace, mapper, self.ca['residual_errors'].enabled)
                        for ds, mapper in zip(datasets, mappers)
                    )
            mappers = [m for m, r in res]
            if self.ca['residual_errors'].enabled:
                residuals = [r for m, r in res]

        if self.ca['residual_errors'].enabled:
            self.ca.residual_errors = Dataset(samples=np.array(residuals)[None, :])

        return mappers
Пример #10
0
 def setUp(self):
     self.backup = []
     # paranoid check
     self.cfgstr = str(cfg)
     # clean up externals cfg for proper testing
     if cfg.has_section('externals'):
         self.backup = copy.deepcopy(cfg.items('externals'))
     cfg.remove_section('externals')
Пример #11
0
 def setUp(self):
     self.backup = []
     # paranoid check
     self.cfgstr = str(cfg)
     # clean up externals cfg for proper testing
     if cfg.has_section('externals'):
         self.backup = copy.deepcopy(cfg.items('externals'))
     cfg.remove_section('externals')
Пример #12
0
 def test_deep_copying_state_variable(self):
     for v in (True, False):
         sv = ConditionalAttribute(enabled=v, doc="Testing")
         sv.enabled = not v
         sv_dc = copy.deepcopy(sv)
         if not (__debug__ and "ENFORCE_CA_ENABLED" in debug.active):
             self.assertEqual(sv.enabled, sv_dc.enabled)
         self.assertEqual(sv.name, sv_dc.name)
         self.assertEqual(sv._instance_index, sv_dc._instance_index)
Пример #13
0
    def select_samples(self, selection):
        """Return new ColumnData with selected samples"""

        data = copy.deepcopy(self)
        for k, v in data.iteritems():
            data[k] = [v[x] for x in selection]

        data._check()
        return data
Пример #14
0
    def select_samples(self, selection):
        """Return new ColumnData with selected samples"""

        data = copy.deepcopy(self)
        for k, v in data.iteritems():
            data[k] = [v[x] for x in selection]

        data._check()
        return data
Пример #15
0
 def test_deep_copying_state_variable(self):
     for v in (True, False):
         sv = ConditionalAttribute(enabled=v,
                            doc="Testing")
         sv.enabled = not v
         sv_dc = copy.deepcopy(sv)
         self.failUnlessEqual(sv.enabled, sv_dc.enabled)
         self.failUnlessEqual(sv.name, sv_dc.name)
         self.failUnlessEqual(sv._instance_index, sv_dc._instance_index)
Пример #16
0
 def test_deep_copying_state_variable(self):
     for v in (True, False):
         sv = ConditionalAttribute(enabled=v, doc="Testing")
         sv.enabled = not v
         sv_dc = copy.deepcopy(sv)
         if not (__debug__ and 'ENFORCE_CA_ENABLED' in debug.active):
             self.assertEqual(sv.enabled, sv_dc.enabled)
         self.assertEqual(sv.name, sv_dc.name)
         self.assertEqual(sv._instance_index, sv_dc._instance_index)
Пример #17
0
def test_mapper_vs_zscore():
    """Test by comparing to results of elderly z-score function
    """
    # data: 40 sample feature line in 20d space (40x20; samples x features)
    dss = [
        dataset_wizard(np.concatenate([np.arange(40)
                                       for i in range(20)]).reshape(20, -1).T,
                       targets=1,
                       chunks=1),
    ] + datasets.values()

    for ds in dss:
        ds1 = deepcopy(ds)
        ds2 = deepcopy(ds)

        zsm = ZScoreMapper(chunks_attr=None)
        assert_raises(RuntimeError, zsm.forward, ds1.samples)
        idhashes = (idhash(ds1), idhash(ds1.samples))
        zsm.train(ds1)
        idhashes_train = (idhash(ds1), idhash(ds1.samples))
        assert_equal(idhashes, idhashes_train)

        # forward dataset
        ds1z_ds = zsm.forward(ds1)
        idhashes_forwardds = (idhash(ds1), idhash(ds1.samples))
        # must not modify samples in place!
        assert_equal(idhashes, idhashes_forwardds)

        # forward samples explicitly
        ds1z = zsm.forward(ds1.samples)
        idhashes_forward = (idhash(ds1), idhash(ds1.samples))
        assert_equal(idhashes, idhashes_forward)

        zscore(ds2, chunks_attr=None)
        assert_array_almost_equal(ds1z, ds2.samples)
        assert_array_equal(ds1.samples, ds.samples)
Пример #18
0
 def test_id_hash(self, pair):
     a, b = pair
     a1 = deepcopy(a)
     a_1 = idhash(a)
     self.assertTrue(a_1 == idhash(a), msg="Must be of the same idhash")
     self.assertTrue(a_1 != idhash(b), msg="Must be of different idhash")
     if isinstance(a, np.ndarray):
         self.assertTrue(a_1 != idhash(a.T), msg=".T must be of different idhash")
     if not isinstance(a, tuple):
         self.assertTrue(a_1 != idhash(a1), msg="Must be of different idhash")
         a[2] += 1; a_2 = idhash(a)
         self.assertTrue(a_1 != a_2, msg="Idhash must change")
     else:
         a_2 = a_1
     a = a[2:]; a_3 = idhash(a)
     self.assertTrue(a_2 != a_3, msg="Idhash must change after slicing")
Пример #19
0
def is_sorted(items):
    """Check if listed items are in sorted order.

    Parameters
    ----------
      `items`: iterable container

    :return: `True` if were sorted. Otherwise `False` + Warning
    """
    items_sorted = deepcopy(items)
    items_sorted.sort()
    equality = items_sorted == items
    # XXX yarik forgotten analog to isiterable
    if hasattr(equality, '__iter__'):
        equality = np.all(equality)
    return equality
Пример #20
0
 def test_id_hash(self, pair):
     a, b = pair
     a1 = deepcopy(a)
     a_1 = idhash(a)
     self.assertTrue(a_1 == idhash(a), msg="Must be of the same idhash")
     self.assertTrue(a_1 != idhash(b), msg="Must be of different idhash")
     if isinstance(a, np.ndarray):
         self.assertTrue(a_1 != idhash(a.T), msg=".T must be of different idhash")
     if not isinstance(a, tuple):
         self.assertTrue(a_1 != idhash(a1), msg="Must be of different idhash")
         a[2] += 1; a_2 = idhash(a)
         self.assertTrue(a_1 != a_2, msg="Idhash must change")
     else:
         a_2 = a_1
     a = a[2:]; a_3 = idhash(a)
     self.assertTrue(a_2 != a_3, msg="Idhash must change after slicing")
Пример #21
0
def is_sorted(items):
    """Check if listed items are in sorted order.

    Parameters
    ----------
      `items`: iterable container

    :return: `True` if were sorted. Otherwise `False` + Warning
    """
    items_sorted = deepcopy(items)
    items_sorted.sort()
    equality = items_sorted == items
    # XXX yarik forgotten analog to isiterable
    if hasattr(equality, '__iter__'):
        equality = np.all(equality)
    return equality
Пример #22
0
    def __new__(cls, *args, **kwargs):
        """Instantiate ClassWithCollections object
        """
        self = super(ClassWithCollections, cls).__new__(cls)

        s__dict__ = self.__dict__

        # init variable
        # XXX: Added as pylint complained (rightfully) -- not sure if false
        # is the proper default
        self.__params_set = False

        # need to check to avoid override of enabled ca in the case
        # of multiple inheritance, like both ClassWithCollectionsl and
        # Harvestable (note: Harvestable was refactored away)
        if '_collections' not in s__dict__:
            s__class__ = self.__class__

            collections = copy.deepcopy(s__class__._collections_template)
            s__dict__['_collections'] = collections
            s__dict__['_known_attribs'] = {}
            """Dictionary to contain 'links' to the collections from each
            known attribute. Is used to gain some speed up in lookup within
            __getattribute__ and __setattr__
            """

            # Assign owner to all collections
            for col, collection in collections.iteritems():
                if col in s__dict__:
                    raise ValueError, \
                          "Object %s has already attribute %s" % \
                          (self, col)
                s__dict__[col] = collection
                collection.name = col

            self.__params_set = False

        if __debug__:
            descr = kwargs.get('descr', None)
            debug(
                "COL", "ClassWithCollections.__new__ was done "
                "for %s%s with descr=%s",
                (s__class__.__name__, _strid(self), descr))

        return self
Пример #23
0
    def __new__(cls, *args, **kwargs):
        """Instantiate ClassWithCollections object
        """
        self = super(ClassWithCollections, cls).__new__(cls)

        s__dict__ = self.__dict__

        # init variable
        # XXX: Added as pylint complained (rightfully) -- not sure if false
        # is the proper default
        self.__params_set = False

        # need to check to avoid override of enabled ca in the case
        # of multiple inheritance, like both ClassWithCollectionsl and
        # Harvestable
        if '_collections' not in s__dict__:
            s__class__ = self.__class__

            collections = copy.deepcopy(s__class__._collections_template)
            s__dict__['_collections'] = collections
            s__dict__['_known_attribs'] = {}
            """Dictionary to contain 'links' to the collections from each
            known attribute. Is used to gain some speed up in lookup within
            __getattribute__ and __setattr__
            """

            # Assign owner to all collections
            for col, collection in collections.iteritems():
                if col in s__dict__:
                    raise ValueError, \
                          "Object %s has already attribute %s" % \
                          (self, col)
                s__dict__[col] = collection
                collection.name = col

            self.__params_set = False

        if __debug__:
            descr = kwargs.get('descr', None)
            debug("COL", "ClassWithCollections.__new__ was done "
                  "for %s%s with descr=%s",
                  (s__class__.__name__, _strid(self), descr))

        return self
Пример #24
0
    def test_generic_tests(self):
        """Test all classifiers for conformant behavior
        """
        for clf_, traindata in \
                [(clfswh['binary'], datasets['dumb2']),
                 (clfswh['multiclass'], datasets['dumb'])]:
            traindata_copy = deepcopy(traindata) # full copy of dataset
            for clf in clf_:
                clf.train(traindata)
                self.assertTrue(
                   (traindata.samples == traindata_copy.samples).all(),
                   "Training of a classifier shouldn't change original dataset")

            # TODO: enforce uniform return from predict??
            #predicted = clf.predict(traindata.samples)
            #self.assertTrue(isinstance(predicted, np.ndarray))

        # Just simple test that all of them are syntaxed correctly
        self.assertTrue(str(clf) != "")
        self.assertTrue(repr(clf) != "")
Пример #25
0
    def test_generic_tests(self):
        """Test all classifiers for conformant behavior
        """
        for clf_, traindata in \
                [(clfswh['binary'], datasets['dumb2']),
                 (clfswh['multiclass'], datasets['dumb'])]:
            traindata_copy = deepcopy(traindata) # full copy of dataset
            for clf in clf_:
                clf.train(traindata)
                self.assertTrue(
                   (traindata.samples == traindata_copy.samples).all(),
                   "Training of a classifier shouldn't change original dataset")

            # TODO: enforce uniform return from predict??
            #predicted = clf.predict(traindata.samples)
            #self.assertTrue(isinstance(predicted, np.ndarray))

        # Just simple test that all of them are syntaxed correctly
        self.assertTrue(str(clf) != "")
        self.assertTrue(repr(clf) != "")
Пример #26
0
    def test_more_svd(self):
        pm = SVDMapper()
        # train SVD
        pm.train(self.largefeat)

        # mixing matrix cannot be square
        self.failUnlessEqual(pm.proj.shape, (40, 10))

        # only first singular value significant
        self.failUnless(pm.sv[:1] > 10)
        self.failUnless((pm.sv[1:] < 10).all())

        # now project data into SVD space
        p = pm.forward(self.largefeat)

        # only variance of first component significant
        var = p.var(axis=0)

        # test that only one component has variance
        self.failUnless(var[:1] > 1.0)
        self.failUnless((var[1:] < 0.0001).all())

        # check that the mapped data can be fully recovered by 'reverse()'
        rp = pm.reverse(p)
        self.failUnlessEqual(rp.shape, self.largefeat.shape)
        self.failUnless((np.round(rp) == self.largefeat).all())

        # copy mapper
        pm2 = deepcopy(pm)

        # now make new random data and do forward->reverse check
        data = np.random.normal(size=(98,40))
        data_f = pm.forward(data)

        self.failUnlessEqual(data_f.shape, (98,10))

        data_r = pm.reverse(data_f)
        self.failUnlessEqual(data_r.shape, (98,40))
Пример #27
0
    def test_more_svd(self):
        pm = SVDMapper()
        # train SVD
        pm.train(self.largefeat)

        # mixing matrix cannot be square
        self.assertEqual(pm.proj.shape, (40, 10))

        # only first singular value significant
        self.assertTrue(pm.sv[:1] > 10)
        self.assertTrue((pm.sv[1:] < 10).all())

        # now project data into SVD space
        p = pm.forward(self.largefeat)

        # only variance of first component significant
        var = p.var(axis=0)

        # test that only one component has variance
        self.assertTrue(var[:1] > 1.0)
        self.assertTrue((var[1:] < 0.0001).all())

        # check that the mapped data can be fully recovered by 'reverse()'
        rp = pm.reverse(p)
        self.assertEqual(rp.shape, self.largefeat.shape)
        self.assertTrue((np.round(rp) == self.largefeat).all())

        # copy mapper
        pm2 = deepcopy(pm)

        # now make new random data and do forward->reverse check
        data = np.random.normal(size=(98, 40))
        data_f = pm.forward(data)

        self.assertEqual(data_f.shape, (98, 10))

        data_r = pm.reverse(data_f)
        self.assertEqual(data_r.shape, (98, 40))
Пример #28
0
    def test_retrainables(self, clf):
        # XXX we agreed to not worry about this for the initial 0.6 release
        raise SkipTest
        # we need a copy since will tune its internals later on
        clf = clf.clone()
        clf.ca.change_temporarily(
            enable_ca=['estimates'],
            # ensure that it does do predictions
            # while training
            disable_ca=['training_stats'])
        clf_re = clf.clone()
        # TODO: .retrainable must have a callback to call smth like
        # _set_retrainable
        clf_re._set_retrainable(True)

        # need to have high snr so we don't 'cope' with problematic
        # datasets since otherwise unittests would fail.
        dsargs = {
            'perlabel': 50,
            'nlabels': 2,
            'nfeatures': 5,
            'nchunks': 1,
            'nonbogus_features': [2, 4],
            'snr': 5.0
        }

        ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        # NB datasets will be changed by the end of testing, so if
        # are to change to use generic datasets - make sure to copy
        # them here
        ds = deepcopy(datasets['uni2large'])
        clf.untrain()
        clf_re.untrain()
        trerr = TransferMeasure(clf,
                                Splitter('train'),
                                postproc=BinaryFxNode(mean_mismatch_error,
                                                      'targets'))
        trerr_re = TransferMeasure(clf_re,
                                   Splitter('train'),
                                   disable_ca=['training_stats'],
                                   postproc=BinaryFxNode(
                                       mean_mismatch_error, 'targets'))

        # Just check for correctness of retraining
        err_1 = np.asscalar(trerr(ds))
        self.assertTrue(
            err_1 < 0.3,
            msg="We should test here on easy dataset. Got error of %s" % err_1)
        values_1 = clf.ca.estimates[:]
        # some times retraining gets into deeper optimization ;-)
        eps = 0.05
        corrcoef_eps = 0.85  # just to get no failures... usually > 0.95

        def batch_test(retrain=True, retest=True, closer=True):
            err = np.asscalar(trerr(ds))
            err_re = np.asscalar(trerr_re(ds))
            corr = np.corrcoef(clf.ca.estimates, clf_re.ca.estimates)[0, 1]
            corr_old = np.corrcoef(values_1, clf_re.ca.estimates)[0, 1]
            if __debug__:
                debug(
                    'TEST', "Retraining stats: errors %g %g corr %g "
                    "with old error %g corr %g" %
                    (err, err_re, corr, err_1, corr_old))
            self.assertTrue(clf_re.ca.retrained == retrain,
                            ("Must fully train",
                             "Must retrain instead of full training")[retrain])
            self.assertTrue(clf_re.ca.repredicted == retest,
                            ("Must fully test",
                             "Must retest instead of full testing")[retest])
            self.assertTrue(
                corr > corrcoef_eps,
                msg="Result must be close to the one without retraining."
                " Got corrcoef=%s" % (corr))
            if closer:
                self.assertTrue(
                    corr >= corr_old,
                    msg="Result must be closer to current without retraining"
                    " than to old one. Got corrcoef=%s" % (corr_old))

        # Check sequential retraining/retesting
        for i in xrange(3):
            flag = bool(i != 0)
            # ok - on 1st call we should train/test, then retrain/retest
            # and we can't compare for closinest to old result since
            # we are working on the same data/classifier
            batch_test(retrain=flag, retest=flag, closer=False)

        # should retrain nicely if we change a parameter
        if 'C' in clf.params:
            clf.params.C *= 0.1
            clf_re.params.C *= 0.1
            batch_test()
        elif 'sigma_noise' in clf.params:
            clf.params.sigma_noise *= 100
            clf_re.params.sigma_noise *= 100
            batch_test()
        else:
            raise RuntimeError, \
                  'Please implement testing while changing some of the ' \
                  'params for clf %s' % clf

        # should retrain nicely if we change kernel parameter
        if hasattr(clf, 'kernel_params') and len(clf.kernel_params):
            clf.kernel_params.gamma = 0.1
            clf_re.kernel_params.gamma = 0.1
            # retest is false since kernel got recomputed thus
            # can't expect to use the same kernel
            batch_test(retest=not ('gamma' in clf.kernel_params))

        # should retrain nicely if we change labels
        permute = AttributePermutator('targets', assure=True)
        oldlabels = dstrain.targets[:]
        dstrain = permute(dstrain)
        self.assertTrue(
            (oldlabels != dstrain.targets).any(),
            msg="We should succeed at permutting -- now got the same targets")
        ds = vstack((dstrain, dstest))
        batch_test()

        # Change labels in testing
        oldlabels = dstest.targets[:]
        dstest = permute(dstest)
        self.assertTrue(
            (oldlabels != dstest.targets).any(),
            msg="We should succeed at permutting -- now got the same targets")
        ds = vstack((dstrain, dstest))
        batch_test()

        # should re-train if we change data
        # reuse trained SVM and its 'final' optimization point
        if not clf.__class__.__name__ in [
                'GPR'
        ]:  # on GPR everything depends on the data ;-)
            oldsamples = dstrain.samples.copy()
            dstrain.samples[:] += dstrain.samples * 0.05
            self.assertTrue((oldsamples != dstrain.samples).any())
            ds = vstack((dstrain, dstest))
            batch_test(retest=False)
        clf.ca.reset_changed_temporarily()

        # test retrain()
        # TODO XXX  -- check validity
        clf_re.retrain(dstrain)
        self.assertTrue(clf_re.ca.retrained)
        clf_re.retrain(dstrain, labels=True)
        self.assertTrue(clf_re.ca.retrained)
        clf_re.retrain(dstrain, traindataset=True)
        self.assertTrue(clf_re.ca.retrained)

        # test repredict()
        clf_re.repredict(dstest.samples)
        self.assertTrue(clf_re.ca.repredicted)
        self.assertRaises(RuntimeError,
                          clf_re.repredict,
                          dstest.samples,
                          labels=True)
        """for now retesting with anything changed makes no sense"""
        clf_re._set_retrainable(False)
Пример #29
0
    def train(self, datasets):
        """Derive a common feature space from a series of datasets.

        Parameters
        ----------
        datasets : sequence of datasets

        Returns
        -------
        A list of trained Mappers matching the number of input datasets.
        """
        params = self.params  # for quicker access ;)
        ca = self.ca
        # Check to make sure we get a list of datasets as input.
        if not isinstance(datasets, (list, tuple, np.ndarray)):
            raise TypeError("Input datasets should be a sequence "
                            "(of type list, tuple, or ndarray) of datasets.")

        ndatasets = len(datasets)
        nfeatures = [ds.nfeatures for ds in datasets]
        alpha = params.alpha

        residuals = None
        if ca['training_residual_errors'].enabled:
            residuals = np.zeros((1 + params.level2_niter, ndatasets))
            ca.training_residual_errors = Dataset(
                samples=residuals,
                sa={
                    'levels':
                    ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)]
                })

        if __debug__:
            debug('HPAL',
                  "Hyperalignment %s for %i datasets" % (self, ndatasets))

        if params.ref_ds is None:
            ref_ds = np.argmax(nfeatures)
        else:
            ref_ds = params.ref_ds
            # Making sure that ref_ds is within range.
            #Parameter() already checks for it being a non-negative integer
            if ref_ds >= ndatasets:
                raise ValueError, "Requested reference dataset %i is out of " \
                      "bounds. We have only %i datasets provided" \
                      % (ref_ds, ndatasets)
        ca.chosen_ref_ds = ref_ds
        # zscore all data sets
        # ds = [ zscore(ds, chunks_attr=None) for ds in datasets]

        # TODO since we are doing in-place zscoring create deep copies
        # of the datasets with pruned targets and shallow copies of
        # the collections (if they would come needed in the transformation)
        # TODO: handle floats and non-floats differently to prevent
        #       waste of memory if there is no need (e.g. no z-scoring)
        #otargets = [ds.sa.targets for ds in datasets]
        datasets = [ds.copy(deep=False) for ds in datasets]
        #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)})
        #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)})
        #            for ds in datasets]

        if params.zscore_all:
            if __debug__:
                debug('HPAL', "Z-scoring all datasets")
            for ids in xrange(len(datasets)):
                zmapper = ZScoreMapper(chunks_attr=None)
                zmapper.train(datasets[ids])
                datasets[ids] = zmapper.forward(datasets[ids])

        if alpha < 1:
            datasets, wmappers = self._regularize(datasets, alpha)

        # initial common space is the reference dataset
        commonspace = datasets[ref_ds].samples
        # the reference dataset might have been zscored already, don't do it
        # twice
        if params.zscore_common and not params.zscore_all:
            if __debug__:
                debug(
                    'HPAL_', "Creating copy of a commonspace and assuring "
                    "it is of a floating type")
            commonspace = commonspace.astype(float)
            zscore(commonspace, chunks_attr=None)
        # If there is only one dataset in training phase, there is nothing to be done
        # just use that data as the common space
        if len(datasets) < 2:
            self.commonspace = commonspace
        else:
            # create a mapper per dataset
            # might prefer some other way to initialize... later
            mappers = [deepcopy(params.alignment) for ds in datasets]

            #
            # Level 1 -- initial projection
            #
            lvl1_projdata = self._level1(datasets, commonspace, ref_ds,
                                         mappers, residuals)
            #
            # Level 2 -- might iterate multiple times
            #
            # this is the final common space
            self.commonspace = self._level2(datasets, lvl1_projdata, mappers,
                                            residuals)
        if params.output_dim is not None:
            mappers = self._level3(datasets)
            self._svd_mapper = SVDMapper()
            self._svd_mapper.train(self._map_and_mean(datasets, mappers))
            self._svd_mapper = StaticProjectionMapper(
                proj=self._svd_mapper.proj[:, :params.output_dim])
Пример #30
0
 def _get_transformer(self):
     if self._transformer is None:
         self._transformer = deepcopy(self._pristine_transformer)
     return self._transformer
Пример #31
0
    def __call__(self, datasets):
        """Estimate mappers for each dataset

        Parameters
        ----------
          datasets : list or tuple of datasets

        Returns
        -------
        A list of trained Mappers of the same length as datasets
        """
        params = self.params            # for quicker access ;)
        ca = self.ca
        ndatasets = len(datasets)
        nfeatures = [ds.nfeatures for ds in datasets]

        residuals = None
        if ca['residual_errors'].enabled:
            residuals = np.zeros((2 + params.level2_niter, ndatasets))
            ca.residual_errors = Dataset(
                samples = residuals,
                sa = {'levels' :
                       ['1'] +
                       ['2:%i' % i for i in xrange(params.level2_niter)] +
                       ['3']})

        if __debug__:
            debug('HPAL', "Hyperalignment %s for %i datasets"
                  % (self, ndatasets))

        if params.ref_ds is None:
            ref_ds = np.argmax(nfeatures)
        else:
            ref_ds = params.ref_ds
            if ref_ds < 0 and ref_ds >= ndatasets:
                raise ValueError, "Requested reference dataset %i is out of " \
                      "bounds. We have only %i datasets provided" \
                      % (ref_ds, ndatasets)
        ca.choosen_ref_ds = ref_ds
        # might prefer some other way to initialize... later
        mappers = [deepcopy(params.alignment) for ds in datasets]
        # zscore all data sets
        # ds = [ zscore(ds, chunks_attr=None) for ds in datasets]

        # Level 1 (first)

        # TODO since we are doing in-place zscoring create deep copies
        # of the datasets with pruned targets and shallow copies of
        # the collections (if they would come needed in the transformation)
        # TODO: handle floats and non-floats differently to prevent
        #       waste of memory if there is no need (e.g. no z-scoring)
        #otargets = [ds.sa.targets for ds in datasets]
        datasets = [ds.copy(deep=False) for ds in datasets]
        #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)})
        #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)})
        #            for ds in datasets]

        if params.zscore_all:
            if __debug__:
                debug('HPAL', "Z-scoring all datasets")
            # zscore them once while storing corresponding ZScoreMapper's
            zmappers = []
            for ids in xrange(len(datasets)):
                zmapper = ZScoreMapper(chunks_attr=None)
                zmappers.append(zmapper)
                zmapper.train(datasets[ids])
                datasets[ids] = zmapper.forward(datasets[ids])

        commonspace = np.asanyarray(datasets[ref_ds])
        if params.zscore_common and not params.zscore_all:
            if __debug__:
                debug('HPAL_',
                      "Creating copy of a commonspace and assuring "
                      "it is of a floating type")
            commonspace = commonspace.astype(float)
            zscore(commonspace, chunks_attr=None)

        data_mapped = [np.asanyarray(ds) for ds in datasets]
        #zscore(data_mapped[ref_ds],chunks_attr=None)
        for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
            if __debug__:
                debug('HPAL_', "Level 1: ds #%i" % i)
            if i == ref_ds:
                continue
            #ds_new = ds.copy()
            #zscore(ds_new, chunks_attr=None);
            ds_new.targets = commonspace
            m.train(ds_new)
            ds_ = m.forward(np.asanyarray(ds_new))
            if params.zscore_common:
                zscore(ds_, chunks_attr=None)
            data_mapped[i] = ds_

            if residuals is not None:
                residuals[0, i] = np.linalg.norm(ds_ - commonspace)

            ## if ds_mapped == []:
            ##     ds_mapped = [zscore(m.forward(d), chunks_attr=None)]
            ## else:
            ##     ds_mapped += [zscore(m.forward(d), chunks_attr=None)]

            # zscore before adding
            # TODO: make just a function so we dont' waste space
            commonspace = params.combiner1(data_mapped[i], commonspace)
            if params.zscore_common:
                zscore(commonspace, chunks_attr=None)

        # update commonspace to mean of ds_mapped
        commonspace = params.combiner2(data_mapped)
        #if params.zscore_common:
        #zscore(commonspace, chunks_attr=None)
        # Level 2 -- might iterate multiple times
        for loop in xrange(params.level2_niter):
            for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
                if __debug__:
                    debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i))

                ds_temp = (commonspace*ndatasets - data_mapped[i])/(ndatasets-1)
                if params.zscore_common:
                    zscore(ds_temp, chunks_attr=None)
                #ds_new = ds.copy()
                #zscore(ds_new, chunks_attr=None)
                ds_new.targets = ds_temp #commonspace #PRJ ds_temp
                m.train(ds_new) # ds_temp)
                ds_ =  m.forward(np.asanyarray(ds_new))
                if params.zscore_common:
                    zscore(ds_, chunks_attr=None)
                data_mapped[i] = ds_
                if residuals is not None:
                    residuals[1+loop, i] = np.linalg.norm(ds_ - commonspace)

                #ds_mapped[i] = zscore( m.forward(ds_temp), chunks_attr=None)

            commonspace = params.combiner2(data_mapped)
            #if params.zscore_common:
                #zscore(commonspace, chunks_attr=None)

        # Level 3 (last) to params.levels
        for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
            if __debug__:
                debug('HPAL_', "Level 3: ds #%i" % i)

            #ds_new = ds.copy()     # shallow copy so we could assign new labels
            #zscore(ds_new, chunks_attr=None)
            ds_temp = (commonspace*ndatasets - data_mapped[i])/(ndatasets-1)
            if params.zscore_common:
                zscore(ds_temp, chunks_attr=None)
            ds_new.targets = ds_temp #commonspace #PRJ ds_temp#
            m.train(ds_new) #ds_temp)
            data_mapped[i] = m.forward(np.asanyarray(ds_new))
            if residuals is not None:
                residuals[-1, i] = np.linalg.norm(data_mapped[i] - commonspace)

        if params.zscore_all:
            # We need to construct new mappers which would chain
            # zscore and then final transformation
            return [ChainMapper([zm, m]) for zm, m in zip(zmappers, mappers)]
        else:
            return mappers
Пример #32
0
    def test_multivariate(self):
        mv_perf = []
        mv_lin_perf = []
        uv_perf = []

        l_clf = clfswh['linear', 'svm'][0]
        nl_clf = clfswh['non-linear', 'svm'][0]

        #orig_keys = nl_clf.param._params.keys()
        #nl_param_orig = nl_clf.param._params.copy()

        # l_clf = LinearNuSVMC()

        # XXX ??? not sure what below meant and it is obsolete if
        # using SG... commenting out for now
        # for some reason order is not preserved thus dictionaries are not
        # the same any longer -- lets compare values
        #self.assertEqual([nl_clf.param._params[k] for k in orig_keys],
        #                     [nl_param_orig[k] for k in orig_keys],
        #   msg="New instance mustn't override values in previously created")
        ## and keys separately
        #self.assertEqual(set(nl_clf.param._params.keys()),
        #                     set(orig_keys),
        #   msg="New instance doesn't change set of parameters in original")

        # We must be able to deepcopy not yet trained SVMs now
        import mvpa2.support.copy as copy
        try:
            nl_clf.untrain()
            nl_clf_copy = copy.deepcopy(nl_clf)
        except:
            self.fail(msg="Failed to deepcopy not-yet trained SVM %s" % nl_clf)

        for i in xrange(20):
            train = pure_multivariate_signal( 20, 3 )
            test = pure_multivariate_signal( 20, 3 )

            # use non-linear CLF on 2d data
            nl_clf.train(train)
            p_mv = nl_clf.predict(test.samples)
            mv_perf.append(np.mean(p_mv==test.targets))

            # use linear CLF on 2d data
            l_clf.train(train)
            p_lin_mv = l_clf.predict(test.samples)
            mv_lin_perf.append(np.mean(p_lin_mv==test.targets))

            # use non-linear CLF on 1d data
            nl_clf.train(train[:, 0])
            p_uv = nl_clf.predict(test[:, 0].samples)
            uv_perf.append(np.mean(p_uv==test.targets))

        mean_mv_perf = np.mean(mv_perf)
        mean_mv_lin_perf = np.mean(mv_lin_perf)
        mean_uv_perf = np.mean(uv_perf)

        # non-linear CLF has to be close to perfect
        self.assertTrue( mean_mv_perf > 0.9 )
        # linear CLF cannot learn this problem!
        self.assertTrue( mean_mv_perf > mean_mv_lin_perf )
        # univariate has insufficient information
        self.assertTrue( mean_uv_perf < mean_mv_perf )
Пример #33
0
 def _get_transformer(self):
     if self._transformer is None:
         self._transformer = deepcopy(self._pristine_transformer)
     return self._transformer
Пример #34
0
    def __init__(self, **kwargs):
        """Init base class of SVMs. *Not to be publicly used*

        TODO: handling of parameters might migrate to be generic for
        all classifiers. SVMs are chosen to be testbase for that
        functionality to see how well it would fit.
        """

        # Check if requested implementation is known
        svm_impl = kwargs.get('svm_impl', None)
        if not svm_impl in self._KNOWN_IMPLEMENTATIONS:
            raise ValueError("Unknown SVM implementation '%s' is requested for %s." \
                  "Known are: %s" % (svm_impl, self.__class__,
                                     list(self._KNOWN_IMPLEMENTATIONS.keys())))
        self._svm_impl = svm_impl

        impl, add_params, add_internals, descr = \
              self._KNOWN_IMPLEMENTATIONS[svm_impl]

        # Add corresponding parameters to 'known' depending on the
        # implementation chosen
        if add_params is not None:
            self._KNOWN_PARAMS = \
                 self._KNOWN_PARAMS[:] + list(add_params)


        # Assign per-instance __tags__
        self.__tags__ = self.__tags__[:] + [svm_impl]

        # Add corresponding internals
        if add_internals is not None:
            self.__tags__ += list(add_internals)
        self.__tags__.append(svm_impl)

        k = kwargs.get('kernel', None)
        if k is None:
            kwargs['kernel'] = self.__default_kernel_class__()
        if 'linear' in ('%s'%kwargs['kernel']).lower(): # XXX not necessarily best
            self.__tags__ += [ 'linear', 'has_sensitivity' ]
        else:
            self.__tags__ += [ 'non-linear' ]

        # pop out all args from **kwargs which are known to be SVM parameters
        _args = {}
        for param in self._KNOWN_PARAMS + ['svm_impl']: # Update to remove kp's?
            if param in kwargs:
                _args[param] = kwargs.pop(param)

        try:
            Classifier.__init__(self, **kwargs)
            
        except TypeError as e:
            if "__init__() got an unexpected keyword argument " in e.args[0]:
                # TODO: make it even more specific -- if that argument is listed
                # within _SVM_PARAMS
                e.args = tuple( [e.args[0] +
                                 "\n Given SVM instance of class %s knows following parameters: %s" %
                                 (self.__class__, self._KNOWN_PARAMS) + \
                                 list(e.args)[1:]])
            raise e

        # populate collections and add values from arguments
        for paramfamily, paramset in ( (self._KNOWN_PARAMS, self.params),):
            for paramname in paramfamily:
                if not (paramname in self._SVM_PARAMS):
                    raise ValueError("Unknown parameter %s" % paramname + \
                          ". Known SVM params are: %s" % list(self._SVM_PARAMS.keys()))
                param = deepcopy(self._SVM_PARAMS[paramname])
                if paramname in _args:
                    param.value = _args[paramname]
                    # XXX might want to set default to it -- not just value

                paramset[paramname] = param

        # TODO: Below commented out because kernel_type has been removed.  
        # Find way to set default C as necessary
        
        # tune up C if it has one and non-linear classifier is used
        #if self.params.has_key('C') and kernel_type != "linear" \
               #and self.params['C'].is_default:
            #if __debug__:
                #debug("SVM_", "Assigning default C value to be 1.0 for SVM "
                      #"%s with non-linear kernel" % self)
            #self.params['C'].default = 1.0

        # Some postchecks
        if 'weight' in self.params and 'weight_label' in self.params:
            if not len(self.params.weight_label) == len(self.params.weight):
                raise ValueError("Lenghts of 'weight' and 'weight_label' lists " \
                      "must be equal.")

            
        if __debug__:
            debug("SVM", "Initialized %s with kernel %s" % 
                  (self, self.params.kernel))
Пример #35
0
class _SVM(Classifier):
    """Support Vector Machine Classifier.

    Base class for all external SVM implementations.
    """

    """
    Derived classes should define:

    * _KERNELS: map(dict) should define assignment to a tuple containing
      implementation kernel type, list of parameters adherent to the
      kernel, and sensitivity analyzer e.g.::

        _KERNELS = {
             'linear': (shogun.Kernel.LinearKernel, (), LinearSVMWeights),
             'rbf' :   (shogun.Kernel.GaussianKernel, ('gamma',), None),
             ...
             }

    * _KNOWN_IMPLEMENTATIONS: map(dict) should define assignment to a
      tuple containing implementation of the SVM, list of parameters
      adherent to the implementation, additional internals, and
      description e.g.::

        _KNOWN_IMPLEMENTATIONS = {
          'C_SVC' : (svm.svmc.C_SVC, ('C',),
                   ('binary', 'multiclass'), 'C-SVM classification'),
          ...
          }

    """

    
    _ATTRIBUTE_COLLECTIONS = ['params'] # enforce presence of params collections

    # Placeholder: map kernel names to sensitivity classes, ie
    # 'linear':LinearSVMWeights, for each backend
    _KNOWN_SENSITIVITIES={}
    kernel = Parameter(None,
                       # XXX: Currently, can't be ensured using constraints
                       # allowedtype=Kernel,
                       doc='Kernel object', index=-1)

    _SVM_PARAMS = {
        'C' : Parameter(-1.0,
                  doc='Trade-off parameter between width of the '
                      'margin and number of support vectors. Higher C -- '
                      'more rigid margin SVM. In linear kernel, negative '
                      'values provide automatic scaling of their value '
                      'according to the norm of the data'),
        'nu' : Parameter(0.5, min=0.0, max=1.0,
                  doc='Fraction of datapoints within the margin'),
        'cache_size': Parameter(100,
                  doc='Size of the kernel cache, specified in megabytes'),
        'tube_epsilon': Parameter(0.01,
                  doc='Epsilon in epsilon-insensitive loss function of '
                      'epsilon-SVM regression (SVR)'),
        'tau': Parameter(1e-6, doc='TAU parameter of KRR regression in shogun'),
        'probability': Parameter(0,
                  doc='Flag to signal either probability estimate is obtained '
                      'within LIBSVM'),
        'shrinking': Parameter(1, doc='Either shrinking is to be conducted'),
        'weight_label': Parameter([], constraints=EnsureListOf(int),
                  doc='To be used in conjunction with weight for custom '
                      'per-label weight'),
        # TODO : merge them into a single dictionary
        'weight': Parameter([], constraints=EnsureListOf(float),
                  doc='Custom weights per label'),
        # For some reason setting up epsilon to 1e-5 slowed things down a bit
        # in comparison to how it was before (in yoh/master) by up to 20%... not clear why
        # may be related to 1e-3 default within _svm.py?
        'epsilon': Parameter(5e-5, min=1e-10,
                  doc='Tolerance of termination criteria. (For nu-SVM default is 0.001)')
        }

    _KNOWN_PARAMS = ()                  # just a placeholder to please lintian
    """Parameters which are specific to a given instantiation of SVM
    """

    __tags__ = [ 'svm', 'kernel-based', 'swig' ]

    def __init__(self, **kwargs):
        """Init base class of SVMs. *Not to be publicly used*

        TODO: handling of parameters might migrate to be generic for
        all classifiers. SVMs are chosen to be testbase for that
        functionality to see how well it would fit.
        """

        # Check if requested implementation is known
        svm_impl = kwargs.get('svm_impl', None)
        if not svm_impl in self._KNOWN_IMPLEMENTATIONS:
            raise ValueError, \
                  "Unknown SVM implementation '%s' is requested for %s." \
                  "Known are: %s" % (svm_impl, self.__class__,
                                     self._KNOWN_IMPLEMENTATIONS.keys())
        self._svm_impl = svm_impl

        impl, add_params, add_internals, descr = \
              self._KNOWN_IMPLEMENTATIONS[svm_impl]

        # Add corresponding parameters to 'known' depending on the
        # implementation chosen
        if add_params is not None:
            self._KNOWN_PARAMS = \
                 self._KNOWN_PARAMS[:] + list(add_params)


        # Assign per-instance __tags__
        self.__tags__ = self.__tags__[:] + [svm_impl]

        # Add corresponding internals
        if add_internals is not None:
            self.__tags__ += list(add_internals)
        self.__tags__.append(svm_impl)

        k = kwargs.get('kernel', None)
        if k is None:
            kwargs['kernel'] = self.__default_kernel_class__()
        if 'linear' in ('%s'%kwargs['kernel']).lower(): # XXX not necessarily best
            self.__tags__ += [ 'linear', 'has_sensitivity' ]
        else:
            self.__tags__ += [ 'non-linear' ]

        # pop out all args from **kwargs which are known to be SVM parameters
        _args = {}
        for param in self._KNOWN_PARAMS + ['svm_impl']: # Update to remove kp's?
            if param in kwargs:
                _args[param] = kwargs.pop(param)

        try:
            Classifier.__init__(self, **kwargs)
            
        except TypeError, e:
            if "__init__() got an unexpected keyword argument " in e.args[0]:
                # TODO: make it even more specific -- if that argument is listed
                # within _SVM_PARAMS
                e.args = tuple( [e.args[0] +
                                 "\n Given SVM instance of class %s knows following parameters: %s" %
                                 (self.__class__, self._KNOWN_PARAMS) + \
                                 list(e.args)[1:]])
            raise e

        # populate collections and add values from arguments
        for paramfamily, paramset in ( (self._KNOWN_PARAMS, self.params),):
            for paramname in paramfamily:
                if not (paramname in self._SVM_PARAMS):
                    raise ValueError, "Unknown parameter %s" % paramname + \
                          ". Known SVM params are: %s" % self._SVM_PARAMS.keys()
                param = deepcopy(self._SVM_PARAMS[paramname])
                if paramname in _args:
                    param.value = _args[paramname]
                    # XXX might want to set default to it -- not just value

                paramset[paramname] = param

        # TODO: Below commented out because kernel_type has been removed.  
        # Find way to set default C as necessary
        
        # tune up C if it has one and non-linear classifier is used
        #if self.params.has_key('C') and kernel_type != "linear" \
               #and self.params['C'].is_default:
            #if __debug__:
                #debug("SVM_", "Assigning default C value to be 1.0 for SVM "
                      #"%s with non-linear kernel" % self)
            #self.params['C'].default = 1.0

        # Some postchecks
        if 'weight' in self.params and 'weight_label' in self.params:
            if not len(self.params.weight_label) == len(self.params.weight):
                raise ValueError, "Lenghts of 'weight' and 'weight_label' lists " \
                      "must be equal."

            
        if __debug__:
            debug("SVM", "Initialized %s with kernel %s" % 
                  (self, self.params.kernel))
Пример #36
0
    def __call__(self, datasets):
        """Estimate mappers for each dataset

        Parameters
        ----------
          datasets : list or tuple of datasets

        Returns
        -------
        A list of trained Mappers of the same length as datasets
        """
        params = self.params            # for quicker access ;)
        ca = self.ca
        ndatasets = len(datasets)
        nfeatures = [ds.nfeatures for ds in datasets]

        residuals = None
        if ca['residual_errors'].enabled:
            residuals = np.zeros((2 + params.level2_niter, ndatasets))
            ca.residual_errors = Dataset(
                samples = residuals,
                sa = {'levels' :
                       ['1'] +
                       ['2:%i' % i for i in xrange(params.level2_niter)] +
                       ['3']})

        if __debug__:
            debug('HPAL', "Hyperalignment %s for %i datasets"
                  % (self, ndatasets))

        if params.ref_ds is None:
            ref_ds = np.argmax(nfeatures)
        else:
            ref_ds = params.ref_ds
            if ref_ds < 0 and ref_ds >= ndatasets:
                raise ValueError, "Requested reference dataset %i is out of " \
                      "bounds. We have only %i datasets provided" \
                      % (ref_ds, ndatasets)
        ca.choosen_ref_ds = ref_ds
        # might prefer some other way to initialize... later
        mappers = [deepcopy(params.alignment) for ds in datasets]
        # zscore all data sets
        # ds = [ zscore(ds, chunks_attr=None) for ds in datasets]

        # Level 1 (first)
        commonspace = np.asanyarray(datasets[ref_ds])
        if params.zscore_common:
            zscore(commonspace, chunks_attr=None)
        data_mapped = [np.asanyarray(ds) for ds in datasets]
        for i, (m, data) in enumerate(zip(mappers, data_mapped)):
            if __debug__:
                debug('HPAL_', "Level 1: ds #%i" % i)
            if i == ref_ds:
                continue
            #ZSC zscore(data, chunks_attr=None)
            ds = dataset_wizard(samples=data, targets=commonspace)
            #ZSC zscore(ds, chunks_attr=None)
            m.train(ds)
            data_temp = m.forward(data)
            #ZSC zscore(data_temp, chunks_attr=None)
            data_mapped[i] = data_temp

            if residuals is not None:
                residuals[0, i] = np.linalg.norm(data_temp - commonspace)

            ## if ds_mapped == []:
            ##     ds_mapped = [zscore(m.forward(d), chunks_attr=None)]
            ## else:
            ##     ds_mapped += [zscore(m.forward(d), chunks_attr=None)]

            # zscore before adding
            # TODO: make just a function so we dont' waste space
            commonspace = params.combiner1(data_mapped[i], commonspace)
            if params.zscore_common:
                zscore(commonspace, chunks_attr=None)

        # update commonspace to mean of ds_mapped
        commonspace = params.combiner2(data_mapped)
        if params.zscore_common:
            zscore(commonspace, chunks_attr=None)

        # Level 2 -- might iterate multiple times
        for loop in xrange(params.level2_niter):
            for i, (m, ds) in enumerate(zip(mappers, datasets)):
                if __debug__:
                    debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i))

                ## ds_temp = zscore( (commonspace*ndatasets - ds_mapped[i])
                ##                   /(ndatasets-1), chunks_attr=None )
                ds_new = ds.copy()
                #ZSC zscore(ds_new, chunks_attr=None)
                #PRJ ds_temp = (commonspace*ndatasets - ds_mapped[i])/(ndatasets-1)
                #ZSC zscore(ds_temp, chunks_attr=None)
                ds_new.targets = commonspace #PRJ ds_temp
                m.train(ds_new) # ds_temp)
                data_mapped[i] = m.forward(np.asanyarray(ds))
                if residuals is not None:
                    residuals[1+loop, i] = np.linalg.norm(data_mapped - commonspace)

                #ds_mapped[i] = zscore( m.forward(ds_temp), chunks_attr=None)

            commonspace = params.combiner2(data_mapped)
            if params.zscore_common:
                zscore(commonspace, chunks_attr=None)

        # Level 3 (last) to params.levels
        for i, (m, ds) in enumerate(zip(mappers, datasets)):
            if __debug__:
                debug('HPAL_', "Level 3: ds #%i" % i)

            ## ds_temp = zscore( (commonspace*ndatasets - ds_mapped[i])
            ##                   /(ndatasets-1), chunks_attr=None )
            ds_new = ds.copy()     # shallow copy so we could assign new labels
            #ZSC zscore(ds_new, chunks_attr=None)
            #PRJ ds_temp = (commonspace*ndatasets - ds_mapped[i])/(ndatasets-1)
            #ZSC zscore(ds_temp, chunks_attr=None)
            ds_new.targets = commonspace #PRJ ds_temp#
            m.train(ds_new) #ds_temp)

            if residuals is not None:
                data_mapped = m.forward(ds_new)
                residuals[-1, i] = np.linalg.norm(data_mapped - commonspace)

        return mappers
Пример #37
0
def timesegments_classification(dss,
                                hyper=None,
                                part1=HalfPartitioner(),
                                part2=NFoldPartitioner(attr='subjects'),
                                window_size=6,
                                overlapping_windows=True,
                                distance='correlation',
                                do_zscore=True):
    """Time-segment classification across subjects using Hyperalignment

    Parameters
    ----------
    dss : list of datasets
       Datasets to benchmark on.  Usually a single dataset per subject.
    hyper : Hyperalignment-like, optional
       Beast which if called on a list of datasets should spit out trained
       mappers.  If not specified, `IdentityMapper`s will be used
    part1 : Partitioner, optional
       Partitioner to split data for hyperalignment "cross-validation"
    part2 : Partitioner, optional
       Partitioner for CV within the hyperalignment test split
    window_size : int, optional
       How many temporal points to consider for a classification sample
    overlapping_windows : bool, optional
       Strategy to how create and classify "samples" for classification.  If
       True -- `window_size` samples from each time point (but trailing ones)
       constitute a sample, and upon "predict" `window_size` of samples around
       each test point is not considered.  If False -- samples are just taken
       (with training and testing splits) at `window_size` step from one to
       another.
    do_zscore : bool, optional
       Perform zscoring (overall, not per-chunk) for each dataset upon
       partitioning with part1
    ...
    """
    # Generate outer-most partitioning ()
    parts = [copy.deepcopy(part1).generate(ds) for ds in dss]

    iter = 1
    errors = []

    while True:
        try:
            dss_partitioned = [p.next() for p in parts]
        except StopIteration:
            # we are done -- no more partitions
            break
        if __debug__:
            debug("BM", "Iteration %d", iter)

        dss_train, dss_test = zip(*[
            list(Splitter("partitions").generate(ds)) for ds in dss_partitioned
        ])

        # TODO:  allow for doing feature selection

        if do_zscore:
            for ds in dss_train + dss_test:
                zscore(ds, chunks_attr=None)

        if hyper is not None:
            # since otherwise it would remember previous loop dataset as the "commonspace"
            # Now let's do hyperalignment but on a copy in each loop iteration
            hyper_ = copy.deepcopy(hyper)
            mappers = hyper_(dss_train)
        else:
            mappers = [IdentityMapper() for ds in dss_train]

        dss_test_aligned = [
            mapper.forward(ds) for mapper, ds in zip(mappers, dss_test)
        ]

        # assign .sa.subjects to those datasets
        for i, ds in enumerate(dss_test_aligned):
            # part2.attr is by default "subjects"
            ds.sa[part2.attr] = [i]

        dss_test_bc = []
        for ds in dss_test_aligned:
            if overlapping_windows:
                startpoints = range(len(ds) - window_size + 1)
            else:
                startpoints = _get_nonoverlapping_startpoints(
                    len(ds), window_size)
            bm = BoxcarMapper(startpoints, window_size)
            bm.train(ds)
            ds_ = bm.forward(ds)
            ds_.sa['startpoints'] = startpoints

            # reassign subjects so they are not arrays
            def assign_unique(ds, sa):
                ds.sa[sa] = [
                    np.asscalar(np.unique(x)) for x in ds.sa[sa].value
                ]

            assign_unique(ds_, part2.attr)

            fm = FlattenMapper()
            fm.train(ds_)
            dss_test_bc.append(ds_.get_mapped(fm))

        ds_test = vstack(dss_test_bc)
        # Perform classification across subjects comparing against mean
        # spatio-temporal pattern of other subjects
        errors_across_subjects = []
        for ds_test_part in part2.generate(ds_test):
            ds_train_, ds_test_ = list(
                Splitter("partitions").generate(ds_test_part))
            # average across subjects to get a representative pattern per timepoint
            ds_train_ = mean_group_sample(['startpoints'])(ds_train_)
            assert (ds_train_.shape == ds_test_.shape)

            if distance == 'correlation':
                # TODO: redo more efficiently since now we are creating full
                # corrcoef matrix.  Also we might better just take a name for
                # the pdist measure but then implement them efficiently
                # (i.e. without hstacking both pieces together first)
                dist = 1 - np.corrcoef(
                    ds_train_, ds_test_)[len(ds_test_):, :len(ds_test_)]
            else:
                raise NotImplementedError

            if overlapping_windows:
                dist = wipe_out_offdiag(dist, window_size)

            winners = np.argmin(dist, axis=1)
            error = np.mean(winners != np.arange(len(winners)))
            errors_across_subjects.append(error)
        errors.append(errors_across_subjects)
        iter += 1

    errors = np.array(errors)
    if __debug__:
        debug(
            "BM", "Finished with %s array of errors. Mean error %.2f" %
            (errors.shape, np.mean(errors)))
    return errors
Пример #38
0
    def test_proper_state(self):
        proper   = TestClassProper()
        proper2  = TestClassProper(enable_ca=['state1'], disable_ca=['state2'])

        # disable_ca should override anything in enable_ca
        proper3 = TestClassProper(enable_ca=['all'], disable_ca='all')

        self.assertEqual(len(proper3.ca.enabled), 0,
            msg="disable_ca should override anything in enable_ca")

        proper.ca.state2 = 1000
        value = proper.ca.state2
        self.assertEqual(proper.ca.state2, 1000, msg="Simple assignment/retrieval")

        proper.ca.disable('state2')
        proper.ca.state2 = 10000
        self.assertEqual(proper.ca.state2, 1000, msg="Simple assignment after being disabled")

        proper4 = copy.deepcopy(proper)

        proper.ca.reset('state2')
        self.assertRaises(UnknownStateError, proper.ca.__getattribute__, 'state2')
        """Must be blank after being reset"""

        self.assertEqual(proper4.ca.state2, 1000,
            msg="Simple assignment after being reset in original instance")


        proper.ca.enable(['state2'])
        self.assertEqual(set(proper.ca.keys()), set(['state1', 'state2']))
        if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active:
            # skip testing since all ca are on now
            return
        self.assertTrue(proper.ca.enabled == ['state2'])

        self.assertTrue(set(proper2.ca.enabled) == set(['state1']))

        self.assertRaises(AttributeError, proper.__getattribute__, 'state12')

        # if documentary on the state is appropriate
        self.assertEqual(proper2.ca.listing,
                             ['%sstate1+%s: state1 doc' % (_def_sep, _def_sep),
                              '%sstate2%s: state2 doc' % (_def_sep, _def_sep)])

        # if __str__ lists correct number of ca
        str_ = str(proper2)
        self.assertTrue(str_.find('2 ca:') != -1)

        # check if disable works
        self.assertTrue(set(proper2.ca.enabled), set(['state1']))

        proper2.ca.disable("all")
        self.assertEqual(set(proper2.ca.enabled), set())

        proper2.ca.enable("all")
        self.assertEqual(len(proper2.ca.enabled), 2)

        proper2.ca.state1, proper2.ca.state2 = 1,2
        self.assertEqual(proper2.ca.state1, 1)
        self.assertEqual(proper2.ca.state2, 2)

        # now reset them
        proper2.ca.reset('all')
        self.assertRaises(UnknownStateError, proper2.ca.__getattribute__, 'state1')
        self.assertRaises(UnknownStateError, proper2.ca.__getattribute__, 'state2')
Пример #39
0
def timesegments_classification(
        dss,
        hyper=None,
        part1=HalfPartitioner(),
        part2=NFoldPartitioner(attr='subjects'),
        window_size=6,
        overlapping_windows=True,
        distance='correlation',
        do_zscore=True):
    """Time-segment classification across subjects using Hyperalignment

    Parameters
    ----------
    dss : list of datasets
       Datasets to benchmark on.  Usually a single dataset per subject.
    hyper : Hyperalignment-like, optional
       Beast which if called on a list of datasets should spit out trained
       mappers.  If not specified, `IdentityMapper`s will be used
    part1 : Partitioner, optional
       Partitioner to split data for hyperalignment "cross-validation"
    part2 : Partitioner, optional
       Partitioner for CV within the hyperalignment test split
    window_size : int, optional
       How many temporal points to consider for a classification sample
    overlapping_windows : bool, optional
       Strategy to how create and classify "samples" for classification.  If
       True -- `window_size` samples from each time point (but trailing ones)
       constitute a sample, and upon "predict" `window_size` of samples around
       each test point is not considered.  If False -- samples are just taken
       (with training and testing splits) at `window_size` step from one to
       another.
    do_zscore : bool, optional
       Perform zscoring (overall, not per-chunk) for each dataset upon
       partitioning with part1
    ...
    """
    # Generate outer-most partitioning ()
    parts = [copy.deepcopy(part1).generate(ds) for ds in dss]

    iter = 1
    errors = []

    while True:
        try:
            dss_partitioned = [p.next() for p in parts]
        except StopIteration:
            # we are done -- no more partitions
            break
        if __debug__:
            debug("BM", "Iteration %d", iter)

        dss_train, dss_test = zip(*[list(Splitter("partitions").generate(ds))
                                    for ds in dss_partitioned])

        # TODO:  allow for doing feature selection

        if do_zscore:
            for ds in dss_train + dss_test:
                zscore(ds, chunks_attr=None)

        if hyper is not None:
            # since otherwise it would remember previous loop dataset as the "commonspace"
            # Now let's do hyperalignment but on a copy in each loop iteration
            hyper_ = copy.deepcopy(hyper)
            mappers = hyper_(dss_train)
        else:
            mappers = [IdentityMapper() for ds in dss_train]

        dss_test_aligned = [mapper.forward(ds) for mapper, ds in zip(mappers, dss_test)]

        # assign .sa.subjects to those datasets
        for i, ds in enumerate(dss_test_aligned):
            # part2.attr is by default "subjects"
            ds.sa[part2.attr] = [i]

        dss_test_bc = []
        for ds in dss_test_aligned:
            if overlapping_windows:
                startpoints = range(len(ds) - window_size + 1)
            else:
                startpoints = _get_nonoverlapping_startpoints(len(ds), window_size)
            bm = BoxcarMapper(startpoints, window_size)
            bm.train(ds)
            ds_ = bm.forward(ds)
            ds_.sa['startpoints'] = startpoints
            # reassign subjects so they are not arrays
            def assign_unique(ds, sa):
                ds.sa[sa] = [np.asscalar(np.unique(x)) for x in ds.sa[sa].value]
            assign_unique(ds_, part2.attr)

            fm = FlattenMapper()
            fm.train(ds_)
            dss_test_bc.append(ds_.get_mapped(fm))

        ds_test = vstack(dss_test_bc)
        # Perform classification across subjects comparing against mean
        # spatio-temporal pattern of other subjects
        errors_across_subjects = []
        for ds_test_part in part2.generate(ds_test):
            ds_train_, ds_test_ = list(Splitter("partitions").generate(ds_test_part))
            # average across subjects to get a representative pattern per timepoint
            ds_train_ = mean_group_sample(['startpoints'])(ds_train_)
            assert(ds_train_.shape == ds_test_.shape)

            if distance == 'correlation':
                # TODO: redo more efficiently since now we are creating full
                # corrcoef matrix.  Also we might better just take a name for
                # the pdist measure but then implement them efficiently
                # (i.e. without hstacking both pieces together first)
                dist = 1 - np.corrcoef(ds_train_, ds_test_)[len(ds_test_):, :len(ds_test_)]
            else:
                raise NotImplementedError

            if overlapping_windows:
                dist = wipe_out_offdiag(dist, window_size)

            winners = np.argmin(dist, axis=1)
            error = np.mean(winners != np.arange(len(winners)))
            errors_across_subjects.append(error)
        errors.append(errors_across_subjects)
        iter += 1

    errors = np.array(errors)
    if __debug__:
        debug("BM", "Finished with %s array of errors. Mean error %.2f"
              % (errors.shape, np.mean(errors)))
    return errors
Пример #40
0
    def train(self, datasets):
        """Derive a common feature space from a series of datasets.

        Parameters
        ----------
        datasets : sequence of datasets

        Returns
        -------
        A list of trained Mappers matching the number of input datasets.
        """
        params = self.params            # for quicker access ;)
        ca = self.ca
        ndatasets = len(datasets)
        nfeatures = [ds.nfeatures for ds in datasets]
        alpha = params.alpha
        
        residuals = None
        if ca['training_residual_errors'].enabled:
            residuals = np.zeros((1 + params.level2_niter, ndatasets))
            ca.training_residual_errors = Dataset(
                samples = residuals,
                sa = {'levels' :
                       ['1'] +
                       ['2:%i' % i for i in xrange(params.level2_niter)]})

        if __debug__:
            debug('HPAL', "Hyperalignment %s for %i datasets"
                  % (self, ndatasets))

        if params.ref_ds is None:
            ref_ds = np.argmax(nfeatures)
        else:
            ref_ds = params.ref_ds
            if ref_ds < 0 and ref_ds >= ndatasets:
                raise ValueError, "Requested reference dataset %i is out of " \
                      "bounds. We have only %i datasets provided" \
                      % (ref_ds, ndatasets)
        ca.choosen_ref_ds = ref_ds
        # zscore all data sets
        # ds = [ zscore(ds, chunks_attr=None) for ds in datasets]

        # TODO since we are doing in-place zscoring create deep copies
        # of the datasets with pruned targets and shallow copies of
        # the collections (if they would come needed in the transformation)
        # TODO: handle floats and non-floats differently to prevent
        #       waste of memory if there is no need (e.g. no z-scoring)
        #otargets = [ds.sa.targets for ds in datasets]
        datasets = [ds.copy(deep=False) for ds in datasets]
        #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)})
        #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)})
        #            for ds in datasets]

        if params.zscore_all:
            if __debug__:
                debug('HPAL', "Z-scoring all datasets")
            for ids in xrange(len(datasets)):
                zmapper = ZScoreMapper(chunks_attr=None)
                zmapper.train(datasets[ids])
                datasets[ids] = zmapper.forward(datasets[ids])

        if alpha < 1:
            datasets, wmappers = self._regularize(datasets, alpha)

        # initial common space is the reference dataset
        commonspace = datasets[ref_ds].samples
        # the reference dataset might have been zscored already, don't do it
        # twice
        if params.zscore_common and not params.zscore_all:
            if __debug__:
                debug('HPAL_',
                      "Creating copy of a commonspace and assuring "
                      "it is of a floating type")
            commonspace = commonspace.astype(float)
            zscore(commonspace, chunks_attr=None)

        # create a mapper per dataset
        # might prefer some other way to initialize... later
        mappers = [deepcopy(params.alignment) for ds in datasets]

        #
        # Level 1 -- initial projection
        #
        lvl1_projdata = self._level1(datasets, commonspace, ref_ds, mappers,
                                     residuals)
        #
        # Level 2 -- might iterate multiple times
        #
        # this is the final common space
        self.commonspace = self._level2(datasets, lvl1_projdata, mappers,
                                        residuals)
Пример #41
0
    def _level3(self, datasets):
        params = self.params  # for quicker access ;)
        # create a mapper per dataset
        mappers = [deepcopy(params.alignment) for ds in datasets]

        # key different from level-2; the common space is uniform
        #temp_commonspace = commonspace
        # Fixing nproc=0
        if params.nproc == 0:
            from mvpa2.base import warning
            warning("nproc of 0 doesn't make sense. Setting nproc to 1.")
            params.nproc = 1
        # Checking for joblib, if not, set nproc to 1
        if params.nproc != 1:
            from mvpa2.base import externals, warning
            if not externals.exists('joblib'):
                warning(
                    "Setting nproc different from 1 requires joblib package, which "
                    "does not seem to exist. Setting nproc to 1.")
                params.nproc = 1

        # start from original input datasets again
        if params.nproc == 1:
            residuals = []
            for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
                if __debug__:
                    debug('HPAL_', "Level 3: ds #%i" % i)
                m, residual = get_trained_mapper(
                    ds_new, self.commonspace, m,
                    self.ca['residual_errors'].enabled)
                if self.ca['residual_errors'].enabled:
                    residuals.append(residual)
        else:
            if __debug__:
                debug('HPAL_',
                      "Level 3: Using joblib with nproc = %d " % params.nproc)
            verbose_level_parallel = 20 \
                if (__debug__ and 'HPAL' in debug.active) else 0
            from joblib import Parallel, delayed
            import sys
            # joblib's 'multiprocessing' backend has known issues of failure on OSX
            # Tested with MacOS 10.12.13, python 2.7.13, joblib v0.10.3
            if params.joblib_backend is None:
                params.joblib_backend = 'threading' if sys.platform == 'darwin' \
                                        else 'multiprocessing'
            res = Parallel(n_jobs=params.nproc,
                           pre_dispatch=params.nproc,
                           backend=params.joblib_backend,
                           verbose=verbose_level_parallel)(
                               delayed(get_trained_mapper)(
                                   ds, self.commonspace, mapper,
                                   self.ca['residual_errors'].enabled)
                               for ds, mapper in zip(datasets, mappers))
            mappers = [m for m, r in res]
            if self.ca['residual_errors'].enabled:
                residuals = [r for m, r in res]

        if self.ca['residual_errors'].enabled:
            self.ca.residual_errors = Dataset(
                samples=np.array(residuals)[None, :])

        return mappers
Пример #42
0
    def test_retrainables(self, clf):
        # XXX we agreed to not worry about this for the initial 0.6 release
        raise SkipTest
        # we need a copy since will tune its internals later on
        clf = clf.clone()
        clf.ca.change_temporarily(enable_ca = ['estimates'],
                                      # ensure that it does do predictions
                                      # while training
                                      disable_ca=['training_stats'])
        clf_re = clf.clone()
        # TODO: .retrainable must have a callback to call smth like
        # _set_retrainable
        clf_re._set_retrainable(True)

        # need to have high snr so we don't 'cope' with problematic
        # datasets since otherwise unittests would fail.
        dsargs = {'perlabel':50, 'nlabels':2, 'nfeatures':5, 'nchunks':1,
                  'nonbogus_features':[2,4], 'snr': 5.0}

        ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        # NB datasets will be changed by the end of testing, so if
        # are to change to use generic datasets - make sure to copy
        # them here
        ds = deepcopy(datasets['uni2large'])
        clf.untrain()
        clf_re.untrain()
        trerr = TransferMeasure(clf, Splitter('train'),
                                postproc=BinaryFxNode(mean_mismatch_error,
                                                      'targets'))
        trerr_re =  TransferMeasure(clf_re, Splitter('train'),
                                    disable_ca=['training_stats'],
                                    postproc=BinaryFxNode(mean_mismatch_error,
                                                          'targets'))

        # Just check for correctness of retraining
        err_1 = np.asscalar(trerr(ds))
        self.assertTrue(err_1<0.3,
            msg="We should test here on easy dataset. Got error of %s" % err_1)
        values_1 = clf.ca.estimates[:]
        # some times retraining gets into deeper optimization ;-)
        eps = 0.05
        corrcoef_eps = 0.85        # just to get no failures... usually > 0.95


        def batch_test(retrain=True, retest=True, closer=True):
            err = np.asscalar(trerr(ds))
            err_re = np.asscalar(trerr_re(ds))
            corr = np.corrcoef(
                clf.ca.estimates, clf_re.ca.estimates)[0, 1]
            corr_old = np.corrcoef(values_1, clf_re.ca.estimates)[0, 1]
            if __debug__:
                debug('TEST', "Retraining stats: errors %g %g corr %g "
                      "with old error %g corr %g" %
                  (err, err_re, corr, err_1, corr_old))
            self.assertTrue(clf_re.ca.retrained == retrain,
                            ("Must fully train",
                             "Must retrain instead of full training")[retrain])
            self.assertTrue(clf_re.ca.repredicted == retest,
                            ("Must fully test",
                             "Must retest instead of full testing")[retest])
            self.assertTrue(corr > corrcoef_eps,
              msg="Result must be close to the one without retraining."
                  " Got corrcoef=%s" % (corr))
            if closer:
                self.assertTrue(
                    corr >= corr_old,
                    msg="Result must be closer to current without retraining"
                    " than to old one. Got corrcoef=%s" % (corr_old))

        # Check sequential retraining/retesting
        for i in xrange(3):
            flag = bool(i!=0)
            # ok - on 1st call we should train/test, then retrain/retest
            # and we can't compare for closinest to old result since
            # we are working on the same data/classifier
            batch_test(retrain=flag, retest=flag, closer=False)

        # should retrain nicely if we change a parameter
        if 'C' in clf.params:
            clf.params.C *= 0.1
            clf_re.params.C *= 0.1
            batch_test()
        elif 'sigma_noise' in clf.params:
            clf.params.sigma_noise *= 100
            clf_re.params.sigma_noise *= 100
            batch_test()
        else:
            raise RuntimeError, \
                  'Please implement testing while changing some of the ' \
                  'params for clf %s' % clf

        # should retrain nicely if we change kernel parameter
        if hasattr(clf, 'kernel_params') and len(clf.kernel_params):
            clf.kernel_params.gamma = 0.1
            clf_re.kernel_params.gamma = 0.1
            # retest is false since kernel got recomputed thus
            # can't expect to use the same kernel
            batch_test(retest=not('gamma' in clf.kernel_params))

        # should retrain nicely if we change labels
        permute = AttributePermutator('targets', assure=True)
        oldlabels = dstrain.targets[:]
        dstrain = permute(dstrain)
        self.assertTrue((oldlabels != dstrain.targets).any(),
            msg="We should succeed at permutting -- now got the same targets")
        ds = vstack((dstrain, dstest))
        batch_test()

        # Change labels in testing
        oldlabels = dstest.targets[:]
        dstest = permute(dstest)
        self.assertTrue((oldlabels != dstest.targets).any(),
            msg="We should succeed at permutting -- now got the same targets")
        ds = vstack((dstrain, dstest))
        batch_test()

        # should re-train if we change data
        # reuse trained SVM and its 'final' optimization point
        if not clf.__class__.__name__ in ['GPR']: # on GPR everything depends on the data ;-)
            oldsamples = dstrain.samples.copy()
            dstrain.samples[:] += dstrain.samples*0.05
            self.assertTrue((oldsamples != dstrain.samples).any())
            ds = vstack((dstrain, dstest))
            batch_test(retest=False)
        clf.ca.reset_changed_temporarily()

        # test retrain()
        # TODO XXX  -- check validity
        clf_re.retrain(dstrain);
        self.assertTrue(clf_re.ca.retrained)
        clf_re.retrain(dstrain, labels=True);
        self.assertTrue(clf_re.ca.retrained)
        clf_re.retrain(dstrain, traindataset=True);
        self.assertTrue(clf_re.ca.retrained)

        # test repredict()
        clf_re.repredict(dstest.samples);
        self.assertTrue(clf_re.ca.repredicted)
        self.assertRaises(RuntimeError, clf_re.repredict,
                              dstest.samples, labels=True)
        """for now retesting with anything changed makes no sense"""
        clf_re._set_retrainable(False)
Пример #43
0
    def test_multivariate(self):
        mv_perf = []
        mv_lin_perf = []
        uv_perf = []

        l_clf = clfswh['linear', 'svm'][0]
        nl_clf = clfswh['non-linear', 'svm'][0]

        #orig_keys = nl_clf.param._params.keys()
        #nl_param_orig = nl_clf.param._params.copy()

        # l_clf = LinearNuSVMC()

        # XXX ??? not sure what below meant and it is obsolete if
        # using SG... commenting out for now
        # for some reason order is not preserved thus dictionaries are not
        # the same any longer -- lets compare values
        #self.assertEqual([nl_clf.param._params[k] for k in orig_keys],
        #                     [nl_param_orig[k] for k in orig_keys],
        #   msg="New instance mustn't override values in previously created")
        ## and keys separately
        #self.assertEqual(set(nl_clf.param._params.keys()),
        #                     set(orig_keys),
        #   msg="New instance doesn't change set of parameters in original")

        # We must be able to deepcopy not yet trained SVMs now
        import mvpa2.support.copy as copy
        try:
            nl_clf.untrain()
            nl_clf_copy_ = copy.copy(nl_clf)
            nl_clf_copy = copy.deepcopy(nl_clf)
        except:
            self.fail(msg="Failed to deepcopy not-yet trained SVM %s" % nl_clf)

        for i in range(20):
            train = pure_multivariate_signal(20, 3)
            test = pure_multivariate_signal(20, 3)

            # use non-linear CLF on 2d data
            nl_clf.train(train)
            p_mv = nl_clf.predict(test.samples)
            mv_perf.append(np.mean(p_mv == test.targets))

            # use linear CLF on 2d data
            l_clf.train(train)
            p_lin_mv = l_clf.predict(test.samples)
            mv_lin_perf.append(np.mean(p_lin_mv == test.targets))

            # use non-linear CLF on 1d data
            nl_clf.train(train[:, 0])
            p_uv = nl_clf.predict(test[:, 0].samples)
            uv_perf.append(np.mean(p_uv == test.targets))

        mean_mv_perf = np.mean(mv_perf)
        mean_mv_lin_perf = np.mean(mv_lin_perf)
        mean_uv_perf = np.mean(uv_perf)

        # non-linear CLF has to be close to perfect
        self.assertTrue(mean_mv_perf > 0.9)
        # linear CLF cannot learn this problem!
        self.assertTrue(mean_mv_perf > mean_mv_lin_perf)
        # univariate has insufficient information
        self.assertTrue(mean_uv_perf < mean_mv_perf)
Пример #44
0
    def test_proper_state(self):
        proper   = TestClassProper()
        proper2  = TestClassProper(enable_ca=['state1'], disable_ca=['state2'])

        # disable_ca should override anything in enable_ca
        proper3 = TestClassProper(enable_ca=['all'], disable_ca='all')

        self.assertEqual(len(proper3.ca.enabled), 0,
            msg="disable_ca should override anything in enable_ca")

        proper.ca.state2 = 1000
        value = proper.ca.state2
        self.assertEqual(proper.ca.state2, 1000, msg="Simple assignment/retrieval")

        proper.ca.disable('state2')
        proper.ca.state2 = 10000
        self.assertEqual(proper.ca.state2, 1000, msg="Simple assignment after being disabled")

        proper4 = copy.deepcopy(proper)

        proper.ca.reset('state2')
        self.assertRaises(UnknownStateError, proper.ca.__getattribute__, 'state2')
        """Must be blank after being reset"""

        self.assertEqual(proper4.ca.state2, 1000,
            msg="Simple assignment after being reset in original instance")


        proper.ca.enable(['state2'])
        self.assertEqual(set(proper.ca.keys()), set(['state1', 'state2']))
        if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active:
            # skip testing since all ca are on now
            return
        self.assertTrue(proper.ca.enabled == ['state2'])

        self.assertTrue(set(proper2.ca.enabled) == set(['state1']))

        self.assertRaises(AttributeError, proper.__getattribute__, 'state12')

        # if documentary on the state is appropriate
        self.assertEqual(proper2.ca.listing,
                             ['%sstate1+%s: state1 doc' % (_def_sep, _def_sep),
                              '%sstate2%s: state2 doc' % (_def_sep, _def_sep)])

        # if __str__ lists correct number of ca
        str_ = str(proper2)
        self.assertTrue(str_.find('2 ca:') != -1)

        # check if disable works
        self.assertTrue(set(proper2.ca.enabled), set(['state1']))

        proper2.ca.disable("all")
        self.assertEqual(set(proper2.ca.enabled), set())

        proper2.ca.enable("all")
        self.assertEqual(len(proper2.ca.enabled), 2)

        proper2.ca.state1, proper2.ca.state2 = 1,2
        self.assertEqual(proper2.ca.state1, 1)
        self.assertEqual(proper2.ca.state2, 2)

        # now reset them
        proper2.ca.reset('all')
        self.assertRaises(UnknownStateError, proper2.ca.__getattribute__, 'state1')
        self.assertRaises(UnknownStateError, proper2.ca.__getattribute__, 'state2')