def test_all_equal(): # all these values are supposed to be different from each other # but equal to themselves a = np.random.normal(size=(10, 10)) + 1000. b = np.zeros((10, 10)) c = np.zeros(10) d = np.zeros(11) e = 0 f = None g = True h = '' i = 'a' j = dict(bummer=np.arange(5)) values = [a, b, c, d, e, f, g, h, i, j] for ii, v in enumerate(values): for jj, w in enumerate(values): # make deepcopy so == operator cannot cheat by checking id() assert_equal(all_equal(copy.deepcopy(v), copy.deepcopy(w)), ii == jj, msg='cmp(%s, %s)' % (type(v), type(w))) # ensure that this function behaves like the # standard python '==' comparator for singulars singulars = [0, None, True, False, '', 1, 'a'] for v in singulars: for w in singulars: assert_equal(all_equal(v, w), v == w)
def test_mapper_vs_zscore(): """Test by comparing to results of elderly z-score function """ # data: 40 sample feature line in 20d space (40x20; samples x features) dss = [ dataset_wizard(np.concatenate( [np.arange(40) for i in range(20)]).reshape(20,-1).T, targets=1, chunks=1), ] + datasets.values() for ds in dss: ds1 = deepcopy(ds) ds2 = deepcopy(ds) zsm = ZScoreMapper(chunks_attr=None) assert_raises(RuntimeError, zsm.forward, ds1.samples) idhashes = (idhash(ds1), idhash(ds1.samples)) zsm.train(ds1) idhashes_train = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_train) # forward dataset ds1z_ds = zsm.forward(ds1) idhashes_forwardds = (idhash(ds1), idhash(ds1.samples)) # must not modify samples in place! assert_equal(idhashes, idhashes_forwardds) # forward samples explicitly ds1z = zsm.forward(ds1.samples) idhashes_forward = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_forward) zscore(ds2, chunks_attr=None) assert_array_almost_equal(ds1z, ds2.samples) assert_array_equal(ds1.samples, ds.samples)
def clone(self): """Create full copy of the classifier. It might require classifier to be untrained first due to present SWIG bindings. TODO: think about proper re-implementation, without enrollment of deepcopy """ if __debug__: debug("CLF", "Cloning %s%s", (self, _strid(self))) try: return deepcopy(self) except: self.untrain() return deepcopy(self)
def _level3(self, datasets): params = self.params # for quicker access ;) # create a mapper per dataset mappers = [deepcopy(params.alignment) for ds in datasets] # key different from level-2; the common space is uniform #temp_commonspace = commonspace residuals = None if self.ca['residual_errors'].enabled: residuals = np.zeros((1, len(datasets))) self.ca.residual_errors = Dataset(samples=residuals) # start from original input datasets again for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) # retrain mapper on final common space ds_new.sa[m.get_space()] = self.commonspace m.train(ds_new) # remove common space attribute again to save on memory del ds_new.sa[m.get_space()] if residuals is not None: # obtain final projection data_mapped = m.forward(ds_new.samples) residuals[0, i] = np.linalg.norm(data_mapped - self.commonspace) return mappers
def _forward_data(self, data): if self.__chunks_attr is not None: raise RuntimeError( "%s cannot do chunk-wise Z-scoring of plain data " "since it has to be parameterized with chunks_attr." % self) if self.__param_est is not None: raise RuntimeError("%s cannot do Z-scoring with estimating " "parameters on some attributes of plain" "data." % self) params = self.__params_dict if params is None: raise RuntimeError, \ "ZScoreMapper needs to be trained before call to forward" # mappers should not modify the input data # cast the data to float, since in-place operations below to not upcast! if np.issubdtype(data.dtype, np.integer): if self._secret_inplace_zscore: raise TypeError( "Cannot perform inplace z-scoring since data is of integer " "type. Please convert to float before calling zscore") mdata = data.astype(self.__dtype) elif self._secret_inplace_zscore: mdata = data else: # do not call .copy() directly, since it might not be an array mdata = copy.deepcopy(data) self._zscore(mdata, *params['__all__']) return mdata
def _level3(self, datasets): params = self.params # for quicker access ;) # create a mapper per dataset mappers = [deepcopy(params.alignment) for ds in datasets] # key different from level-2; the common space is uniform #temp_commonspace = commonspace # Fixing nproc=0 if params.nproc == 0: from mvpa2.base import warning warning("nproc of 0 doesn't make sense. Setting nproc to 1.") params.nproc = 1 # Checking for joblib, if not, set nproc to 1 if params.nproc != 1: from mvpa2.base import externals, warning if not externals.exists('joblib'): warning("Setting nproc different from 1 requires joblib package, which " "does not seem to exist. Setting nproc to 1.") params.nproc = 1 # start from original input datasets again if params.nproc == 1: residuals = [] for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) m, residual = get_trained_mapper(ds_new, self.commonspace, m, self.ca['residual_errors'].enabled) if self.ca['residual_errors'].enabled: residuals.append(residual) else: if __debug__: debug('HPAL_', "Level 3: Using joblib with nproc = %d " % params.nproc) verbose_level_parallel = 20 \ if (__debug__ and 'HPAL' in debug.active) else 0 from joblib import Parallel, delayed import sys # joblib's 'multiprocessing' backend has known issues of failure on OSX # Tested with MacOS 10.12.13, python 2.7.13, joblib v0.10.3 if params.joblib_backend is None: params.joblib_backend = 'threading' if sys.platform == 'darwin' \ else 'multiprocessing' res = Parallel( n_jobs=params.nproc, pre_dispatch=params.nproc, backend=params.joblib_backend, verbose=verbose_level_parallel )( delayed(get_trained_mapper) (ds, self.commonspace, mapper, self.ca['residual_errors'].enabled) for ds, mapper in zip(datasets, mappers) ) mappers = [m for m, r in res] if self.ca['residual_errors'].enabled: residuals = [r for m, r in res] if self.ca['residual_errors'].enabled: self.ca.residual_errors = Dataset(samples=np.array(residuals)[None, :]) return mappers
def setUp(self): self.backup = [] # paranoid check self.cfgstr = str(cfg) # clean up externals cfg for proper testing if cfg.has_section('externals'): self.backup = copy.deepcopy(cfg.items('externals')) cfg.remove_section('externals')
def test_deep_copying_state_variable(self): for v in (True, False): sv = ConditionalAttribute(enabled=v, doc="Testing") sv.enabled = not v sv_dc = copy.deepcopy(sv) if not (__debug__ and "ENFORCE_CA_ENABLED" in debug.active): self.assertEqual(sv.enabled, sv_dc.enabled) self.assertEqual(sv.name, sv_dc.name) self.assertEqual(sv._instance_index, sv_dc._instance_index)
def select_samples(self, selection): """Return new ColumnData with selected samples""" data = copy.deepcopy(self) for k, v in data.iteritems(): data[k] = [v[x] for x in selection] data._check() return data
def test_deep_copying_state_variable(self): for v in (True, False): sv = ConditionalAttribute(enabled=v, doc="Testing") sv.enabled = not v sv_dc = copy.deepcopy(sv) self.failUnlessEqual(sv.enabled, sv_dc.enabled) self.failUnlessEqual(sv.name, sv_dc.name) self.failUnlessEqual(sv._instance_index, sv_dc._instance_index)
def test_deep_copying_state_variable(self): for v in (True, False): sv = ConditionalAttribute(enabled=v, doc="Testing") sv.enabled = not v sv_dc = copy.deepcopy(sv) if not (__debug__ and 'ENFORCE_CA_ENABLED' in debug.active): self.assertEqual(sv.enabled, sv_dc.enabled) self.assertEqual(sv.name, sv_dc.name) self.assertEqual(sv._instance_index, sv_dc._instance_index)
def test_mapper_vs_zscore(): """Test by comparing to results of elderly z-score function """ # data: 40 sample feature line in 20d space (40x20; samples x features) dss = [ dataset_wizard(np.concatenate([np.arange(40) for i in range(20)]).reshape(20, -1).T, targets=1, chunks=1), ] + datasets.values() for ds in dss: ds1 = deepcopy(ds) ds2 = deepcopy(ds) zsm = ZScoreMapper(chunks_attr=None) assert_raises(RuntimeError, zsm.forward, ds1.samples) idhashes = (idhash(ds1), idhash(ds1.samples)) zsm.train(ds1) idhashes_train = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_train) # forward dataset ds1z_ds = zsm.forward(ds1) idhashes_forwardds = (idhash(ds1), idhash(ds1.samples)) # must not modify samples in place! assert_equal(idhashes, idhashes_forwardds) # forward samples explicitly ds1z = zsm.forward(ds1.samples) idhashes_forward = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_forward) zscore(ds2, chunks_attr=None) assert_array_almost_equal(ds1z, ds2.samples) assert_array_equal(ds1.samples, ds.samples)
def test_id_hash(self, pair): a, b = pair a1 = deepcopy(a) a_1 = idhash(a) self.assertTrue(a_1 == idhash(a), msg="Must be of the same idhash") self.assertTrue(a_1 != idhash(b), msg="Must be of different idhash") if isinstance(a, np.ndarray): self.assertTrue(a_1 != idhash(a.T), msg=".T must be of different idhash") if not isinstance(a, tuple): self.assertTrue(a_1 != idhash(a1), msg="Must be of different idhash") a[2] += 1; a_2 = idhash(a) self.assertTrue(a_1 != a_2, msg="Idhash must change") else: a_2 = a_1 a = a[2:]; a_3 = idhash(a) self.assertTrue(a_2 != a_3, msg="Idhash must change after slicing")
def is_sorted(items): """Check if listed items are in sorted order. Parameters ---------- `items`: iterable container :return: `True` if were sorted. Otherwise `False` + Warning """ items_sorted = deepcopy(items) items_sorted.sort() equality = items_sorted == items # XXX yarik forgotten analog to isiterable if hasattr(equality, '__iter__'): equality = np.all(equality) return equality
def __new__(cls, *args, **kwargs): """Instantiate ClassWithCollections object """ self = super(ClassWithCollections, cls).__new__(cls) s__dict__ = self.__dict__ # init variable # XXX: Added as pylint complained (rightfully) -- not sure if false # is the proper default self.__params_set = False # need to check to avoid override of enabled ca in the case # of multiple inheritance, like both ClassWithCollectionsl and # Harvestable (note: Harvestable was refactored away) if '_collections' not in s__dict__: s__class__ = self.__class__ collections = copy.deepcopy(s__class__._collections_template) s__dict__['_collections'] = collections s__dict__['_known_attribs'] = {} """Dictionary to contain 'links' to the collections from each known attribute. Is used to gain some speed up in lookup within __getattribute__ and __setattr__ """ # Assign owner to all collections for col, collection in collections.iteritems(): if col in s__dict__: raise ValueError, \ "Object %s has already attribute %s" % \ (self, col) s__dict__[col] = collection collection.name = col self.__params_set = False if __debug__: descr = kwargs.get('descr', None) debug( "COL", "ClassWithCollections.__new__ was done " "for %s%s with descr=%s", (s__class__.__name__, _strid(self), descr)) return self
def __new__(cls, *args, **kwargs): """Instantiate ClassWithCollections object """ self = super(ClassWithCollections, cls).__new__(cls) s__dict__ = self.__dict__ # init variable # XXX: Added as pylint complained (rightfully) -- not sure if false # is the proper default self.__params_set = False # need to check to avoid override of enabled ca in the case # of multiple inheritance, like both ClassWithCollectionsl and # Harvestable if '_collections' not in s__dict__: s__class__ = self.__class__ collections = copy.deepcopy(s__class__._collections_template) s__dict__['_collections'] = collections s__dict__['_known_attribs'] = {} """Dictionary to contain 'links' to the collections from each known attribute. Is used to gain some speed up in lookup within __getattribute__ and __setattr__ """ # Assign owner to all collections for col, collection in collections.iteritems(): if col in s__dict__: raise ValueError, \ "Object %s has already attribute %s" % \ (self, col) s__dict__[col] = collection collection.name = col self.__params_set = False if __debug__: descr = kwargs.get('descr', None) debug("COL", "ClassWithCollections.__new__ was done " "for %s%s with descr=%s", (s__class__.__name__, _strid(self), descr)) return self
def test_generic_tests(self): """Test all classifiers for conformant behavior """ for clf_, traindata in \ [(clfswh['binary'], datasets['dumb2']), (clfswh['multiclass'], datasets['dumb'])]: traindata_copy = deepcopy(traindata) # full copy of dataset for clf in clf_: clf.train(traindata) self.assertTrue( (traindata.samples == traindata_copy.samples).all(), "Training of a classifier shouldn't change original dataset") # TODO: enforce uniform return from predict?? #predicted = clf.predict(traindata.samples) #self.assertTrue(isinstance(predicted, np.ndarray)) # Just simple test that all of them are syntaxed correctly self.assertTrue(str(clf) != "") self.assertTrue(repr(clf) != "")
def test_more_svd(self): pm = SVDMapper() # train SVD pm.train(self.largefeat) # mixing matrix cannot be square self.failUnlessEqual(pm.proj.shape, (40, 10)) # only first singular value significant self.failUnless(pm.sv[:1] > 10) self.failUnless((pm.sv[1:] < 10).all()) # now project data into SVD space p = pm.forward(self.largefeat) # only variance of first component significant var = p.var(axis=0) # test that only one component has variance self.failUnless(var[:1] > 1.0) self.failUnless((var[1:] < 0.0001).all()) # check that the mapped data can be fully recovered by 'reverse()' rp = pm.reverse(p) self.failUnlessEqual(rp.shape, self.largefeat.shape) self.failUnless((np.round(rp) == self.largefeat).all()) # copy mapper pm2 = deepcopy(pm) # now make new random data and do forward->reverse check data = np.random.normal(size=(98,40)) data_f = pm.forward(data) self.failUnlessEqual(data_f.shape, (98,10)) data_r = pm.reverse(data_f) self.failUnlessEqual(data_r.shape, (98,40))
def test_more_svd(self): pm = SVDMapper() # train SVD pm.train(self.largefeat) # mixing matrix cannot be square self.assertEqual(pm.proj.shape, (40, 10)) # only first singular value significant self.assertTrue(pm.sv[:1] > 10) self.assertTrue((pm.sv[1:] < 10).all()) # now project data into SVD space p = pm.forward(self.largefeat) # only variance of first component significant var = p.var(axis=0) # test that only one component has variance self.assertTrue(var[:1] > 1.0) self.assertTrue((var[1:] < 0.0001).all()) # check that the mapped data can be fully recovered by 'reverse()' rp = pm.reverse(p) self.assertEqual(rp.shape, self.largefeat.shape) self.assertTrue((np.round(rp) == self.largefeat).all()) # copy mapper pm2 = deepcopy(pm) # now make new random data and do forward->reverse check data = np.random.normal(size=(98, 40)) data_f = pm.forward(data) self.assertEqual(data_f.shape, (98, 10)) data_r = pm.reverse(data_f) self.assertEqual(data_r.shape, (98, 40))
def test_retrainables(self, clf): # XXX we agreed to not worry about this for the initial 0.6 release raise SkipTest # we need a copy since will tune its internals later on clf = clf.clone() clf.ca.change_temporarily( enable_ca=['estimates'], # ensure that it does do predictions # while training disable_ca=['training_stats']) clf_re = clf.clone() # TODO: .retrainable must have a callback to call smth like # _set_retrainable clf_re._set_retrainable(True) # need to have high snr so we don't 'cope' with problematic # datasets since otherwise unittests would fail. dsargs = { 'perlabel': 50, 'nlabels': 2, 'nfeatures': 5, 'nchunks': 1, 'nonbogus_features': [2, 4], 'snr': 5.0 } ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # NB datasets will be changed by the end of testing, so if # are to change to use generic datasets - make sure to copy # them here ds = deepcopy(datasets['uni2large']) clf.untrain() clf_re.untrain() trerr = TransferMeasure(clf, Splitter('train'), postproc=BinaryFxNode(mean_mismatch_error, 'targets')) trerr_re = TransferMeasure(clf_re, Splitter('train'), disable_ca=['training_stats'], postproc=BinaryFxNode( mean_mismatch_error, 'targets')) # Just check for correctness of retraining err_1 = np.asscalar(trerr(ds)) self.assertTrue( err_1 < 0.3, msg="We should test here on easy dataset. Got error of %s" % err_1) values_1 = clf.ca.estimates[:] # some times retraining gets into deeper optimization ;-) eps = 0.05 corrcoef_eps = 0.85 # just to get no failures... usually > 0.95 def batch_test(retrain=True, retest=True, closer=True): err = np.asscalar(trerr(ds)) err_re = np.asscalar(trerr_re(ds)) corr = np.corrcoef(clf.ca.estimates, clf_re.ca.estimates)[0, 1] corr_old = np.corrcoef(values_1, clf_re.ca.estimates)[0, 1] if __debug__: debug( 'TEST', "Retraining stats: errors %g %g corr %g " "with old error %g corr %g" % (err, err_re, corr, err_1, corr_old)) self.assertTrue(clf_re.ca.retrained == retrain, ("Must fully train", "Must retrain instead of full training")[retrain]) self.assertTrue(clf_re.ca.repredicted == retest, ("Must fully test", "Must retest instead of full testing")[retest]) self.assertTrue( corr > corrcoef_eps, msg="Result must be close to the one without retraining." " Got corrcoef=%s" % (corr)) if closer: self.assertTrue( corr >= corr_old, msg="Result must be closer to current without retraining" " than to old one. Got corrcoef=%s" % (corr_old)) # Check sequential retraining/retesting for i in xrange(3): flag = bool(i != 0) # ok - on 1st call we should train/test, then retrain/retest # and we can't compare for closinest to old result since # we are working on the same data/classifier batch_test(retrain=flag, retest=flag, closer=False) # should retrain nicely if we change a parameter if 'C' in clf.params: clf.params.C *= 0.1 clf_re.params.C *= 0.1 batch_test() elif 'sigma_noise' in clf.params: clf.params.sigma_noise *= 100 clf_re.params.sigma_noise *= 100 batch_test() else: raise RuntimeError, \ 'Please implement testing while changing some of the ' \ 'params for clf %s' % clf # should retrain nicely if we change kernel parameter if hasattr(clf, 'kernel_params') and len(clf.kernel_params): clf.kernel_params.gamma = 0.1 clf_re.kernel_params.gamma = 0.1 # retest is false since kernel got recomputed thus # can't expect to use the same kernel batch_test(retest=not ('gamma' in clf.kernel_params)) # should retrain nicely if we change labels permute = AttributePermutator('targets', assure=True) oldlabels = dstrain.targets[:] dstrain = permute(dstrain) self.assertTrue( (oldlabels != dstrain.targets).any(), msg="We should succeed at permutting -- now got the same targets") ds = vstack((dstrain, dstest)) batch_test() # Change labels in testing oldlabels = dstest.targets[:] dstest = permute(dstest) self.assertTrue( (oldlabels != dstest.targets).any(), msg="We should succeed at permutting -- now got the same targets") ds = vstack((dstrain, dstest)) batch_test() # should re-train if we change data # reuse trained SVM and its 'final' optimization point if not clf.__class__.__name__ in [ 'GPR' ]: # on GPR everything depends on the data ;-) oldsamples = dstrain.samples.copy() dstrain.samples[:] += dstrain.samples * 0.05 self.assertTrue((oldsamples != dstrain.samples).any()) ds = vstack((dstrain, dstest)) batch_test(retest=False) clf.ca.reset_changed_temporarily() # test retrain() # TODO XXX -- check validity clf_re.retrain(dstrain) self.assertTrue(clf_re.ca.retrained) clf_re.retrain(dstrain, labels=True) self.assertTrue(clf_re.ca.retrained) clf_re.retrain(dstrain, traindataset=True) self.assertTrue(clf_re.ca.retrained) # test repredict() clf_re.repredict(dstest.samples) self.assertTrue(clf_re.ca.repredicted) self.assertRaises(RuntimeError, clf_re.repredict, dstest.samples, labels=True) """for now retesting with anything changed makes no sense""" clf_re._set_retrainable(False)
def train(self, datasets): """Derive a common feature space from a series of datasets. Parameters ---------- datasets : sequence of datasets Returns ------- A list of trained Mappers matching the number of input datasets. """ params = self.params # for quicker access ;) ca = self.ca # Check to make sure we get a list of datasets as input. if not isinstance(datasets, (list, tuple, np.ndarray)): raise TypeError("Input datasets should be a sequence " "(of type list, tuple, or ndarray) of datasets.") ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] alpha = params.alpha residuals = None if ca['training_residual_errors'].enabled: residuals = np.zeros((1 + params.level2_niter, ndatasets)) ca.training_residual_errors = Dataset( samples=residuals, sa={ 'levels': ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)] }) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds # Making sure that ref_ds is within range. #Parameter() already checks for it being a non-negative integer if ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.chosen_ref_ds = ref_ds # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # TODO since we are doing in-place zscoring create deep copies # of the datasets with pruned targets and shallow copies of # the collections (if they would come needed in the transformation) # TODO: handle floats and non-floats differently to prevent # waste of memory if there is no need (e.g. no z-scoring) #otargets = [ds.sa.targets for ds in datasets] datasets = [ds.copy(deep=False) for ds in datasets] #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)}) #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)}) # for ds in datasets] if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) if alpha < 1: datasets, wmappers = self._regularize(datasets, alpha) # initial common space is the reference dataset commonspace = datasets[ref_ds].samples # the reference dataset might have been zscored already, don't do it # twice if params.zscore_common and not params.zscore_all: if __debug__: debug( 'HPAL_', "Creating copy of a commonspace and assuring " "it is of a floating type") commonspace = commonspace.astype(float) zscore(commonspace, chunks_attr=None) # If there is only one dataset in training phase, there is nothing to be done # just use that data as the common space if len(datasets) < 2: self.commonspace = commonspace else: # create a mapper per dataset # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # # Level 1 -- initial projection # lvl1_projdata = self._level1(datasets, commonspace, ref_ds, mappers, residuals) # # Level 2 -- might iterate multiple times # # this is the final common space self.commonspace = self._level2(datasets, lvl1_projdata, mappers, residuals) if params.output_dim is not None: mappers = self._level3(datasets) self._svd_mapper = SVDMapper() self._svd_mapper.train(self._map_and_mean(datasets, mappers)) self._svd_mapper = StaticProjectionMapper( proj=self._svd_mapper.proj[:, :params.output_dim])
def _get_transformer(self): if self._transformer is None: self._transformer = deepcopy(self._pristine_transformer) return self._transformer
def __call__(self, datasets): """Estimate mappers for each dataset Parameters ---------- datasets : list or tuple of datasets Returns ------- A list of trained Mappers of the same length as datasets """ params = self.params # for quicker access ;) ca = self.ca ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] residuals = None if ca['residual_errors'].enabled: residuals = np.zeros((2 + params.level2_niter, ndatasets)) ca.residual_errors = Dataset( samples = residuals, sa = {'levels' : ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)] + ['3']}) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds if ref_ds < 0 and ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.choosen_ref_ds = ref_ds # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # Level 1 (first) # TODO since we are doing in-place zscoring create deep copies # of the datasets with pruned targets and shallow copies of # the collections (if they would come needed in the transformation) # TODO: handle floats and non-floats differently to prevent # waste of memory if there is no need (e.g. no z-scoring) #otargets = [ds.sa.targets for ds in datasets] datasets = [ds.copy(deep=False) for ds in datasets] #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)}) #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)}) # for ds in datasets] if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") # zscore them once while storing corresponding ZScoreMapper's zmappers = [] for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmappers.append(zmapper) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) commonspace = np.asanyarray(datasets[ref_ds]) if params.zscore_common and not params.zscore_all: if __debug__: debug('HPAL_', "Creating copy of a commonspace and assuring " "it is of a floating type") commonspace = commonspace.astype(float) zscore(commonspace, chunks_attr=None) data_mapped = [np.asanyarray(ds) for ds in datasets] #zscore(data_mapped[ref_ds],chunks_attr=None) for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 1: ds #%i" % i) if i == ref_ds: continue #ds_new = ds.copy() #zscore(ds_new, chunks_attr=None); ds_new.targets = commonspace m.train(ds_new) ds_ = m.forward(np.asanyarray(ds_new)) if params.zscore_common: zscore(ds_, chunks_attr=None) data_mapped[i] = ds_ if residuals is not None: residuals[0, i] = np.linalg.norm(ds_ - commonspace) ## if ds_mapped == []: ## ds_mapped = [zscore(m.forward(d), chunks_attr=None)] ## else: ## ds_mapped += [zscore(m.forward(d), chunks_attr=None)] # zscore before adding # TODO: make just a function so we dont' waste space commonspace = params.combiner1(data_mapped[i], commonspace) if params.zscore_common: zscore(commonspace, chunks_attr=None) # update commonspace to mean of ds_mapped commonspace = params.combiner2(data_mapped) #if params.zscore_common: #zscore(commonspace, chunks_attr=None) # Level 2 -- might iterate multiple times for loop in xrange(params.level2_niter): for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i)) ds_temp = (commonspace*ndatasets - data_mapped[i])/(ndatasets-1) if params.zscore_common: zscore(ds_temp, chunks_attr=None) #ds_new = ds.copy() #zscore(ds_new, chunks_attr=None) ds_new.targets = ds_temp #commonspace #PRJ ds_temp m.train(ds_new) # ds_temp) ds_ = m.forward(np.asanyarray(ds_new)) if params.zscore_common: zscore(ds_, chunks_attr=None) data_mapped[i] = ds_ if residuals is not None: residuals[1+loop, i] = np.linalg.norm(ds_ - commonspace) #ds_mapped[i] = zscore( m.forward(ds_temp), chunks_attr=None) commonspace = params.combiner2(data_mapped) #if params.zscore_common: #zscore(commonspace, chunks_attr=None) # Level 3 (last) to params.levels for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) #ds_new = ds.copy() # shallow copy so we could assign new labels #zscore(ds_new, chunks_attr=None) ds_temp = (commonspace*ndatasets - data_mapped[i])/(ndatasets-1) if params.zscore_common: zscore(ds_temp, chunks_attr=None) ds_new.targets = ds_temp #commonspace #PRJ ds_temp# m.train(ds_new) #ds_temp) data_mapped[i] = m.forward(np.asanyarray(ds_new)) if residuals is not None: residuals[-1, i] = np.linalg.norm(data_mapped[i] - commonspace) if params.zscore_all: # We need to construct new mappers which would chain # zscore and then final transformation return [ChainMapper([zm, m]) for zm, m in zip(zmappers, mappers)] else: return mappers
def test_multivariate(self): mv_perf = [] mv_lin_perf = [] uv_perf = [] l_clf = clfswh['linear', 'svm'][0] nl_clf = clfswh['non-linear', 'svm'][0] #orig_keys = nl_clf.param._params.keys() #nl_param_orig = nl_clf.param._params.copy() # l_clf = LinearNuSVMC() # XXX ??? not sure what below meant and it is obsolete if # using SG... commenting out for now # for some reason order is not preserved thus dictionaries are not # the same any longer -- lets compare values #self.assertEqual([nl_clf.param._params[k] for k in orig_keys], # [nl_param_orig[k] for k in orig_keys], # msg="New instance mustn't override values in previously created") ## and keys separately #self.assertEqual(set(nl_clf.param._params.keys()), # set(orig_keys), # msg="New instance doesn't change set of parameters in original") # We must be able to deepcopy not yet trained SVMs now import mvpa2.support.copy as copy try: nl_clf.untrain() nl_clf_copy = copy.deepcopy(nl_clf) except: self.fail(msg="Failed to deepcopy not-yet trained SVM %s" % nl_clf) for i in xrange(20): train = pure_multivariate_signal( 20, 3 ) test = pure_multivariate_signal( 20, 3 ) # use non-linear CLF on 2d data nl_clf.train(train) p_mv = nl_clf.predict(test.samples) mv_perf.append(np.mean(p_mv==test.targets)) # use linear CLF on 2d data l_clf.train(train) p_lin_mv = l_clf.predict(test.samples) mv_lin_perf.append(np.mean(p_lin_mv==test.targets)) # use non-linear CLF on 1d data nl_clf.train(train[:, 0]) p_uv = nl_clf.predict(test[:, 0].samples) uv_perf.append(np.mean(p_uv==test.targets)) mean_mv_perf = np.mean(mv_perf) mean_mv_lin_perf = np.mean(mv_lin_perf) mean_uv_perf = np.mean(uv_perf) # non-linear CLF has to be close to perfect self.assertTrue( mean_mv_perf > 0.9 ) # linear CLF cannot learn this problem! self.assertTrue( mean_mv_perf > mean_mv_lin_perf ) # univariate has insufficient information self.assertTrue( mean_uv_perf < mean_mv_perf )
def __init__(self, **kwargs): """Init base class of SVMs. *Not to be publicly used* TODO: handling of parameters might migrate to be generic for all classifiers. SVMs are chosen to be testbase for that functionality to see how well it would fit. """ # Check if requested implementation is known svm_impl = kwargs.get('svm_impl', None) if not svm_impl in self._KNOWN_IMPLEMENTATIONS: raise ValueError("Unknown SVM implementation '%s' is requested for %s." \ "Known are: %s" % (svm_impl, self.__class__, list(self._KNOWN_IMPLEMENTATIONS.keys()))) self._svm_impl = svm_impl impl, add_params, add_internals, descr = \ self._KNOWN_IMPLEMENTATIONS[svm_impl] # Add corresponding parameters to 'known' depending on the # implementation chosen if add_params is not None: self._KNOWN_PARAMS = \ self._KNOWN_PARAMS[:] + list(add_params) # Assign per-instance __tags__ self.__tags__ = self.__tags__[:] + [svm_impl] # Add corresponding internals if add_internals is not None: self.__tags__ += list(add_internals) self.__tags__.append(svm_impl) k = kwargs.get('kernel', None) if k is None: kwargs['kernel'] = self.__default_kernel_class__() if 'linear' in ('%s'%kwargs['kernel']).lower(): # XXX not necessarily best self.__tags__ += [ 'linear', 'has_sensitivity' ] else: self.__tags__ += [ 'non-linear' ] # pop out all args from **kwargs which are known to be SVM parameters _args = {} for param in self._KNOWN_PARAMS + ['svm_impl']: # Update to remove kp's? if param in kwargs: _args[param] = kwargs.pop(param) try: Classifier.__init__(self, **kwargs) except TypeError as e: if "__init__() got an unexpected keyword argument " in e.args[0]: # TODO: make it even more specific -- if that argument is listed # within _SVM_PARAMS e.args = tuple( [e.args[0] + "\n Given SVM instance of class %s knows following parameters: %s" % (self.__class__, self._KNOWN_PARAMS) + \ list(e.args)[1:]]) raise e # populate collections and add values from arguments for paramfamily, paramset in ( (self._KNOWN_PARAMS, self.params),): for paramname in paramfamily: if not (paramname in self._SVM_PARAMS): raise ValueError("Unknown parameter %s" % paramname + \ ". Known SVM params are: %s" % list(self._SVM_PARAMS.keys())) param = deepcopy(self._SVM_PARAMS[paramname]) if paramname in _args: param.value = _args[paramname] # XXX might want to set default to it -- not just value paramset[paramname] = param # TODO: Below commented out because kernel_type has been removed. # Find way to set default C as necessary # tune up C if it has one and non-linear classifier is used #if self.params.has_key('C') and kernel_type != "linear" \ #and self.params['C'].is_default: #if __debug__: #debug("SVM_", "Assigning default C value to be 1.0 for SVM " #"%s with non-linear kernel" % self) #self.params['C'].default = 1.0 # Some postchecks if 'weight' in self.params and 'weight_label' in self.params: if not len(self.params.weight_label) == len(self.params.weight): raise ValueError("Lenghts of 'weight' and 'weight_label' lists " \ "must be equal.") if __debug__: debug("SVM", "Initialized %s with kernel %s" % (self, self.params.kernel))
class _SVM(Classifier): """Support Vector Machine Classifier. Base class for all external SVM implementations. """ """ Derived classes should define: * _KERNELS: map(dict) should define assignment to a tuple containing implementation kernel type, list of parameters adherent to the kernel, and sensitivity analyzer e.g.:: _KERNELS = { 'linear': (shogun.Kernel.LinearKernel, (), LinearSVMWeights), 'rbf' : (shogun.Kernel.GaussianKernel, ('gamma',), None), ... } * _KNOWN_IMPLEMENTATIONS: map(dict) should define assignment to a tuple containing implementation of the SVM, list of parameters adherent to the implementation, additional internals, and description e.g.:: _KNOWN_IMPLEMENTATIONS = { 'C_SVC' : (svm.svmc.C_SVC, ('C',), ('binary', 'multiclass'), 'C-SVM classification'), ... } """ _ATTRIBUTE_COLLECTIONS = ['params'] # enforce presence of params collections # Placeholder: map kernel names to sensitivity classes, ie # 'linear':LinearSVMWeights, for each backend _KNOWN_SENSITIVITIES={} kernel = Parameter(None, # XXX: Currently, can't be ensured using constraints # allowedtype=Kernel, doc='Kernel object', index=-1) _SVM_PARAMS = { 'C' : Parameter(-1.0, doc='Trade-off parameter between width of the ' 'margin and number of support vectors. Higher C -- ' 'more rigid margin SVM. In linear kernel, negative ' 'values provide automatic scaling of their value ' 'according to the norm of the data'), 'nu' : Parameter(0.5, min=0.0, max=1.0, doc='Fraction of datapoints within the margin'), 'cache_size': Parameter(100, doc='Size of the kernel cache, specified in megabytes'), 'tube_epsilon': Parameter(0.01, doc='Epsilon in epsilon-insensitive loss function of ' 'epsilon-SVM regression (SVR)'), 'tau': Parameter(1e-6, doc='TAU parameter of KRR regression in shogun'), 'probability': Parameter(0, doc='Flag to signal either probability estimate is obtained ' 'within LIBSVM'), 'shrinking': Parameter(1, doc='Either shrinking is to be conducted'), 'weight_label': Parameter([], constraints=EnsureListOf(int), doc='To be used in conjunction with weight for custom ' 'per-label weight'), # TODO : merge them into a single dictionary 'weight': Parameter([], constraints=EnsureListOf(float), doc='Custom weights per label'), # For some reason setting up epsilon to 1e-5 slowed things down a bit # in comparison to how it was before (in yoh/master) by up to 20%... not clear why # may be related to 1e-3 default within _svm.py? 'epsilon': Parameter(5e-5, min=1e-10, doc='Tolerance of termination criteria. (For nu-SVM default is 0.001)') } _KNOWN_PARAMS = () # just a placeholder to please lintian """Parameters which are specific to a given instantiation of SVM """ __tags__ = [ 'svm', 'kernel-based', 'swig' ] def __init__(self, **kwargs): """Init base class of SVMs. *Not to be publicly used* TODO: handling of parameters might migrate to be generic for all classifiers. SVMs are chosen to be testbase for that functionality to see how well it would fit. """ # Check if requested implementation is known svm_impl = kwargs.get('svm_impl', None) if not svm_impl in self._KNOWN_IMPLEMENTATIONS: raise ValueError, \ "Unknown SVM implementation '%s' is requested for %s." \ "Known are: %s" % (svm_impl, self.__class__, self._KNOWN_IMPLEMENTATIONS.keys()) self._svm_impl = svm_impl impl, add_params, add_internals, descr = \ self._KNOWN_IMPLEMENTATIONS[svm_impl] # Add corresponding parameters to 'known' depending on the # implementation chosen if add_params is not None: self._KNOWN_PARAMS = \ self._KNOWN_PARAMS[:] + list(add_params) # Assign per-instance __tags__ self.__tags__ = self.__tags__[:] + [svm_impl] # Add corresponding internals if add_internals is not None: self.__tags__ += list(add_internals) self.__tags__.append(svm_impl) k = kwargs.get('kernel', None) if k is None: kwargs['kernel'] = self.__default_kernel_class__() if 'linear' in ('%s'%kwargs['kernel']).lower(): # XXX not necessarily best self.__tags__ += [ 'linear', 'has_sensitivity' ] else: self.__tags__ += [ 'non-linear' ] # pop out all args from **kwargs which are known to be SVM parameters _args = {} for param in self._KNOWN_PARAMS + ['svm_impl']: # Update to remove kp's? if param in kwargs: _args[param] = kwargs.pop(param) try: Classifier.__init__(self, **kwargs) except TypeError, e: if "__init__() got an unexpected keyword argument " in e.args[0]: # TODO: make it even more specific -- if that argument is listed # within _SVM_PARAMS e.args = tuple( [e.args[0] + "\n Given SVM instance of class %s knows following parameters: %s" % (self.__class__, self._KNOWN_PARAMS) + \ list(e.args)[1:]]) raise e # populate collections and add values from arguments for paramfamily, paramset in ( (self._KNOWN_PARAMS, self.params),): for paramname in paramfamily: if not (paramname in self._SVM_PARAMS): raise ValueError, "Unknown parameter %s" % paramname + \ ". Known SVM params are: %s" % self._SVM_PARAMS.keys() param = deepcopy(self._SVM_PARAMS[paramname]) if paramname in _args: param.value = _args[paramname] # XXX might want to set default to it -- not just value paramset[paramname] = param # TODO: Below commented out because kernel_type has been removed. # Find way to set default C as necessary # tune up C if it has one and non-linear classifier is used #if self.params.has_key('C') and kernel_type != "linear" \ #and self.params['C'].is_default: #if __debug__: #debug("SVM_", "Assigning default C value to be 1.0 for SVM " #"%s with non-linear kernel" % self) #self.params['C'].default = 1.0 # Some postchecks if 'weight' in self.params and 'weight_label' in self.params: if not len(self.params.weight_label) == len(self.params.weight): raise ValueError, "Lenghts of 'weight' and 'weight_label' lists " \ "must be equal." if __debug__: debug("SVM", "Initialized %s with kernel %s" % (self, self.params.kernel))
def __call__(self, datasets): """Estimate mappers for each dataset Parameters ---------- datasets : list or tuple of datasets Returns ------- A list of trained Mappers of the same length as datasets """ params = self.params # for quicker access ;) ca = self.ca ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] residuals = None if ca['residual_errors'].enabled: residuals = np.zeros((2 + params.level2_niter, ndatasets)) ca.residual_errors = Dataset( samples = residuals, sa = {'levels' : ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)] + ['3']}) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds if ref_ds < 0 and ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.choosen_ref_ds = ref_ds # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # Level 1 (first) commonspace = np.asanyarray(datasets[ref_ds]) if params.zscore_common: zscore(commonspace, chunks_attr=None) data_mapped = [np.asanyarray(ds) for ds in datasets] for i, (m, data) in enumerate(zip(mappers, data_mapped)): if __debug__: debug('HPAL_', "Level 1: ds #%i" % i) if i == ref_ds: continue #ZSC zscore(data, chunks_attr=None) ds = dataset_wizard(samples=data, targets=commonspace) #ZSC zscore(ds, chunks_attr=None) m.train(ds) data_temp = m.forward(data) #ZSC zscore(data_temp, chunks_attr=None) data_mapped[i] = data_temp if residuals is not None: residuals[0, i] = np.linalg.norm(data_temp - commonspace) ## if ds_mapped == []: ## ds_mapped = [zscore(m.forward(d), chunks_attr=None)] ## else: ## ds_mapped += [zscore(m.forward(d), chunks_attr=None)] # zscore before adding # TODO: make just a function so we dont' waste space commonspace = params.combiner1(data_mapped[i], commonspace) if params.zscore_common: zscore(commonspace, chunks_attr=None) # update commonspace to mean of ds_mapped commonspace = params.combiner2(data_mapped) if params.zscore_common: zscore(commonspace, chunks_attr=None) # Level 2 -- might iterate multiple times for loop in xrange(params.level2_niter): for i, (m, ds) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i)) ## ds_temp = zscore( (commonspace*ndatasets - ds_mapped[i]) ## /(ndatasets-1), chunks_attr=None ) ds_new = ds.copy() #ZSC zscore(ds_new, chunks_attr=None) #PRJ ds_temp = (commonspace*ndatasets - ds_mapped[i])/(ndatasets-1) #ZSC zscore(ds_temp, chunks_attr=None) ds_new.targets = commonspace #PRJ ds_temp m.train(ds_new) # ds_temp) data_mapped[i] = m.forward(np.asanyarray(ds)) if residuals is not None: residuals[1+loop, i] = np.linalg.norm(data_mapped - commonspace) #ds_mapped[i] = zscore( m.forward(ds_temp), chunks_attr=None) commonspace = params.combiner2(data_mapped) if params.zscore_common: zscore(commonspace, chunks_attr=None) # Level 3 (last) to params.levels for i, (m, ds) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) ## ds_temp = zscore( (commonspace*ndatasets - ds_mapped[i]) ## /(ndatasets-1), chunks_attr=None ) ds_new = ds.copy() # shallow copy so we could assign new labels #ZSC zscore(ds_new, chunks_attr=None) #PRJ ds_temp = (commonspace*ndatasets - ds_mapped[i])/(ndatasets-1) #ZSC zscore(ds_temp, chunks_attr=None) ds_new.targets = commonspace #PRJ ds_temp# m.train(ds_new) #ds_temp) if residuals is not None: data_mapped = m.forward(ds_new) residuals[-1, i] = np.linalg.norm(data_mapped - commonspace) return mappers
def timesegments_classification(dss, hyper=None, part1=HalfPartitioner(), part2=NFoldPartitioner(attr='subjects'), window_size=6, overlapping_windows=True, distance='correlation', do_zscore=True): """Time-segment classification across subjects using Hyperalignment Parameters ---------- dss : list of datasets Datasets to benchmark on. Usually a single dataset per subject. hyper : Hyperalignment-like, optional Beast which if called on a list of datasets should spit out trained mappers. If not specified, `IdentityMapper`s will be used part1 : Partitioner, optional Partitioner to split data for hyperalignment "cross-validation" part2 : Partitioner, optional Partitioner for CV within the hyperalignment test split window_size : int, optional How many temporal points to consider for a classification sample overlapping_windows : bool, optional Strategy to how create and classify "samples" for classification. If True -- `window_size` samples from each time point (but trailing ones) constitute a sample, and upon "predict" `window_size` of samples around each test point is not considered. If False -- samples are just taken (with training and testing splits) at `window_size` step from one to another. do_zscore : bool, optional Perform zscoring (overall, not per-chunk) for each dataset upon partitioning with part1 ... """ # Generate outer-most partitioning () parts = [copy.deepcopy(part1).generate(ds) for ds in dss] iter = 1 errors = [] while True: try: dss_partitioned = [p.next() for p in parts] except StopIteration: # we are done -- no more partitions break if __debug__: debug("BM", "Iteration %d", iter) dss_train, dss_test = zip(*[ list(Splitter("partitions").generate(ds)) for ds in dss_partitioned ]) # TODO: allow for doing feature selection if do_zscore: for ds in dss_train + dss_test: zscore(ds, chunks_attr=None) if hyper is not None: # since otherwise it would remember previous loop dataset as the "commonspace" # Now let's do hyperalignment but on a copy in each loop iteration hyper_ = copy.deepcopy(hyper) mappers = hyper_(dss_train) else: mappers = [IdentityMapper() for ds in dss_train] dss_test_aligned = [ mapper.forward(ds) for mapper, ds in zip(mappers, dss_test) ] # assign .sa.subjects to those datasets for i, ds in enumerate(dss_test_aligned): # part2.attr is by default "subjects" ds.sa[part2.attr] = [i] dss_test_bc = [] for ds in dss_test_aligned: if overlapping_windows: startpoints = range(len(ds) - window_size + 1) else: startpoints = _get_nonoverlapping_startpoints( len(ds), window_size) bm = BoxcarMapper(startpoints, window_size) bm.train(ds) ds_ = bm.forward(ds) ds_.sa['startpoints'] = startpoints # reassign subjects so they are not arrays def assign_unique(ds, sa): ds.sa[sa] = [ np.asscalar(np.unique(x)) for x in ds.sa[sa].value ] assign_unique(ds_, part2.attr) fm = FlattenMapper() fm.train(ds_) dss_test_bc.append(ds_.get_mapped(fm)) ds_test = vstack(dss_test_bc) # Perform classification across subjects comparing against mean # spatio-temporal pattern of other subjects errors_across_subjects = [] for ds_test_part in part2.generate(ds_test): ds_train_, ds_test_ = list( Splitter("partitions").generate(ds_test_part)) # average across subjects to get a representative pattern per timepoint ds_train_ = mean_group_sample(['startpoints'])(ds_train_) assert (ds_train_.shape == ds_test_.shape) if distance == 'correlation': # TODO: redo more efficiently since now we are creating full # corrcoef matrix. Also we might better just take a name for # the pdist measure but then implement them efficiently # (i.e. without hstacking both pieces together first) dist = 1 - np.corrcoef( ds_train_, ds_test_)[len(ds_test_):, :len(ds_test_)] else: raise NotImplementedError if overlapping_windows: dist = wipe_out_offdiag(dist, window_size) winners = np.argmin(dist, axis=1) error = np.mean(winners != np.arange(len(winners))) errors_across_subjects.append(error) errors.append(errors_across_subjects) iter += 1 errors = np.array(errors) if __debug__: debug( "BM", "Finished with %s array of errors. Mean error %.2f" % (errors.shape, np.mean(errors))) return errors
def test_proper_state(self): proper = TestClassProper() proper2 = TestClassProper(enable_ca=['state1'], disable_ca=['state2']) # disable_ca should override anything in enable_ca proper3 = TestClassProper(enable_ca=['all'], disable_ca='all') self.assertEqual(len(proper3.ca.enabled), 0, msg="disable_ca should override anything in enable_ca") proper.ca.state2 = 1000 value = proper.ca.state2 self.assertEqual(proper.ca.state2, 1000, msg="Simple assignment/retrieval") proper.ca.disable('state2') proper.ca.state2 = 10000 self.assertEqual(proper.ca.state2, 1000, msg="Simple assignment after being disabled") proper4 = copy.deepcopy(proper) proper.ca.reset('state2') self.assertRaises(UnknownStateError, proper.ca.__getattribute__, 'state2') """Must be blank after being reset""" self.assertEqual(proper4.ca.state2, 1000, msg="Simple assignment after being reset in original instance") proper.ca.enable(['state2']) self.assertEqual(set(proper.ca.keys()), set(['state1', 'state2'])) if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active: # skip testing since all ca are on now return self.assertTrue(proper.ca.enabled == ['state2']) self.assertTrue(set(proper2.ca.enabled) == set(['state1'])) self.assertRaises(AttributeError, proper.__getattribute__, 'state12') # if documentary on the state is appropriate self.assertEqual(proper2.ca.listing, ['%sstate1+%s: state1 doc' % (_def_sep, _def_sep), '%sstate2%s: state2 doc' % (_def_sep, _def_sep)]) # if __str__ lists correct number of ca str_ = str(proper2) self.assertTrue(str_.find('2 ca:') != -1) # check if disable works self.assertTrue(set(proper2.ca.enabled), set(['state1'])) proper2.ca.disable("all") self.assertEqual(set(proper2.ca.enabled), set()) proper2.ca.enable("all") self.assertEqual(len(proper2.ca.enabled), 2) proper2.ca.state1, proper2.ca.state2 = 1,2 self.assertEqual(proper2.ca.state1, 1) self.assertEqual(proper2.ca.state2, 2) # now reset them proper2.ca.reset('all') self.assertRaises(UnknownStateError, proper2.ca.__getattribute__, 'state1') self.assertRaises(UnknownStateError, proper2.ca.__getattribute__, 'state2')
def timesegments_classification( dss, hyper=None, part1=HalfPartitioner(), part2=NFoldPartitioner(attr='subjects'), window_size=6, overlapping_windows=True, distance='correlation', do_zscore=True): """Time-segment classification across subjects using Hyperalignment Parameters ---------- dss : list of datasets Datasets to benchmark on. Usually a single dataset per subject. hyper : Hyperalignment-like, optional Beast which if called on a list of datasets should spit out trained mappers. If not specified, `IdentityMapper`s will be used part1 : Partitioner, optional Partitioner to split data for hyperalignment "cross-validation" part2 : Partitioner, optional Partitioner for CV within the hyperalignment test split window_size : int, optional How many temporal points to consider for a classification sample overlapping_windows : bool, optional Strategy to how create and classify "samples" for classification. If True -- `window_size` samples from each time point (but trailing ones) constitute a sample, and upon "predict" `window_size` of samples around each test point is not considered. If False -- samples are just taken (with training and testing splits) at `window_size` step from one to another. do_zscore : bool, optional Perform zscoring (overall, not per-chunk) for each dataset upon partitioning with part1 ... """ # Generate outer-most partitioning () parts = [copy.deepcopy(part1).generate(ds) for ds in dss] iter = 1 errors = [] while True: try: dss_partitioned = [p.next() for p in parts] except StopIteration: # we are done -- no more partitions break if __debug__: debug("BM", "Iteration %d", iter) dss_train, dss_test = zip(*[list(Splitter("partitions").generate(ds)) for ds in dss_partitioned]) # TODO: allow for doing feature selection if do_zscore: for ds in dss_train + dss_test: zscore(ds, chunks_attr=None) if hyper is not None: # since otherwise it would remember previous loop dataset as the "commonspace" # Now let's do hyperalignment but on a copy in each loop iteration hyper_ = copy.deepcopy(hyper) mappers = hyper_(dss_train) else: mappers = [IdentityMapper() for ds in dss_train] dss_test_aligned = [mapper.forward(ds) for mapper, ds in zip(mappers, dss_test)] # assign .sa.subjects to those datasets for i, ds in enumerate(dss_test_aligned): # part2.attr is by default "subjects" ds.sa[part2.attr] = [i] dss_test_bc = [] for ds in dss_test_aligned: if overlapping_windows: startpoints = range(len(ds) - window_size + 1) else: startpoints = _get_nonoverlapping_startpoints(len(ds), window_size) bm = BoxcarMapper(startpoints, window_size) bm.train(ds) ds_ = bm.forward(ds) ds_.sa['startpoints'] = startpoints # reassign subjects so they are not arrays def assign_unique(ds, sa): ds.sa[sa] = [np.asscalar(np.unique(x)) for x in ds.sa[sa].value] assign_unique(ds_, part2.attr) fm = FlattenMapper() fm.train(ds_) dss_test_bc.append(ds_.get_mapped(fm)) ds_test = vstack(dss_test_bc) # Perform classification across subjects comparing against mean # spatio-temporal pattern of other subjects errors_across_subjects = [] for ds_test_part in part2.generate(ds_test): ds_train_, ds_test_ = list(Splitter("partitions").generate(ds_test_part)) # average across subjects to get a representative pattern per timepoint ds_train_ = mean_group_sample(['startpoints'])(ds_train_) assert(ds_train_.shape == ds_test_.shape) if distance == 'correlation': # TODO: redo more efficiently since now we are creating full # corrcoef matrix. Also we might better just take a name for # the pdist measure but then implement them efficiently # (i.e. without hstacking both pieces together first) dist = 1 - np.corrcoef(ds_train_, ds_test_)[len(ds_test_):, :len(ds_test_)] else: raise NotImplementedError if overlapping_windows: dist = wipe_out_offdiag(dist, window_size) winners = np.argmin(dist, axis=1) error = np.mean(winners != np.arange(len(winners))) errors_across_subjects.append(error) errors.append(errors_across_subjects) iter += 1 errors = np.array(errors) if __debug__: debug("BM", "Finished with %s array of errors. Mean error %.2f" % (errors.shape, np.mean(errors))) return errors
def train(self, datasets): """Derive a common feature space from a series of datasets. Parameters ---------- datasets : sequence of datasets Returns ------- A list of trained Mappers matching the number of input datasets. """ params = self.params # for quicker access ;) ca = self.ca ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] alpha = params.alpha residuals = None if ca['training_residual_errors'].enabled: residuals = np.zeros((1 + params.level2_niter, ndatasets)) ca.training_residual_errors = Dataset( samples = residuals, sa = {'levels' : ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)]}) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds if ref_ds < 0 and ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.choosen_ref_ds = ref_ds # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # TODO since we are doing in-place zscoring create deep copies # of the datasets with pruned targets and shallow copies of # the collections (if they would come needed in the transformation) # TODO: handle floats and non-floats differently to prevent # waste of memory if there is no need (e.g. no z-scoring) #otargets = [ds.sa.targets for ds in datasets] datasets = [ds.copy(deep=False) for ds in datasets] #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)}) #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)}) # for ds in datasets] if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) if alpha < 1: datasets, wmappers = self._regularize(datasets, alpha) # initial common space is the reference dataset commonspace = datasets[ref_ds].samples # the reference dataset might have been zscored already, don't do it # twice if params.zscore_common and not params.zscore_all: if __debug__: debug('HPAL_', "Creating copy of a commonspace and assuring " "it is of a floating type") commonspace = commonspace.astype(float) zscore(commonspace, chunks_attr=None) # create a mapper per dataset # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # # Level 1 -- initial projection # lvl1_projdata = self._level1(datasets, commonspace, ref_ds, mappers, residuals) # # Level 2 -- might iterate multiple times # # this is the final common space self.commonspace = self._level2(datasets, lvl1_projdata, mappers, residuals)
def _level3(self, datasets): params = self.params # for quicker access ;) # create a mapper per dataset mappers = [deepcopy(params.alignment) for ds in datasets] # key different from level-2; the common space is uniform #temp_commonspace = commonspace # Fixing nproc=0 if params.nproc == 0: from mvpa2.base import warning warning("nproc of 0 doesn't make sense. Setting nproc to 1.") params.nproc = 1 # Checking for joblib, if not, set nproc to 1 if params.nproc != 1: from mvpa2.base import externals, warning if not externals.exists('joblib'): warning( "Setting nproc different from 1 requires joblib package, which " "does not seem to exist. Setting nproc to 1.") params.nproc = 1 # start from original input datasets again if params.nproc == 1: residuals = [] for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) m, residual = get_trained_mapper( ds_new, self.commonspace, m, self.ca['residual_errors'].enabled) if self.ca['residual_errors'].enabled: residuals.append(residual) else: if __debug__: debug('HPAL_', "Level 3: Using joblib with nproc = %d " % params.nproc) verbose_level_parallel = 20 \ if (__debug__ and 'HPAL' in debug.active) else 0 from joblib import Parallel, delayed import sys # joblib's 'multiprocessing' backend has known issues of failure on OSX # Tested with MacOS 10.12.13, python 2.7.13, joblib v0.10.3 if params.joblib_backend is None: params.joblib_backend = 'threading' if sys.platform == 'darwin' \ else 'multiprocessing' res = Parallel(n_jobs=params.nproc, pre_dispatch=params.nproc, backend=params.joblib_backend, verbose=verbose_level_parallel)( delayed(get_trained_mapper)( ds, self.commonspace, mapper, self.ca['residual_errors'].enabled) for ds, mapper in zip(datasets, mappers)) mappers = [m for m, r in res] if self.ca['residual_errors'].enabled: residuals = [r for m, r in res] if self.ca['residual_errors'].enabled: self.ca.residual_errors = Dataset( samples=np.array(residuals)[None, :]) return mappers
def test_retrainables(self, clf): # XXX we agreed to not worry about this for the initial 0.6 release raise SkipTest # we need a copy since will tune its internals later on clf = clf.clone() clf.ca.change_temporarily(enable_ca = ['estimates'], # ensure that it does do predictions # while training disable_ca=['training_stats']) clf_re = clf.clone() # TODO: .retrainable must have a callback to call smth like # _set_retrainable clf_re._set_retrainable(True) # need to have high snr so we don't 'cope' with problematic # datasets since otherwise unittests would fail. dsargs = {'perlabel':50, 'nlabels':2, 'nfeatures':5, 'nchunks':1, 'nonbogus_features':[2,4], 'snr': 5.0} ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # NB datasets will be changed by the end of testing, so if # are to change to use generic datasets - make sure to copy # them here ds = deepcopy(datasets['uni2large']) clf.untrain() clf_re.untrain() trerr = TransferMeasure(clf, Splitter('train'), postproc=BinaryFxNode(mean_mismatch_error, 'targets')) trerr_re = TransferMeasure(clf_re, Splitter('train'), disable_ca=['training_stats'], postproc=BinaryFxNode(mean_mismatch_error, 'targets')) # Just check for correctness of retraining err_1 = np.asscalar(trerr(ds)) self.assertTrue(err_1<0.3, msg="We should test here on easy dataset. Got error of %s" % err_1) values_1 = clf.ca.estimates[:] # some times retraining gets into deeper optimization ;-) eps = 0.05 corrcoef_eps = 0.85 # just to get no failures... usually > 0.95 def batch_test(retrain=True, retest=True, closer=True): err = np.asscalar(trerr(ds)) err_re = np.asscalar(trerr_re(ds)) corr = np.corrcoef( clf.ca.estimates, clf_re.ca.estimates)[0, 1] corr_old = np.corrcoef(values_1, clf_re.ca.estimates)[0, 1] if __debug__: debug('TEST', "Retraining stats: errors %g %g corr %g " "with old error %g corr %g" % (err, err_re, corr, err_1, corr_old)) self.assertTrue(clf_re.ca.retrained == retrain, ("Must fully train", "Must retrain instead of full training")[retrain]) self.assertTrue(clf_re.ca.repredicted == retest, ("Must fully test", "Must retest instead of full testing")[retest]) self.assertTrue(corr > corrcoef_eps, msg="Result must be close to the one without retraining." " Got corrcoef=%s" % (corr)) if closer: self.assertTrue( corr >= corr_old, msg="Result must be closer to current without retraining" " than to old one. Got corrcoef=%s" % (corr_old)) # Check sequential retraining/retesting for i in xrange(3): flag = bool(i!=0) # ok - on 1st call we should train/test, then retrain/retest # and we can't compare for closinest to old result since # we are working on the same data/classifier batch_test(retrain=flag, retest=flag, closer=False) # should retrain nicely if we change a parameter if 'C' in clf.params: clf.params.C *= 0.1 clf_re.params.C *= 0.1 batch_test() elif 'sigma_noise' in clf.params: clf.params.sigma_noise *= 100 clf_re.params.sigma_noise *= 100 batch_test() else: raise RuntimeError, \ 'Please implement testing while changing some of the ' \ 'params for clf %s' % clf # should retrain nicely if we change kernel parameter if hasattr(clf, 'kernel_params') and len(clf.kernel_params): clf.kernel_params.gamma = 0.1 clf_re.kernel_params.gamma = 0.1 # retest is false since kernel got recomputed thus # can't expect to use the same kernel batch_test(retest=not('gamma' in clf.kernel_params)) # should retrain nicely if we change labels permute = AttributePermutator('targets', assure=True) oldlabels = dstrain.targets[:] dstrain = permute(dstrain) self.assertTrue((oldlabels != dstrain.targets).any(), msg="We should succeed at permutting -- now got the same targets") ds = vstack((dstrain, dstest)) batch_test() # Change labels in testing oldlabels = dstest.targets[:] dstest = permute(dstest) self.assertTrue((oldlabels != dstest.targets).any(), msg="We should succeed at permutting -- now got the same targets") ds = vstack((dstrain, dstest)) batch_test() # should re-train if we change data # reuse trained SVM and its 'final' optimization point if not clf.__class__.__name__ in ['GPR']: # on GPR everything depends on the data ;-) oldsamples = dstrain.samples.copy() dstrain.samples[:] += dstrain.samples*0.05 self.assertTrue((oldsamples != dstrain.samples).any()) ds = vstack((dstrain, dstest)) batch_test(retest=False) clf.ca.reset_changed_temporarily() # test retrain() # TODO XXX -- check validity clf_re.retrain(dstrain); self.assertTrue(clf_re.ca.retrained) clf_re.retrain(dstrain, labels=True); self.assertTrue(clf_re.ca.retrained) clf_re.retrain(dstrain, traindataset=True); self.assertTrue(clf_re.ca.retrained) # test repredict() clf_re.repredict(dstest.samples); self.assertTrue(clf_re.ca.repredicted) self.assertRaises(RuntimeError, clf_re.repredict, dstest.samples, labels=True) """for now retesting with anything changed makes no sense""" clf_re._set_retrainable(False)
def test_multivariate(self): mv_perf = [] mv_lin_perf = [] uv_perf = [] l_clf = clfswh['linear', 'svm'][0] nl_clf = clfswh['non-linear', 'svm'][0] #orig_keys = nl_clf.param._params.keys() #nl_param_orig = nl_clf.param._params.copy() # l_clf = LinearNuSVMC() # XXX ??? not sure what below meant and it is obsolete if # using SG... commenting out for now # for some reason order is not preserved thus dictionaries are not # the same any longer -- lets compare values #self.assertEqual([nl_clf.param._params[k] for k in orig_keys], # [nl_param_orig[k] for k in orig_keys], # msg="New instance mustn't override values in previously created") ## and keys separately #self.assertEqual(set(nl_clf.param._params.keys()), # set(orig_keys), # msg="New instance doesn't change set of parameters in original") # We must be able to deepcopy not yet trained SVMs now import mvpa2.support.copy as copy try: nl_clf.untrain() nl_clf_copy_ = copy.copy(nl_clf) nl_clf_copy = copy.deepcopy(nl_clf) except: self.fail(msg="Failed to deepcopy not-yet trained SVM %s" % nl_clf) for i in range(20): train = pure_multivariate_signal(20, 3) test = pure_multivariate_signal(20, 3) # use non-linear CLF on 2d data nl_clf.train(train) p_mv = nl_clf.predict(test.samples) mv_perf.append(np.mean(p_mv == test.targets)) # use linear CLF on 2d data l_clf.train(train) p_lin_mv = l_clf.predict(test.samples) mv_lin_perf.append(np.mean(p_lin_mv == test.targets)) # use non-linear CLF on 1d data nl_clf.train(train[:, 0]) p_uv = nl_clf.predict(test[:, 0].samples) uv_perf.append(np.mean(p_uv == test.targets)) mean_mv_perf = np.mean(mv_perf) mean_mv_lin_perf = np.mean(mv_lin_perf) mean_uv_perf = np.mean(uv_perf) # non-linear CLF has to be close to perfect self.assertTrue(mean_mv_perf > 0.9) # linear CLF cannot learn this problem! self.assertTrue(mean_mv_perf > mean_mv_lin_perf) # univariate has insufficient information self.assertTrue(mean_uv_perf < mean_mv_perf)