def test_ds_shallowcopy(): # lets use some instance of somewhat evolved dataset ds = normal_feature_dataset() ds.samples = ds.samples.view(myarray) # SHALLOW copy the beast ds_ = copy.copy(ds) # verify that we have the same data assert_array_equal(ds.samples, ds_.samples) assert_array_equal(ds.targets, ds_.targets) assert_array_equal(ds.chunks, ds_.chunks) # array subclass survives ok_(isinstance(ds_.samples, myarray)) # modify and see that we actually DO change the data in both ds_.samples[0, 0] = 1234 assert_array_equal(ds.samples, ds_.samples) assert_array_equal(ds.targets, ds_.targets) assert_array_equal(ds.chunks, ds_.chunks) ds_.sa.targets[0] = 'ab' ds_.sa.chunks[0] = 234 assert_array_equal(ds.samples, ds_.samples) assert_array_equal(ds.targets, ds_.targets) assert_array_equal(ds.chunks, ds_.chunks) ok_(ds.sa.targets[0] == 'ab') ok_(ds.sa.chunks[0] == 234)
def test_ds_deepcopy(): # lets use some instance of somewhat evolved dataset ds = normal_feature_dataset() ds.samples = ds.samples.view(myarray) # Clone the beast ds_ = ds.copy() # array subclass survives ok_(isinstance(ds_.samples, myarray)) # verify that we have the same data assert_array_equal(ds.samples, ds_.samples) assert_array_equal(ds.targets, ds_.targets) assert_array_equal(ds.chunks, ds_.chunks) # modify and see if we don't change data in the original one ds_.samples[0, 0] = 1234 ok_(np.any(ds.samples != ds_.samples)) assert_array_equal(ds.targets, ds_.targets) assert_array_equal(ds.chunks, ds_.chunks) ds_.sa.targets = np.hstack(([123], ds_.targets[1:])) ok_(np.any(ds.samples != ds_.samples)) ok_(np.any(ds.targets != ds_.targets)) assert_array_equal(ds.chunks, ds_.chunks) ds_.sa.chunks = np.hstack(([1234], ds_.chunks[1:])) ok_(np.any(ds.samples != ds_.samples)) ok_(np.any(ds.targets != ds_.targets)) ok_(np.any(ds.chunks != ds_.chunks))
def test_binds(self): ds = normal_feature_dataset() ds_data = ds.samples.copy() ds_chunks = ds.chunks.copy() self.failUnless(np.all(ds.samples == ds_data)) # sanity check funcs = ['coarsen_chunks'] for f in funcs: eval('ds.%s()' % f) self.failUnless( np.any(ds.samples != ds_data) or np.any(ds.chunks != ds_chunks), msg="We should have modified original dataset with %s" % f) ds.samples = ds_data.copy() ds.sa['chunks'].value = ds_chunks.copy() # and some which should just return results for f in [ 'aggregate_features', 'remove_invariant_features', 'get_samples_per_chunk_target' ]: res = eval('ds.%s()' % f) self.failUnless(res is not None, msg='We should have got result from function %s' % f) self.failUnless( np.all(ds.samples == ds_data), msg="Function %s should have not modified original dataset" % f)
def test_smlr_sensitivities(self): data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) # use SMLR on binary problem, but not fitting all weights clf = SMLR(fit_all_weights=False) clf.train(data) # now ask for the sensitivities WITHOUT having to pass the dataset # again sens = clf.get_sensitivity_analyzer(force_training=False)() self.failUnless(sens.shape == (len(data.UT) - 1, data.nfeatures))
def test_mdpflowmapper(): flow = mdp.nodes.PCANode() + mdp.nodes.SFANode() fm = MDPFlowMapper(flow) ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) fm.train(ds) assert_false(fm.flow[0].is_training()) assert_false(fm.flow[1].is_training()) fds = fm.forward(ds) assert_true(isinstance(fds, Dataset)) assert_equal(fds.samples.shape, ds.samples.shape)
def test_glmnet_c_sensitivities(): data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) # use GLMNET on binary problem clf = GLMNET_C() clf.train(data) # now ask for the sensitivities WITHOUT having to pass the dataset # again sens = clf.get_sensitivity_analyzer(force_train=False)(None) #failUnless(sens.shape == (data.nfeatures,)) assert_equal(sens.shape, (len(data.UT), data.nfeatures))
def test_glmnet_c_sensitivities(): data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) # use GLMNET on binary problem clf = GLMNET_C() clf.train(data) # now ask for the sensitivities WITHOUT having to pass the dataset # again sens = clf.get_sensitivity_analyzer(force_training=False)() #failUnless(sens.shape == (data.nfeatures,)) assert_equal(sens.shape, (len(data.UT), data.nfeatures))
def test_mdpnodemapper(): ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) node = mdp.nodes.PCANode() mm = MDPNodeMapper(node, nodeargs={'stoptrain': ((), {'debug': True})}) mm.train(ds) fds = mm.forward(ds) if externals.versions['mdp'] >= '2.5': assert_true(hasattr(mm.node, 'cov_mtx')) assert_true(isinstance(fds, Dataset)) assert_equal(fds.samples.shape, ds.samples.shape) # set projection onto first 2 components mm.nodeargs['exec'] = ((), {'n': 2}) #should be different from above lfds = mm.forward(ds.samples) # output shape changes although the node still claim otherwise assert_equal(mm.node.output_dim, 4) assert_equal(lfds.shape[0], fds.samples.shape[0]) assert_equal(lfds.shape[1], 2) assert_array_equal(lfds, fds.samples[:, :2]) # reverse rfds = mm.reverse(fds) # even smaller size works rlfds = mm.reverse(lfds) assert_equal(rfds.samples.shape, ds.samples.shape) # retraining has to work on a new dataset too, since we copy the node # internally dsbig = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=10) mm.train(dsbig)
def test_cache_speedup(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) from mvpa.algorithms.cvtranserror import CrossValidatedTransferError from mvpa.datasets.splitters import NFoldSplitter from time import time from mvpa.clfs.transerror import TransferError from mvpa.kernels.base import CachedKernel from mvpa.kernels.sg import RbfSGKernel from mvpa.misc.data_generators import normal_feature_dataset ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1) sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1) cv_c = CrossValidatedTransferError(TransferError(ck), splitter=NFoldSplitter()) cv_s = CrossValidatedTransferError(TransferError(sk), splitter=NFoldSplitter()) #data = datasets['uni4large'] P = 5000 data = normal_feature_dataset(snr=2, perlabel=200, nchunks=10, means=np.random.randn(2, P), nfeatures=P) t0 = time() ck.params.kernel.compute(data) cachetime = time()-t0 t0 = time() cached_err = cv_c(data) ccv_time = time()-t0 t0 = time() norm_err = cv_s(data) ncv_time = time()-t0 assert_almost_equal(np.asanyarray(cached_err), np.asanyarray(norm_err)) ok_(cachetime<ncv_time) ok_(ccv_time<ncv_time) #print 'Regular CV time: %s seconds'%ncv_time #print 'Caching time: %s seconds'%cachetime #print 'Cached CV time: %s seconds'%ccv_time speedup = ncv_time/(ccv_time+cachetime) #print 'Speedup factor: %s'%speedup # Speedup ideally should be 10, though it's not purely linear self.failIf(speedup < 2, 'Problem caching data - too slow!')
def test_mdpflow_additional_arguments_nones(): skip_if_no_external('mdp', min_version='2.5') # we have no IdentityNode yet... is there analog? ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) flow = mdp.nodes.PCANode() + mdp.nodes.IdentityNode() + mdp.nodes.FDANode() # this is what it would look like in MDP itself #flow.train([[ds.samples], # [[ds.samples, ds.sa.targets]]]) assert_raises(ValueError, MDPFlowMapper, flow, node_arguments=[[],[]]) fm = MDPFlowMapper(flow, node_arguments = (None, None, [ds.sa.targets])) fm.train(ds) fds = fm.forward(ds) assert_equal(ds.samples.shape, fds.samples.shape) rds = fm.reverse(fds) assert_array_almost_equal(ds.samples, rds.samples)
def test_mdpflow_additional_arguments_nones(): skip_if_no_external('mdp', min_version='2.5') # we have no IdentityNode yet... is there analog? ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) flow = mdp.nodes.PCANode() + mdp.nodes.IdentityNode() + mdp.nodes.FDANode() # this is what it would look like in MDP itself #flow.train([[ds.samples], # [[ds.samples, ds.sa.targets]]]) assert_raises(ValueError, MDPFlowMapper, flow, node_arguments=[[], []]) fm = MDPFlowMapper(flow, node_arguments=(None, None, [ds.sa.targets])) fm.train(ds) fds = fm.forward(ds) assert_equal(ds.samples.shape, fds.samples.shape) rds = fm.reverse(fds) assert_array_almost_equal(ds.samples, rds.samples)
def test_cache_speedup(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1) sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1) cv_c = CrossValidatedTransferError(TransferError(ck), splitter=NFoldSplitter()) cv_s = CrossValidatedTransferError(TransferError(sk), splitter=NFoldSplitter()) #data = datasets['uni4large'] P = 5000 data = normal_feature_dataset(snr=2, perlabel=200, nchunks=10, means=np.random.randn(2, P), nfeatures=P) t0 = time() ck.params.kernel.compute(data) cachetime = time() - t0 t0 = time() cached_err = cv_c(data) ccv_time = time() - t0 t0 = time() norm_err = cv_s(data) ncv_time = time() - t0 assert_almost_equal(np.asanyarray(cached_err), np.asanyarray(norm_err)) ok_(cachetime < ncv_time) ok_(ccv_time < ncv_time) #print 'Regular CV time: %s seconds'%ncv_time #print 'Caching time: %s seconds'%cachetime #print 'Cached CV time: %s seconds'%ccv_time speedup = ncv_time / (ccv_time + cachetime) #print 'Speedup factor: %s'%speedup # Speedup ideally should be 10, though it's not purely linear self.failIf(speedup < 2, 'Problem caching data - too slow!')
def test_cache_speedup(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1) sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1) cv_c = CrossValidation(ck, NFoldPartitioner()) cv_s = CrossValidation(sk, NFoldPartitioner()) #data = datasets['uni4large'] P = 5000 data = normal_feature_dataset(snr=2, perlabel=200, nchunks=10, means=np.random.randn(2, P), nfeatures=P) t0 = time() ck.params.kernel.compute(data) cachetime = time()-t0 t0 = time() cached_err = cv_c(data) ccv_time = time()-t0 t0 = time() norm_err = cv_s(data) ncv_time = time()-t0 assert_almost_equal(np.asanyarray(cached_err), np.asanyarray(norm_err)) ok_(cachetime<ncv_time) ok_(ccv_time<ncv_time) #print 'Regular CV time: %s seconds'%ncv_time #print 'Caching time: %s seconds'%cachetime #print 'Cached CV time: %s seconds'%ccv_time speedup = ncv_time/(ccv_time+cachetime) #print 'Speedup factor: %s'%speedup # Speedup ideally should be 10, though it's not purely linear self.failIf(speedup < 2, 'Problem caching data - too slow!')
def test_binds(self): ds = normal_feature_dataset() ds_data = ds.samples.copy() ds_chunks = ds.chunks.copy() self.failUnless(np.all(ds.samples == ds_data)) # sanity check funcs = ['coarsen_chunks'] for f in funcs: eval('ds.%s()' % f) self.failUnless(np.any(ds.samples != ds_data) or np.any(ds.chunks != ds_chunks), msg="We should have modified original dataset with %s" % f) ds.samples = ds_data.copy() ds.sa['chunks'].value = ds_chunks.copy() # and some which should just return results for f in ['aggregate_features', 'remove_invariant_features', 'get_samples_per_chunk_target']: res = eval('ds.%s()' % f) self.failUnless(res is not None, msg='We should have got result from function %s' % f) self.failUnless(np.all(ds.samples == ds_data), msg="Function %s should have not modified original dataset" % f)
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity (RFE(sens_ana, cvmeasure, Repeater(2), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector( 0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits (RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.mean), FxMapper('samples', np.abs)])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater(2), # we will use the same full cv-training dataset fselector=FractionTailSelector( 0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), train_pmeasure=False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.failUnless(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.failUnless(resds.nfeatures == data_nfeatures - e.argmin()) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? imin = np.argmin(e) self.failUnless( 1 < imin < len(e) - 1 ) else: self.failUnless(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.failUnless( (nfeatures[::-1] == rfe.ca.nfeatures).all() ) # check if history has elements for every step self.failUnless(set(rfe.ca.history) == set(range(len(np.array(rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.failUnless(rfe.ca.nfeatures[-1] == len(np.where(rfe.ca.history ==max(rfe.ca.history))[0]))
draft of a complete analysis. First import a necessary pieces of PyMVPA -- this time each bit individually. """ from mvpa.datasets.base import dataset_wizard from mvpa.datasets.splitters import OddEvenSplitter from mvpa.clfs.svm import LinearCSVMC from mvpa.clfs.transerror import TransferError from mvpa.algorithms.cvtranserror import CrossValidatedTransferError from mvpa.measures.searchlight import Searchlight from mvpa.misc.data_generators import normal_feature_dataset """For the sake of simplicity, let's use a small artificial dataset.""" # overcomplicated way to generate an example dataset ds = normal_feature_dataset(perlabel=10, nlabels=2, nchunks=2, nfeatures=10, nonbogus_features=[3, 7], snr=5.0) dataset = dataset_wizard(samples=ds.samples, targets=ds.targets, chunks=ds.chunks) """Now it only takes three lines for a searchlight analysis.""" # setup measure to be computed in each sphere (cross-validated # generalization error on odd/even splits) cv = CrossValidatedTransferError(TransferError(LinearCSVMC()), OddEvenSplitter()) # setup searchlight with 5 mm radius and measure configured above sl = Searchlight(cv, radius=5) # run searchlight on dataset sl_map = sl(dataset) print "Best performing sphere error:", min(sl_map)