def _call(self, dataset): """Computes featurewise f-scores using compound comparisons.""" targets_sa = dataset.sa[self._targets_attr] orig_labels = targets_sa.value labels = orig_labels.copy() # Lets create a very shallow copy of a dataset with just # samples and targets_attr dataset_mod = Dataset(dataset.samples, sa={self._targets_attr: labels}) results = [] for ul in targets_sa.unique: labels[orig_labels == ul] = 1 labels[orig_labels != ul] = 2 f_ds = OneWayAnova._call(self, dataset_mod) if 'fprob' in f_ds.fa: # rename the fprob attribute to something label specific # to survive final aggregation stage f_ds.fa['fprob_' + str(ul)] = f_ds.fa.fprob del f_ds.fa['fprob'] results.append(f_ds) results = vstack(results) results.sa[self._targets_attr] = targets_sa.unique return results
def _call(self, dataset): """Computes featurewise f-scores using compound comparisons.""" targets_sa = dataset.sa[self._targets_attr] orig_labels = targets_sa.value labels = orig_labels.copy() # Lets create a very shallow copy of a dataset with just # samples and targets_attr dataset_mod = Dataset(dataset.samples, sa={self._targets_attr : labels}) results = [] for ul in targets_sa.unique: labels[orig_labels == ul] = 1 labels[orig_labels != ul] = 2 f_ds = OneWayAnova._call(self, dataset_mod) if 'fprob' in f_ds.fa: # rename the fprob attribute to something label specific # to survive final aggregation stage f_ds.fa['fprob_' + str(ul)] = f_ds.fa.fprob del f_ds.fa['fprob'] results.append(f_ds) results = vstack(results) results.sa[self._targets_attr] = targets_sa.unique return results
def test_mergeds(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.fa['one'] = np.ones(5) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1, chunks=1) data1.fa['one'] = np.zeros(5) data2 = Dataset.from_wizard(np.ones((3, 5)), targets=2, chunks=1) data3 = Dataset.from_wizard(np.ones((4, 5)), targets=2) data4 = Dataset.from_wizard(np.ones((2, 5)), targets=3, chunks=2) data4.fa['test'] = np.arange(5) # cannot merge if there are attributes missing in one of the datasets assert_raises(DatasetError, data1.append, data0) merged = data1.copy() merged.append(data2) ok_( merged.nfeatures == 5 ) l12 = [1]*5 + [2]*3 l1 = [1]*8 ok_((merged.targets == l12).all()) ok_((merged.chunks == l1).all()) data_append = data1.copy() data_append.append(data2) ok_(data_append.nfeatures == 5) ok_((data_append.targets == l12).all()) ok_((data_append.chunks == l1).all()) # # appending # # we need the same samples attributes in both datasets assert_raises(DatasetError, data2.append, data3) # # vstacking # if __debug__: # tested only in __debug__ assert_raises(ValueError, vstack, (data0, data1, data2, data3)) datasets = (data1, data2, data4) merged = vstack(datasets) assert_equal(merged.shape, (np.sum([len(ds) for ds in datasets]), data1.nfeatures)) assert_true('test' in merged.fa) assert_array_equal(merged.sa.targets, [1]*5 + [2]*3 + [3]*2) # # hstacking # assert_raises(ValueError, hstack, datasets) datasets = (data0, data1) merged = hstack(datasets) assert_equal(merged.shape, (len(data1), np.sum([ds.nfeatures for ds in datasets]))) assert_true('chunks' in merged.sa) assert_array_equal(merged.fa.one, [1]*5 + [0]*5)
def test_ifs(self, svm): # measure for feature selection criterion and performance assesment # use the SAME clf! errorfx = mean_mismatch_error fmeasure = CrossValidation(svm, NFoldPartitioner(), postproc=mean_sample()) pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets')) ifs = IFS(fmeasure, pmeasure, Splitter('purpose', attr_values=['train', 'test']), fselector=\ # go for lower tail selection as data_measure will return # errors -> low is good FixedNElementTailSelector(1, tail='lower', mode='select'), ) wdata = self.get_data() wdata.sa['purpose'] = np.repeat('train', len(wdata)) tdata = self.get_data() tdata.sa['purpose'] = np.repeat('test', len(tdata)) ds = vstack((wdata, tdata)) orig_nfeatures = ds.nfeatures ifs.train(ds) resds = ifs(ds) # fail if orig datasets are changed self.failUnless(ds.nfeatures == orig_nfeatures) # check that the features set with the least error is selected self.failUnless(len(ifs.ca.errors)) e = np.array(ifs.ca.errors) self.failUnless(resds.nfeatures == e.argmin() + 1) # repeat with dataset where selection order is known wsignal = datasets['dumb2'].copy() wsignal.sa['purpose'] = np.repeat('train', len(wsignal)) tsignal = datasets['dumb2'].copy() tsignal.sa['purpose'] = np.repeat('test', len(tsignal)) signal = vstack((wsignal, tsignal)) ifs.train(signal) resds = ifs(signal) self.failUnless((resds.samples[:,0] == signal.samples[:,0]).all())
def _call(self, dataset): """Computes featurewise f-scores using compound comparisons.""" orig_labels = dataset.targets labels = orig_labels.copy() results = [] for ul in dataset.sa['targets'].unique: labels[orig_labels == ul] = 1 labels[orig_labels != ul] = 2 f_ds = OneWayAnova._call(self, dataset, labels) if 'fprob' in f_ds.fa: # rename the fprob attribute to something label specific # to survive final aggregation stage f_ds.fa['fprob_' + str(ul)] = f_ds.fa.fprob del f_ds.fa['fprob'] results.append(f_ds) results = vstack(results) results.sa['targets'] = dataset.sa['targets'].unique return results
def _forward_dataset(self, ds): if self.__chunks_attr is None: return self._forward_dataset_helper(ds) else: # strip down dataset to speedup local processing if self.__attr_strategy == 'remove': keep_sa = [] else: keep_sa = None proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[]) # process all chunks individually # use a customsplitter to speed-up splitting spl = Splitter(self.__chunks_attr) dses = [self._forward_dataset_helper(d) for d in spl.generate(proc_ds)] # and merge them again mds = vstack(dses) # put back attributes mds.fa.update(ds.fa) mds.a.update(ds.a) return mds
def _forward_dataset(self, ds): if self.__chunks_attr is None: return self._forward_dataset_helper(ds) else: # strip down dataset to speedup local processing if self.__attr_strategy == 'remove': keep_sa = [] else: keep_sa = None proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[]) # process all chunks individually # use a customsplitter to speed-up splitting spl = CustomSplitter([((i, ), ) for i in ds.sa[self.__chunks_attr].unique], attr=self.__chunks_attr) dses = [self._forward_dataset_helper(d[0]) for d in spl(proc_ds)] # and merge them again mds = vstack(dses) # put back attributes mds.fa.update(ds.fa) mds.a.update(ds.a) return mds
def test_vstack_and_origids_issue(self): # That is actually what swaroop hit skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) # Inspired by the problem Swaroop ran into k = LinearSGKernel(normalizer_cls=False) k_ = LinearSGKernel(normalizer_cls=False) # to be cached ck = CachedKernel(k_) clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1) clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1) cvte = CrossValidatedTransferError( TransferError(clf), NFoldSplitter()) cvte_ = CrossValidatedTransferError( TransferError(clf_), NFoldSplitter()) ds = datasets['uni2large_test'].copy(deep=True) ok_(~('orig_ids' in ds.sa)) # assure that there are None ck.compute(ds) # so we initialize origids ok_('origids' in ds.sa) ds2 = ds.copy(deep=True) ds2.samples = np.zeros(ds2.shape) from mvpa.base.dataset import vstack ds_vstacked = vstack((ds2, ds)) # should complaint now since there would not be unique # samples' origids if __debug__: assert_raises(ValueError, ck.compute, ds_vstacked) ds_vstacked.init_origids('samples') # reset origids ck.compute(ds_vstacked) errs = cvte(ds_vstacked) errs_ = cvte_(ds_vstacked) # Following test would have failed since origids # were just ints, and then non-unique after vstack assert_array_equal(errs.samples, errs_.samples)
def test_vstack_and_origids_issue(self): # That is actually what swaroop hit skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) # Inspired by the problem Swaroop ran into k = LinearSGKernel(normalizer_cls=False) k_ = LinearSGKernel(normalizer_cls=False) # to be cached ck = CachedKernel(k_) clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1) clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1) cvte = CrossValidatedTransferError(TransferError(clf), NFoldSplitter()) cvte_ = CrossValidatedTransferError(TransferError(clf_), NFoldSplitter()) ds = datasets['uni2large_test'].copy(deep=True) ok_(~('orig_ids' in ds.sa)) # assure that there are None ck.compute(ds) # so we initialize origids ok_('origids' in ds.sa) ds2 = ds.copy(deep=True) ds2.samples = np.zeros(ds2.shape) from mvpa.base.dataset import vstack ds_vstacked = vstack((ds2, ds)) # should complaint now since there would not be unique # samples' origids if __debug__: assert_raises(ValueError, ck.compute, ds_vstacked) ds_vstacked.init_origids('samples') # reset origids ck.compute(ds_vstacked) errs = cvte(ds_vstacked) errs_ = cvte_(ds_vstacked) # Following test would have failed since origids # were just ints, and then non-unique after vstack assert_array_equal(errs.samples, errs_.samples)