def _call(self, dataset): """Perform the ROI search. """ # local binding nproc = self.nproc if nproc is None and externals.exists('pprocess'): import pprocess try: nproc = pprocess.get_number_of_cores() or 1 except AttributeError: warning("pprocess version %s has no API to figure out maximal " "number of cores. Using 1" % externals.versions['pprocess']) nproc = 1 # train the queryengine self._queryengine.train(dataset) # decide whether to run on all possible center coords or just a provided # subset if isinstance(self.__roi_ids, str): roi_ids = dataset.fa[self.__roi_ids].value.nonzero()[0] elif self.__roi_ids is not None: roi_ids = self.__roi_ids # safeguard against stupidity if __debug__: if max(roi_ids) >= dataset.nfeatures: raise IndexError, \ "Maximal center_id found is %s whenever given " \ "dataset has only %d features" \ % (max(roi_ids), dataset.nfeatures) else: roi_ids = np.arange(dataset.nfeatures) # pass to subclass results, roi_sizes = self._sl_call(dataset, roi_ids, nproc) if not roi_sizes is None: self.ca.roi_sizes = roi_sizes if 'mapper' in dataset.a: # since we know the space we can stick the original mapper into the # results as well if self.__roi_ids is None: results.a['mapper'] = copy.copy(dataset.a.mapper) else: # there is an additional selection step that needs to be # expressed by another mapper mapper = copy.copy(dataset.a.mapper) mapper.append(StaticFeatureSelection(roi_ids, dshape=dataset.shape[1:])) results.a['mapper'] = mapper # charge state self.ca.raw_results = results # return raw results, base-class will take care of transformations return results
def _call(self, dataset): analyzers = [] # create analyzers for clf in self.clf.clfs: if self.__analyzer is None: analyzer = clf.get_sensitivity_analyzer(**(self._slave_kwargs)) if analyzer is None: raise ValueError, \ "Wasn't able to figure basic analyzer for clf %r" % \ (clf,) if __debug__: debug("SA", "Selected analyzer %r for clf %r" % \ (analyzer, clf)) else: # XXX shallow copy should be enough... analyzer = copy.copy(self.__analyzer) # assign corresponding classifier analyzer.clf = clf # if clf was trained already - don't train again if clf.trained: analyzer._force_training = False analyzers.append(analyzer) self.__combined_analyzer.analyzers = analyzers # XXX not sure if we don't want to call directly ._call(dataset) to avoid # double application of transformers/combiners, after all we are just # 'proxying' here to combined_analyzer... # YOH: decided -- lets call ._call return self.__combined_analyzer._call(dataset)
def test_subset(): data = np.array( [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]]) # float array doesn't work sm = FeatureSliceMapper(np.ones(16)) assert_raises(IndexError, sm.forward, data) # full mask sm = FeatureSliceMapper(slice(None)) # should not change single samples assert_array_equal(sm.forward(data[0:1].copy()), data[0:1]) # or multi-samples assert_array_equal(sm.forward(data.copy()), data) sm.train(data) # same on reverse assert_array_equal(sm.reverse(data[0:1].copy()), data[0:1]) # or multi-samples assert_array_equal(sm.reverse(data.copy()), data) # identical mappers sm_none = FeatureSliceMapper(slice(None)) sm_int = FeatureSliceMapper(np.arange(16)) sm_bool = FeatureSliceMapper(np.ones(16, dtype='bool')) sms = [sm_none, sm_int, sm_bool] # test subsets sids = [3,4,5,6] bsubset = np.zeros(16, dtype='bool') bsubset[sids] = True subsets = [sids, slice(3,7), bsubset, [3,3,4,4,6,6,6,5]] # all test subset result in equivalent masks, hence should do the same to # the mapper and result in identical behavior for st in sms: for i, sub in enumerate(subsets): # shallow copy orig = copy(st) subsm = FeatureSliceMapper(sub) # should do copy-on-write for all important stuff!! assert_true(orig.is_mergable(subsm)) orig += subsm # test if selection did its job if i == 3: # special case of multiplying features assert_array_equal(orig.forward1(data[0].copy()), subsets[i]) else: assert_array_equal(orig.forward1(data[0].copy()), sids) ## all of the above shouldn't change the original mapper #assert_array_equal(sm.get_mask(), np.arange(16)) # check for some bug catcher # no 3D input #assert_raises(IndexError, sm.forward, np.ones((3,2,1))) # no input of wrong length if __debug__: # checked only in __debug__ assert_raises(ValueError, sm.forward, np.ones(4))
def _postcall(self, dataset, result): """Some postprocessing on the result """ self.raw_result = result if not self.__transformer is None: if __debug__: debug("SA_", "Applying transformer %s" % self.__transformer) result = self.__transformer(result) # estimate the NULL distribution when functor is given if not self.__null_dist is None: if __debug__: debug("SA_", "Estimating NULL distribution using %s" % self.__null_dist) # we need a matching datameasure instance, but we have to disable # the estimation of the null distribution in that child to prevent # infinite looping. measure = copy.copy(self) measure.__null_dist = None self.__null_dist.fit(measure, dataset) if self.states.isEnabled('null_t'): # get probability under NULL hyp, but also request # either it belong to the right tail null_prob, null_right_tail = \ self.__null_dist.p(result, return_tails=True) self.null_prob = null_prob externals.exists('scipy', raiseException=True) from scipy.stats import norm # TODO: following logic should appear in NullDist, # not here tail = self.null_dist.tail if tail == 'left': acdf = N.abs(null_prob) elif tail == 'right': acdf = 1.0 - N.abs(null_prob) elif tail in ['any', 'both']: acdf = 1.0 - N.clip(N.abs(null_prob), 0, 0.5) else: raise RuntimeError, 'Unhandled tail %s' % tail # We need to clip to avoid non-informative inf's ;-) # that happens due to lack of precision in mantissa # which is 11 bits in double. We could clip values # around 0 at as low as 1e-100 (correspond to z~=21), # but for consistency lets clip at 1e-16 which leads # to distinguishable value around p=1 and max z=8.2. # Should be sufficient range of z-values ;-) clip = 1e-16 null_t = norm.ppf(N.clip(acdf, clip, 1.0 - clip)) null_t[~null_right_tail] *= -1.0 # revert sign for negatives self.null_t = null_t # store else: # get probability of result under NULL hypothesis if available # and don't request tail information self.null_prob = self.__null_dist.p(result) return result
def test_confusion_based_error(self, l_clf): train = datasets['uni2medium'] train = train[train.sa.train == 1] # to check if we fail to classify for 3 labels test3 = datasets['uni3medium'] test3 = test3[test3.sa.train == 1] err = ConfusionBasedError(clf=l_clf) terr = TransferMeasure(l_clf, Splitter('train', attr_values=[1,1]), postproc=BinaryFxNode(mean_mismatch_error, 'targets')) self.failUnlessRaises(UnknownStateError, err, None) """Shouldn't be able to access the state yet""" l_clf.train(train) e, te = err(None), terr(train) te = np.asscalar(te) self.failUnless(abs(e-te) < 1e-10, msg="ConfusionBasedError (%.2g) should be equal to TransferError " "(%.2g) on traindataset" % (e, te)) # this will print nasty WARNING but it is ok -- it is just checking code # NB warnings are not printed while doing whole testing warning("Don't worry about the following warning.") if 'multiclass' in l_clf.__tags__: self.failIf(terr(test3) is None) # try copying the beast terr_copy = copy(terr)
def _call(self, dataset): analyzers = [] # create analyzers for clf in self.clf.clfs: if self.__analyzer is None: analyzer = clf.get_sensitivity_analyzer(**(self._slave_kwargs)) if analyzer is None: raise ValueError, \ "Wasn't able to figure basic analyzer for clf %r" % \ (clf,) if __debug__: debug("SA", "Selected analyzer %r for clf %r" % \ (analyzer, clf)) else: # XXX shallow copy should be enough... analyzer = copy.copy(self.__analyzer) # assign corresponding classifier analyzer.clf = clf # if clf was trained already - don't train again if clf.trained: analyzer._force_train = False analyzers.append(analyzer) self.__combined_analyzer.analyzers = analyzers # XXX not sure if we don't want to call directly ._call(dataset) to avoid # double application of transformers/combiners, after all we are just # 'proxying' here to combined_analyzer... # YOH: decided -- lets call ._call return self.__combined_analyzer._call(dataset)
def test_subset(): data = np.array( [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]]) # float array doesn't work sm = FeatureSliceMapper(np.ones(16)) assert_raises(IndexError, sm.forward, data) # full mask sm = FeatureSliceMapper(slice(None)) # should not change single samples assert_array_equal(sm.forward(data[0:1].copy()), data[0:1]) # or multi-samples assert_array_equal(sm.forward(data.copy()), data) sm.train(data) # same on reverse assert_array_equal(sm.reverse(data[0:1].copy()), data[0:1]) # or multi-samples assert_array_equal(sm.reverse(data.copy()), data) # identical mappers sm_none = FeatureSliceMapper(slice(None)) sm_int = FeatureSliceMapper(np.arange(16)) sm_bool = FeatureSliceMapper(np.ones(16, dtype='bool')) sms = [sm_none, sm_int, sm_bool] # test subsets sids = [3, 4, 5, 6] bsubset = np.zeros(16, dtype='bool') bsubset[sids] = True subsets = [sids, slice(3, 7), bsubset, [3, 3, 4, 4, 6, 6, 6, 5]] # all test subset result in equivalent masks, hence should do the same to # the mapper and result in identical behavior for st in sms: for i, sub in enumerate(subsets): # shallow copy orig = copy(st) subsm = FeatureSliceMapper(sub) # should do copy-on-write for all important stuff!! assert_true(orig.is_mergable(subsm)) orig += subsm # test if selection did its job if i == 3: # special case of multiplying features assert_array_equal(orig.forward1(data[0].copy()), subsets[i]) else: assert_array_equal(orig.forward1(data[0].copy()), sids) ## all of the above shouldn't change the original mapper #assert_array_equal(sm.get_mask(), np.arange(16)) # check for some bug catcher # no 3D input #assert_raises(IndexError, sm.forward, np.ones((3,2,1))) # no input of wrong length if __debug__: # checked only in __debug__ assert_raises(ValueError, sm.forward, np.ones(4))
def test_ds_shallowcopy(): # lets use some instance of somewhat evolved dataset ds = normal_feature_dataset() ds.samples = ds.samples.view(myarray) # SHALLOW copy the beast ds_ = copy.copy(ds) # verify that we have the same data assert_array_equal(ds.samples, ds_.samples) assert_array_equal(ds.targets, ds_.targets) assert_array_equal(ds.chunks, ds_.chunks) # array subclass survives ok_(isinstance(ds_.samples, myarray)) # modify and see that we actually DO change the data in both ds_.samples[0, 0] = 1234 assert_array_equal(ds.samples, ds_.samples) assert_array_equal(ds.targets, ds_.targets) assert_array_equal(ds.chunks, ds_.chunks) ds_.sa.targets[0] = 'ab' ds_.sa.chunks[0] = 234 assert_array_equal(ds.samples, ds_.samples) assert_array_equal(ds.targets, ds_.targets) assert_array_equal(ds.chunks, ds_.chunks) ok_(ds.sa.targets[0] == 'ab') ok_(ds.sa.chunks[0] == 234)
def __iadd__(self, other): """Add the sets from `other` s `SummaryStatistics` to current one """ #print "adding ", other, " to ", self # need to do shallow copy, or otherwise smth like "cm += cm" # would loop forever and exhaust memory eventually othersets = copy.copy(other.__sets) for set in othersets: self.add(*set)#[0], set[1]) return self
def __getitem__(self, key): # if just one is requested return just one, otherwise return a # NodeChain again if isinstance(key, int): return self._nodes[key] else: # operate on shallow copy of self sliced = copy.copy(self) sliced._nodes = self._nodes[key] return sliced
def _sl_call(self, dataset, roi_ids, nproc): """Classical generic searchlight implementation """ # compute if nproc > 1: # split all target ROIs centers into `nproc` equally sized blocks nproc_needed = min(len(roi_ids), nproc) roi_blocks = np.array_split(roi_ids, nproc_needed) # the next block sets up the infrastructure for parallel computing # this can easily be changed into a ParallelPython loop, if we # decide to have a PP job server in PyMVPA import pprocess p_results = pprocess.Map(limit=nproc_needed) if __debug__: debug('SLC', "Starting off child processes for nproc=%i" % nproc_needed) compute = p_results.manage( pprocess.MakeParallel(self._proc_block)) for block in roi_blocks: # should we maybe deepcopy the measure to have a unique and # independent one per process? compute(block, dataset, copy.copy(self.__datameasure)) # collect results results = [] if self.ca.is_enabled('roi_sizes'): roi_sizes = [] else: roi_sizes = None for r, rsizes in p_results: results += r if not roi_sizes is None: roi_sizes += rsizes else: # otherwise collect the results in a list results, roi_sizes = \ self._proc_block(roi_ids, dataset, self.__datameasure) if __debug__ and 'SLC' in debug.active: debug('SLC', '') # just newline resshape = len(results) and np.asanyarray(results[0]).shape or 'N/A' debug('SLC', ' hstacking %d results of shape %s' % (len(results), resshape)) # but be careful: this call also serves as conversion from parallel maps # to regular lists! # this uses the Dataset-hstack results = hstack(results) if __debug__: debug('SLC', " hstacked shape %s" % (results.shape,)) return results, roi_sizes
def __call__(self, *args, **kwargs): """ """ # prepare complex result structure for all calls and their respective # attributes: calls x dict(attributes x loop iterations) results = [dict([('result', [])] + [(a, []) for a in c.attribs]) \ for c in self.__calls] # Lets do it! for (i, X) in enumerate(self.__source(*args, **kwargs)): for (c, call) in enumerate(self.__calls): # sanity check if i == 0 and call.expand_args and not isSequenceType(X): raise RuntimeError, \ "Cannot expand non-sequence result from %s" % \ `self.__source` # apply argument filter (and reorder) if requested if call.argfilter: filtered_args = [X[f] for f in call.argfilter] else: filtered_args = X if call.expand_args: result = call.call(*filtered_args) else: result = call.call(filtered_args) # # XXX pylint doesn't like `` for some reason # if __debug__: # debug("LOOP", "Iteration %i on call %s. Got result %s" % # (i, `self.__call`, `result`)) results[c]['result'].append(result) for attrib in call.attribs: attrv = call.call.__getattribute__(attrib) if call.copy_attribs: attrv = copy(attrv) results[c][attrib].append(attrv) # reduce results structure if self.__simplify_results: # get rid of dictionary if just the results are requested for (c, call) in enumerate(self.__calls): if not len(call.attribs): results[c] = results[c]['result'] if len(self.__calls) == 1: results = results[0] return results
def _precall(self, ds): # estimate the NULL distribution when functor is given if not self.__null_dist is None: if __debug__: debug("SA_", "Estimating NULL distribution using %s" % self.__null_dist) # we need a matching measure instance, but we have to disable # the estimation of the null distribution in that child to prevent # infinite looping. measure = copy.copy(self) measure.__null_dist = None self.__null_dist.fit(measure, ds)
def test_labelpermutation_randomsampling(): ds = Dataset.from_wizard(np.ones((5, 1)), targets=range(5), chunks=1) ds.append(Dataset.from_wizard(np.ones((5, 1)) + 1, targets=range(5), chunks=2)) ds.append(Dataset.from_wizard(np.ones((5, 1)) + 2, targets=range(5), chunks=3)) ds.append(Dataset.from_wizard(np.ones((5, 1)) + 3, targets=range(5), chunks=4)) ds.append(Dataset.from_wizard(np.ones((5, 1)) + 4, targets=range(5), chunks=5)) # use subclass for testing if it would survive ds.samples = ds.samples.view(myarray) ok_(ds.get_nsamples_per_attr('targets') == {0:5, 1:5, 2:5, 3:5, 4:5}) sample = ds.random_samples(2) ok_(sample.get_nsamples_per_attr('targets').values() == [ 2, 2, 2, 2, 2 ]) ok_((ds.sa['chunks'].unique == range(1, 6)).all()) # keep the orig labels orig_labels = ds.targets[:] # also keep the orig dataset, but SHALLOW copy and leave everything # else as a view! ods = copy.copy(ds) ds.permute_targets() # some permutation should have happened assert_false((ds.targets == orig_labels).all()) # but the original dataset should be uneffected assert_array_equal(ods.targets, orig_labels) # array subclass survives ok_(isinstance(ods.samples, myarray)) # samples are really shared ds.samples[0, 0] = 123456 assert_array_equal(ds.samples, ods.samples) # and other samples attributes too ds.chunks[0] = 9876 assert_array_equal(ds.chunks, ods.chunks) # try to permute on custom target ds = ods.copy() otargets = ods.sa.targets.copy() ds.sa['custom'] = ods.sa.targets.copy() assert_array_equal(ds.sa.custom, otargets) assert_array_equal(ds.sa.targets, otargets) ds.permute_targets(targets_attr='custom') # original targets should still match assert_array_equal(ds.sa.targets, otargets) # but custom should get permuted assert_false((ds.sa.custom == otargets).all())
def as_descrete_time(self, dt, storeoffset=False, offsetattr='offset'): """Convert `onset` and `duration` information into descrete timepoints. Parameters ---------- dt : float Temporal distance between two timepoints in the same unit as `onset` and `duration`. storeoffset : bool If True, the temporal offset between original `onset` and descretized onset is stored as an additional item. offsetattr : str The name of the attribute that is used to store the computed offset in case the `storeoffset` is enabled. Returns ------- A copy of the original `Event` with `onset` and optionally `duration` replaced by their corresponding descrete timepoint. The new onset will correspond to the timepoint just before or exactly at the original onset. The new duration will be the number of timepoints covering the event from the computed onset timepoint till the timepoint exactly at the end, or just after the event. Note again, that the new values are expressed as #timepoint and not in their original unit! """ dt = float(dt) onset = self['onset'] out = copy(self) # get the timepoint just prior the onset out['onset'] = int(np.floor(onset / dt)) if storeoffset: # compute offset offset = onset - (out['onset'] * dt) out[offsetattr] = offset if out.has_key('duration'): # how many timepoint cover the event (from computed onset # to the one timepoint just after the end of the event out['duration'] = int(np.ceil((onset + out['duration']) / dt) \ - out['onset']) return out
def _sl_call(self, dataset, roi_ids, nproc): """Classical generic searchlight implementation """ # compute if nproc > 1: # split all target ROIs centers into `nproc` equally sized blocks roi_blocks = np.array_split(roi_ids, nproc) # the next block sets up the infrastructure for parallel computing # this can easily be changed into a ParallelPython loop, if we # decide to have a PP job server in PyMVPA import pprocess p_results = pprocess.Map(limit=nproc) if __debug__: debug("SLC", "Starting off child processes for nproc=%i" % nproc) compute = p_results.manage(pprocess.MakeParallel(self._proc_block)) for block in roi_blocks: # should we maybe deepcopy the measure to have a unique and # independent one per process? compute(block, dataset, copy.copy(self.__datameasure)) # collect results results = [] if self.ca.is_enabled("roi_sizes"): roi_sizes = [] else: roi_sizes = None for r, rsizes in p_results: results += r if not roi_sizes is None: roi_sizes += rsizes else: # otherwise collect the results in a list results, roi_sizes = self._proc_block(roi_ids, dataset, self.__datameasure) if __debug__: debug("SLC", "") # but be careful: this call also serves as conversion from parallel maps # to regular lists! # this uses the Dataset-hstack results = hstack(results) return results, roi_sizes
def testConfusionBasedError(self, l_clf): train = datasets['uni2medium_train'] # to check if we fail to classify for 3 labels test3 = datasets['uni3medium_train'] err = ConfusionBasedError(clf=l_clf) terr = TransferError(clf=l_clf) self.failUnlessRaises(UnknownStateError, err, None) """Shouldn't be able to access the state yet""" l_clf.train(train) self.failUnlessEqual(err(None), terr(train), msg="ConfusionBasedError should be equal to TransferError on" + " traindataset") # this will print nasty WARNING but it is ok -- it is just checking code # NB warnings are not printed while doing whole testing self.failIf(terr(test3) is None) # try copying the beast terr_copy = copy(terr)
def test_confusion_based_error(self, l_clf): train = datasets['uni2medium_train'] # to check if we fail to classify for 3 labels test3 = datasets['uni3medium_train'] err = ConfusionBasedError(clf=l_clf) terr = TransferError(clf=l_clf) self.failUnlessRaises(UnknownStateError, err, None) """Shouldn't be able to access the state yet""" l_clf.train(train) e, te = err(None), terr(train) self.failUnless(abs(e-te) < 1e-10, msg="ConfusionBasedError (%.2g) should be equal to TransferError " "(%.2g) on traindataset" % (e, te)) # this will print nasty WARNING but it is ok -- it is just checking code # NB warnings are not printed while doing whole testing warning("Don't worry about the following warning.") self.failIf(terr(test3) is None) # try copying the beast terr_copy = copy(terr)
def test_confusion_based_error(self, l_clf): train = datasets['uni2medium_train'] # to check if we fail to classify for 3 labels test3 = datasets['uni3medium_train'] err = ConfusionBasedError(clf=l_clf) terr = TransferError(clf=l_clf) self.failUnlessRaises(UnknownStateError, err, None) """Shouldn't be able to access the state yet""" l_clf.train(train) e, te = err(None), terr(train) self.failUnless( abs(e - te) < 1e-10, msg="ConfusionBasedError (%.2g) should be equal to TransferError " "(%.2g) on traindataset" % (e, te)) # this will print nasty WARNING but it is ok -- it is just checking code # NB warnings are not printed while doing whole testing warning("Don't worry about the following warning.") self.failIf(terr(test3) is None) # try copying the beast terr_copy = copy(terr)
def _call(self, dataset, testdataset): """Proceed and select the features recursively eliminating less important ones. Parameters ---------- dataset : Dataset used to select features and train classifiers to determine the transfer error. testdataset : Dataset used to test the trained classifer on a certain feature set to determine the transfer error. Returns ------- A tuple with the dataset containing the feature subset of `dataset` that had the lowest transfer error of all tested sets until the stopping criterion was reached. The tuple also contains a dataset with the corrsponding features from the `testdataset`. """ errors = [] """Computed error for each tested features set.""" # feature candidate are all features in the pattern object candidates = range(dataset.nfeatures) # initially empty list of selected features selected = [] # results in here please results = None # as long as there are candidates left # the loop will most likely get broken earlier if the stopping # criterion is reached while len(candidates): # measures for all candidates measures = [] # for all possible candidates for i, candidate in enumerate(candidates): if __debug__: debug('IFSC', "Tested %i" % i, cr=True) # take the new candidate and all already selected features # select a new temporay feature subset from the dataset # XXX assume MappedDataset and issue plain=True ?? tmp_dataset = \ dataset[:, selected + [candidate]] # compute data measure on this feature set measures.append(self.__data_measure(tmp_dataset)) measures = [np.asscalar(m) for m in measures] # Select promissing feature candidates (staging) # IDs are only applicable to the current set of feature candidates tmp_staging_ids = self.__feature_selector(measures) # translate into real candidate ids staging_ids = [candidates[i] for i in tmp_staging_ids] # mark them as selected and remove from candidates selected += staging_ids for i in staging_ids: candidates.remove(i) # compute transfer error for the new set # XXX assume MappedDataset and issue plain=True ?? error = self.__transfer_error(testdataset[:, selected], dataset[:, selected]) errors.append(error) # Check if it is time to stop and if we got # the best result stop = self.__stopping_criterion(errors) isthebest = self.__bestdetector(errors) if __debug__: debug('IFSC', "nselected %i; error: %.4f " \ "best/stop=%d/%d\n" \ % (len(selected), errors[-1], isthebest, stop), cr=True, lf=True) if isthebest: # do copy to survive later selections results = copy(selected) # leave the loop when the criterion is reached if stop: break # charge state self.ca.errors = errors # best dataset ever is returned return dataset[:, results], testdataset[:, results]
def _train(self, ds): """Proceed and select the features recursively eliminating less important ones. Parameters ---------- dataset : Dataset used to compute sensitivity maps and train a classifier to determine the transfer error testdataset : Dataset used to test the trained classifer to determine the transfer error Returns a tuple of two new datasets with the feature subset of `dataset` that had the lowest transfer error of all tested sets until the stopping criterion was reached. The first dataset is the feature subset of the training data and the second the selection of the test dataset. """ # get the initial split into train and test dataset, testdataset = self._get_traintest_ds(ds) errors = [] """Computed error for each tested features set.""" ca = self.ca ca.nfeatures = [] """Number of features at each step. Since it is not used by the algorithm it is stored directly in the conditional attribute""" ca.history = np.arange(dataset.nfeatures) """Store the last step # when the feature was still present """ ca.sensitivities = [] stop = False """Flag when RFE should be stopped.""" results = None """Will hold the best feature set ever.""" wdataset = dataset """Operate on working dataset initially identical.""" wtestdataset = testdataset """Same feature selection has to be performs on test dataset as well. This will hold the current testdataset.""" step = 0 """Counter how many selection step where done.""" orig_feature_ids = np.arange(dataset.nfeatures) """List of feature Ids as per original dataset remaining at any given step""" sensitivity = None """Contains the latest sensitivity map.""" result_selected_ids = orig_feature_ids """Resultant ids of selected features. Since the best is not necessarily is the last - we better keep this one around. By default -- all features are there""" selected_ids = result_selected_ids while wdataset.nfeatures > 0: if __debug__: debug('RFEC', "Step %d: nfeatures=%d" % (step, wdataset.nfeatures)) # mark the features which are present at this step # if it brings anyb mentionable computational burden in the future, # only mark on removed features at each step ca.history[orig_feature_ids] = step # Compute sensitivity map if self.__update_sensitivity or sensitivity == None: sensitivity = self._fmeasure(wdataset) if len(sensitivity) > 1: raise ValueError( "RFE cannot handle multiple sensitivities at once. " "'%s' returned %i sensitivities." % (self._fmeasure.__class__.__name__, len(sensitivity))) if ca.is_enabled("sensitivities"): ca.sensitivities.append(sensitivity) # get error for current feature set (handles optional retraining) error = self._evaluate_pmeasure(wdataset, wtestdataset) # Record the error errors.append(np.asscalar(error)) # Check if it is time to stop and if we got # the best result stop = self._stopping_criterion(errors) isthebest = self._bestdetector(errors) nfeatures = wdataset.nfeatures if ca.is_enabled("nfeatures"): ca.nfeatures.append(wdataset.nfeatures) # store result if isthebest: result_selected_ids = orig_feature_ids if __debug__: debug('RFEC', "Step %d: nfeatures=%d error=%.4f best/stop=%d/%d " % (step, nfeatures, np.asscalar(error), isthebest, stop)) # stop if it is time to finish if nfeatures == 1 or stop: break # Select features to preserve selected_ids = self._fselector(sensitivity) if __debug__: debug('RFEC_', "Sensitivity: %s, nfeatures_selected=%d, selected_ids: %s" % (sensitivity, len(selected_ids), selected_ids)) # Create a dataset only with selected features wdataset = wdataset[:, selected_ids] # select corresponding sensitivity values if they are not # recomputed if not self.__update_sensitivity: sensitivity = sensitivity[selected_ids] # need to update the test dataset as well # XXX why should it ever become None? # yoh: because we can have __transfer_error computed # using wdataset. See xia-generalization estimate # in lightsvm. Or for god's sake leave-one-out # on a wdataset # TODO: document these cases in this class if not testdataset is None: wtestdataset = wtestdataset[:, selected_ids] step += 1 # WARNING: THIS MUST BE THE LAST THING TO DO ON selected_ids selected_ids.sort() if self.ca.is_enabled("history") \ or self.ca.is_enabled('selected_ids'): orig_feature_ids = orig_feature_ids[selected_ids] # we already have the initial sensitivities, so even for a shared # classifier we can cleanup here self._pmeasure.untrain() # charge conditional attributes self.ca.errors = errors self.ca.selected_ids = result_selected_ids # announce desired features to the underlying slice mapper # do copy to survive later selections self._safe_assign_slicearg(copy(result_selected_ids))
def _train(self, ds): # local binding fmeasure = self._fmeasure fselector = self._fselector scriterion = self._stopping_criterion bestdetector = self._bestdetector # init # Computed error for each tested features set. errors = [] # feature candidate are all features in the pattern object candidates = range(ds.nfeatures) # initially empty list of selected features selected = [] # results in here please results = None # as long as there are candidates left # the loop will most likely get broken earlier if the stopping # criterion is reached while len(candidates): # measures for all candidates measures = [] # for all possible candidates for i, candidate in enumerate(candidates): if __debug__: debug('IFSC', "Tested %i" % i, cr=True) # take the new candidate and all already selected features # select a new temporay feature subset from the dataset # slice the full dataset, because for the initial iteration # steps this will be much mure effecient than splitting the # full ds into train and test at first fslm = StaticFeatureSelection(selected + [candidate]) fslm.train(ds) candidate_ds = fslm(ds) # activate the dataset splitter dsgen = self._splitter.generate(candidate_ds) # and derived the dataset part that is used for computing the selection # criterion trainds = dsgen.next() # compute data measure on the training part of this feature set measures.append(fmeasure(trainds)) # relies on ds.item() to work properly measures = [np.asscalar(m) for m in measures] # Select promissing feature candidates (staging) # IDs are only applicable to the current set of feature candidates tmp_staging_ids = fselector(measures) # translate into real candidate ids staging_ids = [candidates[i] for i in tmp_staging_ids] # mark them as selected and remove from candidates selected += staging_ids for i in staging_ids: candidates.remove(i) # actually run the performance measure to estimate "quality" of # selection fslm = StaticFeatureSelection(selected) fslm.train(ds) selectedds = fslm(ds) # split into train and test part trainds, testds = self._get_traintest_ds(selectedds) # evaluate and store error = self._evaluate_pmeasure(trainds, testds) errors.append(np.asscalar(error)) # intermediate cleanup, so the datasets do not hand around while # the next candidate evaluation is computed del trainds del testds # Check if it is time to stop and if we got # the best result stop = scriterion(errors) isthebest = bestdetector(errors) if __debug__: debug('IFSC', "nselected %i; error: %.4f " \ "best/stop=%d/%d\n" \ % (len(selected), errors[-1], isthebest, stop), cr=True, lf=True) if isthebest: # announce desired features to the underlying slice mapper # do copy to survive later selections self._safe_assign_slicearg(copy(selected)) # leave the loop when the criterion is reached if stop: break # charge state self.ca.errors = errors
def __copy__(self): # XXX how do we safely and exhaustively copy a node? return self.__class__([copy.copy(n) for n in self])
def __add__(self, other): """Add two `SummaryStatistics`s """ result = copy.copy(self) result += other return result
def _call(self, dataset, testdataset): """Proceed and select the features recursively eliminating less important ones. Parameters ---------- dataset : Dataset used to select features and train classifiers to determine the transfer error. testdataset : Dataset used to test the trained classifer on a certain feature set to determine the transfer error. Returns ------- A tuple with the dataset containing the feature subset of `dataset` that had the lowest transfer error of all tested sets until the stopping criterion was reached. The tuple also contains a dataset with the corrsponding features from the `testdataset`. """ errors = [] """Computed error for each tested features set.""" # feature candidate are all features in the pattern object candidates = range( dataset.nfeatures ) # initially empty list of selected features selected = [] # results in here please results = None # as long as there are candidates left # the loop will most likely get broken earlier if the stopping # criterion is reached while len( candidates ): # measures for all candidates measures = [] # for all possible candidates for i, candidate in enumerate(candidates): if __debug__: debug('IFSC', "Tested %i" % i, cr=True) # take the new candidate and all already selected features # select a new temporay feature subset from the dataset # XXX assume MappedDataset and issue plain=True ?? tmp_dataset = \ dataset[:, selected + [candidate]] # compute data measure on this feature set measures.append(self.__data_measure(tmp_dataset)) measures = [np.asscalar(m) for m in measures] # Select promissing feature candidates (staging) # IDs are only applicable to the current set of feature candidates tmp_staging_ids = self.__feature_selector(measures) # translate into real candidate ids staging_ids = [ candidates[i] for i in tmp_staging_ids ] # mark them as selected and remove from candidates selected += staging_ids for i in staging_ids: candidates.remove(i) # compute transfer error for the new set # XXX assume MappedDataset and issue plain=True ?? error = self.__transfer_error(testdataset[:, selected], dataset[:, selected]) errors.append(error) # Check if it is time to stop and if we got # the best result stop = self.__stopping_criterion(errors) isthebest = self.__bestdetector(errors) if __debug__: debug('IFSC', "nselected %i; error: %.4f " \ "best/stop=%d/%d\n" \ % (len(selected), errors[-1], isthebest, stop), cr=True, lf=True) if isthebest: # do copy to survive later selections results = copy(selected) # leave the loop when the criterion is reached if stop: break # charge state self.ca.errors = errors # best dataset ever is returned return dataset[:, results], testdataset[:, results]
def test_labelpermutation_randomsampling(): ds = Dataset.from_wizard(np.ones((5, 10)), targets=range(5), chunks=1) for i in xrange(1, 5): ds.append(Dataset.from_wizard(np.ones((5, 10)) + i, targets=range(5), chunks=i+1)) # assign some feature attributes ds.fa['roi'] = np.repeat(np.arange(5), 2) ds.fa['lucky'] = np.arange(10)%2 # use subclass for testing if it would survive ds.samples = ds.samples.view(myarray) ok_(ds.get_nsamples_per_attr('targets') == {0:5, 1:5, 2:5, 3:5, 4:5}) sample = ds.random_samples(2) ok_(sample.get_nsamples_per_attr('targets').values() == [ 2, 2, 2, 2, 2 ]) ok_((ds.sa['chunks'].unique == range(1, 6)).all()) # keep the orig labels orig_labels = ds.targets.copy() # also keep the orig dataset, but SHALLOW copy and leave everything # else as a view! ods = copy.copy(ds) ds.permute_attr() # by default, some permutation of targets should have happened assert_false((ds.targets == orig_labels).all()) # but the original dataset should be unaffected assert_array_equal(ods.targets, orig_labels) # array subclass survives ok_(isinstance(ods.samples, myarray)) # samples are really shared ds.samples[0, 0] = 123456 assert_array_equal(ds.samples, ods.samples) # and other samples attributes too ds.chunks[0] = 9876 assert_array_equal(ds.chunks, ods.chunks) # try to permute on custom target ds = ods.copy() otargets = ods.sa.targets.copy() ds.sa['custom'] = ods.sa.targets.copy() assert_array_equal(ds.sa.custom, otargets) assert_array_equal(ds.sa.targets, otargets) ds.permute_attr(attr='custom') # original targets should still match assert_array_equal(ds.sa.targets, otargets) # but custom should get permuted assert_false((ds.sa.custom == otargets).all()) # # Test permutation among features # assert_raises(KeyError, ds.permute_attr, attr='roi') # wrong collection ds = ods.copy() ds.permute_attr(attr='lucky', chunks_attr='roi', col='fa') # we should have not touched samples attributes for sa in ds.sa.keys(): assert_array_equal(ds.sa[sa].value, ods.sa[sa].value) # but we should have changed the roi assert_false((ds.fa['lucky'].value == ods.fa['lucky'].value).all()) assert_array_equal(ds.fa['roi'].value, ods.fa['roi'].value) # permute ROI as well without chunking (??? should we make # chunks_attr=None by default?) ds.permute_attr(attr='roi', chunks_attr=None, col='fa') assert_false((ds.fa['roi'].value == ods.fa['roi'].value).all())
def _call(self, dataset): """Perform the ROI search. """ # local binding nproc = self.__nproc if nproc is None and externals.exists('pprocess'): import pprocess try: nproc = pprocess.get_number_of_cores() or 1 except AttributeError: warning("pprocess version %s has no API to figure out maximal " "number of cores. Using 1" % externals.versions['pprocess']) nproc = 1 # train the queryengine self.__qe.train(dataset) # decide whether to run on all possible center coords or just a provided # subset if self.__center_ids is not None: roi_ids = self.__center_ids # safeguard against stupidity if __debug__: if max(roi_ids) >= dataset.nfeatures: raise IndexError, \ "Maximal center_id found is %s whenever given " \ "dataset has only %d features" \ % (max(roi_ids), dataset.nfeatures) else: roi_ids = np.arange(dataset.nfeatures) # compute if nproc > 1: # split all target ROIs centers into `nproc` equally sized blocks roi_blocks = np.array_split(roi_ids, nproc) # the next block sets up the infrastructure for parallel computing # this can easily be changed into a ParallelPython loop, if we # decide to have a PP job server in PyMVPA import pprocess p_results = pprocess.Map(limit=nproc) compute = p_results.manage( pprocess.MakeParallel(self._proc_block)) for block in roi_blocks: # should we maybe deepcopy the measure to have a unique and # independent one per process? compute(block, dataset, copy.copy(self.__datameasure)) # collect results results = [] if self.ca.is_enabled('roisizes'): roisizes = [] else: roisizes = None for r, rsizes in p_results: results += r if not roisizes is None: roisizes += rsizes else: # otherwise collect the results in a list results, roisizes = \ self._proc_block(roi_ids, dataset, self.__datameasure) if not roisizes is None: self.ca.roisizes = roisizes if __debug__: debug('SLC', '') # but be careful: this call also serves as conversion from parallel maps # to regular lists! # this uses the Dataset-hstack results = hstack(results) if 'mapper' in dataset.a: # since we know the space we can stick the original mapper into the # results as well if self.__center_ids is None: results.a['mapper'] = copy.copy(dataset.a.mapper) else: # there is an additional selection step that needs to be # expressed by another mapper mapper = copy.copy(dataset.a.mapper) mapper.append(FeatureSliceMapper(self.__center_ids, dshape=dataset.shape[1:])) results.a['mapper'] = mapper # charge state self.ca.raw_results = results # return raw results, base-class will take care of transformations return results
def _postcall(self, dataset, result): """Some postprocessing on the result """ self.ca.raw_results = result # post-processing if not self.__postproc is None: if __debug__: debug("SA_", "Applying mapper %s" % self.__postproc) result = self.__postproc.forward(result) # estimate the NULL distribution when functor is given if not self.__null_dist is None: if __debug__: debug("SA_", "Estimating NULL distribution using %s" % self.__null_dist) # we need a matching datameasure instance, but we have to disable # the estimation of the null distribution in that child to prevent # infinite looping. measure = copy.copy(self) measure.__null_dist = None self.__null_dist.fit(measure, dataset) if self.ca.is_enabled('null_t'): # get probability under NULL hyp, but also request # either it belong to the right tail null_prob, null_right_tail = \ self.__null_dist.p(result, return_tails=True) self.ca.null_prob = null_prob externals.exists('scipy', raise_=True) from scipy.stats import norm # TODO: following logic should appear in NullDist, # not here tail = self.null_dist.tail if tail == 'left': acdf = np.abs(null_prob) elif tail == 'right': acdf = 1.0 - np.abs(null_prob) elif tail in ['any', 'both']: acdf = 1.0 - np.clip(np.abs(null_prob), 0, 0.5) else: raise RuntimeError, 'Unhandled tail %s' % tail # We need to clip to avoid non-informative inf's ;-) # that happens due to lack of precision in mantissa # which is 11 bits in double. We could clip values # around 0 at as low as 1e-100 (correspond to z~=21), # but for consistency lets clip at 1e-16 which leads # to distinguishable value around p=1 and max z=8.2. # Should be sufficient range of z-values ;-) clip = 1e-16 null_t = norm.ppf(np.clip(acdf, clip, 1.0 - clip)) # assure that we deal with arrays: null_t = np.array(null_t, ndmin=1, copy=False) null_t[~null_right_tail] *= -1.0 # revert sign for negatives self.ca.null_t = null_t # store else: # get probability of result under NULL hypothesis if available # and don't request tail information self.ca.null_prob = self.__null_dist.p(result) return result