def __getitem__(self, args): # uniformize for checks below; it is not a tuple if just single slicing # spec is passed if not isinstance(args, tuple): args = (args,) # if we get an slicing array for feature selection and it is *not* 1D # try feeding it through the mapper (if there is any) if len(args) > 1 and isinstance(args[1], np.ndarray) and len(args[1].shape) > 1 and self.a.has_key("mapper"): args = list(args) args[1] = self.a.mapper.forward1(args[1]) args = tuple(args) # let the base do the work ds = super(Dataset, self).__getitem__(args) # and adjusting the mapper (if any) if len(args) > 1 and "mapper" in ds.a: # create matching mapper # the mapper is just appended to the dataset. It could also be # actually used to perform the slicing and prevent duplication of # functionality between the Dataset.__getitem__ and the mapper. # However, __getitem__ is sometimes more efficient, since it can # slice samples and feature axis at the same time. Moreover, the # mvpa.base.dataset.Dataset has no clue about mappers and should # be fully functional without them. subsetmapper = StaticFeatureSelection(args[1], dshape=self.samples.shape[1:]) # do not-act forward mapping to charge the output shape of the # slice mapper without having it to train on a full dataset (which # is most likely more expensive) subsetmapper.forward(np.zeros((1,) + self.shape[1:], dtype="bool")) # mapper is ready to use -- simply store ds._append_mapper(subsetmapper) return ds
def test_subset_filler(): sm = StaticFeatureSelection(np.arange(3)) sm_f0 = StaticFeatureSelection(np.arange(3), filler=0) sm_fm1 = StaticFeatureSelection(np.arange(3), filler=-1) sm_fnan = StaticFeatureSelection(np.arange(3), filler=np.nan) data = np.arange(12).astype(float).reshape((2, -1)) sm.train(data) data_forwarded = sm.forward(data) for m in (sm, sm_f0, sm_fm1, sm_fnan): m.train(data) assert_array_equal(data_forwarded, m.forward(data)) data_back_fm1 = sm_fm1.reverse(data_forwarded) ok_(np.all(data_back_fm1[:, 3:] == -1)) data_back_fnan = sm_fnan.reverse(data_forwarded) ok_(np.all(np.isnan(data_back_fnan[:, 3:])))
def _train(self, ds): # local binding fmeasure = self._fmeasure fselector = self._fselector scriterion = self._stopping_criterion bestdetector = self._bestdetector # init # Computed error for each tested features set. errors = [] # feature candidate are all features in the pattern object candidates = range(ds.nfeatures) # initially empty list of selected features selected = [] # results in here please results = None # as long as there are candidates left # the loop will most likely get broken earlier if the stopping # criterion is reached while len(candidates): # measures for all candidates measures = [] # for all possible candidates for i, candidate in enumerate(candidates): if __debug__: debug('IFSC', "Tested %i" % i, cr=True) # take the new candidate and all already selected features # select a new temporay feature subset from the dataset # slice the full dataset, because for the initial iteration # steps this will be much mure effecient than splitting the # full ds into train and test at first fslm = StaticFeatureSelection(selected + [candidate]) fslm.train(ds) candidate_ds = fslm(ds) # activate the dataset splitter dsgen = self._splitter.generate(candidate_ds) # and derived the dataset part that is used for computing the selection # criterion trainds = dsgen.next() # compute data measure on the training part of this feature set measures.append(fmeasure(trainds)) # relies on ds.item() to work properly measures = [np.asscalar(m) for m in measures] # Select promissing feature candidates (staging) # IDs are only applicable to the current set of feature candidates tmp_staging_ids = fselector(measures) # translate into real candidate ids staging_ids = [candidates[i] for i in tmp_staging_ids] # mark them as selected and remove from candidates selected += staging_ids for i in staging_ids: candidates.remove(i) # actually run the performance measure to estimate "quality" of # selection fslm = StaticFeatureSelection(selected) fslm.train(ds) selectedds = fslm(ds) # split into train and test part trainds, testds = self._get_traintest_ds(selectedds) # evaluate and store error = self._evaluate_pmeasure(trainds, testds) errors.append(np.asscalar(error)) # intermediate cleanup, so the datasets do not hand around while # the next candidate evaluation is computed del trainds del testds # Check if it is time to stop and if we got # the best result stop = scriterion(errors) isthebest = bestdetector(errors) if __debug__: debug('IFSC', "nselected %i; error: %.4f " \ "best/stop=%d/%d\n" \ % (len(selected), errors[-1], isthebest, stop), cr=True, lf=True) if isthebest: # announce desired features to the underlying slice mapper # do copy to survive later selections self._safe_assign_slicearg(copy(selected)) # leave the loop when the criterion is reached if stop: break # charge state self.ca.errors = errors
def test_subset(): data = np.array( [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]]) # float array doesn't work sm = StaticFeatureSelection(np.ones(16)) assert_raises(IndexError, sm.forward, data) # full mask sm = StaticFeatureSelection(slice(None)) # should not change single samples assert_array_equal(sm.forward(data[0:1].copy()), data[0:1]) # or multi-samples assert_array_equal(sm.forward(data.copy()), data) sm.train(data) # same on reverse assert_array_equal(sm.reverse(data[0:1].copy()), data[0:1]) # or multi-samples assert_array_equal(sm.reverse(data.copy()), data) # identical mappers sm_none = StaticFeatureSelection(slice(None)) sm_int = StaticFeatureSelection(np.arange(16)) sm_bool = StaticFeatureSelection(np.ones(16, dtype='bool')) sms = [sm_none, sm_int, sm_bool] # test subsets sids = [3,4,5,6] bsubset = np.zeros(16, dtype='bool') bsubset[sids] = True subsets = [sids, slice(3,7), bsubset, [3,3,4,4,6,6,6,5]] # all test subset result in equivalent masks, hence should do the same to # the mapper and result in identical behavior for st in sms: for i, sub in enumerate(subsets): # shallow copy orig = copy(st) subsm = StaticFeatureSelection(sub) # should do copy-on-write for all important stuff!! orig += subsm # test if selection did its job if i == 3: # special case of multiplying features assert_array_equal(orig.forward1(data[0].copy()), subsets[i]) else: assert_array_equal(orig.forward1(data[0].copy()), sids) ## all of the above shouldn't change the original mapper #assert_array_equal(sm.get_mask(), np.arange(16)) # check for some bug catcher # no 3D input #assert_raises(IndexError, sm.forward, np.ones((3,2,1))) # no input of wrong length if __debug__: # checked only in __debug__ assert_raises(ValueError, sm.forward, np.ones(4)) # same on reverse #assert_raises(ValueError, sm.reverse, np.ones(16)) # invalid ids #assert_false(subsm.is_valid_inid(-1)) #assert_false(subsm.is_valid_inid(16)) # intended merge failures fsm = StaticFeatureSelection(np.arange(16)) assert_equal(fsm.__iadd__(None), NotImplemented) assert_equal(fsm.__iadd__(Dataset([2,3,4])), NotImplemented)