def test_static_reverse_doesnt_work_after_feature_selection_tuneup_1(): ds_orig = datasets['uni2small'].copy() # doesn't matter which m = StaticFeatureSelection(np.arange(4)) m.train(ds_orig) ds = ds_orig.get_mapped(m) ds0_rev = ds.a.mapper.reverse1(ds.samples[0]) # should work assert_equal(ds0_rev.shape, (ds_orig.nfeatures,)) # direct feature selection ds_ = ds[:, [0, 2]] # should work but doesn't due to # RuntimeError: Cannot reverse-map data since the original data shape is unknown. Either set `dshape` in the constructor, or call train(). ds0_rev_ = ds_.a.mapper.reverse1(ds_.samples[0]) #ds0_rev_ = _verified_reverse1(ds_.a.mapper, ds_.samples[0]) assert_equal(ds0_rev_.shape, (ds_orig.nfeatures,))
def test_subset_filler(): sm = StaticFeatureSelection(np.arange(3)) sm_f0 = StaticFeatureSelection(np.arange(3), filler=0) sm_fm1 = StaticFeatureSelection(np.arange(3), filler=-1) sm_fnan = StaticFeatureSelection(np.arange(3), filler=np.nan) data = np.arange(12).astype(float).reshape((2, -1)) sm.train(data) data_forwarded = sm.forward(data) for m in (sm, sm_f0, sm_fm1, sm_fnan): m.train(data) assert_array_equal(data_forwarded, m.forward(data)) data_back_fm1 = sm_fm1.reverse(data_forwarded) ok_(np.all(data_back_fm1[:, 3:] == -1)) data_back_fnan = sm_fnan.reverse(data_forwarded) ok_(np.all(np.isnan(data_back_fnan[:, 3:])))
def test_subset_filler(): sm = StaticFeatureSelection(np.arange(3)) sm_f0 = StaticFeatureSelection(np.arange(3), filler=0) sm_fm1 = StaticFeatureSelection(np.arange(3), filler= -1) sm_fnan = StaticFeatureSelection(np.arange(3), filler=np.nan) data = np.arange(12).astype(float).reshape((2, -1)) sm.train(data) data_forwarded = sm.forward(data) for m in (sm, sm_f0, sm_fm1, sm_fnan): m.train(data) assert_array_equal(data_forwarded, m.forward(data)) data_back_fm1 = sm_fm1.reverse(data_forwarded) ok_(np.all(data_back_fm1[:, 3:] == -1)) data_back_fnan = sm_fnan.reverse(data_forwarded) ok_(np.all(np.isnan(data_back_fnan[:, 3:])))
def test_subset(): data = np.array( [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]]) # float array doesn't work sm = StaticFeatureSelection(np.ones(16)) assert_raises(IndexError, sm.forward, data) # full mask sm = StaticFeatureSelection(slice(None)) # should not change single samples assert_array_equal(sm.forward(data[0:1].copy()), data[0:1]) # or multi-samples assert_array_equal(sm.forward(data.copy()), data) sm.train(data) # same on reverse assert_array_equal(sm.reverse(data[0:1].copy()), data[0:1]) # or multi-samples assert_array_equal(sm.reverse(data.copy()), data) # identical mappers sm_none = StaticFeatureSelection(slice(None)) sm_int = StaticFeatureSelection(np.arange(16)) sm_bool = StaticFeatureSelection(np.ones(16, dtype='bool')) sms = [sm_none, sm_int, sm_bool] # test subsets sids = [3, 4, 5, 6] bsubset = np.zeros(16, dtype='bool') bsubset[sids] = True subsets = [sids, slice(3, 7), bsubset, [3, 3, 4, 4, 6, 6, 6, 5]] # all test subset result in equivalent masks, hence should do the same to # the mapper and result in identical behavior for st in sms: for i, sub in enumerate(subsets): # shallow copy orig = copy(st) subsm = StaticFeatureSelection(sub) # should do copy-on-write for all important stuff!! orig += subsm # test if selection did its job if i == 3: # special case of multiplying features assert_array_equal(orig.forward1(data[0].copy()), subsets[i]) else: assert_array_equal(orig.forward1(data[0].copy()), sids) ## all of the above shouldn't change the original mapper #assert_array_equal(sm.get_mask(), np.arange(16)) # check for some bug catcher # no 3D input #assert_raises(IndexError, sm.forward, np.ones((3,2,1))) # no input of wrong length if __debug__: # checked only in __debug__ assert_raises(ValueError, sm.forward, np.ones(4)) # same on reverse #assert_raises(ValueError, sm.reverse, np.ones(16)) # invalid ids #assert_false(subsm.is_valid_inid(-1)) #assert_false(subsm.is_valid_inid(16)) # intended merge failures fsm = StaticFeatureSelection(np.arange(16)) assert_equal(fsm.__iadd__(None), NotImplemented) assert_equal(fsm.__iadd__(Dataset([2, 3, 4])), NotImplemented)
def _train(self, ds): # local binding fmeasure = self._fmeasure fselector = self._fselector scriterion = self._stopping_criterion bestdetector = self._bestdetector # init # Computed error for each tested features set. errors = [] # feature candidate are all features in the pattern object candidates = list(range(ds.nfeatures)) # initially empty list of selected features selected = [] # results in here please results = None # as long as there are candidates left # the loop will most likely get broken earlier if the stopping # criterion is reached while len(candidates): # measures for all candidates measures = [] # for all possible candidates for i, candidate in enumerate(candidates): if __debug__: debug('IFSC', "Tested %i" % i, cr=True) # take the new candidate and all already selected features # select a new temporay feature subset from the dataset # slice the full dataset, because for the initial iteration # steps this will be much mure effecient than splitting the # full ds into train and test at first fslm = StaticFeatureSelection(selected + [candidate]) fslm.train(ds) candidate_ds = fslm(ds) # activate the dataset splitter dsgen = self._splitter.generate(candidate_ds) # and derived the dataset part that is used for computing the selection # criterion trainds = next(dsgen) # compute data measure on the training part of this feature set measures.append(fmeasure(trainds)) # relies on ds.item() to work properly measures = [np.asscalar(m) for m in measures] # Select promissing feature candidates (staging) # IDs are only applicable to the current set of feature candidates tmp_staging_ids = fselector(measures) # translate into real candidate ids staging_ids = [candidates[i] for i in tmp_staging_ids] # mark them as selected and remove from candidates selected += staging_ids for i in staging_ids: candidates.remove(i) # actually run the performance measure to estimate "quality" of # selection fslm = StaticFeatureSelection(selected) fslm.train(ds) selectedds = fslm(ds) # split into train and test part trainds, testds = self._get_traintest_ds(selectedds) # evaluate and store error = self._evaluate_pmeasure(trainds, testds) errors.append(np.asscalar(error)) # intermediate cleanup, so the datasets do not hand around while # the next candidate evaluation is computed del trainds del testds # Check if it is time to stop and if we got # the best result stop = scriterion(errors) isthebest = bestdetector(errors) if __debug__: debug('IFSC', "nselected %i; error: %.4f " \ "best/stop=%d/%d\n" \ % (len(selected), errors[-1], isthebest, stop), cr=True, lf=True) if isthebest: # announce desired features to the underlying slice mapper # do copy to survive later selections self._safe_assign_slicearg(copy(selected)) # leave the loop when the criterion is reached if stop: break # charge state self.ca.errors = errors
def _train(self, ds): # local binding fmeasure = self._fmeasure fselector = self._fselector scriterion = self._stopping_criterion bestdetector = self._bestdetector # init # Computed error for each tested features set. errors = [] # feature candidate are all features in the pattern object candidates = range(ds.nfeatures) # initially empty list of selected features selected = [] # results in here please results = None # as long as there are candidates left # the loop will most likely get broken earlier if the stopping # criterion is reached while len(candidates): # measures for all candidates measures = [] # for all possible candidates for i, candidate in enumerate(candidates): if __debug__: debug('IFSC', "Tested %i" % i, cr=True) # take the new candidate and all already selected features # select a new temporay feature subset from the dataset # slice the full dataset, because for the initial iteration # steps this will be much mure effecient than splitting the # full ds into train and test at first fslm = StaticFeatureSelection(selected + [candidate]) fslm.train(ds) candidate_ds = fslm(ds) # activate the dataset splitter dsgen = self._splitter.generate(candidate_ds) # and derived the dataset part that is used for computing the selection # criterion trainds = dsgen.next() # compute data measure on the training part of this feature set measures.append(fmeasure(trainds)) # relies on ds.item() to work properly measures = [np.asscalar(m) for m in measures] # Select promissing feature candidates (staging) # IDs are only applicable to the current set of feature candidates tmp_staging_ids = fselector(measures) # translate into real candidate ids staging_ids = [candidates[i] for i in tmp_staging_ids] # mark them as selected and remove from candidates selected += staging_ids for i in staging_ids: candidates.remove(i) # actually run the performance measure to estimate "quality" of # selection fslm = StaticFeatureSelection(selected) fslm.train(ds) selectedds = fslm(ds) # split into train and test part trainds, testds = self._get_traintest_ds(selectedds) # evaluate and store error = self._evaluate_pmeasure(trainds, testds) errors.append(np.asscalar(error)) # intermediate cleanup, so the datasets do not hand around while # the next candidate evaluation is computed del trainds del testds # Check if it is time to stop and if we got # the best result stop = scriterion(errors) isthebest = bestdetector(errors) if __debug__: debug('IFSC', "nselected %i; error: %.4f " \ "best/stop=%d/%d\n" \ % (len(selected), errors[-1], isthebest, stop), cr=True, lf=True) if isthebest: # announce desired features to the underlying slice mapper # do copy to survive later selections self._safe_assign_slicearg(copy(selected)) # leave the loop when the criterion is reached if stop: break # charge state self.ca.errors = errors