def selectFeatures(self, ids, plain=False, sort=False): """Select features given their ids. The methods behaves similar to Dataset.selectFeatures(), but additionally takes care of adjusting the embedded mapper appropriately. :Parameters: ids: sequence Iterable container to select ids plain: boolean Flag whether to return MappedDataset (or just Dataset) sort: boolean Flag whether to sort Ids. Order matters and selectFeatures assumes incremental order. If not such, in non-optimized code selectFeatures would verify the order and sort """ # call base method to get selected feature subset if plain: sdata = Dataset(self._data, self._dsattr, check_data=False, copy_samples=False, copy_data=False, copy_dsattr=False) return sdata.selectFeatures(ids=ids, sort=sort) else: sdata = Dataset.selectFeatures(self, ids=ids, sort=sort) # since we have new DataSet we better have a new mapper sdata._dsattr['mapper'] = copy.deepcopy(sdata._dsattr['mapper']) if sort: sdata._dsattr['mapper'].selectOut(sorted(ids)) else: sdata._dsattr['mapper'].selectOut(ids) return sdata
def testEvilSelects(self): """Test some obscure selections of samples via select() or __getitem__ """ origdata = datasets["uni2large"].samples[:100, :10].T data = Dataset( samples=origdata, # 0 1 2 3 4 5 6 7 8 9 labels=[8, 9, 4, 3, 3, 3, 3, 2, 8, 9], chunks=[1, 2, 3, 2, 3, 1, 5, 6, 3, 6], ) # malformed getitem if __debug__: # check is enforced only in __debug__ self.failUnlessRaises(ValueError, data.__getitem__, "labels", "featu") # too many indicies self.failUnlessRaises(ValueError, data.__getitem__, 1, 1, 1) # various getitems which should carry the same result for sel in [ data.select("chunks", [2, 6], labels=[3, 2], features=slice(None)), data.select("all", "all", labels=[2, 3], chunks=[2, 6]), data["chunks", [2, 6], "labels", [3, 2]], data[:, :, "chunks", [2, 6], "labels", [3, 2]], # get warnings but should work as the rest for now data[3:8, "chunks", [2, 6, 2, 6], "labels", [3, 2]], ]: self.failUnless(N.all(sel.origids == [3, 7])) self.failUnless(sel.nfeatures == 100) self.failUnless(N.all(sel.samples == origdata[[3, 7]])) target = origdata[[3, 7]] target = target[:, [1, 3]] # various getitems which should carry the same result for sel in [ data.select("all", [1, 3], "chunks", [2, 6], labels=[3, 2]), data[:, [1, 3], "chunks", [2, 6], "labels", [3, 2]], data[:, [1, 3], "chunks", [2, 6], "labels", [3, 2]], # get warnings but should work as the rest for now data[3:8, [1, 1, 3, 1], "chunks", [2, 6, 2, 6], "labels", [3, 2]], ]: self.failUnless(N.all(sel.origids == [3, 7])) self.failUnless(sel.nfeatures == 2) self.failUnless(N.all(sel.samples == target)) # Check if we get empty selection if requesting impossible self.failUnless(data.select(chunks=[23]).nsamples == 0) # Check .where() self.failUnless(N.all(data.where(chunks=[2, 6]) == [1, 3, 7, 9])) self.failUnless(N.all(data.where(chunks=[2, 6], labels=[22, 3]) == [3])) # both samples and features idx = data.where("all", [1, 3, 10], labels=[2, 3, 4]) self.failUnless(N.all(idx[1] == [1, 3, 10])) self.failUnless(N.all(idx[0] == range(2, 8))) # empty query self.failUnless(data.where() is None) # empty result self.failUnless(data.where(labels=[123]) == [])
def testCombinedPatternAndFeatureMasking(self): data = Dataset(samples=N.arange(20).reshape((4, 5)), labels=1, chunks=1) self.failUnless(data.nsamples == 4) self.failUnless(data.nfeatures == 5) fsel = data.selectFeatures([1, 2]) fpsel = fsel.selectSamples([0, 3]) self.failUnless(fpsel.nsamples == 2) self.failUnless(fpsel.nfeatures == 2) self.failUnless((fpsel.samples == [[1, 2], [16, 17]]).all())
def testFeatureMaskConversion(self): dataset = Dataset(samples=N.arange(12).reshape((4, 3)), labels=1, chunks=1) mask = dataset.convertFeatureIds2FeatureMask(range(dataset.nfeatures)) self.failUnless(len(mask) == dataset.nfeatures) self.failUnless((mask == True).all()) self.failUnless((dataset.convertFeatureMask2FeatureIds(mask) == range(3)).all()) mask[1] = False self.failUnless((dataset.convertFeatureMask2FeatureIds(mask) == [0, 2]).all())
def testApplyMapper(self): """Test creation of new dataset by applying a mapper""" mapper = MaskMapper(N.array([1, 0, 1])) dataset = Dataset(samples=N.arange(12).reshape((4, 3)), labels=1, chunks=1) seldataset = dataset.applyMapper(featuresmapper=mapper) self.failUnless((dataset.selectFeatures([0, 2]).samples == seldataset.samples).all()) # Lets do simple test on maskmapper reverse since it seems to # do evil things. Those checks are done only in __debug__ if __debug__: # should fail since in mask we have just 2 features now self.failUnlessRaises(ValueError, mapper.reverse, [10, 20, 30]) self.failUnlessRaises(ValueError, mapper.forward, [10, 20])
def testLabelsMapping(self): """Test mapping of the labels from strings to numericals """ od = {"apple": 0, "orange": 1} samples = [[3], [2], [3]] labels_l = ["apple", "orange", "apple"] # test broadcasting of the label ds = Dataset(samples=samples, labels="orange") self.failUnless(N.all(ds.labels == ["orange"] * 3)) # Test basic mapping of litteral labels for ds in [ Dataset(samples=samples, labels=labels_l, labels_map=od), # Figure out mapping Dataset(samples=samples, labels=labels_l, labels_map=True), ]: self.failUnless(N.all(ds.labels == [0, 1, 0])) self.failUnless(ds.labels_map == od) ds_ = ds[1] self.failUnless(ds_.labels_map == od, msg="selectSamples should provide full mapping preserved") # We should complaint about insufficient mapping self.failUnlessRaises(ValueError, Dataset, samples=samples, labels=labels_l, labels_map={"apple": 0}) # Conformance to older behavior -- if labels are given in # strings, no mapping occur by default ds2 = Dataset(samples=samples, labels=labels_l) self.failUnlessEqual(ds2.labels_map, None) # We should label numerical labels if it was requested: od3 = {1: 100, 2: 101, 3: 100} ds3 = Dataset(samples=samples, labels=[1, 2, 3], labels_map=od3) self.failUnlessEqual(ds3.labels_map, od3) self.failUnless(N.all(ds3.labels == [100, 101, 100])) ds3_ = ds3[1] self.failUnlessEqual(ds3.labels_map, od3) ds4 = Dataset(samples=samples, labels=labels_l) # Lets check setting the labels map ds = Dataset(samples=samples, labels=labels_l, labels_map=od) self.failUnlessRaises(ValueError, ds.setLabelsMap, {"orange": 1, "nonorange": 3}) new_map = {"tasty": 0, "crappy": 1} ds.labels_map = new_map.copy() self.failUnlessEqual(ds.labels_map, new_map)
def _call(self, dataset): """Extract weights from GPR """ clf = self.clf kernel = clf.kernel train_fv = clf._train_fv if isinstance(kernel, LinearKernel): Sigma_p = 1.0 else: Sigma_p = kernel.params.Sigma_p weights = Ndot(Sigma_p, Ndot(train_fv.T, clf._alpha)) if self.ca.is_enabled('variances'): # super ugly formulas that can be quite surely improved: tmp = np.linalg.inv(clf._L) Kyinv = Ndot(tmp.T, tmp) # XXX in such lengthy matrix manipulations you might better off # using np.matrix where * is a matrix product self.ca.variances = Ndiag( Sigma_p - Ndot(Sigma_p, Ndot(train_fv.T, Ndot(Kyinv, Ndot(train_fv, Sigma_p))))) return Dataset(np.atleast_2d(weights))
def _call(self, dataset): sensitivities = [] for ind, analyzer in enumerate(self.__analyzers): if __debug__: debug("SA", "Computing sensitivity for SA#%d:%s" % (ind, analyzer)) sensitivity = analyzer(dataset) sensitivities.append(sensitivity) if __debug__: debug("SA", "Returning %d sensitivities from %s" % (len(sensitivities), self.__class__.__name__)) sa_attr = self._sa_attr if isinstance(sensitivities[0], AttrDataset): smerged = None for i, s in enumerate(sensitivities): s.sa[sa_attr] = np.repeat(i, len(s)) if smerged is None: smerged = s else: smerged.append(s) sensitivities = smerged else: sensitivities = \ Dataset(sensitivities, sa={sa_attr: np.arange(len(sensitivities))}) self.ca.sensitivities = sensitivities return sensitivities
def _forward_dataset_helper(self, ds): # local binding num = self.__num pos = None if not self.__position_attr is None: # we know something about sample position pos = ds.sa[self.__position_attr].value rsamples, pos = resample(ds.samples, self.__num, t=pos, window=self.__window_args) else: # we know nothing about samples position rsamples = resample(ds.samples, self.__num, t=None, window=self.__window_args) # new dataset that reuses that feature and dataset attributes of the # source mds = Dataset(rsamples, fa=ds.fa, a=ds.a) # the tricky part is what to do with the samples attributes, since their # number has changes if self.__attr_strategy == 'remove': # nothing to be done pass elif self.__attr_strategy == 'sample': step = int(len(ds) / num) sa = dict([(k, ds.sa[k].value[0::step][:num]) for k in ds.sa]) mds.sa.update(sa) elif self.__attr_strategy == 'resample': # resample the attributes themselves sa = {} for k in ds.sa: v = ds.sa[k].value if pos is None: sa[k] = resample(v, self.__num, t=None, window=self.__window_args) else: if k == self.__position_attr: # position attr will be handled separately at the end continue sa[k] = resample(v, self.__num, t=pos, window=self.__window_args)[0] # inject them all mds.sa.update(sa) else: raise ValueError("Unkown attribute handling strategy '%s'." % self.__attr_strategy) if not pos is None: # we got the new sample positions and can store them mds.sa[self.__position_attr] = pos return mds
def test_linear_kernel(self): """Simplistic testing of linear kernel""" d1 = Dataset(np.asarray([range(5)] * 10, dtype=float)) lk = npK.LinearKernel() lk.compute(d1) self.failUnless(lk._k.shape == (10, 10), "Failure computing LinearKernel (Size mismatch)") self.failUnless((lk._k == 30).all(), "Failure computing LinearKernel")
def test_resample(): time = np.linspace(0, 2 * np.pi, 100) ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T, sa={ 'time': time, 'section': np.repeat(range(10), 10) }) assert_equal(ds.shape, (100, 2)) # downsample num = 10 rm = FFTResampleMapper(num, window=('gauss', 50), position_attr='time', attr_strategy='sample') mds = rm(ds) assert_equal(mds.shape, (num, ds.nfeatures)) # didn't change the orig assert_equal(len(ds), 100) # check position-based resampling ds_partial = ds[0::10] mds_partial = rm(ds_partial) # despite different input sampling should yield the same output timepoints assert_array_almost_equal(mds.sa.time, mds_partial.sa.time) # exclude the first points to prevent edge effects, but the data should be # very similar too assert_array_almost_equal(mds.samples[2:], mds_partial.samples[2:], decimal=2) # simple sample of sa's should give meaningful stuff assert_array_equal(mds.sa.section, range(10)) # and now for a dataset with chunks cds = vstack([ds.copy(), ds.copy()]) cds.sa['chunks'] = np.repeat([0, 1], len(ds)) rm = FFTResampleMapper(num, attr_strategy='sample', chunks_attr='chunks', window=('gauss', 50)) mcds = rm(cds) assert_equal(mcds.shape, (20, 2)) assert_array_equal(mcds.sa.section, np.tile(range(10), 2)) # each individual chunks should be identical to previous dataset assert_array_almost_equal(mds.samples, mcds.samples[:10]) assert_array_almost_equal(mds.samples, mcds.samples[10:])
def testId(self): """Test Dataset.idhash() if it gets changed if any of the labels/chunks changes """ dataset = Dataset(samples=N.arange(12).reshape((4, 3)), labels=1, chunks=1) origid = dataset.idhash dataset.labels = [3, 1, 2, 3] # change all labels self.failUnless(origid != dataset.idhash, msg="Changing all labels should alter dataset's idhash") origid = dataset.idhash z = dataset.labels[1] self.failUnlessEqual(origid, dataset.idhash, msg="Accessing shouldn't change idhash") z = dataset.chunks self.failUnlessEqual(origid, dataset.idhash, msg="Accessing shouldn't change idhash") z[2] = 333 self.failUnless(origid != dataset.idhash, msg="Changing value in attribute should change idhash") origid = dataset.idhash dataset.samples[1, 1] = 1000 self.failUnless(origid != dataset.idhash, msg="Changing value in data should change idhash") origid = dataset.idhash dataset.permuteLabels(True) self.failUnless(origid != dataset.idhash, msg="Permutation also changes idhash") dataset.permuteLabels(False) self.failUnless(origid == dataset.idhash, msg="idhash should be restored after " "permuteLabels(False)")
def test_cached_kernel(self): nchunks = 5 n = 50 * nchunks d = Dataset(np.random.randn(n, 132)) d.sa.chunks = np.random.randint(nchunks, size=n) # We'll compare against an Rbf just because it has a parameter to change rk = npK.RbfKernel(sigma=1.5) # Assure two kernels are independent for this test ck = CachedKernel(kernel=npK.RbfKernel(sigma=1.5)) ck.compute(d) # Initial cache of all data self.failUnless(ck._recomputed, 'CachedKernel was not initially computed') # Try some splitting for chunk in [d[d.sa.chunks == i] for i in range(nchunks)]: rk.compute(chunk) ck.compute(chunk) self.kernel_equiv(rk, ck) #, accuracy=1e-12) self.failIf(ck._recomputed, "CachedKernel incorrectly recomputed it's kernel") # Test what happens when a parameter changes ck.params.sigma = 3.5 ck.compute(d) self.failUnless(ck._recomputed, "CachedKernel doesn't recompute on kernel change") rk.params.sigma = 3.5 rk.compute(d) self.failUnless(np.all(rk._k == ck._k), 'Cached and rbf kernels disagree after kernel change') # Now test handling new data d2 = Dataset(np.random.randn(32, 43)) ck.compute(d2) self.failUnless( ck._recomputed, "CachedKernel did not automatically recompute new data") ck.compute(d) self.failUnless(ck._recomputed, "CachedKernel did not recompute old data which had\n" +\ "previously been computed, but had the cache overriden")
def testFeatureSelection(self): """Testing feature selection: sorted/not sorted, feature groups """ origdata = datasets["uni2large"].samples[:10, :20] data = Dataset(samples=origdata, labels=2, chunks=2) # define some feature groups data.defineFeatureGroups(N.repeat(range(4), 5)) unmasked = data.samples.copy() # default must be no mask self.failUnless(data.nfeatures == 20) features_to_select = [3, 0, 17] features_to_select_copy = copy.deepcopy(features_to_select) features_to_select_sorted = copy.deepcopy(features_to_select) features_to_select_sorted.sort() bsel = N.array([False] * 20) bsel[features_to_select] = True # check selection with feature list for sel, issorted in [ (data.selectFeatures(features_to_select, sort=False), False), (data.selectFeatures(features_to_select, sort=True), True), (data.select(slice(None), features_to_select), True), (data.select(slice(None), N.array(features_to_select)), True), (data.select(slice(None), bsel), True), ]: self.failUnless(sel.nfeatures == 3) # check size of the masked patterns self.failUnless(sel.samples.shape == (10, 3)) # check that the right features are selected fts = (features_to_select, features_to_select_sorted)[int(issorted)] self.failUnless((unmasked[:, fts] == sel.samples).all()) # check grouping information self.failUnless((sel._dsattr["featuregroups"] == [0, 0, 3]).all()) # check side effect on features_to_select parameter: self.failUnless(features_to_select == features_to_select_copy) # check selection by feature group id gsel = data.selectFeatures(groups=[2, 3]) self.failUnless(gsel.nfeatures == 10) self.failUnless(set(gsel._dsattr["featuregroups"]) == set([2, 3]))
def testAttributes(self): """Test adding custom attributes to a dataset """ # class BlobbyDataset(Dataset): # pass # TODO: we can't assign attributes to those for now... ds = Dataset(samples=range(5), labels=1, chunks=1) self.failUnlessRaises(AttributeError, lambda x: x.blobs, ds) """Dataset.blobs should fail since .blobs wasn't yet registered""" # register new attribute but it would alter only new instances Dataset._registerAttribute("blobs", "_data", hasunique=True) ds = Dataset(samples=range(5), labels=1, chunks=1) self.failUnless(not ds.blobs != [0], msg="By default new attributes supposed to get 0 as the value") try: ds.blobs = [1, 2] self.fail(msg="Dataset.blobs=[1,2] should fail since " "there is 5 samples") except ValueError, e: pass
def _call(self, dataset): """Computes featurewise I-RELIEF weights.""" samples = dataset.samples NS, NF = samples.shape[:2] if self.w_guess == None: self.w = np.ones(NF, 'd') # do normalization in all cases to be safe :) self.w = self.w / (self.w**2).sum() M, H = self.compute_M_H(dataset.targets) while True: self.k = self.kernel(length_scale=self.kernel_width / self.w) d_w_k = self.k.computed(samples).as_raw_np() # set d_w_k to zero where distance=0 (i.e. kernel == # 1.0), otherwise I-RELIEF could not converge. # XXX Note that kernel==1 for distance=0 only for # exponential kernels!! IMPROVE d_w_k[np.abs(d_w_k - 1.0) < 1.0e-15] = 0.0 ni = np.zeros(NF, 'd') for n in range(NS): # d_w_k[n,n] could be omitted since == 0.0 gamma_n = 1.0 - np.nan_to_num(d_w_k[n, M[n]].sum() \ / (d_w_k[n, :].sum()-d_w_k[n, n])) alpha_n = np.nan_to_num(d_w_k[n, M[n]] / (d_w_k[n, M[n]].sum())) beta_n = np.nan_to_num(d_w_k[n, H[n]] / (d_w_k[n, H[n]].sum())) m_n = (np.abs(samples[n, :] - samples[M[n], :]) \ * alpha_n[:, None]).sum(0) h_n = (np.abs(samples[n, :] - samples[H[n], :]) \ * beta_n[:, None]).sum(0) ni += gamma_n * (m_n - h_n) ni = ni / NS ni_plus = np.clip(ni, 0.0, np.inf) # set all negative elements to zero w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum()))) change = np.abs(w_new - self.w).sum() if __debug__ and 'IRELIEF' in debug.active: debug( 'IRELIEF', "change=%.4f max=%f min=%.4f mean=%.4f std=%.4f #nan=%d" % (change, w_new.max(), w_new.min(), w_new.mean(), w_new.std(), np.isnan(w_new).sum())) # update weights: self.w = w_new if change < self.threshold: break return Dataset(self.w[np.newaxis])
def test_resample(): time = np.linspace(0, 2*np.pi, 100) ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T, sa = {'time': time, 'section': np.repeat(range(10), 10)}) assert_equal(ds.shape, (100, 2)) # downsample num = 10 rm = FFTResampleMapper(num, window=('gauss', 50), position_attr='time', attr_strategy='sample') mds = rm(ds) assert_equal(mds.shape, (num, ds.nfeatures)) # didn't change the orig assert_equal(len(ds), 100) # check position-based resampling ds_partial = ds[0::10] mds_partial = rm(ds_partial) # despite different input sampling should yield the same output timepoints assert_array_almost_equal(mds.sa.time, mds_partial.sa.time) # exclude the first points to prevent edge effects, but the data should be # very similar too assert_array_almost_equal(mds.samples[2:], mds_partial.samples[2:], decimal=2) # simple sample of sa's should give meaningful stuff assert_array_equal(mds.sa.section, range(10)) # and now for a dataset with chunks cds = vstack([ds.copy(), ds.copy()]) cds.sa['chunks'] = np.repeat([0,1], len(ds)) rm = FFTResampleMapper(num, attr_strategy='sample', chunks_attr='chunks', window=('gauss', 50)) mcds = rm(cds) assert_equal(mcds.shape, (20, 2)) assert_array_equal(mcds.sa.section, np.tile(range(10),2)) # each individual chunks should be identical to previous dataset assert_array_almost_equal(mds.samples, mcds.samples[:10]) assert_array_almost_equal(mds.samples, mcds.samples[10:])
def test_datasetmapping(): # 6 samples, 4 features data = np.arange(24).reshape(6, 4) ds = Dataset(data, sa={ 'timepoints': np.arange(6), 'multidim': data.copy() }, fa={'fid': np.arange(4)}) # with overlapping and non-overlapping boxcars startpoints = [0, 1, 4] boxlength = 2 bm = BoxcarMapper(startpoints, boxlength, inspace='boxy') # train is critical bm.train(ds) mds = bm.forward(ds) assert_equal(len(mds), len(startpoints)) assert_equal(mds.nfeatures, boxlength) # all samples attributes remain, but the can rotated/compressed into # multidimensional attributes assert_equal(sorted(mds.sa.keys()), ['boxy_onsetidx'] + sorted(ds.sa.keys())) assert_equal(mds.sa.multidim.shape, (len(startpoints), boxlength, ds.nfeatures)) assert_equal(mds.sa.timepoints.shape, (len(startpoints), boxlength)) assert_array_equal(mds.sa.timepoints.flatten(), np.array([(s, s + 1) for s in startpoints]).flatten()) assert_array_equal(mds.sa.boxy_onsetidx, startpoints) # feature attributes also get rotated and broadcasted assert_array_equal(mds.fa.fid, [ds.fa.fid, ds.fa.fid]) # and finally there is a new one assert_array_equal(mds.fa.boxy_offsetidx, np.repeat(np.arange(boxlength), 4).reshape(2, -1)) # now see how it works on reverse() rds = bm.reverse(mds) # we got at least something of all original attributes back assert_equal(sorted(rds.sa.keys()), sorted(ds.sa.keys())) assert_equal(sorted(rds.fa.keys()), sorted(ds.fa.keys())) # it is not possible to reconstruct the full samples array # some samples even might show up multiple times (when there are overlapping # boxcars assert_array_equal( rds.samples, np.array([[0, 1, 2, 3], [4, 5, 6, 7], [4, 5, 6, 7], [8, 9, 10, 11], [16, 17, 18, 19], [20, 21, 22, 23]])) assert_array_equal(rds.sa.timepoints, [0, 1, 1, 2, 4, 5]) assert_array_equal(rds.sa.multidim, ds.sa.multidim[rds.sa.timepoints]) # but feature attributes should be fully recovered assert_array_equal(rds.fa.fid, ds.fa.fid)
def testSimple(self): data = N.arange(24).reshape(8,3) labels = [0, 1] * 4 chunks = N.repeat(N.array((0,1)),4) # correct results csamples = [[3, 4, 5], [6, 7, 8], [15, 16, 17], [18, 19, 20]] clabels = [0, 1, 0, 1] cchunks = [0, 0, 1, 1] ds = Dataset(samples=data, labels=labels, chunks=chunks) # default behavior m = SampleGroupMapper() # error if not trained self.failUnlessRaises(RuntimeError, m, data) # train mapper first m.train(ds) self.failUnless((m.forward(ds.samples) == csamples).all()) self.failUnless((m.forward(ds.labels) == clabels).all()) self.failUnless((m.forward(ds.chunks) == cchunks).all()) # directly apply to dataset # using untrained mapper! mapped = ds.applyMapper(samplesmapper=SampleGroupMapper()) self.failUnless(mapped.nsamples == 4) self.failUnless(mapped.nfeatures == 3) self.failUnless((mapped.samples == csamples).all()) self.failUnless((mapped.labels == clabels).all()) self.failUnless((mapped.chunks == cchunks).all()) # make sure origids get regenerated self.failUnless((mapped.origids == range(4)).all())
def test_1d_multispace_searchlight(self): ds = Dataset([np.arange(6)]) ds.fa['coord1'] = np.repeat(np.arange(3), 2) # add a second space to the dataset ds.fa['coord2'] = np.tile(np.arange(2), 3) measure = lambda x: "+".join([str(x) for x in x.samples[0]]) # simply select each feature once res = Searchlight(measure, IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(0)), nproc=1)(ds) assert_array_equal(res.samples, [['0', '1', '2', '3', '4', '5']]) res = Searchlight(measure, IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(1)), nproc=1)(ds) assert_array_equal(res.samples, [['0+1', '0+1', '2+3', '2+3', '4+5', '4+5']]) res = Searchlight(measure, IndexQueryEngine(coord1=Sphere(1), coord2=Sphere(0)), nproc=1)(ds) assert_array_equal(res.samples, [['0+2', '1+3', '0+2+4', '1+3+5', '2+4', '3+5']])
def normalFeatureDataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5, means=None, nonbogus_features=None, snr=1.0): """Generate a dataset where each label is some normally distributed beastie around specified mean (0 if None). snr is assuming that signal has std 1.0 so we just divide noise by snr Probably it is a generalization of pureMultivariateSignal where means=[ [0,1], [1,0] ] Specify either means or nonbogus_features so means get assigned accordingly """ data = N.random.standard_normal((perlabel*nlabels, nfeatures))/N.sqrt(snr) if (means is None) and (not nonbogus_features is None): if len(nonbogus_features) > nlabels: raise ValueError, "Can't assign simply a feature to a " + \ "class: more nonbogus_features than labels" means = N.zeros((len(nonbogus_features), nfeatures)) # pure multivariate -- single bit per feature for i in xrange(len(nonbogus_features)): means[i, nonbogus_features[i]] = 1.0 if not means is None: # add mean data += N.repeat(N.array(means, ndmin=2), perlabel, axis=0) # bring it 'under 1', since otherwise some classifiers have difficulties # during optimization data = 1.0/(N.max(N.abs(data))) * data labels = N.concatenate([N.repeat('L%d' % i, perlabel) for i in range(nlabels)]) chunks = N.concatenate([N.repeat(range(nchunks), perlabel/nchunks) for i in range(nlabels)]) ds = Dataset(samples=data, labels=labels, chunks=chunks, labels_map=True) ds.nonbogus_features = nonbogus_features return ds
def build_streamline_things(self): # Build a dataset having samples of different lengths. This is # trying to mimic a possible interface for streamlines # datasets, i.e., an iterable container of Mx3 points, where M # depends on each single streamline. # trying to pack it into an 'object' array to prevent conversion in the # Dataset self.streamline_samples = np.array([ np.random.rand(3,3), np.random.rand(5,3), np.random.rand(7,3)], dtype='object') self.dataset = Dataset(self.streamline_samples) self.similarities = [StreamlineSimilarity(distance=corouge)]
def _call(self, dataset): """Computes featurewise I-RELIEF weights.""" samples = dataset.samples NS, NF = samples.shape[:2] if self.w_guess == None: w = np.ones(NF, 'd') w /= (w**2).sum() # do normalization in all cases to be safe :) M, H = self.compute_M_H(dataset.targets) while True: d_w_k = self.k(pnorm_w(data1=samples, weight=w, p=1)) ni = np.zeros(NF, 'd') for n in range(NS): # d_w_k[n, n] could be omitted since == 0.0 gamma_n = 1.0 - np.nan_to_num(d_w_k[n, M[n]].sum() \ / (d_w_k[n, :].sum() - d_w_k[n, n])) alpha_n = np.nan_to_num(d_w_k[n, M[n]] / (d_w_k[n, M[n]].sum())) beta_n = np.nan_to_num(d_w_k[n, H[n]] / (d_w_k[n, H[n]].sum())) m_n = (np.abs(samples[n, :] - samples[M[n], :]) \ * alpha_n[:, None]).sum(0) h_n = (np.abs(samples[n, :] - samples[H[n], :]) \ * beta_n[:, None]).sum(0) ni += gamma_n * (m_n - h_n) ni = ni / NS ni_plus = np.clip(ni, 0.0, np.inf) # set all negative elements to zero w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum()))) change = np.abs(w_new - w).sum() if __debug__ and 'IRELIEF' in debug.active: debug('IRELIEF', "change=%.4f max=%f min=%.4f mean=%.4f std=%.4f #nan=%d" \ % (change, w_new.max(), w_new.min(), w_new.mean(), w_new.std(), np.isnan(w_new).sum())) # update weights: w = w_new if change < self.threshold: break self.w = w return Dataset(self.w[np.newaxis])
def testLabelRandomizationAndSampling(self): """ """ data = Dataset(samples=N.ones((5, 1)), labels=range(5), chunks=1) data += Dataset(samples=N.ones((5, 1)) + 1, labels=range(5), chunks=2) data += Dataset(samples=N.ones((5, 1)) + 2, labels=range(5), chunks=3) data += Dataset(samples=N.ones((5, 1)) + 3, labels=range(5), chunks=4) data += Dataset(samples=N.ones((5, 1)) + 4, labels=range(5), chunks=5) self.failUnless(data.samplesperlabel == {0: 5, 1: 5, 2: 5, 3: 5, 4: 5}) sample = data.getRandomSamples(2) self.failUnless(sample.samplesperlabel.values() == [2, 2, 2, 2, 2]) self.failUnless((data.uniquechunks == range(1, 6)).all()) # store the old labels origlabels = data.labels.copy() data.permuteLabels(True) self.failIf((data.labels == origlabels).all()) data.permuteLabels(False) self.failUnless((data.labels == origlabels).all()) # now try another object with the same data data2 = Dataset(samples=data.samples, labels=data.labels, chunks=data.chunks) # labels are the same as the originals self.failUnless((data2.labels == origlabels).all()) # now permute in the new object data2.permuteLabels(True) # must not affect the old one self.failUnless((data.labels == origlabels).all()) # but only the new one self.failIf((data2.labels == origlabels).all())
def eep_dataset(samples, targets=None, chunks=None): """Create a dataset using an EEP binary file as source. EEP files are used by *eeprobe* a software for analysing even-related potentials (ERP), which was developed at the Max-Planck Institute for Cognitive Neuroscience in Leipzig, Germany. http://www.ant-neuro.com/products/eeprobe Parameters ---------- samples : str or EEPBin instance This is either a filename of an EEP file, or an EEPBin instance, providing the samples data in EEP format. targets, chunks : sequence or scalar or None Values are pass through to `Dataset.from_wizard()`. See its documentation for more information. Returns ------- Dataset Besides is usual attributes (e.g. targets, chunks, and a mapper). The returned dataset also includes feature attributes associating each same with a channel (by id), and a specific timepoint -- based on information read from the EEP data. """ if isinstance(samples, str): # open the eep file eb = EEPBin(samples) elif isinstance(samples, EEPBin): # nothing special eb = samples else: raise ValueError("eep_dataset takes the filename of an " "EEP file or a EEPBin object as 'samples' argument.") # init dataset ds = Dataset.from_channeltimeseries(eb.data, targets=targets, chunks=chunks, t0=eb.t0, dt=eb.dt, channelids=eb.channels) return ds
def bench_pymvpa(X, Y): """ bench with pymvpa (by default uses a custom swig-generated wrapper around libsvm) """ from mvpa.datasets import Dataset from mvpa.clfs import svm gc.collect() # start time tstart = datetime.now() data = Dataset(samples=X, labels=Y) clf = svm.RbfCSVMC(C=1.) clf.train(data) Z = clf.predict(X) delta = (datetime.now() - tstart) # stop time mvpa_results.append(delta.seconds + delta.microseconds / mu_second)
def eep_dataset(samples, targets=None, chunks=None): """Create a dataset using an EEP binary file as source. EEP files are used by *eeprobe* a software for analysing even-related potentials (ERP), which was developed at the Max-Planck Institute for Cognitive Neuroscience in Leipzig, Germany. http://www.ant-neuro.com/products/eeprobe Parameters ---------- samples : str or EEPBin instance This is either a filename of an EEP file, or an EEPBin instance, providing the samples data in EEP format. targets, chunks : sequence or scalar or None Values are pass through to `Dataset.from_wizard()`. See its documentation for more information. Returns ------- Dataset Besides is usual attributes (e.g. targets, chunks, and a mapper). The returned dataset also includes feature attributes associating each same with a channel (by id), and a specific timepoint -- based on information read from the EEP data. """ if isinstance(samples, str): # open the eep file eb = EEPBin(samples) elif isinstance(samples, EEPBin): # nothing special eb = samples else: raise ValueError("eep_dataset takes the filename of an " "EEP file or a EEPBin object as 'samples' argument.") # init dataset ds = Dataset.from_channeltimeseries( eb.data, targets=targets, chunks=chunks, t0=eb.t0, dt=eb.dt, channelids=eb.channels) return ds
def __init__(self, samples=None, mapper=None, dsattr=None, **kwargs): """ If `samples` and `mapper` arguments are not `None` the mapper is used to forward-map the samples array and the result is passed to the `Dataset` constructor. :Parameters: mapper: Instance of `Mapper` This mapper will be embedded in the dataset and is used and updated, by all subsequent mapping or feature selection procedures. **kwargs: All other arguments are simply passed to and handled by the constructor of `Dataset`. """ # there are basically two mode for the constructor: # 1. internal mode - only data and dsattr dict # 2. user mode - samples != None # and mapper != None # see if dsattr is none, if so, set to empty dict if dsattr is None: dsattr = {} # if a mapper was passed, store it in dsattr dict that gets passed # to base Dataset if not mapper is None: # TODO: check mapper for compliance with dimensionality within _data # may be only within __debug__ dsattr['mapper'] = mapper # if the samples are passed to the special arg, use the mapper to # transform them. if not samples is None: if not dsattr.has_key('mapper') or dsattr['mapper'] is None: raise DatasetError, \ "Constructor of MappedDataset requires a mapper " \ "if unmapped samples are provided." Dataset.__init__(self, samples=mapper.forward(samples), dsattr=dsattr, **(kwargs)) else: Dataset._checkCopyConstructorArgs(samples=samples, dsattr=dsattr, **kwargs) Dataset.__init__(self, dsattr=dsattr, **(kwargs))
def testIdsonboundaries(self): """Test detection of transition points Shame on Yarik -- he didn't create unittests right away... damn me """ ds = Dataset( samples=N.array(range(10), ndmin=2).T, labels=[0, 0, 1, 1, 0, 0, 1, 1, 0, 0], chunks=[0, 0, 0, 0, 0, 1, 1, 1, 1, 1], ) self.failUnless( ds.idsonboundaries() == [0, 2, 4, 5, 6, 8], "We should have got ids whenever either chunk or " "label changes", ) self.failUnless(ds.idsonboundaries(attributes_to_track=["chunks"]) == [0, 5]) # Preceding samples self.failUnless(ds.idsonboundaries(prior=1, post=-1, attributes_to_track=["chunks"]) == [4, 9]) self.failUnless(ds.idsonboundaries(prior=2, post=-1, attributes_to_track=["chunks"]) == [3, 4, 8, 9]) self.failUnless( ds.idsonboundaries(prior=2, post=-1, attributes_to_track=["chunks"], revert=True) == [0, 1, 2, 5, 6, 7] ) self.failUnless(ds.idsonboundaries(prior=1, post=1, attributes_to_track=["chunks"]) == [0, 1, 4, 5, 6, 9]) # all should be there self.failUnless(ds.idsonboundaries(prior=2) == range(10))
def test_polydetrend(): samples_forwhole = np.array( [[1.0, 2, 3, 4, 5, 6], [-2.0, -4, -6, -8, -10, -12]], ndmin=2 ).T samples_forchunks = np.array( [[1.0, 2, 3, 3, 2, 1], [-2.0, -4, -6, -6, -4, -2]], ndmin=2 ).T chunks = [0, 0, 0, 1, 1, 1] chunks_bad = [ 0, 0, 1, 1, 1, 0] target_whole = np.array( [[-3.0, -2, -1, 1, 2, 3], [-6, -4, -2, 2, 4, 6]], ndmin=2 ).T target_chunked = np.array( [[-1.0, 0, 1, 1, 0, -1], [2, 0, -2, -2, 0, 2]], ndmin=2 ).T ds = Dataset(samples_forwhole) # this one will auto-train the mapper on first use dm = PolyDetrendMapper(polyord=1, inspace='police') mds = dm(ds) # features are linear trends, so detrending should remove all assert_array_almost_equal(mds.samples, np.zeros(mds.shape)) # we get the information where each sample is assumed to be in the # space spanned by the polynomials assert_array_equal(mds.sa.police, np.arange(len(ds))) # hackish way to get the previous regressors into a dataset ds.sa['opt_reg_const'] = dm._regs[:,0] ds.sa['opt_reg_lin'] = dm._regs[:,1] # using these precomputed regressors, we should get the same result as # before even if we do not generate a regressor for linear dm_optreg = PolyDetrendMapper(polyord=0, opt_regs=['opt_reg_const', 'opt_reg_lin']) mds_optreg = dm_optreg(ds) assert_array_almost_equal(mds_optreg, np.zeros(mds.shape)) ds = Dataset(samples_forchunks) # 'constant' detrending removes the mean mds = PolyDetrendMapper(polyord=0)(ds) assert_array_almost_equal( mds.samples, samples_forchunks - np.mean(samples_forchunks, axis=0)) # if there is no GLOBAL linear trend it should be identical to mean removal # even if trying to remove linear mds2 = PolyDetrendMapper(polyord=1)(ds) assert_array_almost_equal(mds, mds2) # chunk-wise detrending ds = dataset_wizard(samples_forchunks, chunks=chunks) dm = PolyDetrendMapper(chunks_attr='chunks', polyord=1, inspace='police') mds = dm(ds) # features are chunkswise linear trends, so detrending should remove all assert_array_almost_equal(mds.samples, np.zeros(mds.shape)) # we get the information where each sample is assumed to be in the # space spanned by the polynomials, which is the identical linspace in both # chunks assert_array_equal(mds.sa.police, range(3) * 2) # non-matching number of samples cannot be mapped assert_raises(ValueError, dm, ds[:-1]) # however, if the dataset knows about the space it is possible ds.sa['police'] = mds.sa.police # XXX this should be #mds2 = dm(ds[1:-1]) #assert_array_equal(mds[1:-1], mds2) # XXX but right now is assert_raises(NotImplementedError, dm, ds[1:-1]) # Detrend must preserve the size of dataset assert_equal(mds.shape, ds.shape) # small additional test for break points # although they are no longer there ds = dataset_wizard(np.array([[1.0, 2, 3, 1, 2, 3]], ndmin=2).T, targets=chunks, chunks=chunks) mds = PolyDetrendMapper(chunks_attr='chunks', polyord=1)(ds) assert_array_almost_equal(mds.samples, np.zeros(mds.shape)) # test of different polyord on each chunk target_mixed = np.array( [[-1.0, 0, 1, 0, 0, 0], [2.0, 0, -2, 0, 0, 0]], ndmin=2 ).T ds = dataset_wizard(samples_forchunks.copy(), targets=chunks, chunks=chunks) mds = PolyDetrendMapper(chunks_attr='chunks', polyord=[0,1])(ds) assert_array_almost_equal(mds, target_mixed) # test irregluar spacing of samples, but with corrective time info samples_forwhole = np.array( [[1.0, 4, 6, 8, 2, 9], [-2.0, -8, -12, -16, -4, -18]], ndmin=2 ).T ds = Dataset(samples_forwhole, sa={'time': samples_forwhole[:,0]}) # linear detrending that makes use of temporal info from dataset dm = PolyDetrendMapper(polyord=1, inspace='time') mds = dm(ds) assert_array_almost_equal(mds.samples, np.zeros(mds.shape)) # and now the same stuff, but with chunking and ordered by time samples_forchunks = np.array( [[1.0, 3, 3, 2, 2, 1], [-2.0, -6, -6, -4, -4, -2]], ndmin=2 ).T chunks = [0, 1, 0, 1, 0, 1] time = [4, 4, 12, 8, 8, 12] ds = Dataset(samples_forchunks.copy(), sa={'chunks': chunks, 'time': time}) mds = PolyDetrendMapper(chunks_attr='chunks', polyord=1, inspace='time')(ds) # the whole thing must not affect the source data assert_array_equal(ds, samples_forchunks) # but if done inplace that is no longer true poly_detrend(ds, chunks_attr='chunks', polyord=1, inspace='time') assert_array_equal(ds, mds)
def testSampleSelection(self): origdata = datasets["uni2large"].samples[:100, :10].T data = Dataset(samples=origdata, labels=2, chunks=2) self.failUnless(data.nsamples == 10) # set single pattern to enabled for sel in [data.selectSamples(5), data.select(5), data.select(slice(5, 6))]: self.failUnless(sel.nsamples == 1) self.failUnless(data.nfeatures == 100) self.failUnless(sel.origids == [5]) # check duplicate selections for sel in [ data.selectSamples([5, 5]), # Following ones would fail since select removes # repetitions (XXX) # data.select([5,5]), # data.select([5,5], 'all'), # data.select([5,5], slice(None)), ]: self.failUnless(sel.nsamples == 2) self.failUnless((sel.samples[0] == data.samples[5]).all()) self.failUnless((sel.samples[0] == sel.samples[1]).all()) self.failUnless(len(sel.labels) == 2) self.failUnless(len(sel.chunks) == 2) self.failUnless((sel.origids == [5, 5]).all()) self.failUnless(sel.samples.shape == (2, 100)) # check selection by labels for sel in [ data.selectSamples(data.idsbylabels(2)), data.select(labels=2), data.select("labels", 2), data.select("labels", [2]), data["labels", [2]], data["labels":[2], "labels":2], data["labels":[2]], ]: self.failUnless(sel.nsamples == data.nsamples) self.failUnless(N.all(sel.samples == data.samples)) # not present label for sel in [ data.selectSamples(data.idsbylabels(3)), data.select(labels=3), data.select("labels", 3), data.select("labels", [3]), ]: self.failUnless(sel.nsamples == 0) data = Dataset(samples=origdata, labels=[8, 9, 4, 3, 3, 3, 4, 2, 8, 9], chunks=2) for sel in [ data.selectSamples(data.idsbylabels([2, 3])), data.select("labels", [2, 3]), data.select("labels", [2, 3], labels=[1, 2, 3, 4]), data.select("labels", [2, 3], chunks=[1, 2, 3, 4]), data["labels":[2, 3], "chunks":[1, 2, 3, 4]], data["chunks":[1, 2, 3, 4], "labels":[2, 3]], ]: self.failUnless(N.all(sel.origids == [3.0, 4.0, 5.0, 7.0])) # lets cause it to compute unique labels self.failUnless((data.uniquelabels == [2, 3, 4, 8, 9]).all()) # select some samples removing some labels completely sel = data.selectSamples(data.idsbylabels([3, 4, 8, 9])) self.failUnlessEqual(Set(sel.uniquelabels), Set([3, 4, 8, 9])) self.failUnless((sel.origids == [0, 1, 2, 3, 4, 5, 6, 8, 9]).all())
datasets[basename] = dataset # sample 3D total = 2 * spec['perlabel'] nchunks = spec['nchunks'] data = np.random.standard_normal((total, 3, 6, 6)) labels = np.concatenate( (np.repeat(0, spec['perlabel']), np.repeat(1, spec['perlabel']))) data[:, 1, 0, 0] += 2 * labels # add some signal chunks = np.asarray(range(nchunks) * (total / nchunks)) mask = np.ones((3, 6, 6), dtype='bool') mask[0, 0, 0] = 0 mask[1, 3, 2] = 0 ds = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks, mask=mask, space='myspace') datasets['3d%s' % kind] = ds # some additional datasets datasets['dumb2'] = dumb_feature_binary_dataset() datasets['dumb'] = dumb_feature_dataset() # dataset with few invariant features _dsinv = dumb_feature_dataset() _dsinv.samples = np.hstack((_dsinv.samples, np.zeros( (_dsinv.nsamples, 1)), np.ones((_dsinv.nsamples, 1)))) datasets['dumbinv'] = _dsinv # Datasets for regressions testing datasets['sin_modulated'] = multiple_chunks(sin_modulated, 4, 30, 1)
def test_polydetrend(): samples_forwhole = np.array( [[1.0, 2, 3, 4, 5, 6], [-2.0, -4, -6, -8, -10, -12]], ndmin=2).T samples_forchunks = np.array( [[1.0, 2, 3, 3, 2, 1], [-2.0, -4, -6, -6, -4, -2]], ndmin=2).T chunks = [0, 0, 0, 1, 1, 1] chunks_bad = [0, 0, 1, 1, 1, 0] target_whole = np.array([[-3.0, -2, -1, 1, 2, 3], [-6, -4, -2, 2, 4, 6]], ndmin=2).T target_chunked = np.array([[-1.0, 0, 1, 1, 0, -1], [2, 0, -2, -2, 0, 2]], ndmin=2).T ds = Dataset(samples_forwhole) # this one will auto-train the mapper on first use dm = PolyDetrendMapper(polyord=1, inspace='police') mds = dm(ds) # features are linear trends, so detrending should remove all assert_array_almost_equal(mds.samples, np.zeros(mds.shape)) # we get the information where each sample is assumed to be in the # space spanned by the polynomials assert_array_equal(mds.sa.police, np.arange(len(ds))) # hackish way to get the previous regressors into a dataset ds.sa['opt_reg_const'] = dm._regs[:, 0] ds.sa['opt_reg_lin'] = dm._regs[:, 1] # using these precomputed regressors, we should get the same result as # before even if we do not generate a regressor for linear dm_optreg = PolyDetrendMapper(polyord=0, opt_regs=['opt_reg_const', 'opt_reg_lin']) mds_optreg = dm_optreg(ds) assert_array_almost_equal(mds_optreg, np.zeros(mds.shape)) ds = Dataset(samples_forchunks) # 'constant' detrending removes the mean mds = PolyDetrendMapper(polyord=0)(ds) assert_array_almost_equal( mds.samples, samples_forchunks - np.mean(samples_forchunks, axis=0)) # if there is no GLOBAL linear trend it should be identical to mean removal # even if trying to remove linear mds2 = PolyDetrendMapper(polyord=1)(ds) assert_array_almost_equal(mds, mds2) # chunk-wise detrending ds = dataset_wizard(samples_forchunks, chunks=chunks) dm = PolyDetrendMapper(chunks_attr='chunks', polyord=1, inspace='police') mds = dm(ds) # features are chunkswise linear trends, so detrending should remove all assert_array_almost_equal(mds.samples, np.zeros(mds.shape)) # we get the information where each sample is assumed to be in the # space spanned by the polynomials, which is the identical linspace in both # chunks assert_array_equal(mds.sa.police, range(3) * 2) # non-matching number of samples cannot be mapped assert_raises(ValueError, dm, ds[:-1]) # however, if the dataset knows about the space it is possible ds.sa['police'] = mds.sa.police # XXX this should be #mds2 = dm(ds[1:-1]) #assert_array_equal(mds[1:-1], mds2) # XXX but right now is assert_raises(NotImplementedError, dm, ds[1:-1]) # Detrend must preserve the size of dataset assert_equal(mds.shape, ds.shape) # small additional test for break points # although they are no longer there ds = dataset_wizard(np.array([[1.0, 2, 3, 1, 2, 3]], ndmin=2).T, targets=chunks, chunks=chunks) mds = PolyDetrendMapper(chunks_attr='chunks', polyord=1)(ds) assert_array_almost_equal(mds.samples, np.zeros(mds.shape)) # test of different polyord on each chunk target_mixed = np.array([[-1.0, 0, 1, 0, 0, 0], [2.0, 0, -2, 0, 0, 0]], ndmin=2).T ds = dataset_wizard(samples_forchunks.copy(), targets=chunks, chunks=chunks) mds = PolyDetrendMapper(chunks_attr='chunks', polyord=[0, 1])(ds) assert_array_almost_equal(mds, target_mixed) # test irregluar spacing of samples, but with corrective time info samples_forwhole = np.array( [[1.0, 4, 6, 8, 2, 9], [-2.0, -8, -12, -16, -4, -18]], ndmin=2).T ds = Dataset(samples_forwhole, sa={'time': samples_forwhole[:, 0]}) # linear detrending that makes use of temporal info from dataset dm = PolyDetrendMapper(polyord=1, inspace='time') mds = dm(ds) assert_array_almost_equal(mds.samples, np.zeros(mds.shape)) # and now the same stuff, but with chunking and ordered by time samples_forchunks = np.array( [[1.0, 3, 3, 2, 2, 1], [-2.0, -6, -6, -4, -4, -2]], ndmin=2).T chunks = [0, 1, 0, 1, 0, 1] time = [4, 4, 12, 8, 8, 12] ds = Dataset(samples_forchunks.copy(), sa={'chunks': chunks, 'time': time}) mds = PolyDetrendMapper(chunks_attr='chunks', polyord=1, inspace='time')(ds) # the whole thing must not affect the source data assert_array_equal(ds, samples_forchunks) # but if done inplace that is no longer true poly_detrend(ds, chunks_attr='chunks', polyord=1, inspace='time') assert_array_equal(ds, mds)
def _call(self, dataset): """Computes featurewise I-RELIEF-2 weights. Online version.""" NS = dataset.samples.shape[0] NF = dataset.samples.shape[1] if self.w_guess == None: self.w = np.ones(NF, 'd') # do normalization in all cases to be safe :) self.w = self.w / (self.w**2).sum() M, H = self.compute_M_H(dataset.targets) ni = np.zeros(NF, 'd') pi = np.zeros(NF, 'd') if self.permute: # indices to go through samples in random order random_sequence = np.random.permutation(NS) else: random_sequence = np.arange(NS) change = self.threshold + 1.0 iteration = 0 counter = 0.0 while change > self.threshold and iteration < self.max_iter: if __debug__: debug('IRELIEF', "Iteration %d" % iteration) for t in range(NS): counter += 1.0 n = random_sequence[t] self.k = self.kernel(length_scale=self.kernel_width / self.w) d_w_k_xn_Mn = self.k.computed( dataset.samples[None, n, :], dataset.samples[M[n], :]).as_raw_np().squeeze() d_w_k_xn_Mn_sum = d_w_k_xn_Mn.sum() d_w_k_xn_x = self.k.computed( dataset.samples[None, n, :], dataset.samples).as_raw_np().squeeze() gamma_n = 1.0 - d_w_k_xn_Mn_sum / d_w_k_xn_x.sum() alpha_n = d_w_k_xn_Mn / d_w_k_xn_Mn_sum d_w_k_xn_Hn = self.k.computed( dataset.samples[None, n, :], dataset.samples[H[n], :]).as_raw_np().squeeze() beta_n = d_w_k_xn_Hn / d_w_k_xn_Hn.sum() m_n = (np.abs(dataset.samples[n, :] - dataset.samples[M[n], :]) \ * alpha_n[:, np.newaxis]).sum(0) h_n = (np.abs(dataset.samples[n, :] - dataset.samples[H[n], :]) \ * beta_n[:, np.newaxis]).sum(0) pi = gamma_n * (m_n - h_n) learning_rate = 1.0 / (counter * self.a + 1.0) ni_new = ni + learning_rate * (pi - ni) ni = ni_new # set all negative elements to zero ni_plus = np.clip(ni, 0.0, np.inf) w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum()))) change = np.abs(w_new - self.w).sum() if t % 10 == 0 and __debug__ and 'IRELIEF' in debug.active: debug( 'IRELIEF', "t=%d change=%.4f max=%f min=%.4f mean=%.4f std=%.4f" " #nan=%d" % (t, change, w_new.max(), w_new.min(), w_new.mean(), w_new.std(), np.isnan(w_new).sum())) self.w = w_new if change < self.threshold and iteration > 0: break iteration += 1 return Dataset(self.w[np.newaxis])
datasets["%s_%s" % (basename, replication)] = dataset_ # full dataset datasets[basename] = dataset # sample 3D total = 2*spec['perlabel'] nchunks = spec['nchunks'] data = np.random.standard_normal(( total, 3, 6, 6 )) labels = np.concatenate( ( np.repeat( 0, spec['perlabel'] ), np.repeat( 1, spec['perlabel'] ) ) ) chunks = np.asarray(range(nchunks)*(total/nchunks)) mask = np.ones((3, 6, 6), dtype='bool') mask[0, 0, 0] = 0 mask[1, 3, 2] = 0 ds = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks, mask=mask, space='myspace') datasets['3d%s' % kind] = ds # some additional datasets datasets['dumb2'] = dumb_feature_binary_dataset() datasets['dumb'] = dumb_feature_dataset() # dataset with few invariant features _dsinv = dumb_feature_dataset() _dsinv.samples = np.hstack((_dsinv.samples, np.zeros((_dsinv.nsamples, 1)), np.ones((_dsinv.nsamples, 1)))) datasets['dumbinv'] = _dsinv # Datasets for regressions testing datasets['sin_modulated'] = multiple_chunks(sin_modulated, 4, 30, 1)
def _call(self, dataset): """Computes featurewise I-RELIEF-2 weights. Online version.""" # local bindings samples = dataset.samples NS, NF = samples.shape[:2] threshold = self.threshold a = self.a if self.w_guess == None: w = np.ones(NF, 'd') # do normalization in all cases to be safe :) w /= (w**2).sum() M, H = self.compute_M_H(dataset.targets) ni = np.zeros(NF, 'd') pi = np.zeros(NF, 'd') if self.permute: # indices to go through x in random order random_sequence = np.random.permutation(NS) else: random_sequence = np.arange(NS) change = threshold + 1.0 iteration = 0 counter = 0.0 while change > threshold and iteration < self.max_iter: if __debug__: debug('IRELIEF', "Iteration %d" % iteration) for t in range(NS): counter += 1.0 n = random_sequence[t] d_xn_x = np.abs(samples[n, :] - samples) d_w_k_xn_x = self.k((d_xn_x * w).sum(1)) d_w_k_xn_Mn = d_w_k_xn_x[M[n]] d_w_k_xn_Mn_sum = d_w_k_xn_Mn.sum() gamma_n = 1.0 - d_w_k_xn_Mn_sum / d_w_k_xn_x.sum() alpha_n = d_w_k_xn_Mn / d_w_k_xn_Mn_sum d_w_k_xn_Hn = d_w_k_xn_x[H[n]] beta_n = d_w_k_xn_Hn / d_w_k_xn_Hn.sum() m_n = (d_xn_x[M[n], :] * alpha_n[:, None]).sum(0) h_n = (d_xn_x[H[n], :] * beta_n[:, None]).sum(0) pi = gamma_n * (m_n - h_n) learning_rate = 1.0 / (counter * a + 1.0) ni_new = ni + learning_rate * (pi - ni) ni = ni_new # set all negative elements to zero ni_plus = np.clip(ni, 0.0, np.inf) w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum()))) change = np.abs(w_new - w).sum() if t % 10 == 0 and __debug__ and 'IRELIEF' in debug.active: debug( 'IRELIEF', "t=%d change=%.4f max=%f min=%.4f mean=%.4f std=%.4f" " #nan=%d" % (t, change, w_new.max(), w_new.min(), w_new.mean(), w_new.std(), np.isnan(w_new).sum())) w = w_new if change < threshold and iteration > 0: break iteration += 1 self.w = w return Dataset(self.w[np.newaxis])
lbp = np.asarray(lbp) i3_histo = np.asarray(i3_histo) rgb_histo = np.asarray(rgb_histo) id_index = 15 lbp_predictdata = lbp[[id_index]] i3_histo_predictdata = lbp[[id_index]] print #print predictdata print classID[id_index] #print "len lbp:", len(lbp) #print "shape:", lbp.shape #mvpa lbp_training = Dataset(samples=lbp,labels=classID) i3_histo_training = Dataset(samples=lbp,labels=classID) clf = kNN(k=1, voting='majority') print "clf = ", clf clf.train(lbp_training) lbp_predicted_classID = clf.predict(lbp_predictdata) clf.train(i3_histo_training) i3_histo_predicted_classID = clf.predict(i3_histo_predictdata) print "lbp_predicted_classID: ", lbp_predicted_classID print "i3_histo__predicted_classID :", i3_histo_predicted_classID #if predicted_classID[0] == 1.0: print "Image is of class: GRASS" #if predicted_classID[0] == 2.0: print "Image is of class: DIRT/GRAVEL" #if predicted_classID[0] == 3.0: print "Image is of class: CEMENT/ASPHALT"
from mvpa.datasets import Dataset #pymvpa stuff f_handle = open("classdatafile.txt", 'r') f_handle2 = open("classidfile.txt", 'r') f_handle3 = open("predictdata.txt", 'r') features = genfromtxt(f_handle, dtype=float) classes = genfromtxt(f_handle2, dtype=int) predictdata = genfromtxt(f_handle3, dtype=float) predictdata = np.expand_dims(predictdata, axis=0) print predictdata print np.shape(features), features.ndim, features.dtype print np.shape(classes), classes.ndim, classes.dtype print np.shape(predictdata), predictdata.ndim, predictdata.dtype f_handle.close() f_handle2.close() f_handle3.close() training = Dataset(samples=features, labels=classes) clf = kNN(k=2) print "clf = ", clf clf.train(training) #print np.mean(clf.predict(training.samples) == training.labels) classID = clf.predict(predictdata) print "classID = ", classID #print clf.trained_labels if classID[0] == 1: print "Image is of class: GRASS" if classID[0] == 2: print "Image is of class: DIRT/GRAVEL" if classID[0] == 3: print "Image is of class: CEMENT/ASPHALT"