def _forward_dataset(self, ds): # local binding chunks_attr = self.__chunks_attr dtype = self.__dtype if ( __debug__ and not chunks_attr is None and np.array(get_nsamples_per_attr(ds, chunks_attr).values()).min() <= 2 ): warning( "Z-scoring chunk-wise having a chunk with less than three " "samples will set features in these samples to either zero " "(with 1 sample in a chunk) " "or -1/+1 (with 2 samples in a chunk)." ) params = self.__params_dict if params is None: raise RuntimeError, "ZScoreMapper needs to be trained before call to forward" if self._secret_inplace_zscore: mds = ds else: # shallow copy to put the new stuff in mds = ds.copy(deep=False) # cast the data to float, since in-place operations below do not upcast! if np.issubdtype(mds.samples.dtype, np.integer): mds.samples = mds.samples.astype(dtype) if "__all__" in params: # we have a global parameter set mds.samples = self._zscore(mds.samples, *params["__all__"]) else: # per chunk z-scoring for c in mds.sa[chunks_attr].unique: if not c in params: raise RuntimeError( "%s has no parameters for chunk '%s'. It probably " "wasn't present in the training dataset!?" % (self.__class__.__name__, c) ) slicer = np.where(mds.sa[chunks_attr].value == c)[0] mds.samples[slicer] = self._zscore(mds.samples[slicer], *params[c]) return mds
def _forward_dataset(self, ds): # local binding chunks_attr = self.__chunks_attr dtype = self.__dtype if __debug__ and not chunks_attr is None \ and np.array(get_nsamples_per_attr(ds, chunks_attr).values()).min() <= 2: warning("Z-scoring chunk-wise having a chunk with less than three " "samples will set features in these samples to either zero " "(with 1 sample in a chunk) " "or -1/+1 (with 2 samples in a chunk).") params = self.__params_dict if params is None: raise RuntimeError, \ "ZScoreMapper needs to be trained before call to forward" if self._secret_inplace_zscore: mds = ds else: # shallow copy to put the new stuff in mds = ds.copy(deep=False) # cast the data to float, since in-place operations below do not upcast! if np.issubdtype(mds.samples.dtype, np.integer): mds.samples = mds.samples.astype(dtype) if '__all__' in params: # we have a global parameter set mds.samples = self._zscore(mds.samples, *params['__all__']) else: # per chunk z-scoring for c in mds.sa[chunks_attr].unique: if not c in params: raise RuntimeError( "%s has no parameters for chunk '%s'. It probably " "wasn't present in the training dataset!?" % (self.__class__.__name__, c)) slicer = np.where(mds.sa[chunks_attr].value == c)[0] mds.samples[slicer] = self._zscore(mds.samples[slicer], *params[c]) return mds
def __call__(self, dataset): """Splits the dataset. This method behaves like a generator. """ # local bindings to methods to gain some speedup ds_class = dataset.__class__ # for each split cfgs = self.splitcfg(dataset) n_cfgs = len(cfgs) # Finally split the data for isplit, split in enumerate(cfgs): # determine sample sizes if not operator.isSequenceType(self.__npertarget) \ or isinstance(self.__npertarget, str): npertargetsplit = [self.__npertarget] * len(split) else: npertargetsplit = self.__npertarget # get splitted datasets split_ds = self.split_dataset(dataset, split) # do multiple post-processing runs for this split for run in xrange(self.__runspersplit): # post-process all datasets finalized_datasets = [] for ds, npertarget in zip(split_ds, npertargetsplit): # Set flag of dataset either this was the last split # ??? per our discussion this might be the best # solution which would scale if we care about # thread-safety etc if ds is not None: ds_a = ds.a lastsplit = (isplit == n_cfgs - 1) if not ds_a.has_key('lastsplit'): # if not yet known -- add one ds_a['lastsplit'] = lastsplit else: # otherwise just assign a new value ds_a.lastsplit = lastsplit # permute the labels if self.__permute_attr is not None: permute_attr(ds, attr=self.__permute_attr, chunks_attr='chunks', col='sa') # select subset of samples if requested if npertarget == 'all' or ds is None: finalized_datasets.append(ds) else: # We need to select a subset of samples # TODO: move all this logic within random_sample # go for maximum possible number of samples provided # by each label in this dataset if npertarget == 'equal': # determine the min number of samples per class npl = np.array( get_nsamples_per_attr( ds, 'targets').values()).min() elif isinstance(npertarget, float) or ( operator.isSequenceType(npertarget) and len(npertarget) > 0 and isinstance(npertarget[0], float)): # determine number of samples per class and take # a ratio counts = np.array( get_nsamples_per_attr(ds, 'targets').values()) npl = (counts * npertarget).round().astype(int) else: npl = npertarget # finally select the patterns finalized_datasets.append(random_samples(ds, npl)) if self._reverse: yield finalized_datasets[::-1] else: yield finalized_datasets
def test_cper_class(self, clf): if not (clf.params.has_key('C')): # skip those without C return ds = datasets['uni2medium'].copy() ds__ = datasets['uni2medium'].copy() # # ballanced set # Lets add a bit of noise to drive classifier nuts. same # should be done for disballanced set ds__.samples = ds__.samples + \ 0.5 * np.random.normal(size=(ds__.samples.shape)) # # disballanced set # lets overpopulate label 0 times = 20 ds_ = ds[(range(ds.nsamples) + range(ds.nsamples/2) * times)] ds_.samples = ds_.samples + \ 0.5 * np.random.normal(size=(ds_.samples.shape)) spl = get_nsamples_per_attr(ds_, 'targets') #_.samplesperlabel #print ds_.targets, ds_.chunks cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats') # on balanced e = cve(ds__) tpr_1 = cve.ca.stats.stats["TPR"][1] # on disbalanced e = cve(ds_) tpr_2 = cve.ca.stats.stats["TPR"][1] # Set '1 C per label' # recreate cvte since previous might have operated on copies cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats') oldC = clf.params.C # TODO: provide clf.params.C not with a tuple but dictionary # with C per label (now order is deduced in a cruel way) ratio = np.sqrt(float(spl[ds_.UT[0]])/spl[ds_.UT[1]]) clf.params.C = (-1/ratio, -1*ratio) try: # on disbalanced but with balanced C e_ = cve(ds_) # reassign C clf.params.C = oldC except: clf.params.C = oldC raise tpr_3 = cve.ca.stats.stats["TPR"][1] # Actual tests if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(tpr_1 > 0.25, msg="Without disballance we should have some " "hits, but got TPR=%.3f" % tpr_1) self.failUnless(tpr_2 < 0.25, msg="With disballance we should have almost no " "hits for minor, but got TPR=%.3f" % tpr_2) self.failUnless(tpr_3 > 0.25, msg="With disballanced data but ratio-based Cs " "we should have some hits for minor, but got " "TPR=%.3f" % tpr_3)
def __call__(self, dataset): """Splits the dataset. This method behaves like a generator. """ # local bindings to methods to gain some speedup ds_class = dataset.__class__ # for each split cfgs = self.splitcfg(dataset) n_cfgs = len(cfgs) # Finally split the data for isplit, split in enumerate(cfgs): # determine sample sizes if not operator.isSequenceType(self.__npertarget) \ or isinstance(self.__npertarget, str): npertargetsplit = [self.__npertarget] * len(split) else: npertargetsplit = self.__npertarget # get splitted datasets split_ds = self.split_dataset(dataset, split) # do multiple post-processing runs for this split for run in xrange(self.__runspersplit): # post-process all datasets finalized_datasets = [] for ds, npertarget in zip(split_ds, npertargetsplit): # Set flag of dataset either this was the last split # ??? per our discussion this might be the best # solution which would scale if we care about # thread-safety etc if ds is not None: ds_a = ds.a lastsplit = (isplit == n_cfgs-1) if not ds_a.has_key('lastsplit'): # if not yet known -- add one ds_a['lastsplit'] = lastsplit else: # otherwise just assign a new value ds_a.lastsplit = lastsplit # permute the labels if self.__permute_attr is not None: permute_attr(ds, attr=self.__permute_attr, chunks_attr='chunks', col='sa') # select subset of samples if requested if npertarget == 'all' or ds is None: finalized_datasets.append(ds) else: # We need to select a subset of samples # TODO: move all this logic within random_sample # go for maximum possible number of samples provided # by each label in this dataset if npertarget == 'equal': # determine the min number of samples per class npl = np.array(get_nsamples_per_attr( ds, 'targets').values()).min() elif isinstance(npertarget, float) or ( operator.isSequenceType(npertarget) and len(npertarget) > 0 and isinstance(npertarget[0], float)): # determine number of samples per class and take # a ratio counts = np.array(get_nsamples_per_attr( ds, 'targets').values()) npl = (counts * npertarget).round().astype(int) else: npl = npertarget # finally select the patterns finalized_datasets.append( random_samples(ds, npl)) if self._reverse: yield finalized_datasets[::-1] else: yield finalized_datasets
def test_cper_class(self, clf): if not (clf.params.has_key('C')): # skip those without C return ds = datasets['uni2medium'].copy() ds__ = datasets['uni2medium'].copy() # # ballanced set # Lets add a bit of noise to drive classifier nuts. same # should be done for disballanced set ds__.samples = ds__.samples + \ 0.5 * np.random.normal(size=(ds__.samples.shape)) # # disballanced set # lets overpopulate label 0 times = 20 ds_ = ds[(range(ds.nsamples) + range(ds.nsamples/2) * times)] ds_.samples = ds_.samples + \ 0.5 * np.random.normal(size=(ds_.samples.shape)) spl = get_nsamples_per_attr(ds_, 'targets') #_.samplesperlabel #print ds_.targets, ds_.chunks cve = CrossValidatedTransferError(TransferError(clf), NFoldSplitter(), enable_ca='confusion') # on balanced e = cve(ds__) tpr_1 = cve.ca.confusion.stats["TPR"][1] # on disbalanced e = cve(ds_) tpr_2 = cve.ca.confusion.stats["TPR"][1] # Set '1 C per label' # recreate cvte since previous might have operated on copies cve = CrossValidatedTransferError(TransferError(clf), NFoldSplitter(), enable_ca='confusion') oldC = clf.params.C # TODO: provide clf.params.C not with a tuple but dictionary # with C per label (now order is deduced in a cruel way) ratio = np.sqrt(float(spl[ds_.UT[0]])/spl[ds_.UT[1]]) clf.params.C = (-1/ratio, -1*ratio) try: # on disbalanced but with balanced C e_ = cve(ds_) # reassign C clf.params.C = oldC except: clf.params.C = oldC raise tpr_3 = cve.ca.confusion.stats["TPR"][1] # Actual tests if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(tpr_1 > 0.25, msg="Without disballance we should have some " "hits, but got TPR=%.3f" % tpr_1) self.failUnless(tpr_2 < 0.25, msg="With disballance we should have almost no " "hits for minor, but got TPR=%.3f" % tpr_2) self.failUnless(tpr_3 > 0.25, msg="With disballanced data but ratio-based Cs " "we should have some hits for minor, but got " "TPR=%.3f" % tpr_3)