def test_split_featurewise_dataset_measure(self): ds = datasets['uni3small'] sana = SplitFeaturewiseDatasetMeasure( analyzer=SMLR(fit_all_weights=True).get_sensitivity_analyzer(), splitter=NFoldSplitter(), ) sens = sana(ds) # a sensitivity for each chunk and each label combination assert_equal(sens.shape, (len(ds.sa['chunks'].unique) * len(ds.sa['targets'].unique), ds.nfeatures)) # Lets try more complex example with 'boosting' ds = datasets['uni3medium'] ds.init_origids('samples') sana = SplitFeaturewiseDatasetMeasure( analyzer=SMLR(fit_all_weights=True).get_sensitivity_analyzer(), splitter=NoneSplitter(npertarget=0.25, mode='first', nrunspersplit=2), enable_ca=['splits', 'sensitivities']) sens = sana(ds) assert_equal(sens.shape, (2 * len(ds.sa['targets'].unique), ds.nfeatures)) splits = sana.ca.splits self.failUnlessEqual(len(splits), 2) self.failUnless( np.all([s[0].nsamples == ds.nsamples / 4 for s in splits])) # should have used different samples self.failUnless( np.any([splits[0][0].sa.origids != splits[1][0].sa.origids])) # and should have got different sensitivities self.failUnless(np.any(sens[0] != sens[1]))
def test_smlr_sensitivities(self): data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4) # use SMLR on binary problem, but not fitting all weights clf = SMLR(fit_all_weights=False) clf.train(data) # now ask for the sensitivities WITHOUT having to pass the dataset # again sens = clf.get_sensitivity_analyzer(force_training=False)() self.failUnless(sens.shape == (len(data.UT) - 1, data.nfeatures))
def test_smlr_state(self): data = datasets['dumb'] clf = SMLR() clf.train(data) clf.ca.enable('estimates') clf.ca.enable('predictions') p = np.asarray(clf.predict(data.samples)) self.failUnless((p == clf.ca.predictions).all()) self.failUnless(np.array(clf.ca.estimates).shape[0] == np.array(p).shape[0])
def testSMLRState(self): data = datasets['dumb'] clf = SMLR() clf.train(data) clf.states.enable('values') clf.states.enable('predictions') p = N.asarray(clf.predict(data.samples)) self.failUnless((p == clf.predictions).all()) self.failUnless(N.array(clf.values).shape[0] == N.array(p).shape[0])
def test_smlr_state(self): data = datasets['dumb'] clf = SMLR() clf.train(data) clf.ca.enable('estimates') clf.ca.enable('predictions') p = np.asarray(clf.predict(data.samples)) self.failUnless((p == clf.ca.predictions).all()) self.failUnless( np.array(clf.ca.estimates).shape[0] == np.array(p).shape[0])
def test_smlr(self): data = datasets['dumb'] clf = SMLR() clf.train(data) # prediction has to be perfect # # XXX yoh: whos said that?? ;-) # # There is always a tradeoff between learning and # generalization errors so... but in this case the problem is # more interesting: absent bias disallows to learn data you # have here -- there is no solution which would pass through # (0,0) predictions = clf.predict(data.samples) self.failUnless((predictions == data.targets).all())
def test_union_feature_selection(self): # two methods: 5% highes F-scores, non-zero SMLR weights fss = [ SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')), SensitivityBasedFeatureSelection( SMLRWeights(SMLR(lm=1, implementation="C"), postproc=sumofabs_sample()), RangeElementSelector(mode='select')) ] fs = CombinedFeatureSelection( fss, combiner='union', enable_ca=['selected_ids', 'selections_ids']) od = fs(self.dataset) self.failUnless(fs.combiner == 'union') self.failUnless(len(fs.ca.selections_ids)) self.failUnless(len(fs.ca.selections_ids) <= self.dataset.nfeatures) # should store one set per methods self.failUnless(len(fs.ca.selections_ids) == len(fss)) # no individual can be larger than union for s in fs.ca.selections_ids: self.failUnless(len(s) <= len(fs.ca.selected_ids)) # check output dataset self.failUnless(od.nfeatures == len(fs.ca.selected_ids)) for i, id in enumerate(fs.ca.selected_ids): self.failUnless( (od.samples[:, i] == self.dataset.samples[:, id]).all()) # again for intersection fs = CombinedFeatureSelection( fss, combiner='intersection', enable_ca=['selected_ids', 'selections_ids']) # simply run it for now -- can't think of additional tests od = fs(self.dataset)
def compute(self): feat = self.feat lab = self.lab opt = self.opt trainLabels = opt.trainLabels thresh = opt.thresh if opt.checkNull: if opt.mode == "train": nrnd.shuffle(lab[0]) elif opt.mode == "test": nrnd.shuffle(lab[2]) else: raise "Null hypothesis checking is undefined for this mode: ",opt.mode maxLab = max([ max(l) for l in lab ]) if "svm" in opt.method: if opt.kernel == "lin": svmFact = SvmShogLin.factory(C=opt.C) elif opt.kernel == "rbf": kernel=SparseGaussianKernel(100,opt.rbfWidth) #kernel must know its lhs for classification, and #it must be the same as it was for training kernel.init(feat[0],feat[0]) svmFact = SvmShogKern.factory(C=opt.C,kernel=kernel) if opt.method == "svm": modStore = SvmModFileStore(opt.modelRoot,svmFact=svmFact) if opt.mode == "train": if opt.method == "svm": svmMul = SvmOneVsAll(maxLabel=maxLab) svmMul.setLab(lab[0]) svmMul.setFeat(feat[0]) svmMul.setSvmStore(modStore) svmMul.trainMany(trainLabels=trainLabels) elif opt.method == "smlr": import mvpa.datasets from mvpa.clfs.smlr import SMLR mv_data = mvpa.datasets.Dataset(samples=feat[0].get_full_feature_matrix().transpose(),labels=lab[0]) clf = SMLR(lm=opt.smlrLm,convergence_tol=opt.smlrConvergenceTol) clf.train(mv_data) makedir(opt.modelRoot) dumpObj(clf,pjoin(opt.modelRoot,"smlr")) elif opt.mode in ("test","predict"): if opt.method == "svm": if trainLabels is None: labLoad = None maxLabel = modStore.getMaxLabel() else: labLoad = trainLabels maxLabel = max(trainLabels) svms = SvmModMemStore(svmFact) svms.fromOther(modStore,labels=labLoad) svmMul = SvmOneVsAll(maxLabel=maxLabel) svmMul.setSvmStore(svms) if opt.useSrm: svmMul.setFeat(feat[1]) svmMul.classifyBin() svmMul.setLab(lab[1]) svmMul.computeSrm() srm = svmMul.getSrm() #srm[:] = 1. #svmMul.setSrm(srm) print "SRM = %s" % (svmMul.getSrm(),) svmMul.setLab(None) svmMul.setFeat(feat[2]) svmMul.classifyBin() labPred = n.zeros((len(thresh),len(lab[2])),dtype='i4') for iThresh in xrange(len(thresh)): t = thresh[iThresh] labPred[iThresh] = svmMul.classify(thresh=t,useSrm=opt.useSrm) print "Threshold %.3f" % t return Struct(labPred=labPred,param=n.rec.fromarrays([thresh],names="thresh")) elif opt.method == "smlr": clf = loadObj(pjoin(opt.modelRoot,"smlr")) labPred = n.asarray(clf.predict(feat[2].get_full_feature_matrix().transpose()),dtype='i4') labPred.shape = (1,len(labPred)) return Struct(labPred=labPred,param=n.rec.fromarrays([thresh[0:1]],names="thresh")) elif opt.method == "knn": dist = SparseEuclidianDistance(feat[0],feat[1]) mod = KNN(opt.knnK,dist,Labels(lab[0].astype('f8'))) if opt.knnMaxDist is not None: mod.set_max_dist(opt.knnMaxDist) mod.train() labPred = mod.classify().get_labels() labUnclass = mod.get_unclass_label() labPred[labPred==labUnclass] = opt.labUnclass labPred.shape = (1,len(labPred)) return Struct(labPred=labPred,param=None) elif opt.method == "knn-svm": assert len(thresh) == 1,"multiple SVM decision thresholds not implemented for knn-svm" dist = SparseEuclidianDistance(feat[0],feat[2]) knn = KNN(opt.knnK,dist,Labels(lab[0].astype('f8'))) knn.train() n_test = feat[2].get_num_vectors() ind_neighb = numpy.zeros((n_test,opt.knnK),dtype='i4') dist_neighb = numpy.zeros((n_test,opt.knnK),dtype='f8') print "Computing KNN list..." knn.get_neighbours(ind_neighb,dist_neighb) labPred = numpy.zeros(n_test,dtype='i4') print "Training neighbours' SVMs..." for iTest in xrange(n_test): samp_ind_neighb = ind_neighb[iTest] samp_dist_neighb = dist_neighb[iTest] if opt.knnMaxDist is not None: samp_in_dist = samp_dist_neighb < opt.knnMaxDist samp_ind_neighb = samp_ind_neighb[samp_in_dist] samp_dist_neighb = samp_dist_neighb[samp_in_dist] if len(samp_ind_neighb) > 0: svmTrFeat = feat[0].subsample(samp_ind_neighb) svmTrLab = lab[0][samp_ind_neighb] if (svmTrLab == svmTrLab[0]).all(): labPred[iTest] = svmTrLab[0] if iTest % 100 == 0: print "All %s neighbours have one label %i for samp %i" % (len(samp_ind_neighb),labPred[iTest],iTest) else: svmTsFeat = feat[2].subsample(numpy.asarray([iTest],dtype='i4')) labPred[iTest] = svmOneVsAllOneStep(feat=(svmTrFeat,svmTsFeat), lab=(svmTrLab,), opt=Struct(C=opt.C,thresh=thresh[0],useSrm=False)) if iTest % 100 == 0: print "SVM selected label %i from %s for samp %i" % (labPred[iTest],svmTrLab,iTest) else: labPred[iTest] = opt.labUnclass if iTest % 100 == 0: print "No training samples are within cutoff distance found for samp %i" % (iTest,) labPred.shape = (1,len(labPred)) return Struct(labPred=labPred,param=None)
"""Registered items """ return self.__items clfswh = Warehouse(known_tags=_KNOWN_INTERNALS) # classifiers regrswh = Warehouse(known_tags=_KNOWN_INTERNALS) # regressions # NB: # - Nu-classifiers are turned off since for haxby DS default nu # is an 'infisible' one # - Python's SMLR is turned off for the duration of development # since it is slow and results should be the same as of C version # clfswh += [ SMLR(lm=0.1, implementation="C", descr="SMLR(lm=0.1)"), SMLR(lm=1.0, implementation="C", descr="SMLR(lm=1.0)"), #SMLR(lm=10.0, implementation="C", descr="SMLR(lm=10.0)"), #SMLR(lm=100.0, implementation="C", descr="SMLR(lm=100.0)"), #SMLR(implementation="Python", descr="SMLR(Python)") ] clfswh += \ [ MulticlassClassifier(clfswh['smlr'][0], descr='Pairs+maxvote multiclass on ' + \ clfswh['smlr'][0].descr) ] if externals.exists('libsvm'): from mvpa.clfs import libsvmc as libsvm clfswh._known_tags.union_update(libsvm.SVM._KNOWN_IMPLEMENTATIONS.keys()) clfswh += [
FeaturewiseDatasetMeasure.__init__(self, **kwargs) self.__mult = mult def _call(self, dataset): """Train linear SVM on `dataset` and extract weights from classifier. """ sens = self.__mult * (np.arange(dataset.nfeatures) - int(dataset.nfeatures / 2)) return Dataset(sens[np.newaxis]) # Sample universal classifiers (linear and non-linear) which should be # used whenever it doesn't matter what classifier it is for testing # some higher level creations -- chosen so it is the fastest universal # one. Also it should not punch state.py in the face how it is # happening with kNN... sample_clf_lin = SMLR(lm=0.1) #sg.svm.LinearCSVMC(svm_impl='libsvm') #if externals.exists('shogun'): # sample_clf_nl = sg.SVM(kernel_type='RBF', svm_impl='libsvm') #else: #classical one which was used for a while #and surprisingly it is not bad at all for the unittests sample_clf_nl = kNN(k=5) # and also a regression-based classifier r = clfswh['linear', 'regression_based', 'has_sensitivity'] if len(r) > 0: sample_clf_reg = r[0] else: sample_clf_reg = None