Пример #1
0
    def test_split_featurewise_dataset_measure(self):
        ds = datasets['uni3small']
        sana = SplitFeaturewiseDatasetMeasure(
            analyzer=SMLR(fit_all_weights=True).get_sensitivity_analyzer(),
            splitter=NFoldSplitter(),
        )

        sens = sana(ds)
        # a sensitivity for each chunk and each label combination
        assert_equal(sens.shape, (len(ds.sa['chunks'].unique) *
                                  len(ds.sa['targets'].unique), ds.nfeatures))

        # Lets try more complex example with 'boosting'
        ds = datasets['uni3medium']
        ds.init_origids('samples')
        sana = SplitFeaturewiseDatasetMeasure(
            analyzer=SMLR(fit_all_weights=True).get_sensitivity_analyzer(),
            splitter=NoneSplitter(npertarget=0.25,
                                  mode='first',
                                  nrunspersplit=2),
            enable_ca=['splits', 'sensitivities'])
        sens = sana(ds)

        assert_equal(sens.shape,
                     (2 * len(ds.sa['targets'].unique), ds.nfeatures))
        splits = sana.ca.splits
        self.failUnlessEqual(len(splits), 2)
        self.failUnless(
            np.all([s[0].nsamples == ds.nsamples / 4 for s in splits]))
        # should have used different samples
        self.failUnless(
            np.any([splits[0][0].sa.origids != splits[1][0].sa.origids]))
        # and should have got different sensitivities
        self.failUnless(np.any(sens[0] != sens[1]))
Пример #2
0
    def test_smlr_sensitivities(self):
        data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

        # use SMLR on binary problem, but not fitting all weights
        clf = SMLR(fit_all_weights=False)
        clf.train(data)

        # now ask for the sensitivities WITHOUT having to pass the dataset
        # again
        sens = clf.get_sensitivity_analyzer(force_training=False)()
        self.failUnless(sens.shape == (len(data.UT) - 1, data.nfeatures))
Пример #3
0
    def test_smlr_sensitivities(self):
        data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

        # use SMLR on binary problem, but not fitting all weights
        clf = SMLR(fit_all_weights=False)
        clf.train(data)

        # now ask for the sensitivities WITHOUT having to pass the dataset
        # again
        sens = clf.get_sensitivity_analyzer(force_training=False)()
        self.failUnless(sens.shape == (len(data.UT) - 1, data.nfeatures))
Пример #4
0
    def test_smlr_state(self):
        data = datasets['dumb']

        clf = SMLR()

        clf.train(data)

        clf.ca.enable('estimates')
        clf.ca.enable('predictions')

        p = np.asarray(clf.predict(data.samples))

        self.failUnless((p == clf.ca.predictions).all())
        self.failUnless(np.array(clf.ca.estimates).shape[0] == np.array(p).shape[0])
Пример #5
0
    def testSMLRState(self):
        data = datasets['dumb']

        clf = SMLR()

        clf.train(data)

        clf.states.enable('values')
        clf.states.enable('predictions')

        p = N.asarray(clf.predict(data.samples))

        self.failUnless((p == clf.predictions).all())
        self.failUnless(N.array(clf.values).shape[0] == N.array(p).shape[0])
Пример #6
0
    def test_smlr_state(self):
        data = datasets['dumb']

        clf = SMLR()

        clf.train(data)

        clf.ca.enable('estimates')
        clf.ca.enable('predictions')

        p = np.asarray(clf.predict(data.samples))

        self.failUnless((p == clf.ca.predictions).all())
        self.failUnless(
            np.array(clf.ca.estimates).shape[0] == np.array(p).shape[0])
Пример #7
0
    def test_smlr(self):
        data = datasets['dumb']

        clf = SMLR()

        clf.train(data)

        # prediction has to be perfect
        #
        # XXX yoh: whos said that?? ;-)
        #
        # There is always a tradeoff between learning and
        # generalization errors so...  but in this case the problem is
        # more interesting: absent bias disallows to learn data you
        # have here -- there is no solution which would pass through
        # (0,0)
        predictions = clf.predict(data.samples)
        self.failUnless((predictions == data.targets).all())
Пример #8
0
    def test_smlr(self):
        data = datasets['dumb']

        clf = SMLR()

        clf.train(data)

        # prediction has to be perfect
        #
        # XXX yoh: whos said that?? ;-)
        #
        # There is always a tradeoff between learning and
        # generalization errors so...  but in this case the problem is
        # more interesting: absent bias disallows to learn data you
        # have here -- there is no solution which would pass through
        # (0,0)
        predictions = clf.predict(data.samples)
        self.failUnless((predictions == data.targets).all())
Пример #9
0
    def test_union_feature_selection(self):
        # two methods: 5% highes F-scores, non-zero SMLR weights
        fss = [
            SensitivityBasedFeatureSelection(
                OneWayAnova(),
                FractionTailSelector(0.05, mode='select', tail='upper')),
            SensitivityBasedFeatureSelection(
                SMLRWeights(SMLR(lm=1, implementation="C"),
                            postproc=sumofabs_sample()),
                RangeElementSelector(mode='select'))
        ]

        fs = CombinedFeatureSelection(
            fss,
            combiner='union',
            enable_ca=['selected_ids', 'selections_ids'])

        od = fs(self.dataset)

        self.failUnless(fs.combiner == 'union')
        self.failUnless(len(fs.ca.selections_ids))
        self.failUnless(len(fs.ca.selections_ids) <= self.dataset.nfeatures)
        # should store one set per methods
        self.failUnless(len(fs.ca.selections_ids) == len(fss))
        # no individual can be larger than union
        for s in fs.ca.selections_ids:
            self.failUnless(len(s) <= len(fs.ca.selected_ids))
        # check output dataset
        self.failUnless(od.nfeatures == len(fs.ca.selected_ids))
        for i, id in enumerate(fs.ca.selected_ids):
            self.failUnless(
                (od.samples[:, i] == self.dataset.samples[:, id]).all())

        # again for intersection
        fs = CombinedFeatureSelection(
            fss,
            combiner='intersection',
            enable_ca=['selected_ids', 'selections_ids'])
        # simply run it for now -- can't think of additional tests
        od = fs(self.dataset)
Пример #10
0
    def compute(self):
        feat = self.feat
        lab = self.lab
        opt = self.opt
        
        trainLabels = opt.trainLabels

        thresh = opt.thresh

        if opt.checkNull:
            if opt.mode == "train":
                nrnd.shuffle(lab[0])
            elif opt.mode == "test":
                nrnd.shuffle(lab[2])
            else:
                raise "Null hypothesis checking is undefined for this mode: ",opt.mode

        maxLab = max([ max(l) for l in lab ])
        if "svm" in opt.method:
            if opt.kernel == "lin":
                svmFact = SvmShogLin.factory(C=opt.C)
            elif opt.kernel == "rbf":
                kernel=SparseGaussianKernel(100,opt.rbfWidth)
                #kernel must know its lhs for classification, and
                #it must be the same as it was for training
                kernel.init(feat[0],feat[0])
                svmFact = SvmShogKern.factory(C=opt.C,kernel=kernel)
            if opt.method == "svm":
                modStore = SvmModFileStore(opt.modelRoot,svmFact=svmFact)
        if opt.mode == "train":
            if opt.method == "svm":
                svmMul = SvmOneVsAll(maxLabel=maxLab)
                svmMul.setLab(lab[0])
                svmMul.setFeat(feat[0])
                svmMul.setSvmStore(modStore)
                svmMul.trainMany(trainLabels=trainLabels)
            elif opt.method == "smlr":
                import mvpa.datasets
                from mvpa.clfs.smlr import SMLR
                mv_data = mvpa.datasets.Dataset(samples=feat[0].get_full_feature_matrix().transpose(),labels=lab[0])
                clf = SMLR(lm=opt.smlrLm,convergence_tol=opt.smlrConvergenceTol)
                clf.train(mv_data)
                makedir(opt.modelRoot)
                dumpObj(clf,pjoin(opt.modelRoot,"smlr"))

        elif opt.mode in ("test","predict"):
            if opt.method == "svm":
                if trainLabels is None:
                    labLoad = None
                    maxLabel = modStore.getMaxLabel()
                else:
                    labLoad = trainLabels
                    maxLabel = max(trainLabels)
                svms = SvmModMemStore(svmFact)
                svms.fromOther(modStore,labels=labLoad)
                svmMul = SvmOneVsAll(maxLabel=maxLabel)
                svmMul.setSvmStore(svms)
                if opt.useSrm:
                    svmMul.setFeat(feat[1])
                    svmMul.classifyBin()
                    svmMul.setLab(lab[1])
                    svmMul.computeSrm()
                    srm = svmMul.getSrm()
                    #srm[:] = 1.
                    #svmMul.setSrm(srm)
                    print "SRM = %s" % (svmMul.getSrm(),)
                svmMul.setLab(None)
                svmMul.setFeat(feat[2])
                svmMul.classifyBin()
                labPred = n.zeros((len(thresh),len(lab[2])),dtype='i4')
                for iThresh in xrange(len(thresh)):
                    t = thresh[iThresh]
                    labPred[iThresh] = svmMul.classify(thresh=t,useSrm=opt.useSrm)
                    print "Threshold %.3f" % t
                return Struct(labPred=labPred,param=n.rec.fromarrays([thresh],names="thresh"))
            elif opt.method == "smlr":
                clf = loadObj(pjoin(opt.modelRoot,"smlr"))
                labPred = n.asarray(clf.predict(feat[2].get_full_feature_matrix().transpose()),dtype='i4')
                labPred.shape = (1,len(labPred))
                return Struct(labPred=labPred,param=n.rec.fromarrays([thresh[0:1]],names="thresh"))
            elif opt.method == "knn":
                dist = SparseEuclidianDistance(feat[0],feat[1])
                mod = KNN(opt.knnK,dist,Labels(lab[0].astype('f8')))
                if opt.knnMaxDist is not None:
                    mod.set_max_dist(opt.knnMaxDist)
                mod.train()
                labPred = mod.classify().get_labels()
                labUnclass = mod.get_unclass_label()
                labPred[labPred==labUnclass] = opt.labUnclass
                labPred.shape = (1,len(labPred))
                return Struct(labPred=labPred,param=None)
            elif opt.method == "knn-svm":
                assert len(thresh) == 1,"multiple SVM decision thresholds not implemented for knn-svm"
                dist = SparseEuclidianDistance(feat[0],feat[2])
                knn = KNN(opt.knnK,dist,Labels(lab[0].astype('f8')))
                knn.train()
                n_test = feat[2].get_num_vectors()
                ind_neighb = numpy.zeros((n_test,opt.knnK),dtype='i4')
                dist_neighb = numpy.zeros((n_test,opt.knnK),dtype='f8')
                print "Computing KNN list..."
                knn.get_neighbours(ind_neighb,dist_neighb)
                labPred = numpy.zeros(n_test,dtype='i4')
                print "Training neighbours' SVMs..."
                for iTest in xrange(n_test):
                    samp_ind_neighb = ind_neighb[iTest]
                    samp_dist_neighb = dist_neighb[iTest]
                    if opt.knnMaxDist is not None:
                        samp_in_dist = samp_dist_neighb < opt.knnMaxDist
                        samp_ind_neighb = samp_ind_neighb[samp_in_dist]
                        samp_dist_neighb = samp_dist_neighb[samp_in_dist]
                    if len(samp_ind_neighb) > 0:
                        svmTrFeat = feat[0].subsample(samp_ind_neighb)
                        svmTrLab = lab[0][samp_ind_neighb]
                        if (svmTrLab == svmTrLab[0]).all():
                            labPred[iTest] = svmTrLab[0]
                            if iTest % 100 == 0:
                                print "All %s neighbours have one label %i for samp %i" % (len(samp_ind_neighb),labPred[iTest],iTest)
                        else:
                            svmTsFeat = feat[2].subsample(numpy.asarray([iTest],dtype='i4'))
                            labPred[iTest] = svmOneVsAllOneStep(feat=(svmTrFeat,svmTsFeat),
                                    lab=(svmTrLab,),
                                    opt=Struct(C=opt.C,thresh=thresh[0],useSrm=False))
                            if iTest % 100 == 0:
                                print "SVM selected label %i from %s for samp %i" % (labPred[iTest],svmTrLab,iTest)
                    else:
                        labPred[iTest] = opt.labUnclass
                        if iTest % 100 == 0:
                            print "No training samples are within cutoff distance found for samp %i" % (iTest,)

                labPred.shape = (1,len(labPred))
                return Struct(labPred=labPred,param=None)
Пример #11
0
        """Registered items
        """
        return self.__items


clfswh = Warehouse(known_tags=_KNOWN_INTERNALS)  # classifiers
regrswh = Warehouse(known_tags=_KNOWN_INTERNALS)  # regressions

# NB:
#  - Nu-classifiers are turned off since for haxby DS default nu
#    is an 'infisible' one
#  - Python's SMLR is turned off for the duration of development
#    since it is slow and results should be the same as of C version
#
clfswh += [
    SMLR(lm=0.1, implementation="C", descr="SMLR(lm=0.1)"),
    SMLR(lm=1.0, implementation="C", descr="SMLR(lm=1.0)"),
    #SMLR(lm=10.0, implementation="C", descr="SMLR(lm=10.0)"),
    #SMLR(lm=100.0, implementation="C", descr="SMLR(lm=100.0)"),
    #SMLR(implementation="Python", descr="SMLR(Python)")
]

clfswh += \
     [ MulticlassClassifier(clfswh['smlr'][0],
                            descr='Pairs+maxvote multiclass on ' + \
                            clfswh['smlr'][0].descr) ]

if externals.exists('libsvm'):
    from mvpa.clfs import libsvmc as libsvm
    clfswh._known_tags.union_update(libsvm.SVM._KNOWN_IMPLEMENTATIONS.keys())
    clfswh += [
Пример #12
0
        FeaturewiseDatasetMeasure.__init__(self, **kwargs)
        self.__mult = mult

    def _call(self, dataset):
        """Train linear SVM on `dataset` and extract weights from classifier.
        """
        sens = self.__mult * (np.arange(dataset.nfeatures) -
                              int(dataset.nfeatures / 2))
        return Dataset(sens[np.newaxis])


# Sample universal classifiers (linear and non-linear) which should be
# used whenever it doesn't matter what classifier it is for testing
# some higher level creations -- chosen so it is the fastest universal
# one. Also it should not punch state.py in the face how it is
# happening with kNN...
sample_clf_lin = SMLR(lm=0.1)  #sg.svm.LinearCSVMC(svm_impl='libsvm')

#if externals.exists('shogun'):
#    sample_clf_nl = sg.SVM(kernel_type='RBF', svm_impl='libsvm')
#else:
#classical one which was used for a while
#and surprisingly it is not bad at all for the unittests
sample_clf_nl = kNN(k=5)

# and also a regression-based classifier
r = clfswh['linear', 'regression_based', 'has_sensitivity']
if len(r) > 0: sample_clf_reg = r[0]
else:
    sample_clf_reg = None