示例#1
0
    def test_SVM_Priors_D(self):
        """Test SVM with priors """
        # Train a svm
        svm = AZorngCvSVM.CvSVMLearner(
            self.inDataD, priors={"Iris-setosa": 0.2, "Iris-versicolor": 0.3, "Iris-virginica": 0.5}
        )
        trainedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, svm)

        self.assertEqual(round(trainedAcc, 7), round(0.73333329999999997, 7))
        # Save model
        rc = svm.write(self.modelPath)
        self.assertEqual(rc, True)
        # Load the saved model
        loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath)
        loadedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, loadedsvm)
        # Assure equal accuracy
        self.assertEqual(trainedAcc, loadedAcc)

        svmLearner = AZorngCvSVM.CvSVMLearner(
            scaleData=False, priors={"Iris-setosa": 0.2, "Iris-versicolor": 0.3, "Iris-virginica": 0.5}
        )

        svmLearner.name = "CvSVMLearner"
        svmLearner.shrinking = 1
        svmLearner.eps = 0.001
        svmLearner.p = 0.0
        svmLearner.nu = 0.6
        svmLearner.kernel_type = 2
        svmLearner.svm_type = 103
        svmLearner.gamma = 0.0033
        svmLearner.C = 47
        svmLearner.probability = 1
        svmLearner.scaleData = True
        svmLearner.scaleClass = False
        # svmLearner.for_nomogram=1

        Res = orngTest.crossValidation(
            [svmLearner], self.inDataD, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible
        )
        CA = evalUtilities.CA(Res)[0]
        self.assertEqual(round(CA, 2), round(0.940000000, 2))  # orange1.0: 0.93333333333333335])

        svmLearner.priors = None
        Res = orngTest.crossValidation(
            [svmLearner], self.inDataD, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible
        )
        CA = evalUtilities.CA(Res)[0]
        self.assertEqual(round(CA, 2), round(0.94666666666666666, 2))

        newSVM = svmLearner(self.inDataD)
        trainedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, newSVM)
        # Save model
        rc = newSVM.write(self.modelPath)
        self.assertEqual(rc, True)
        # Load the saved model
        loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath)
        loadedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, loadedsvm)
        # Assure equal accuracy
        self.assertEqual(round(trainedAcc, 7), round(0.95999999999999996, 7))  # Before in AZSVM: 0.953333300000
        self.assertEqual(round(trainedAcc, 1), round(loadedAcc, 1))
示例#2
0
def WrapperFSS(data, learner, verbose=0, folds=10):
  classVar = data.domain.classVar
  currentAtt = []
  freeAttributes = list(data.domain.attributes)

  newDomain = orange.Domain(currentAtt + [classVar])
  d = data.select(newDomain)
  results = orngTest.crossValidation([learner], d, folds=folds)
  maxStat = orngStat.CA(results)[0]
  if verbose>=2:
    print "start (%5.3f)" % maxStat

  while 1:
    stat = []
    for a in freeAttributes:
      newDomain = orange.Domain([a] + currentAtt + [classVar])
      d = data.select(newDomain)
      results = orngTest.crossValidation([learner], d, folds=folds)
      stat.append(orngStat.CA(results)[0])
      if verbose>=2:
        print "  %s gained %5.3f" % (a.name, orngStat.CA(results)[0])

    if (max(stat) > maxStat):
      oldMaxStat = maxStat
      maxStat = max(stat)
      bestVarIndx = stat.index(max(stat))
      if verbose:
        print "gain: %5.3f, attribute: %s" % (maxStat-oldMaxStat, freeAttributes[bestVarIndx].name)
      currentAtt = currentAtt + [freeAttributes[bestVarIndx]]
      del freeAttributes[bestVarIndx]
    else:
      if verbose:
        print "stopped (%5.3f)" % (max(stat) - maxStat)
      return orange.Domain(currentAtt + [classVar])
      break
示例#3
0
    def crossValidation(self, nu=0.6, gamma=4.0):
        """
        Perform a cross validation on the training data.
    
        @param nu:            The S{nu}-Parameter of the support vector machine.
        @type nu:             float

        @param gamma:         The S{gamma}-Parameter of the RBF-Kernel.
        @type gamma:          float

        @rtype:               float
        @return:              The cross validation accuracy.
        """
        if self.learner is None:
            raise ValueError("Learner has to be loaded before classification can be done.")

        self.nu = nu
        self.gamma = gamma

        svm = orange.SVMLearner()
        svm.svm_type = orange.SVMLearner.Nu_SVC
        svm.nu = nu
        svm.gamma = gamma
        svm.probability = True

        values = self.learner.histograms.domain.classVar.values
        self.values = values
        length = len(values)
        self.confusion = numpy.zeros((length, length), int)
        result = orngTest.crossValidation([svm], self.learner.histograms, folds=10)

        self.cvAccuracy = orngStat.CA(result)[0]
        recognosco.logger.info("Cross validation accuracy: %s", self.cvAccuracy)
        return self.cvAccuracy
示例#4
0
文件: cv.py 项目: sloria/usv
def test_fss(learner, data, t=0.01):
    fss = orngFSS.FilterAttsAboveThresh(threshold=t)
    fLearner = orngFSS.FilteredLearner(learner, filter=fss,
        name='%s & fss' % (learner.name))
    learners = [learner, fLearner]
    results = orngTest.crossValidation(learners, data, folds=10, storeClassifiers=1)

    # how many attributes did each classifier use?
    natt = [0.] * len(learners)
    for fold in range(results.numberOfIterations):
        for lrn in range(len(learners)):
            natt[lrn] += len(results.classifiers[fold][lrn].domain.attributes)
    for lrn in range(len(learners)):
        natt[lrn] = natt[lrn] / 10.

    print "\nLearner         Accuracy  #Atts"
    for i in range(len(learners)):
        print "%-15s %5.3f     %5.2f" % (learners[i].name, orngStat.CA(results)[i], natt[i])

    # which attributes were used in filtered case?
    print '\nAttribute usage (how many folds attribute was used):'
    used = {}
    for fold in range(results.numberOfIterations):
        for att in results.classifiers[fold][1].domain.attributes:
            a = att.name
            if a in used.keys(): used[a] += 1
            else: used[a] = 1
    for a in used.keys():
        print '%2d x %s' % (used[a], a)
示例#5
0
文件: cv.py 项目: sloria/usv
def test_fss(learner, data, t=0.01):
    fss = orngFSS.FilterAttsAboveThresh(threshold=t)
    fLearner = orngFSS.FilteredLearner(learner,
                                       filter=fss,
                                       name='%s & fss' % (learner.name))
    learners = [learner, fLearner]
    results = orngTest.crossValidation(learners,
                                       data,
                                       folds=10,
                                       storeClassifiers=1)

    # how many attributes did each classifier use?
    natt = [0.] * len(learners)
    for fold in range(results.numberOfIterations):
        for lrn in range(len(learners)):
            natt[lrn] += len(results.classifiers[fold][lrn].domain.attributes)
    for lrn in range(len(learners)):
        natt[lrn] = natt[lrn] / 10.

    print "\nLearner         Accuracy  #Atts"
    for i in range(len(learners)):
        print "%-15s %5.3f     %5.2f" % (learners[i].name,
                                         orngStat.CA(results)[i], natt[i])

    # which attributes were used in filtered case?
    print '\nAttribute usage (how many folds attribute was used):'
    used = {}
    for fold in range(results.numberOfIterations):
        for att in results.classifiers[fold][1].domain.attributes:
            a = att.name
            if a in used.keys(): used[a] += 1
            else: used[a] = 1
    for a in used.keys():
        print '%2d x %s' % (used[a], a)
    def evaluating(self, event):
	train_data = orange.ExampleTable("classification.tab")        
	bayes2 = orange.BayesLearner()
    	tree2 = orngTree.TreeLearner()
    	knnLearner2 = orange.kNNLearner()
    	knnLearner2.k = 10 # k == 18 seems to be best (at least for 2-3)
    #svm2 = svm.SVMLearner()
    	bayes2.name = "bayes2"
    	tree2.name = "tree2"
    	knnLearner2.name = "knn2"
    	learners = [bayes2, tree2, knnLearner2]

    	results = orngTest.crossValidation(learners, train_data, folds=10)
    	print "train_data:"
    	print train_data
    	print "k=="
    	print knnLearner2.k
    	print 'Learner  CA     IS     Brier    AUC'
    	c = ''
    	for i in range(len(learners)):
        	print "%-8s %5.3f  %5.3f  %5.3f  %5.3f" % (learners[i].name, \
            	orngStat.CA(results)[i], orngStat.IS(results)[i],
            	orngStat.BrierScore(results)[i], orngStat.AUC(results)[i])
        	c = c+str("%-8s"%learners[i].name)+'\t'+ str("%5.3f" %orngStat.CA(results)[i])+'\t'+ str("%5.3f" %orngStat.IS(results)[i])+'\t'+str("%5.3f" %orngStat.BrierScore(results)[i])+'\t'+str("%5.3f" %orngStat.AUC(results)[i]+'\n')
        self.logger2.AppendText('Learner  CA     IS     Brier    AUC:\n %s\n' % c)
	return (c)
示例#7
0
def classifier(personalrecords, lowage, upage , agetype, numcrossfolds, estimation):
    domain = createAttributes()
    data = orange.ExampleTable(domain, getallfeatures(personalrecords,domain,
                                                                         lowage, upage, 'MONTHS'))
    
    bayes = orange.BayesLearner()
    bayesMEst = orange.BayesLearner()
    bayesMEst.estimatorConstructor = orange.ProbabilityEstimatorConstructor_m(m=estimation)

    tree = orngTree.TreeLearner(mForPruning=2)
    bayes.name = "bayes"
    bayesMEst.name = 'bayesWithMEstimation'
    tree.name = "tree"
    learners = [ bayesMEst]
    #learners = [tree]
    #print numcrossfolds , ' fold cross validattion', 'for' , learners
    results = orngTest.crossValidation(learners, data, folds=numcrossfolds, storeClassifiers = 1)
    # output the results
    #print "Learner  CA     IS     Brier    AUC" 
    #for i in range(len(learners)):
     #   print "%-8s %5.3f  %5.3f  %5.3f  %5.3f" % (learners[i].name, \
      #                                             orngStat.CA(results)[i], orngStat.IS(results)[i],
      #                                             orngStat.BrierScore(results)[i], orngStat.AUC(results)[i])
    
    #orngTree.printDot(tree, fileName='c:\\tree10.dot', internalNodeShape="ellipse", leafShape="box")
    #orngTree.printDot(bayes, fileName='c:\\bayes10.dot', internalNodeShape="ellipse", leafShape="box")
    TP = []
    FP = []
    TN = []
    FN = []
    for i in range(numcrossfolds):
        for j in range(len(learners)):
            classifier = (results.classifiers[i])[j]
            tp = 0
            fn = 0
            tn = 0
            fp = 0
            for dt  in data:
                if dt.getclass() == '1':
                    p = str(classifier(dt))
                    if p == '1':
                        tp = tp + 1
                    else:
                        fn  = fn +1
                if dt.getclass() == '0':
                    p = classifier(dt)
                    if p == '0':
                        tn = tn + 1
                    else:
                        fp  = fp +1
            #print 'Results for Learner :', learnersname[j], " for corssfold number ", i
            #print 'True positive: ', tp ,'False positive: ', fp, 'True Negative: ', tn, 'False Negative: ', fn
            TP.append(tp)
            FP.append(fp)
            TN.append(tn)
            FN.append(fn)
                #learners[j].dumpTree()
                #orngTree.printDot(learners[j], fileName=filename, internalNodeShape="ellipse", leafShape="box")
    #classiferProbabilites(results, len(learners) , numcrossfolds, data , attlist[:1] , '1')     
    return (results , TP ,FP , TN, FN)
示例#8
0
    def test_SVMC(self):

        # Train a svm
        svmL = AZorngCvSVM.CvSVMLearner(scaleData=False,
                                        svm_type=103,
                                        gamma=0.01,
                                        C=1,
                                        nu=0.5,
                                        p=1,
                                        eps=0.001,
                                        coef0=0,
                                        degree=3)
        svm = svmL(self.inDataC)
        trainedAcc = evalUtilities.getRMSE(self.inDataC, svm)

        self.assertEqual(round(trainedAcc, 7), round(2.8525863999999999,
                                                     7))  # ver 0.3

        # Save model
        rc = svm.write(self.modelPath)
        self.assertEqual(rc, True)
        # Load the saved model
        loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath)
        loadedAcc = evalUtilities.getRMSE(self.inDataC, loadedsvm)
        # Assure equal accuracy
        self.assertEqual(trainedAcc, loadedAcc)

        svmLearner = AZorngCvSVM.CvSVMLearner(scaleData=False)

        svmLearner.name = "CvSVMLearner"
        svmLearner.eps = 0.001
        svmLearner.p = 1
        svmLearner.nu = 0.6
        svmLearner.kernel_type = 2
        svmLearner.svm_type = 103
        svmLearner.gamma = 0.0033
        svmLearner.C = 47
        svmLearner.scaleData = True
        svmLearner.scaleClass = False

        Res = orngTest.crossValidation(
            [svmLearner],
            self.inDataC,
            folds=5,
            strat=orange.MakeRandomIndices.StratifiedIfPossible)
        RMSE = evalUtilities.RMSE(Res)[0]
        self.assertEqual(round(RMSE, 2), round(2.96, 2))  #Ver 0.3

        newSVM = svmLearner(self.inDataC)
        trainedAcc = evalUtilities.getRMSE(self.inDataC, newSVM)
        # Save model
        rc = newSVM.write(self.modelPath)
        self.assertEqual(rc, True)
        # Load the saved model
        loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath)
        loadedAcc = evalUtilities.getRMSE(self.inDataC, loadedsvm)
        # Assure equal accuracy
        self.assertEqual(round(trainedAcc, 4), round(2.8289, 4))  #Ver 0.3
        self.assertEqual(round(trainedAcc, 4), round(loadedAcc, 4))
示例#9
0
    def testRMSEstdCalc(self):

        data = dataUtilities.DataTable(self.regDataPath)
        RFlearner = AZorngRF.RFLearner()
        learners = [RFlearner]
        nFolds = 5
        res = orngTest.crossValidation(learners, data, strat=orange.MakeRandomIndices.StratifiedIfPossible, folds = nFolds) 
        RMSEstd = evalUtilities.getRMSEstd(res, nFolds)[0]
        self.assertEqual(round(RMSEstd,3), round(0.141, 3))
示例#10
0
def cforange_cross_validation(input_dict):
    import orange, orngTest, orngStat
    learners = [input_dict['learner']]
    data = input_dict['dataset']
    folds = int(input_dict['folds'])
    results = orngTest.crossValidation(learners, data, folds=folds)
    output_dict = {}
    output_dict['results']=results
    return output_dict
示例#11
0
    def testRMSEstdCalc(self):

        data = dataUtilities.DataTable(self.regDataPath)
        RFlearner = AZorngRF.RFLearner()
        learners = [RFlearner]
        nFolds = 5
        res = orngTest.crossValidation(
            learners,
            data,
            strat=orange.MakeRandomIndices.StratifiedIfPossible,
            folds=nFolds)
        RMSEstd = evalUtilities.getRMSEstd(res, nFolds)[0]
        self.assertEqual(round(RMSEstd, 3), round(0.141, 3))
示例#12
0
    def test_SVMC(self):

        # Train a svm
        svmL = AZorngCvSVM.CvSVMLearner(
            scaleData=False, svm_type=103, gamma=0.01, C=1, nu=0.5, p=1, eps=0.001, coef0=0, degree=3
        )
        svm = svmL(self.inDataC)
        trainedAcc = evalUtilities.getRMSE(self.inDataC, svm)

        self.assertEqual(round(trainedAcc, 7), round(2.8525863999999999, 7))  # ver 0.3

        # Save model
        rc = svm.write(self.modelPath)
        self.assertEqual(rc, True)
        # Load the saved model
        loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath)
        loadedAcc = evalUtilities.getRMSE(self.inDataC, loadedsvm)
        # Assure equal accuracy
        self.assertEqual(trainedAcc, loadedAcc)

        svmLearner = AZorngCvSVM.CvSVMLearner(scaleData=False)

        svmLearner.name = "CvSVMLearner"
        svmLearner.eps = 0.001
        svmLearner.p = 1
        svmLearner.nu = 0.6
        svmLearner.kernel_type = 2
        svmLearner.svm_type = 103
        svmLearner.gamma = 0.0033
        svmLearner.C = 47
        svmLearner.scaleData = True
        svmLearner.scaleClass = False

        Res = orngTest.crossValidation(
            [svmLearner], self.inDataC, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible
        )
        RMSE = evalUtilities.RMSE(Res)[0]
        self.assertEqual(round(RMSE, 2), round(2.96, 2))  # Ver 0.3

        newSVM = svmLearner(self.inDataC)
        trainedAcc = evalUtilities.getRMSE(self.inDataC, newSVM)
        # Save model
        rc = newSVM.write(self.modelPath)
        self.assertEqual(rc, True)
        # Load the saved model
        loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath)
        loadedAcc = evalUtilities.getRMSE(self.inDataC, loadedsvm)
        # Assure equal accuracy
        self.assertEqual(round(trainedAcc, 4), round(2.8289, 4))  # Ver 0.3
        self.assertEqual(round(trainedAcc, 4), round(loadedAcc, 4))
示例#13
0
def generalCVconfMat(data, learners, nFolds = 5):
    """
    General method for printing the X fold CV confusion matrix of an Orange data set (data)
    with any number of classes. learners is a list of AZorange learners.
    """

    res = orngTest.crossValidation(learners, data, strat=orange.MakeRandomIndices.StratifiedIfPossible, folds = nFolds)
    classes = data.domain.classVar.values

    for idx in range(len(learners)):
        cm = orngStat.computeConfusionMatrices(res)[idx]
        print "Results for "+learners[idx].name
        print "\t"+"\t".join(classes)
        for className, classConfusions in zip(classes, cm):
            print ("%s" + ("\t%i" * len(classes))) % ((className, ) + tuple(classConfusions))
示例#14
0
文件: tweaked.py 项目: laat/ex3
def tweaked(outfile, **kwargs):
    import orange, orngTest
    classes = []

    outfile = outfile.rsplit(".",1)[0]
    data = orange.ExampleTable(outfile)

    #c45 = orange.C45Learner(minObjs=100)
    c45 = orange.kNNLearner(minObjs=100)

    results = orngTest.crossValidation([c45], data, folds=10)
    for i, example in enumerate(results.results, 1):
        p = example.probabilities[0]
        classes.append((i, p[1]))
    return classes
示例#15
0
    def test_SVMD(self):

        # Train a svm
        svm = AZorngCvSVM.CvSVMLearner(
            self.inDataD, scaleData=False, gamma=4, C=1, nu=0.5, p=0.1, eps=0.001, coef0=0, degree=3
        )
        trainedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, svm)

        self.assertEqual(round(trainedAcc, 7), round(0.986666666667, 7))
        # Save model
        rc = svm.write(self.modelPath)
        self.assertEqual(rc, True)
        # Load the saved model
        loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath)
        loadedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, loadedsvm)
        # Assure equal accuracy
        self.assertEqual(trainedAcc, loadedAcc)

        svmLearner = AZorngCvSVM.CvSVMLearner(scaleData=False)

        svmLearner.name = "CvSVMLearner"
        svmLearner.eps = 0.001
        svmLearner.p = 0.0
        svmLearner.nu = 0.6
        svmLearner.kernel_type = 2
        svmLearner.svm_type = 101
        svmLearner.gamma = 0.0033
        svmLearner.C = 47
        svmLearner.scaleData = True
        svmLearner.scaleClass = False

        Res = orngTest.crossValidation(
            [svmLearner], self.inDataD, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible
        )
        CA = evalUtilities.CA(Res)[0]
        self.assertEqual(round(CA, 2), round(0.96666666666666667, 2))  # Before in AZSVM: 0.95999999999999996

        newSVM = svmLearner(self.inDataD)
        trainedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, newSVM)
        # Save model
        rc = newSVM.write(self.modelPath)
        self.assertEqual(rc, True)
        # Load the saved model
        loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath)
        loadedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, loadedsvm)
        # Assure equal accuracy
        self.assertEqual(round(trainedAcc, 7), round(0.96666669999999999, 7))  # Before in AZSVM: 0.953333300000
        self.assertEqual(round(trainedAcc, 1), round(loadedAcc, 1))
示例#16
0
def evaluation(data, folds):
    """
    Evaluates Bayes and DT and prints results for 4 different metrics: Correspondence analysis,
    Information score, Brier score and Area under the ROC curve.
    """
    bayes = orange.BayesLearner()
    tree = orngTree.TreeLearner(mForPruning=2)
    bayes.name = "bayes"
    tree.name = "tree"
    learners = [bayes, tree]
    print "Statistical measures per learner (using %d-fold cross-validation):" % (folds)
    results = orngTest.crossValidation(learners, data, folds)
    print "Learner   CA      IS      Brier     AUC"
    for i in range(len(learners)):
        print "%-8s %5.3f  %5.3f  %5.3f  %5.3f" % (learners[i].name, orngStat.CA(results)[i], orngStat.IS(results)[i], orngStat.BrierScore(results)[i], orngStat.AUC(results)[i])
    print "-------"
    print
示例#17
0
    def cross_validation(self):
        
        data = self.data
        # set up the learners
        bayes = orange.BayesLearner()
        tree = orngTree.TreeLearner(mForPruning=2)
        bayes.name = "bayes"
        tree.name = "tree"
        
        l = orange.SVMLearner() 
        l.name = "SVM"
        
        l=orange.SVMLearner() 
        l.svm_type=orange.SVMLearner.Nu_SVC 
        l.nu=0.3 
        l.probability=True 
        
        learners = [bayes, tree, l]
        deepcopy
        # compute accuracies on data
        
        
        res = orngTest.crossValidation(learners, data, folds=10)
        cm = orngStat.computeConfusionMatrices(res,
                classIndex=data.domain.classVar.values.index('-1'))
        
        stat = (('CA', 'CA(res)'),
                ('Sens', 'sens(cm)'),
                ('Spec', 'spec(cm)'),
                ('AUC', 'AUC(res)'),
                ('IS', 'IS(res)'),
                ('Brier', 'BrierScore(res)'),
                ('F1', 'F1(cm)'),
                ('F2', 'Falpha(cm, alpha=2.0)'),
                ('MCC', 'MCC(cm)'),
                ('sPi', 'scottsPi(cm)'),
                )

        scores = [eval("orngStat."+s[1]) for s in stat]
        print "Learner  " + "".join(["%-7s" % s[0] for s in stat])
        for (i, l) in enumerate(learners):
            print "%-8s " % l.name + "".join(["%5.3f  " % s[i] for s in scores])
    
        return None
示例#18
0
文件: otests.py 项目: bh0085/compbio
def ensemble2(data = None):
    import orange, orngTree, orngEnsemble

    if not data:
        data = orange.ExampleTable('bupa.tab')

    forest = orngEnsemble.RandomForestLearner(trees=50, name="forest")
    tree = orngTree.TreeLearner(minExamples=2, mForPrunning=2, \
                            sameMajorityPruning=True, name='tree')
    learners = [tree, forest]

    import orngTest, orngStat
    #results = orngTest.leaveOneOut(learners, data)
    results = orngTest.crossValidation(learners, data, folds=2)
    print "Learner  CA     Brier  AUC"
    for i in range(len(learners)):
        print "%-8s %5.3f  %5.3f  %5.3f" % (learners[i].name, \
                                                orngStat.CA(results)[i],
                                            
                                            orngStat.AUC(results)[i])
示例#19
0
def classifier(personalrecords, features , attlist ,lowage, upage ,estimation):
    data = orange.ExampleTable(createAttributes(attlist), getallfeatures(personalrecords,attlist,
                                                                         lowage, upage, 'MONTHS'))
    bayes = orange.BayesLearner()
    bayesWithEstimation = orange.BayesLearner(m=estimation)
    tree = orngTree.TreeLearner(mForPruning=2)
    bayes.name = "bayes"
    bayesWithEstimation.name = "bayesWithEstimation"
    tree.name = "tree"
    learners = [bayes, bayesWithEstimation]
    print '10 fold cross validattion'
    results = orngTest.crossValidation(learners, data, folds=10)
    # output the results
    print "Learner  CA     IS     Brier    AUC"
    for i in range(len(learners)):
        print "%-8s %5.3f  %5.3f  %5.3f  %5.3f" % (learners[i].name, \
                                                   orngStat.CA(results)[i], orngStat.IS(results)[i],
                                                   orngStat.BrierScore(results)[i], orngStat.AUC(results)[i])
        if learners[i].name == 'bayes' or learners[i].name == 'bayesWithEstimation':
            
    orngTree.printDot(tree, fileName='c:\\tree10.dot', internalNodeShape="ellipse", leafShape="box")
    
    orngTree.printDot(bayes, fileName='c:\\bayes10.dot', internalNodeShape="ellipse", leafShape="box")
示例#20
0
def knn_classifier_xv(tree, outfile="dev.tab", **kwargs):
    import orange, orngTest, orngStat
    classes = []
    # TODO: skipp skriving til fil
    outfile = outfile.rsplit(".",1)[0]
    data = orange.ExampleTable(outfile)

    knn = orange.kNNLearner(k=21, name="knn")

    results = orngTest.crossValidation([knn], data, folds=10)

    # output the results
    print "Learner  CA     IS     Brier    AUC"

    print "%-8s %5.3f  %5.3f  %5.3f  %5.3f" % (knn.name, \
        orngStat.CA(results)[0], orngStat.IS(results)[0],
        orngStat.BrierScore(results)[0], orngStat.AUC(results)[0])

    print results.results[0].probabilities
    for i, example in enumerate(results.results, 1):
        p = example.probabilities[0]
        classes.append((i, p[1]))
    return classes
    #scores = Orange.feature.scoring.score_all(start_data)
    #data = Orange.feature.selection.select(start_data, scores, features)

    train_data, test_data = proj_utils.partition_data(start_data)

    #selection = orange.MakeRandomIndicesCV(data, cv_folds)

    #sen1 = 0.0
    #spe1 = 0.0
    #acc1 = 0.0
    #sen2 = 0.0
    #spe2 = 0.0
    #acc2 = 0.0

    model = train_classifier(train_data, features)
    train_results = orngTest.crossValidation([model], example_data, cv_folds)
    #test_results = orngTest.crossValidation([model], test_data, cv_folds)

    train_stats = proj_utils.get_stats(train_results)
    #test_stats = proj_utils.get_stats(test_results)

    print "Train:\n%s" % str(train_stats)
    #print "\nTest:\n%s" % str(test_stats)

    f = open(
        os.path.dirname(__file__) + '\\logisticRegressionFilteredCVResults_' +
        'V' + str(cv_folds) + '_F' + str(features) + '.txt', 'w+')
    f.write("Train:\n")
    f.write(str(train_stats) + "\n")
    #f.write("Test:\n")
    #f.write(str(test_stats))
示例#22
0
    def getProbabilitiesAsAttribute(self, algorithm=None, minsup=None, atts=None):
        """ For regression problems, it returns the RMSE and the Q2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None
                
			parameters:
                algo - key for the structural feature generation algorithm (set dependent structural features that have to be calculated inside the crossvalidation)
                minsup - minimum support for the algorithm
                atts - attributes to be removed before learning (e.g. meta etc...)
        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None

        if algorithm:
            self.__log(" Additional features to be calculated inside of cross-validation")
            self.__log(" Algorithm for structural features: " + str(algorithm))
            self.__log(" Minimum support parameter: " + str(minsup))

        # Set the response type
        self.responseType = (
            self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression"
        )
        self.__log("  " + str(self.responseType))

        # Create the Train and test sets
        DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds)

        # Var for saving each Fols result
        optAcc = {}
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}

        # Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models = {}
        rocs = {}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  " + str([x for x in MLmethods]))

        # Check data in advance so that, by chance, it will not faill at the last fold!
        for foldN in range(self.nExtFolds):
            trainData = self.data.select(DataIdxs[foldN], negate=1)
            self.__checkTrainData(trainData)

        # Optional!!
        # Order Learners so that PLS is the first
        sortedML = [ml for ml in MLmethods]
        if "PLS" in sortedML:
            sortedML.remove("PLS")
            sortedML.insert(0, "PLS")

        for ml in sortedML:
            self.__log("    > " + str(ml) + "...")
            try:
                # Var for saving each Fols result
                results[ml] = []
                exp_pred[ml] = []
                models[ml] = []
                rocs[ml] = []
                nTrainEx[ml] = []
                nTestEx[ml] = []
                optAcc[ml] = []

                ### mods TG
                prediction_attribute = orange.FloatVariable("class_prob")
                domain = [data.domain.attributes, prediction_attribute, data.domain.classvar]
                data_new = orange.ExampleTable(domain)

                logTxt = ""
                for foldN in range(self.nExtFolds):
                    if type(self.learner) == dict:
                        self.paramList = None

                    trainData = self.data.select(DataIdxs[foldN], negate=1)
                    orig_len = len(trainData.domain.attributes)
                    # add structural descriptors to the training data (TG)
                    if algorithm:
                        trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, algorithm, minsup)
                        trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts)

                    testData = self.data.select(DataIdxs[foldN])
                    # print "IDX: ",
                    # print DataIdxs[foldN]
                    # calculate the feature values for the test data (TG)
                    if algorithm:
                        cut_off = orig_len - len(atts)
                        smarts = trainData.domain.attributes[cut_off:]
                        self.__log("  Number of structural features added: " + str(len(smarts)))
                        testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts)
                        testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts)

                    nTrainEx[ml].append(len(trainData))
                    nTestEx[ml].append(len(testData))
                    # Test if trainsets inside optimizer will respect dataSize criterias.
                    #  if not, don't optimize, but still train the model
                    dontOptimize = False
                    if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20):
                        dontOptimize = True
                    else:
                        tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds)
                        tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1)
                        if not self.__checkTrainData(tmpTrainData, False):
                            dontOptimize = True

                    if dontOptimize:
                        logTxt += (
                            "       Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n"
                        )
                        self.__log(logTxt)
                        if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            CA = evalUtilities.CA(res)[0]
                            optAcc[ml].append(CA)
                        else:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            R2 = evalUtilities.R2(res)[0]
                            optAcc[ml].append(R2)
                    else:
                        runPath = miscUtilities.createScratchDir(
                            baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData)
                        )
                        trainData.save(os.path.join(runPath, "trainData.tab"))

                        tunedPars = paramOptUtilities.getOptParam(
                            learner=MLmethods[ml],
                            trainDataFile=os.path.join(runPath, "trainData.tab"),
                            paramList=self.paramList,
                            useGrid=False,
                            verbose=self.verbose,
                            queueType=self.queueType,
                            runPath=runPath,
                            nExtFolds=None,
                            nFolds=self.nInnerFolds,
                            logFile=self.logFile,
                            getTunedPars=True,
                        )
                        if not MLmethods[ml] or not MLmethods[ml].optimized:
                            self.__log(
                                "       WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized."
                            )
                            self.__log("                It will be ignored")
                            # self.__log("                It will be set to default parameters")
                            self.__log("                    DEBUG can be done in: " + runPath)
                            # Set learner back to default
                            # MLmethods[ml] = MLmethods[ml].__class__()
                            raise Exception("The learner " + str(ml) + " was not optimized.")
                        else:
                            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                optAcc[ml].append(tunedPars[0])
                            else:
                                res = orngTest.crossValidation(
                                    [MLmethods[ml]],
                                    trainData,
                                    folds=5,
                                    strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                    randomGenerator=random.randint(0, 100),
                                )
                                R2 = evalUtilities.R2(res)[0]
                                optAcc[ml].append(R2)

                            miscUtilities.removeDir(runPath)
                    # Train the model
                    model = MLmethods[ml](trainData)
                    models[ml].append(model)
                    # Test the model
                    if self.responseType == "Classification":
                        results[ml].append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                        roc = self.aroc(testData, [model])
                        rocs[ml].append(roc)

                    # save the prediction probabilities

                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        results[ml].append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        exp_pred[ml] += local_exp_pred

                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    rocs[ml],
                )
                if self.verbose > 0:
                    print "UnbiasedAccuracyGetter!Results  " + ml + ":\n"
                    pprint(res)
                if not res:
                    raise Exception("No results available!")
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)
                self.__log("       OK")
            except:
                self.__log("       Learner " + str(ml) + " failed to create/optimize the model!")
                res = self.createStatObj()
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            # We still need to build a consensus model out of the stable models
            #   ONLY if there are more that one model stable!
            #   When only one or no stable models, build a consensus based on all models
            consensusMLs = {}
            for modelName in statistics:
                StabilityValue = statistics[modelName]["StabilityValue"]
                if StabilityValue is not None and statistics[modelName]["stable"]:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            self.__log(
                "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods."
            )

            if len(consensusMLs) <= 1:  # we need more models to build a consensus!
                consensusMLs = {}
                for modelName in statistics:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            if len(consensusMLs) >= 2:
                # Var for saving each Fols result
                Cresults = []
                Cexp_pred = []
                CnTrainEx = []
                CnTestEx = []
                self.__log(
                    "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs])
                )
                for foldN in range(self.nExtFolds):
                    if self.responseType == "Classification":
                        CLASS0 = str(self.data.domain.classVar.values[0])
                        CLASS1 = str(self.data.domain.classVar.values[1])
                        exprTest0 = "(0"
                        for ml in consensusMLs:
                            exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " "
                        exprTest0 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest0 += ", " + ml + " == " + CLASS0 + " "
                        exprTest0 += "]),1)"
                        exprTest1 = exprTest0.replace(CLASS0, CLASS1)
                        expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1]
                    else:
                        Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs])
                        expression = "(1 / " + str(Q2sum) + ") * (0"
                        for ml in consensusMLs:
                            expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " "
                        expression += ")"

                    testData = self.data.select(DataIdxs[foldN])
                    CnTestEx.append(len(testData))
                    consensusClassifiers = {}
                    for learnerName in consensusMLs:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression)
                    CnTrainEx.append(model.NTrainEx)
                    # Test the model
                    if self.responseType == "Classification":
                        Cresults.append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        Cresults.append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds)
                statistics["Consensus"] = copy.deepcopy(res)
                statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs)
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics

        # By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]
示例#23
0
文件: c45.py 项目: stefie10/slu_hri
tree = orange.C45Learner(data, m=100)
for i in data[:5]:
    print tree(i), i.getclass()

print "\n\nC4.5 with minObjs=100"
tree = orange.C45Learner(data, minObjs=100)
for i in data[:5]:
    print tree(i), i.getclass()

print "\n\nC4.5 with -m 1 and -s"
lrn = orange.C45Learner()
lrn.commandline("-m 1 -s")
tree = lrn(data)
for i in data:
    if i.getclass() != tree(i):
        print i, tree(i)

import orngC45
tree = orange.C45Learner(data)
orngC45.printTree(tree)
print

import orngStat, orngTest
res = orngTest.crossValidation(
    [orange.C45Learner(),
     orange.C45Learner(convertToOrange=1)], data)
print "Classification accuracy: %5.3f (converted to tree: %5.3f)" % tuple(
    orngStat.CA(res))
print "Brier score: %5.3f (converted to tree: %5.3f)" % tuple(
    orngStat.BrierScore(res))
示例#24
0
import orange, orngTest, orngStat, orngBayes
data = orange.ExampleTable("lung-cancer")
bayes = orngBayes.BayesLearner()
bayes_m = orngBayes.BayesLearner(m=2)

res = orngTest.crossValidation([bayes, bayes_m], data)
CAs = orngStat.CA(res)
print
print "Without m: %5.3f" % CAs[0]
print "With m=2: %5.3f" % CAs[1]

data = orange.ExampleTable("voting")
model = orngBayes.BayesLearner(data)
orngBayes.printModel(model)
#              to compare naive Bayesian learner when all or just the most 
#              important attribute is used. Shows how to find out which (in
#              ten-fold cross validation) attributes was used the most.
# Category:    feature selection
# Uses:        voting
# Referenced:  Orange.feature.html#selection
# Classes:     Orange.feature.selection.FilteredLearner

import Orange, orngTest, orngStat
voting = Orange.data.Table("voting")

nb = Orange.classification.bayes.NaiveLearner()
fl = Orange.feature.selection.FilteredLearner(nb, 
     filter=Orange.feature.selection.FilterBestNAtts(n=1), name='filtered')
learners = (Orange.classification.bayes.NaiveLearner(name='bayes'), fl)
results = orngTest.crossValidation(learners, voting, storeClassifiers=1)

# output the results
print "Learner      CA"
for i in range(len(learners)):
    print "%-12s %5.3f" % (learners[i].name, orngStat.CA(results)[i])

# find out which attributes were retained by filtering

print "\nNumber of times attributes were used in cross-validation:"
attsUsed = {}
for i in range(10):
    for a in results.classifiers[i][1].atts():
        if a.name in attsUsed.keys():
            attsUsed[a.name] += 1
        else:
示例#26
0
# Description: Shows how to add class noise to data
# Category:    preprocessing
# Uses:        imports-85
# Classes:     Preprocessor_addClassNoise, orngTest.crossValidation
# Referenced:  domain.htm

import orange, orngTest, orngStat

filename = "promoters.tab"
data = orange.ExampleTable(filename)
data.name = "unspoiled"
datasets = [data]

add_noise = orange.Preprocessor_addClassNoise()
for noiselevel in (0.2, 0.4, 0.6):
    add_noise.proportion = noiselevel
    add_noise.randomGenerator = 42
    d = add_noise(data)
    d.name = "class noise %4.2f" % noiselevel
    datasets.append(d)

learner = orange.BayesLearner()

for d in datasets:
    results = orngTest.crossValidation([learner], d, folds=10)
    print "%20s   %5.3f" % (d.name, orngStat.CA(results)[0])
示例#27
0
# Category:    preprocessing
# Uses:        crx.tab
# Referenced:  orngFSS.htm

import orange, orngDisc, orngTest, orngStat, orngFSS

data = orange.ExampleTable("../datasets/crx")

bayes = orange.BayesLearner()
dBayes = orngDisc.DiscretizedLearner(bayes, name='disc bayes')
fss = orngFSS.FilterAttsAboveThresh(threshold=0.05)
fBayes = orngFSS.FilteredLearner(dBayes, filter=fss, name='bayes & fss')

learners = [dBayes, fBayes]
results = orngTest.crossValidation(learners,
                                   data,
                                   folds=10,
                                   storeClassifiers=1)

# how many attributes did each classifier use?

natt = [0.] * len(learners)
for fold in range(results.numberOfIterations):
    for lrn in range(len(learners)):
        natt[lrn] += len(results.classifiers[fold][lrn].domain.attributes)
for lrn in range(len(learners)):
    natt[lrn] = natt[lrn] / 10.

print "\nLearner         Accuracy  #Atts"
for i in range(len(learners)):
    print "%-15s %5.3f     %5.2f" % (learners[i].name, orngStat.CA(results)[i],
                                     natt[i])
示例#28
0
    def score(self, ids):
        """compute scores for the list of learners"""
        if (not self.data):
            for id in ids:
                self.learners[id].results = None
            return
        # test which learners can accept the given data set
        # e.g., regressions can't deal with classification data
        learners = []
        n = len(self.data.domain.attributes) * 2
        indices = orange.MakeRandomIndices2(
            p0=min(n, len(self.data)),
            stratified=orange.MakeRandomIndices2.StratifiedIfPossible)
        new = self.data.selectref(indices(self.data))
        #        new = self.data.selectref([1]*min(n, len(self.data)) +
        #                                  [0]*(len(self.data) - min(n, len(self.data))))
        self.warning(0)
        for l in [self.learners[id] for id in ids]:
            learner = l.learner
            if self.preprocessor:
                learner = self.preprocessor.wrapLearner(learner)
            try:
                predictor = learner(new)
                if predictor(new[0]).varType == new.domain.classVar.varType:
                    learners.append(learner)
                else:
                    l.scores = []
            except Exception as ex:
                self.warning(
                    0,
                    "Learner %s ends with exception: %s" % (l.name, str(ex)))
                l.scores = []

        if not learners:
            return

        # computation of results (res, and cm if classification)
        pb = None
        if self.resampling == 0:
            pb = OWGUI.ProgressBar(self, iterations=self.nFolds)
            res = orngTest.crossValidation(
                learners,
                self.data,
                folds=self.nFolds,
                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                callback=pb.advance,
                storeExamples=True)
            pb.finish()
        elif self.resampling == 1:
            pb = OWGUI.ProgressBar(self, iterations=len(self.data))
            res = orngTest.leaveOneOut(learners,
                                       self.data,
                                       callback=pb.advance,
                                       storeExamples=True)
            pb.finish()
        elif self.resampling == 2:
            pb = OWGUI.ProgressBar(self, iterations=self.pRepeat)
            res = orngTest.proportionTest(learners,
                                          self.data,
                                          self.pLearning / 100.,
                                          times=self.pRepeat,
                                          callback=pb.advance,
                                          storeExamples=True)
            pb.finish()
        elif self.resampling == 3:
            pb = OWGUI.ProgressBar(self, iterations=len(learners))
            res = orngTest.learnAndTestOnLearnData(learners,
                                                   self.data,
                                                   storeExamples=True,
                                                   callback=pb.advance)
            pb.finish()

        elif self.resampling == 4:
            if not self.testdata:
                for l in self.learners.values():
                    l.scores = []
                return
            pb = OWGUI.ProgressBar(self, iterations=len(learners))
            res = orngTest.learnAndTestOnTestData(learners,
                                                  self.data,
                                                  self.testdata,
                                                  storeExamples=True,
                                                  callback=pb.advance)
            pb.finish()
        if self.isclassification():
            cm = orngStat.computeConfusionMatrices(res,
                                                   classIndex=self.targetClass)

        if self.preprocessor:  # Unwrap learners
            learners = [l.wrappedLearner for l in learners]

        res.learners = learners

        for l in [self.learners[id] for id in ids]:
            if l.learner in learners:
                l.results = res

        self.error(list(range(len(self.stat))))
        scores = []
        for i, s in enumerate(self.stat):
            try:
                scores.append(eval("orngStat." + s.f))

            except Exception as ex:
                self.error(i, "An error occurred while evaluating orngStat." + s.f + "on %s due to %s" % \
                           (" ".join([l.name for l in learners]), ex))
                scores.append([None] * len(self.learners))

        for (i, l) in enumerate(learners):
            self.learners[l.id].scores = [s[i] if s else None for s in scores]

        self.sendResults()
示例#29
0
#              to compare naive Bayesian learner when all or just the most
#              important attribute is used. Shows how to find out which (in
#              ten-fold cross validation) attributes was used the most.
# Category:    feature selection
# Uses:        voting
# Referenced:  Orange.feature.html#selection
# Classes:     Orange.feature.selection.FilteredLearner

import Orange, orngTest, orngStat
voting = Orange.data.Table("voting")

nb = Orange.classification.bayes.NaiveLearner()
fl = Orange.feature.selection.FilteredLearner(
    nb, filter=Orange.feature.selection.FilterBestNAtts(n=1), name='filtered')
learners = (Orange.classification.bayes.NaiveLearner(name='bayes'), fl)
results = orngTest.crossValidation(learners, voting, storeClassifiers=1)

# output the results
print "Learner      CA"
for i in range(len(learners)):
    print "%-12s %5.3f" % (learners[i].name, orngStat.CA(results)[i])

# find out which attributes were retained by filtering

print "\nNumber of times attributes were used in cross-validation:"
attsUsed = {}
for i in range(10):
    for a in results.classifiers[i][1].atts():
        if a.name in attsUsed.keys():
            attsUsed[a.name] += 1
        else:
示例#30
0
import orange, orngTree, orngWrap, orngStat

learner = orngTree.TreeLearner()
data = orange.ExampleTable("voting")
tuner = orngWrap.Tune1Parameter(object=learner,
                                parameter="minSubset",
                                values=[1, 2, 3, 4, 5, 10, 15, 20],
                                evaluate = orngStat.AUC, verbose=2)
classifier = tuner(data)

print "Optimal setting: ", learner.minSubset

import orngTest
untuned = orngTree.TreeLearner()
res = orngTest.crossValidation([untuned, tuner], data)
AUCs = orngStat.AUC(res)

print "Untuned tree: %5.3f" % AUCs[0]
print "Tuned tree: %5.3f" % AUCs[1]


learner = orngTree.TreeLearner(minSubset=10).instance()
data = orange.ExampleTable("voting")
tuner = orngWrap.Tune1Parameter(object=learner,
                                parameter=["split.continuousSplitConstructor.minSubset", "split.discreteSplitConstructor.minSubset"],
                                values=[1, 2, 3, 4, 5, 10, 15, 20],
                                evaluate = orngStat.AUC, verbose=2)
classifier = tuner(data)

print "Optimal setting: ", learner.split.continuousSplitConstructor.minSubset
示例#31
0
# Description: Shows how to add class noise to data
# Category:    preprocessing
# Uses:        imports-85
# Classes:     Preprocessor_addClassNoise, orngTest.crossValidation
# Referenced:  domain.htm

import orange, orngTest, orngStat

filename = "promoters.tab"
data = orange.ExampleTable(filename)
data.name = "unspoiled"
datasets = [data]

add_noise = orange.Preprocessor_addClassNoise()
for noiselevel in (0.2, 0.4, 0.6):
  add_noise.proportion = noiselevel
  add_noise.randomGenerator = 42
  d = add_noise(data)
  d.name = "class noise %4.2f" % noiselevel
  datasets.append(d)

learner = orange.BayesLearner()

for d in datasets:
  results = orngTest.crossValidation([learner], d, folds=10)
  print "%20s   %5.3f" % (d.name, orngStat.CA(results)[0])
    #data = Orange.feature.selection.select(start_data, scores, features)
    
    train_data, test_data = proj_utils.partition_data(start_data)
    
    #selection = orange.MakeRandomIndicesCV(data, cv_folds)

    #sen1 = 0.0
    #spe1 = 0.0
    #acc1 = 0.0
    #sen2 = 0.0
    #spe2 = 0.0
    #acc2 = 0.0
    
    
    model = train_classifier(train_data, features)    
    train_results = orngTest.crossValidation([model], example_data, cv_folds)
    #test_results = orngTest.crossValidation([model], test_data, cv_folds)    
    
    train_stats = proj_utils.get_stats(train_results)
    #test_stats = proj_utils.get_stats(test_results)       

    print "Train:\n%s" % str(train_stats)
    #print "\nTest:\n%s" % str(test_stats)
    
    f = open(os.path.dirname(__file__) + '\\logisticRegressionFilteredCVResults_' + 'V' + str(cv_folds) + '_F' + str(features) + '.txt', 'w+')
    f.write("Train:\n")
    f.write(str(train_stats) + "\n")
    #f.write("Test:\n")
    #f.write(str(test_stats))
    f.close()
示例#33
0
import orange, orngSVM
data = orange.ExampleTable("iris.tab")
lin = orngSVM.SVMLearner(kernel_type=orngSVM.SVMLearner.Linear, name="SVM - Linear")
poly = orngSVM.SVMLearner(kernel_type=orngSVM.SVMLearner.Polynomial, name="SVM - Poly")
rbf = orngSVM.SVMLearner(kernel_type=orngSVM.SVMLearner.RBF, name="SVM - RBF")

learners = [lin, poly, rbf]
import orngTest, orngStat
res = orngTest.crossValidation(learners, data)
print "%15s%8s%8s" % ("Name", "CA", "AUC")
for l, ca, auc in zip(learners, orngStat.CA(res), orngStat.AUC(res)):
  print "%-15s   %.3f   %.3f" % (l.name, ca, auc)
示例#34
0
    nPermutations = 0

    # Some arrays to hold values
    pvalue = np.zeros(len(t0))
    items = range(len(data))
    data_p = data

    # Just loop through randomly permuted labels ...
    for m in range(nPermutations):
        if m < nPermutations - 1:
            np.random.shuffle(items)

        for i in range(len(data)):
            j = items[i]
            data_p[i].setclass(data[j].getclass())
        results_p = orngTest.crossValidation(learners, data_p, folds=kFolds)
        cm = orngStat.computeConfusionMatrices(results_p, classIndex=data_p.domain.classVar.values.index(target))
        t = orngStat.CA(results_p)
        for p in range(len(learners)):
            if t[p] >= t0[p]:
                pvalue[p] += 1.0

    if nPermutations > 0:
        pvalue /= (nPermutations * 1.0)

    scores = [eval("orngStat." + s[1]) for s in stat] + [pvalue]

    # Write out the empirical p-values
    Headers = "Learner   " + "".join(["%-7s" % s[0] for s in stat]) + 'p-value'

    print ""
import orange, orngSVM
data=orange.ExampleTable("iris.tab")
l1=orngSVM.SVMLearner()
l1.kernelFunc=orngSVM.RBFKernelWrapper(orange.ExamplesDistanceConstructor_Euclidean(data), gamma=0.5)
l1.kernel_type=orange.SVMLearner.Custom
l1.probability=True
c1=l1(data)
l1.name="SVM - RBF(Euclidean)"

l2=orngSVM.SVMLearner()
l2.kernelFunc=orngSVM.RBFKernelWrapper(orange.ExamplesDistanceConstructor_Hamming(data), gamma=0.5)
l2.kernel_type=orange.SVMLearner.Custom
l2.probability=True
c2=l2(data)
l2.name="SVM - RBF(Hamming)"

l3=orngSVM.SVMLearner()
l3.kernelFunc=orngSVM.CompositeKernelWrapper(orngSVM.RBFKernelWrapper(orange.ExamplesDistanceConstructor_Euclidean(data), gamma=0.5),orngSVM.RBFKernelWrapper(orange.ExamplesDistanceConstructor_Hamming(data), gamma=0.5), l=0.5)
l3.kernel_type=orange.SVMLearner.Custom
l3.probability=True
c3=l1(data)
l3.name="SVM - Composite"


import orngTest, orngStat
tests=orngTest.crossValidation([l1, l2, l3], data, folds=5)
[ca1, ca2, ca3]=orngStat.CA(tests)
print l1.name, "CA:", ca1
print l2.name, "CA:", ca2
print l3.name, "CA:", ca3
示例#36
0
import orange, orngImpute, orngTest, orngStat

data = orange.ExampleTable("voting")
ba = orange.BayesLearner()
imba = orngImpute.ImputeLearner(
    baseLearner=ba, imputerConstructor=orange.ImputerConstructor_minimal)
res = orngTest.crossValidation([ba, imba], data)
CAs = orngStat.CA(res)

print "Without imputation: %5.3f" % CAs[0]
print "With imputation: %5.3f" % CAs[1]
示例#37
0
# Description: Demonstrates the use of discretization
# Category:    discretization
# Classes:     entropyDiscretization, DiscretizedLearner
# Uses:        iris.tab

import orange
import orngDisc

data = orange.ExampleTable("iris.tab")

disc_data = orngDisc.entropyDiscretization(data)

disc_learner = orngDisc.DiscretizedLearner(orange.BayesLearner(),
                                           name="disc-bayes")
learner = orange.BayesLearner(name="bayes")

learners = [learner, disc_learner]

import orngTest, orngStat

results = orngTest.crossValidation(learners, data)
print "Classification Accuracy:"
for i in range(len(learners)):
    print("%15s: %5.3f") % (learners[i].name, orngStat.CA(results)[i])
示例#38
0
resList = [
    str(CM[0][0]),
    str(CM[0][1]),
    str(CM[1][0]),
    str(CM[1][1]),
    str(CA),
    str(MCC)
]
wrtStr = string.join(resList, "\t")
print "nonIID test set results"
print wrtStr

# CV accuracy
res = orngTest.crossValidation(
    [learner],
    data,
    strat=orange.MakeRandomIndices.StratifiedIfPossible,
    folds=10)
CM = evalUtilities.ConfMat(res)[0]
CA = round(orngStat.CA(res)[0], 3)
MCC = round(evalUtilities.calcMCC(CM), 3)
# TH, FL, FH, TL
resList = [
    str(CM[0][0]),
    str(CM[0][1]),
    str(CM[1][0]),
    str(CM[1][1]),
    str(CA),
    str(MCC)
]
wrtStr = string.join(resList, "\t")
示例#39
0
import orange, orngWrap, orngTest, orngStat

data = orange.ExampleTable("bupa")

learner = orange.BayesLearner()
thresh = orngWrap.ThresholdLearner(learner=learner)
thresh80 = orngWrap.ThresholdLearner_fixed(learner=learner, threshold=.8)
res = orngTest.crossValidation([learner, thresh, thresh80], data)
CAs = orngStat.CA(res)

print "W/out threshold adjustement: %5.3f" % CAs[0]
print "With adjusted thredhold: %5.3f" % CAs[1]
print "With threshold at 0.80: %5.3f" % CAs[2]
示例#40
0
def buildModel(trainData,
               MLMethod,
               queueType="NoSGE",
               verbose=0,
               logFile=None):
    """
        Buld the method passed in MLMethod and optimize ( "IndividualStatistics"  not in MLMethod)
        if MLMethod is a Consensus ("individualStatistics"  in MLMethod) , build each and optimize first all models and after build the consensus!
        """
    log(logFile,
        "Building and optimizing learner: " + MLMethod["MLMethod"] + "...")
    learners = {}
    MLMethods = {}
    if "IndividualStatistics" in MLMethod:  #It is a consensus and will certaily not contain any
        #special model as it was filtered in the getUnbiasedAcc
        for ML in MLMethod["IndividualStatistics"]:
            MLMethods[ML] = copy.deepcopy(MLMethod["IndividualStatistics"][ML])
    else:
        ML = MLMethod["MLMethod"]
        if MLMETHODS[ML](
                name=ML
        ).specialType == 1:  # If is a special model and has a built-in optimizaer
            log(logFile, "       This is a special model")
            smilesAttr = dataUtilities.getSMILESAttr(trainData)
            if smilesAttr:
                log(logFile, "Found SMILES attribute:" + smilesAttr)
                trainData = dataUtilities.attributeSelectionData(
                    trainData, [smilesAttr, trainData.domain.classVar.name])
            optInfo, SpecialModel = MLMETHODS[ML](name=ML).optimizePars(
                trainData, folds=5)
            return SpecialModel
        else:
            MLMethods[MLMethod["MLMethod"]] = MLMethod

    smilesAttr = dataUtilities.getSMILESAttr(trainData)
    if smilesAttr:
        trainData = dataUtilities.attributeDeselectionData(
            trainData, [smilesAttr])

    # optimize all MLMethods
    for ML in MLMethods:
        log(logFile, "  Optimizing MLmethod: " + ML)
        learners[ML] = MLMETHODS[ML](name=ML)

        runPath = miscUtilities.createScratchDir(
            baseDir=AZOC.NFS_SCRATCHDIR, desc="competitiveWorkflow_BuildModel")
        trainData.save(os.path.join(runPath, "trainData.tab"))

        tunedPars = paramOptUtilities.getOptParam(learner=learners[ML],
                                                  trainDataFile=os.path.join(
                                                      runPath,
                                                      "trainData.tab"),
                                                  useGrid=False,
                                                  verbose=verbose,
                                                  queueType=queueType,
                                                  runPath=runPath,
                                                  nExtFolds=None,
                                                  logFile=logFile,
                                                  getTunedPars=True)

        if not learners[ML].optimized:
            print "WARNING: competitiveWorkflow: The learner " + str(
                learners[ML]) + " was not optimized."
            #print "         Using default parameters"
            print "         The " + str(learners[ML]) + " will not be included"
            #print "         Returning None"
            print "             DEBUG can be made in: " + runPath
            #Setting default parameters
            #learners[ML] = learners[ML].__class__()
            #return None
            learners.pop(ML)
            continue
        else:
            print "Optimized learner ", learners[ML]
            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                MLMethods[ML]["optAcc"] = tunedPars[0]
            else:
                res = orngTest.crossValidation(
                    [learners[ML]],
                    trainData,
                    folds=5,
                    strat=orange.MakeRandomIndices.StratifiedIfPossible,
                    randomGenerator=random.randint(0, 100))
                R2 = evalUtilities.R2(res)[0]
                MLMethods[ML]["optAcc"] = R2
            miscUtilities.removeDir(runPath)
    #Train the model
    if len(learners) == 1:
        log(logFile, "  Building the model:" + learners.keys()[0])
        model = learners[learners.keys()[0]](trainData)
    elif len(learners) >= 1:
        model = buildConsensus(trainData, learners, MLMethods)
    else:
        print "ERROR: No Learners were selected!"
        return None

    return model
l1.kernel_type = orange.SVMLearner.Custom
l1.probability = True
c1 = l1(data)
l1.name = "SVM - RBF(Euclidean)"

l2 = orngSVM.SVMLearner()
l2.kernelFunc = orngSVM.RBFKernelWrapper(
    orange.ExamplesDistanceConstructor_Hamming(data), gamma=0.5)
l2.kernel_type = orange.SVMLearner.Custom
l2.probability = True
c2 = l2(data)
l2.name = "SVM - RBF(Hamming)"

l3 = orngSVM.SVMLearner()
l3.kernelFunc = orngSVM.CompositeKernelWrapper(
    orngSVM.RBFKernelWrapper(
        orange.ExamplesDistanceConstructor_Euclidean(data), gamma=0.5),
    orngSVM.RBFKernelWrapper(orange.ExamplesDistanceConstructor_Hamming(data),
                             gamma=0.5),
    l=0.5)
l3.kernel_type = orange.SVMLearner.Custom
l3.probability = True
c3 = l1(data)
l3.name = "SVM - Composite"

import orngTest, orngStat
tests = orngTest.crossValidation([l1, l2, l3], data, folds=5)
[ca1, ca2, ca3] = orngStat.CA(tests)
print l1.name, "CA:", ca1
print l2.name, "CA:", ca2
print l3.name, "CA:", ca3
示例#42
0
    def test_SVMD(self):

        # Train a svm
        svm = AZorngCvSVM.CvSVMLearner(self.inDataD,
                                       scaleData=False,
                                       gamma=4,
                                       C=1,
                                       nu=0.5,
                                       p=0.1,
                                       eps=0.001,
                                       coef0=0,
                                       degree=3)
        trainedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, svm)

        self.assertEqual(round(trainedAcc, 7), round(0.986666666667, 7))
        # Save model
        rc = svm.write(self.modelPath)
        self.assertEqual(rc, True)
        # Load the saved model
        loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath)
        loadedAcc = evalUtilities.getClassificationAccuracy(
            self.inDataD, loadedsvm)
        # Assure equal accuracy
        self.assertEqual(trainedAcc, loadedAcc)

        svmLearner = AZorngCvSVM.CvSVMLearner(scaleData=False)

        svmLearner.name = "CvSVMLearner"
        svmLearner.eps = 0.001
        svmLearner.p = 0.0
        svmLearner.nu = 0.6
        svmLearner.kernel_type = 2
        svmLearner.svm_type = 101
        svmLearner.gamma = 0.0033
        svmLearner.C = 47
        svmLearner.scaleData = True
        svmLearner.scaleClass = False

        Res = orngTest.crossValidation(
            [svmLearner],
            self.inDataD,
            folds=5,
            strat=orange.MakeRandomIndices.StratifiedIfPossible)
        CA = evalUtilities.CA(Res)[0]
        self.assertEqual(round(CA, 2),
                         round(0.96666666666666667,
                               2))  # Before in AZSVM: 0.95999999999999996

        newSVM = svmLearner(self.inDataD)
        trainedAcc = evalUtilities.getClassificationAccuracy(
            self.inDataD, newSVM)
        # Save model
        rc = newSVM.write(self.modelPath)
        self.assertEqual(rc, True)
        # Load the saved model
        loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath)
        loadedAcc = evalUtilities.getClassificationAccuracy(
            self.inDataD, loadedsvm)
        # Assure equal accuracy
        self.assertEqual(round(trainedAcc, 7),
                         round(0.96666669999999999,
                               7))  #Before in AZSVM: 0.953333300000
        self.assertEqual(round(trainedAcc, 1), round(loadedAcc, 1))
示例#43
0
#              validation (how many and which attributes were used?).
# Category:    preprocessing
# Uses:        crx.tab
# Referenced:  orngFSS.htm

import orange, orngDisc, orngTest, orngStat, orngFSS

data = orange.ExampleTable("../datasets/crx")

bayes = orange.BayesLearner()
dBayes = orngDisc.DiscretizedLearner(bayes, name='disc bayes')
fss = orngFSS.FilterAttsAboveThresh(threshold=0.05)
fBayes = orngFSS.FilteredLearner(dBayes, filter=fss, name='bayes & fss')

learners = [dBayes, fBayes]
results = orngTest.crossValidation(learners, data, folds=10, storeClassifiers=1)

# how many attributes did each classifier use?

natt = [0.] * len(learners)
for fold in range(results.numberOfIterations):
  for lrn in range(len(learners)):
    natt[lrn] += len(results.classifiers[fold][lrn].domain.attributes)
for lrn in range(len(learners)):
  natt[lrn] = natt[lrn]/10.

print "\nLearner         Accuracy  #Atts"
for i in range(len(learners)):
  print "%-15s %5.3f     %5.2f" % (learners[i].name, orngStat.CA(results)[i], natt[i])

# which attributes were used in filtered case?
示例#44
0
    def test_SVM_Priors_D(self):
        """Test SVM with priors """
        # Train a svm
        svm = AZorngCvSVM.CvSVMLearner(self.inDataD,
                                       priors={
                                           "Iris-setosa": 0.2,
                                           "Iris-versicolor": 0.3,
                                           "Iris-virginica": 0.5
                                       })
        trainedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, svm)

        self.assertEqual(round(trainedAcc, 7), round(0.73333329999999997, 7))
        # Save model
        rc = svm.write(self.modelPath)
        self.assertEqual(rc, True)
        # Load the saved model
        loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath)
        loadedAcc = evalUtilities.getClassificationAccuracy(
            self.inDataD, loadedsvm)
        # Assure equal accuracy
        self.assertEqual(trainedAcc, loadedAcc)

        svmLearner = AZorngCvSVM.CvSVMLearner(scaleData=False,
                                              priors={
                                                  "Iris-setosa": 0.2,
                                                  "Iris-versicolor": 0.3,
                                                  "Iris-virginica": 0.5
                                              })

        svmLearner.name = "CvSVMLearner"
        svmLearner.shrinking = 1
        svmLearner.eps = 0.001
        svmLearner.p = 0.0
        svmLearner.nu = 0.6
        svmLearner.kernel_type = 2
        svmLearner.svm_type = 103
        svmLearner.gamma = 0.0033
        svmLearner.C = 47
        svmLearner.probability = 1
        svmLearner.scaleData = True
        svmLearner.scaleClass = False
        #svmLearner.for_nomogram=1

        Res = orngTest.crossValidation(
            [svmLearner],
            self.inDataD,
            folds=5,
            strat=orange.MakeRandomIndices.StratifiedIfPossible)
        CA = evalUtilities.CA(Res)[0]
        self.assertEqual(round(CA, 2),
                         round(0.940000000,
                               2))  # orange1.0: 0.93333333333333335])

        svmLearner.priors = None
        Res = orngTest.crossValidation(
            [svmLearner],
            self.inDataD,
            folds=5,
            strat=orange.MakeRandomIndices.StratifiedIfPossible)
        CA = evalUtilities.CA(Res)[0]
        self.assertEqual(round(CA, 2), round(0.94666666666666666, 2))

        newSVM = svmLearner(self.inDataD)
        trainedAcc = evalUtilities.getClassificationAccuracy(
            self.inDataD, newSVM)
        # Save model
        rc = newSVM.write(self.modelPath)
        self.assertEqual(rc, True)
        # Load the saved model
        loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath)
        loadedAcc = evalUtilities.getClassificationAccuracy(
            self.inDataD, loadedsvm)
        # Assure equal accuracy
        self.assertEqual(round(trainedAcc, 7),
                         round(0.95999999999999996,
                               7))  #Before in AZSVM: 0.953333300000
        self.assertEqual(round(trainedAcc, 1), round(loadedAcc, 1))
示例#45
0
    def getAcc(self, callBack=None, algorithm=None, params=None, atts=None, holdout=None):
        """ For regression problems, it returns the RMSE and the Q2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None
                
			parameters:
                algorithm - list of feature generation algorithms (set dependent features that have to be calculated inside the crossvalidation)
                params - dictionary of parameters
                atts - attributes to be removed before learning (e.g. meta etc...)
        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None

        if holdout:
            self.nExtFolds = 1

        if algorithm:
            self.__log(" Additional features to be calculated inside of cross-validation")
            for i in algorithm:
                self.__log(" Algorithm: " + str(i))
            for j, v in params.iteritems():
                self.__log(" Parameter: " + str(j) + " = " + str(v))

        # Set the response type
        self.responseType = (
            self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression"
        )
        self.__log("  " + str(self.responseType))

        # Create the Train and test sets
        DataIdxs = None
        if holdout:
            self.__log("Using hold out evaluation with " + str(holdout) + "*100 % of data for training")
            DataIdxs = dataUtilities.SeedDataSampler_holdOut(self.data, holdout)
        else:
            DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds)

        # Var for saving each Fols result
        optAcc = {}
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}

        # Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models = {}
        rocs = {}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  " + str([x for x in MLmethods]))

        # Check data in advance so that, by chance, it will not fail at the last fold!
        for foldN in range(self.nExtFolds):
            trainData = self.data.select(DataIdxs[foldN], negate=1)
            self.__checkTrainData(trainData)

        # Optional!!
        # Order Learners so that PLS is the first
        sortedML = [ml for ml in MLmethods]
        if "PLS" in sortedML:
            sortedML.remove("PLS")
            sortedML.insert(0, "PLS")

        stepsDone = 0
        nTotalSteps = len(sortedML) * self.nExtFolds
        for ml in sortedML:
            self.__log("    > " + str(ml) + "...")
            try:
                # Var for saving each Fols result
                results[ml] = []
                exp_pred[ml] = []
                models[ml] = []
                rocs[ml] = []
                nTrainEx[ml] = []
                nTestEx[ml] = []
                optAcc[ml] = []
                logTxt = ""

                for foldN in range(self.nExtFolds):
                    if type(self.learner) == dict:
                        self.paramList = None

                    trainData = self.data.select(DataIdxs[foldN], negate=1)
                    orig_len = len(trainData.domain.attributes)
                    refs = None
                    methods = [
                        "rdk_MACCS_keys",
                        "rdk_topo_fps",
                        "rdk_morgan_fps",
                        "rdk_morgan_features_fps",
                        "rdk_atompair_fps",
                    ]
                    train_domain = None
                    # add structural descriptors to the training data (TG)
                    if algorithm:
                        for i in range(len(algorithm)):
                            if algorithm[i] == "structClust":
                                self.__log("Algorithm " + str(i) + ": " + str(algorithm[i]))
                                actData = orange.ExampleTable(trainData.domain)
                                for d in trainData:
                                    # only valid for simboosted qsar paper experiments!?
                                    if d.getclass() == "2":
                                        actData.append(d)

                                refs = structuralClustering.getReferenceStructures(
                                    actData,
                                    threshold=params["threshold"],
                                    minClusterSize=params["minClusterSize"],
                                    numThreads=2,
                                )
                                self.__log(
                                    " found "
                                    + str(len(refs))
                                    + " reference structures in "
                                    + str(len(actData))
                                    + " active structures"
                                )
                                orig_len = orig_len + (len(refs) * len(methods))
                                trainData_sim = SimBoostedQSAR.getSimDescriptors(refs, trainData, methods)

                                if i == (len(algorithm) - 1):
                                    trainData = dataUtilities.attributeDeselectionData(trainData_sim, atts)
                                else:
                                    trainData = dataUtilities.attributeDeselectionData(trainData_sim, [])

                            elif algorithm[i] == "ECFP":
                                self.__log("Algorithm " + str(i) + ": " + str(algorithm[i]))
                                trainData_ecfp = getCinfonyDesc.getCinfonyDescResults(trainData, ["rdk.FingerPrints"])
                                train_domain = trainData_ecfp.domain
                                if i == (len(algorithm) - 1):
                                    trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, atts)
                                else:
                                    trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, [])

                            else:
                                self.__log("Algorithm " + str(i) + ": " + str(algorithm[i]))
                                trainData_structDesc = getStructuralDesc.getStructuralDescResult(
                                    trainData, algorithm[i], params["minsup"]
                                )
                                if i == (len(algorithm) - 1):
                                    trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts)
                                else:
                                    trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, [])

                                    # trainData.save("/home/girschic/proj/AZ/ProjDev/train.tab")
                    testData = self.data.select(DataIdxs[foldN])
                    # calculate the feature values for the test data (TG)
                    if algorithm:
                        for i in range(len(algorithm)):
                            if algorithm[i] == "structClust":
                                self.__log(str(algorithm[i]))
                                testData_sim = SimBoostedQSAR.getSimDescriptors(refs, testData, methods)
                                if i == (len(algorithm) - 1):
                                    testData = dataUtilities.attributeDeselectionData(testData_sim, atts)
                                else:
                                    testData = dataUtilities.attributeDeselectionData(testData_sim, [])
                            elif algorithm[i] == "ECFP":
                                self.__log(str(algorithm[i]))
                                # testData_ecfp = orange.ExampleTable(train_domain)
                                tmp_dat = []
                                for d in testData:
                                    tmp = getCinfonyDesc.getRdkFPforTestInstance(train_domain, d)
                                    tmp_dat.append(tmp)
                                testData_ecfp = orange.ExampleTable(tmp_dat[0].domain, tmp_dat)
                                if i == (len(algorithm) - 1):
                                    # 						print "removing atts"
                                    testData = dataUtilities.attributeDeselectionData(testData_ecfp, atts)
                                else:
                                    # 						print "removing no atts"
                                    testData = dataUtilities.attributeDeselectionData(testData_ecfp, [])

                            else:
                                cut_off = orig_len - len(atts)
                                smarts = trainData.domain.attributes[cut_off:]
                                self.__log("  Number of structural features added: " + str(len(smarts)))
                                testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts)
                                if i == (len(algorithm) - 1):
                                    testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts)
                                else:
                                    testData = dataUtilities.attributeDeselectionData(testData_structDesc, [])

                    #                testData.save("/home/girschic/proj/AZ/ProjDev/test.tab")
                    nTrainEx[ml].append(len(trainData))
                    nTestEx[ml].append(len(testData))
                    # Test if trainsets inside optimizer will respect dataSize criterias.
                    #  if not, don't optimize, but still train the model
                    dontOptimize = False
                    if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20):
                        dontOptimize = True
                    else:
                        tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds)
                        tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1)
                        if not self.__checkTrainData(tmpTrainData, False):
                            dontOptimize = True

                    if dontOptimize:
                        logTxt += (
                            "       Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n"
                        )
                        self.__log(logTxt)
                        if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            CA = evalUtilities.CA(res)[0]
                            optAcc[ml].append(CA)
                        else:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            R2 = evalUtilities.R2(res)[0]
                            optAcc[ml].append(R2)
                    else:
                        runPath = miscUtilities.createScratchDir(
                            baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData)
                        )
                        # 		    self.__log("	run path:"+str(runPath))
                        trainData.save(os.path.join(runPath, "trainData.tab"))

                        tunedPars = paramOptUtilities.getOptParam(
                            learner=MLmethods[ml],
                            trainDataFile=os.path.join(runPath, "trainData.tab"),
                            paramList=self.paramList,
                            useGrid=False,
                            verbose=self.verbose,
                            queueType=self.queueType,
                            runPath=runPath,
                            nExtFolds=None,
                            nFolds=self.nInnerFolds,
                            logFile=self.logFile,
                            getTunedPars=True,
                        )
                        if not MLmethods[ml] or not MLmethods[ml].optimized:
                            self.__log(
                                "       WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized."
                            )
                            self.__log("                It will be ignored")
                            # self.__log("                It will be set to default parameters")
                            self.__log("                    DEBUG can be done in: " + runPath)
                            # Set learner back to default
                            # MLmethods[ml] = MLmethods[ml].__class__()
                            raise Exception("The learner " + str(ml) + " was not optimized.")
                        else:
                            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                optAcc[ml].append(tunedPars[0])
                            else:
                                res = orngTest.crossValidation(
                                    [MLmethods[ml]],
                                    trainData,
                                    folds=5,
                                    strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                    randomGenerator=random.randint(0, 100),
                                )
                                R2 = evalUtilities.R2(res)[0]
                                optAcc[ml].append(R2)

                            miscUtilities.removeDir(runPath)
                    # Train the model
                    model = MLmethods[ml](trainData)
                    models[ml].append(model)
                    # Test the model
                    if self.responseType == "Classification":
                        results[ml].append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                        roc = self.aroc(testData, [model])
                        rocs[ml].append(roc)
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        results[ml].append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        exp_pred[ml] += local_exp_pred
                    if callBack:
                        stepsDone += 1
                        if not callBack((100 * stepsDone) / nTotalSteps):
                            return None

                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    rocs[ml],
                )

                if self.verbose > 0:
                    print "UnbiasedAccuracyGetter!Results  " + ml + ":\n"
                    pprint(res)
                if not res:
                    raise Exception("No results available!")
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)
                self.__log("       OK")
            except:
                print "Unexpected error:",
                print sys.exc_info()[0]
                print sys.exc_info()[1]
                self.__log("       Learner " + str(ml) + " failed to create/optimize the model!")
                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    rocs[ml],
                )
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            # We still need to build a consensus model out of the stable models
            #   ONLY if there are more that one model stable!
            #   When only one or no stable models, build a consensus based on all models
            consensusMLs = {}
            for modelName in statistics:
                StabilityValue = statistics[modelName]["StabilityValue"]
                if StabilityValue is not None and statistics[modelName]["stable"]:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            self.__log(
                "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods."
            )

            if len(consensusMLs) <= 1:  # we need more models to build a consensus!
                consensusMLs = {}
                for modelName in statistics:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            if len(consensusMLs) >= 2:
                # Var for saving each Fols result
                Cresults = []
                Cexp_pred = []
                CnTrainEx = []
                CnTestEx = []
                self.__log(
                    "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs])
                )
                for foldN in range(self.nExtFolds):
                    if self.responseType == "Classification":
                        CLASS0 = str(self.data.domain.classVar.values[0])
                        CLASS1 = str(self.data.domain.classVar.values[1])
                        exprTest0 = "(0"
                        for ml in consensusMLs:
                            exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " "
                        exprTest0 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest0 += ", " + ml + " == " + CLASS0 + " "
                        exprTest0 += "]),1)"
                        exprTest1 = exprTest0.replace(CLASS0, CLASS1)
                        expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1]
                    else:
                        Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs])
                        expression = "(1 / " + str(Q2sum) + ") * (0"
                        for ml in consensusMLs:
                            expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " "
                        expression += ")"

                    testData = self.data.select(DataIdxs[foldN])
                    CnTestEx.append(len(testData))
                    consensusClassifiers = {}
                    for learnerName in consensusMLs:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression)
                    CnTrainEx.append(model.NTrainEx)
                    # Test the model
                    if self.responseType == "Classification":
                        Cresults.append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        Cresults.append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds)
                statistics["Consensus"] = copy.deepcopy(res)
                statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs)
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics

        # By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]
示例#46
0
# Description: Test of naive bayesian classifier with entropy-based discretization (as defined in nbdisc.py)
# Category:    modelling
# Uses:        iris.tab
# Classes:     orngTest.crossValidation, orngStat.CA
# Referenced:  c_nb_disc.htm

import orange, orngTest, orngStat, nbdisc
data = orange.ExampleTable("iris")
results = orngTest.crossValidation([nbdisc.Learner()], data, folds=10)
print "Accuracy = %5.3f" % orngStat.CA(results)[0]
		
data = orange.ExampleTable("features.tab")
folds = 10
k = 150
cv = CrossVal(data, folds, k)
cv.run_kNN()
cv.printCA()


# Builtin ClossValidator, with several classifiers, only used for testing early on
if False:
	# set up the learners
	bayes = orange.BayesLearner()
	tree = orngTree.TreeLearner(mForPruning=2)
	knn = orange.kNNLearner(k=k)
	bayes.name = "bayes"
	tree.name = "tree"
	knn.name = "knn"
	learners = [bayes, tree, knn]

	# compute accuracies on data
	data = orange.ExampleTable("features.tab")

	# Create a crossvalidation on the sampleset so that you don't classify it's own data
	results = orngTest.crossValidation(learners, data, folds=10)

	# output the results
	print "Learner \tAccuracy"
	for i in range(len(learners)):
		print "%-8s\t%5.3f%%" % (learners[i].name, orngStat.CA(results)[i]*100)
示例#48
0
# Description: Demostrates the use of classification scores
# Category:    evaluation
# Uses:        voting.tab
# Referenced:  orngStat.htm

import orange, orngTest, orngTree

learners = [orange.BayesLearner(name = "bayes"),
            orngTree.TreeLearner(name="tree"),
            orange.MajorityLearner(name="majrty")]

voting = orange.ExampleTable("voting")
res = orngTest.crossValidation(learners, voting)

vehicle = orange.ExampleTable("vehicle")
resVeh = orngTest.crossValidation(learners, vehicle)

import orngStat

CAs = orngStat.CA(res)
APs = orngStat.AP(res)
Briers = orngStat.BrierScore(res)
ISs = orngStat.IS(res)

print
print "method\tCA\tAP\tBrier\tIS"
for l in range(len(learners)):
    print "%s\t%5.3f\t%5.3f\t%5.3f\t%6.3f" % (learners[l].name, CAs[l], APs[l], Briers[l], ISs[l])


CAs = orngStat.CA(res, reportSE=True)
示例#49
0
# Classes:     orngTest.crossValidation, orngTree.TreeLearner, orange.kNNLearner, orngRegression.LinearRegressionLearner
# Referenced:  regression.htm

import orange
import orngRegression
import orngTree
import orngStat, orngTest

data = orange.ExampleTable("housing")

# definition of learners (regressors)
lr = orngRegression.LinearRegressionLearner(name="lr")
rt = orngTree.TreeLearner(measure="retis",
                          mForPruning=2,
                          minExamples=20,
                          name="rt")
maj = orange.MajorityLearner(name="maj")
knn = orange.kNNLearner(k=10, name="knn")
learners = [maj, lr, rt, knn]

# evaluation and reporting of scores
results = orngTest.crossValidation(learners, data, folds=10)
scores = [("MSE", orngStat.MSE), ("RMSE", orngStat.RMSE),
          ("MAE", orngStat.MAE), ("RSE", orngStat.RSE),
          ("RRSE", orngStat.RRSE), ("RAE", orngStat.RAE), ("R2", orngStat.R2)]

print "Learner  " + "".join(["%-7s" % s[0] for s in scores])
for i in range(len(learners)):
    print "%-8s " % learners[i].name + "".join(
        ["%6.3f " % s[1](results)[i] for s in scores])
示例#50
0
    def classificationAccuracy(self, histOrImgs, labels=None, confThr=0.0,
                               peakThr=None, edgeThr=None, nu=0.6, gamma=2.0, doCrossVal=False):
        """
        Classify test data and (optionally) perform a cross validation on the training data.

        @param histOrImgs:    Either a list of SIFT descriptor arrays or an iterator of images.
        @type histOrImgs:     numpy.ndarray or [numpy.ndarray] or Image.Image or [Image.Image]
        
        @param labels:        A list of labels corresponding to the list of descriptors/images.
        @type labels:         [string]
    
        @param nu:            The S{nu}-Parameter of the support vector machine.
        @type nu:             float

        @param gamma:         The S{gamma}-Parameter of the RBF-Kernel.
        @type gamma:          float
        
        @param confidenceThreshold:    All classifications who are classified with a lower
                                       confidence than this threshold are rejected. 1.0: 
                                       Everything is rejected, 0.0: Nothing is rejected.
        @type confidenceThreshold:     float
        
        @param peakThreshold:         A SIFT parameter. Sensible values: 0.0 < x < 30.0.
        @type peakThreshold:          float
        
        @param edgeThreshold:         A SIFT parameter. Sensible values: 0.0 < x < 10.0.
        @type edgeThreshold:          float

        @rtype:               (float,float)
        @return:              The cross validation accuracy and the test data classification accuracy.
        """
        if self.learner is None:
            raise ValueError("Learner has to be loaded before classification can be done.")

        # Set SIFT member variables (so they get stored in the DB if requested)
        if peakThr is None:
            self.peakThreshold = self.learner.peakThreshold
        else:
            self.peakThreshold = peakThr

        if edgeThr is None:
            self.edgeThreshold = self.learner.edgeThreshold
        else:
            self.edgeThreshold = edgeThr

        # If we've been given an images iterator, extract features and vector quantize
        if isinstance(histOrImgs, collections.Iterator):
            if labels is None:
                raise ValueError("If argument 'histOrImgs' is an iterator of images, \
                                  the argument 'lables' must not be None.")

            desc, self.numTestDesc = im.extractFeatures(histOrImgs, self.peakThreshold, self.edgeThreshold)

            recognosco.logger.info("Found %i features/image on average.", self.numTestDesc / len(desc))

            tmpHistograms = _buildHistograms(self.learner.codebook, desc)
            histograms = _convertToOrangeDataSet(tmpHistograms, self.learner.domain, labels)
        else:
            histograms = histOrImgs


        values = histograms.domain.classVar.values
        self.values = values
        length = len(values)
        self.confusion = numpy.zeros((length, length), int)
        starttime = time.time()

        self.nu = nu
        self.gamma = gamma

        svm = orange.SVMLearner()
        svm.svm_type = orange.SVMLearner.Nu_SVC
        svm.nu = nu
        svm.gamma = gamma
        svm.kernel_type = orange.SVMLearner.RBF
        svm.probability = True

        recognosco.logger.debug("Training Support Vector Machine...")
        self.classifier = svm(self.learner.histograms)
        recognosco.logger.debug("Done...")

        crossVal = 0.0
        if doCrossVal:
            crossVal = orngTest.crossValidation([svm], self.learner.histograms, folds=10)

        numCorrectClassified = 0.0
        numClassified = 0.0
        for i in range(len(histograms)):
                c = self.classifier(histograms[i])
                recognosco.logger.info("Has the Class: %s", histograms[i].getclass())
                recognosco.logger.info("Classified as: %s", c)

                prob = self.classifier(histograms[i], self.classifier.GetProbabilities)
                conf = self.__getConfidence(prob)
                recognosco.logger.info("Confidence: %f", conf)
                if conf < confThr:
                    recognosco.logger.info("Rejected classification (Threshold: %.2f)", confThr)
                    continue

                numClassified += 1.0
                predicted = values.index(str(c))
                actual = values.index(str(histograms[i].getclass()))
                self.confusion[predicted][actual] += 1

                if(c == histograms[i].getclass()):
                    numCorrectClassified += 1.0

        endtime = time.time()
        self.confusion = str(self.confusion)
        self.clAccuracy = numCorrectClassified / numClassified
        self.fracClassified = numClassified / len(histograms)

        if doCrossVal:
            self.cvAccuracy = orngStat.CA(crossVal)[0]
            recognosco.logger.info("Cross validation accuracy: %s", self.cvAccuracy)
        else:
            self.cvAccuracy = -1.0

        recognosco.logger.info("Classification accuracy of test data: %s", self.clAccuracy)
        self.testTime = endtime - starttime
        return (self.cvAccuracy, self.classificationAccuracy)