def test_SVM_Priors_D(self): """Test SVM with priors """ # Train a svm svm = AZorngCvSVM.CvSVMLearner( self.inDataD, priors={"Iris-setosa": 0.2, "Iris-versicolor": 0.3, "Iris-virginica": 0.5} ) trainedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, svm) self.assertEqual(round(trainedAcc, 7), round(0.73333329999999997, 7)) # Save model rc = svm.write(self.modelPath) self.assertEqual(rc, True) # Load the saved model loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath) loadedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, loadedsvm) # Assure equal accuracy self.assertEqual(trainedAcc, loadedAcc) svmLearner = AZorngCvSVM.CvSVMLearner( scaleData=False, priors={"Iris-setosa": 0.2, "Iris-versicolor": 0.3, "Iris-virginica": 0.5} ) svmLearner.name = "CvSVMLearner" svmLearner.shrinking = 1 svmLearner.eps = 0.001 svmLearner.p = 0.0 svmLearner.nu = 0.6 svmLearner.kernel_type = 2 svmLearner.svm_type = 103 svmLearner.gamma = 0.0033 svmLearner.C = 47 svmLearner.probability = 1 svmLearner.scaleData = True svmLearner.scaleClass = False # svmLearner.for_nomogram=1 Res = orngTest.crossValidation( [svmLearner], self.inDataD, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible ) CA = evalUtilities.CA(Res)[0] self.assertEqual(round(CA, 2), round(0.940000000, 2)) # orange1.0: 0.93333333333333335]) svmLearner.priors = None Res = orngTest.crossValidation( [svmLearner], self.inDataD, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible ) CA = evalUtilities.CA(Res)[0] self.assertEqual(round(CA, 2), round(0.94666666666666666, 2)) newSVM = svmLearner(self.inDataD) trainedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, newSVM) # Save model rc = newSVM.write(self.modelPath) self.assertEqual(rc, True) # Load the saved model loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath) loadedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, loadedsvm) # Assure equal accuracy self.assertEqual(round(trainedAcc, 7), round(0.95999999999999996, 7)) # Before in AZSVM: 0.953333300000 self.assertEqual(round(trainedAcc, 1), round(loadedAcc, 1))
def WrapperFSS(data, learner, verbose=0, folds=10): classVar = data.domain.classVar currentAtt = [] freeAttributes = list(data.domain.attributes) newDomain = orange.Domain(currentAtt + [classVar]) d = data.select(newDomain) results = orngTest.crossValidation([learner], d, folds=folds) maxStat = orngStat.CA(results)[0] if verbose>=2: print "start (%5.3f)" % maxStat while 1: stat = [] for a in freeAttributes: newDomain = orange.Domain([a] + currentAtt + [classVar]) d = data.select(newDomain) results = orngTest.crossValidation([learner], d, folds=folds) stat.append(orngStat.CA(results)[0]) if verbose>=2: print " %s gained %5.3f" % (a.name, orngStat.CA(results)[0]) if (max(stat) > maxStat): oldMaxStat = maxStat maxStat = max(stat) bestVarIndx = stat.index(max(stat)) if verbose: print "gain: %5.3f, attribute: %s" % (maxStat-oldMaxStat, freeAttributes[bestVarIndx].name) currentAtt = currentAtt + [freeAttributes[bestVarIndx]] del freeAttributes[bestVarIndx] else: if verbose: print "stopped (%5.3f)" % (max(stat) - maxStat) return orange.Domain(currentAtt + [classVar]) break
def crossValidation(self, nu=0.6, gamma=4.0): """ Perform a cross validation on the training data. @param nu: The S{nu}-Parameter of the support vector machine. @type nu: float @param gamma: The S{gamma}-Parameter of the RBF-Kernel. @type gamma: float @rtype: float @return: The cross validation accuracy. """ if self.learner is None: raise ValueError("Learner has to be loaded before classification can be done.") self.nu = nu self.gamma = gamma svm = orange.SVMLearner() svm.svm_type = orange.SVMLearner.Nu_SVC svm.nu = nu svm.gamma = gamma svm.probability = True values = self.learner.histograms.domain.classVar.values self.values = values length = len(values) self.confusion = numpy.zeros((length, length), int) result = orngTest.crossValidation([svm], self.learner.histograms, folds=10) self.cvAccuracy = orngStat.CA(result)[0] recognosco.logger.info("Cross validation accuracy: %s", self.cvAccuracy) return self.cvAccuracy
def test_fss(learner, data, t=0.01): fss = orngFSS.FilterAttsAboveThresh(threshold=t) fLearner = orngFSS.FilteredLearner(learner, filter=fss, name='%s & fss' % (learner.name)) learners = [learner, fLearner] results = orngTest.crossValidation(learners, data, folds=10, storeClassifiers=1) # how many attributes did each classifier use? natt = [0.] * len(learners) for fold in range(results.numberOfIterations): for lrn in range(len(learners)): natt[lrn] += len(results.classifiers[fold][lrn].domain.attributes) for lrn in range(len(learners)): natt[lrn] = natt[lrn] / 10. print "\nLearner Accuracy #Atts" for i in range(len(learners)): print "%-15s %5.3f %5.2f" % (learners[i].name, orngStat.CA(results)[i], natt[i]) # which attributes were used in filtered case? print '\nAttribute usage (how many folds attribute was used):' used = {} for fold in range(results.numberOfIterations): for att in results.classifiers[fold][1].domain.attributes: a = att.name if a in used.keys(): used[a] += 1 else: used[a] = 1 for a in used.keys(): print '%2d x %s' % (used[a], a)
def evaluating(self, event): train_data = orange.ExampleTable("classification.tab") bayes2 = orange.BayesLearner() tree2 = orngTree.TreeLearner() knnLearner2 = orange.kNNLearner() knnLearner2.k = 10 # k == 18 seems to be best (at least for 2-3) #svm2 = svm.SVMLearner() bayes2.name = "bayes2" tree2.name = "tree2" knnLearner2.name = "knn2" learners = [bayes2, tree2, knnLearner2] results = orngTest.crossValidation(learners, train_data, folds=10) print "train_data:" print train_data print "k==" print knnLearner2.k print 'Learner CA IS Brier AUC' c = '' for i in range(len(learners)): print "%-8s %5.3f %5.3f %5.3f %5.3f" % (learners[i].name, \ orngStat.CA(results)[i], orngStat.IS(results)[i], orngStat.BrierScore(results)[i], orngStat.AUC(results)[i]) c = c+str("%-8s"%learners[i].name)+'\t'+ str("%5.3f" %orngStat.CA(results)[i])+'\t'+ str("%5.3f" %orngStat.IS(results)[i])+'\t'+str("%5.3f" %orngStat.BrierScore(results)[i])+'\t'+str("%5.3f" %orngStat.AUC(results)[i]+'\n') self.logger2.AppendText('Learner CA IS Brier AUC:\n %s\n' % c) return (c)
def classifier(personalrecords, lowage, upage , agetype, numcrossfolds, estimation): domain = createAttributes() data = orange.ExampleTable(domain, getallfeatures(personalrecords,domain, lowage, upage, 'MONTHS')) bayes = orange.BayesLearner() bayesMEst = orange.BayesLearner() bayesMEst.estimatorConstructor = orange.ProbabilityEstimatorConstructor_m(m=estimation) tree = orngTree.TreeLearner(mForPruning=2) bayes.name = "bayes" bayesMEst.name = 'bayesWithMEstimation' tree.name = "tree" learners = [ bayesMEst] #learners = [tree] #print numcrossfolds , ' fold cross validattion', 'for' , learners results = orngTest.crossValidation(learners, data, folds=numcrossfolds, storeClassifiers = 1) # output the results #print "Learner CA IS Brier AUC" #for i in range(len(learners)): # print "%-8s %5.3f %5.3f %5.3f %5.3f" % (learners[i].name, \ # orngStat.CA(results)[i], orngStat.IS(results)[i], # orngStat.BrierScore(results)[i], orngStat.AUC(results)[i]) #orngTree.printDot(tree, fileName='c:\\tree10.dot', internalNodeShape="ellipse", leafShape="box") #orngTree.printDot(bayes, fileName='c:\\bayes10.dot', internalNodeShape="ellipse", leafShape="box") TP = [] FP = [] TN = [] FN = [] for i in range(numcrossfolds): for j in range(len(learners)): classifier = (results.classifiers[i])[j] tp = 0 fn = 0 tn = 0 fp = 0 for dt in data: if dt.getclass() == '1': p = str(classifier(dt)) if p == '1': tp = tp + 1 else: fn = fn +1 if dt.getclass() == '0': p = classifier(dt) if p == '0': tn = tn + 1 else: fp = fp +1 #print 'Results for Learner :', learnersname[j], " for corssfold number ", i #print 'True positive: ', tp ,'False positive: ', fp, 'True Negative: ', tn, 'False Negative: ', fn TP.append(tp) FP.append(fp) TN.append(tn) FN.append(fn) #learners[j].dumpTree() #orngTree.printDot(learners[j], fileName=filename, internalNodeShape="ellipse", leafShape="box") #classiferProbabilites(results, len(learners) , numcrossfolds, data , attlist[:1] , '1') return (results , TP ,FP , TN, FN)
def test_SVMC(self): # Train a svm svmL = AZorngCvSVM.CvSVMLearner(scaleData=False, svm_type=103, gamma=0.01, C=1, nu=0.5, p=1, eps=0.001, coef0=0, degree=3) svm = svmL(self.inDataC) trainedAcc = evalUtilities.getRMSE(self.inDataC, svm) self.assertEqual(round(trainedAcc, 7), round(2.8525863999999999, 7)) # ver 0.3 # Save model rc = svm.write(self.modelPath) self.assertEqual(rc, True) # Load the saved model loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath) loadedAcc = evalUtilities.getRMSE(self.inDataC, loadedsvm) # Assure equal accuracy self.assertEqual(trainedAcc, loadedAcc) svmLearner = AZorngCvSVM.CvSVMLearner(scaleData=False) svmLearner.name = "CvSVMLearner" svmLearner.eps = 0.001 svmLearner.p = 1 svmLearner.nu = 0.6 svmLearner.kernel_type = 2 svmLearner.svm_type = 103 svmLearner.gamma = 0.0033 svmLearner.C = 47 svmLearner.scaleData = True svmLearner.scaleClass = False Res = orngTest.crossValidation( [svmLearner], self.inDataC, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible) RMSE = evalUtilities.RMSE(Res)[0] self.assertEqual(round(RMSE, 2), round(2.96, 2)) #Ver 0.3 newSVM = svmLearner(self.inDataC) trainedAcc = evalUtilities.getRMSE(self.inDataC, newSVM) # Save model rc = newSVM.write(self.modelPath) self.assertEqual(rc, True) # Load the saved model loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath) loadedAcc = evalUtilities.getRMSE(self.inDataC, loadedsvm) # Assure equal accuracy self.assertEqual(round(trainedAcc, 4), round(2.8289, 4)) #Ver 0.3 self.assertEqual(round(trainedAcc, 4), round(loadedAcc, 4))
def testRMSEstdCalc(self): data = dataUtilities.DataTable(self.regDataPath) RFlearner = AZorngRF.RFLearner() learners = [RFlearner] nFolds = 5 res = orngTest.crossValidation(learners, data, strat=orange.MakeRandomIndices.StratifiedIfPossible, folds = nFolds) RMSEstd = evalUtilities.getRMSEstd(res, nFolds)[0] self.assertEqual(round(RMSEstd,3), round(0.141, 3))
def cforange_cross_validation(input_dict): import orange, orngTest, orngStat learners = [input_dict['learner']] data = input_dict['dataset'] folds = int(input_dict['folds']) results = orngTest.crossValidation(learners, data, folds=folds) output_dict = {} output_dict['results']=results return output_dict
def testRMSEstdCalc(self): data = dataUtilities.DataTable(self.regDataPath) RFlearner = AZorngRF.RFLearner() learners = [RFlearner] nFolds = 5 res = orngTest.crossValidation( learners, data, strat=orange.MakeRandomIndices.StratifiedIfPossible, folds=nFolds) RMSEstd = evalUtilities.getRMSEstd(res, nFolds)[0] self.assertEqual(round(RMSEstd, 3), round(0.141, 3))
def test_SVMC(self): # Train a svm svmL = AZorngCvSVM.CvSVMLearner( scaleData=False, svm_type=103, gamma=0.01, C=1, nu=0.5, p=1, eps=0.001, coef0=0, degree=3 ) svm = svmL(self.inDataC) trainedAcc = evalUtilities.getRMSE(self.inDataC, svm) self.assertEqual(round(trainedAcc, 7), round(2.8525863999999999, 7)) # ver 0.3 # Save model rc = svm.write(self.modelPath) self.assertEqual(rc, True) # Load the saved model loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath) loadedAcc = evalUtilities.getRMSE(self.inDataC, loadedsvm) # Assure equal accuracy self.assertEqual(trainedAcc, loadedAcc) svmLearner = AZorngCvSVM.CvSVMLearner(scaleData=False) svmLearner.name = "CvSVMLearner" svmLearner.eps = 0.001 svmLearner.p = 1 svmLearner.nu = 0.6 svmLearner.kernel_type = 2 svmLearner.svm_type = 103 svmLearner.gamma = 0.0033 svmLearner.C = 47 svmLearner.scaleData = True svmLearner.scaleClass = False Res = orngTest.crossValidation( [svmLearner], self.inDataC, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible ) RMSE = evalUtilities.RMSE(Res)[0] self.assertEqual(round(RMSE, 2), round(2.96, 2)) # Ver 0.3 newSVM = svmLearner(self.inDataC) trainedAcc = evalUtilities.getRMSE(self.inDataC, newSVM) # Save model rc = newSVM.write(self.modelPath) self.assertEqual(rc, True) # Load the saved model loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath) loadedAcc = evalUtilities.getRMSE(self.inDataC, loadedsvm) # Assure equal accuracy self.assertEqual(round(trainedAcc, 4), round(2.8289, 4)) # Ver 0.3 self.assertEqual(round(trainedAcc, 4), round(loadedAcc, 4))
def generalCVconfMat(data, learners, nFolds = 5): """ General method for printing the X fold CV confusion matrix of an Orange data set (data) with any number of classes. learners is a list of AZorange learners. """ res = orngTest.crossValidation(learners, data, strat=orange.MakeRandomIndices.StratifiedIfPossible, folds = nFolds) classes = data.domain.classVar.values for idx in range(len(learners)): cm = orngStat.computeConfusionMatrices(res)[idx] print "Results for "+learners[idx].name print "\t"+"\t".join(classes) for className, classConfusions in zip(classes, cm): print ("%s" + ("\t%i" * len(classes))) % ((className, ) + tuple(classConfusions))
def tweaked(outfile, **kwargs): import orange, orngTest classes = [] outfile = outfile.rsplit(".",1)[0] data = orange.ExampleTable(outfile) #c45 = orange.C45Learner(minObjs=100) c45 = orange.kNNLearner(minObjs=100) results = orngTest.crossValidation([c45], data, folds=10) for i, example in enumerate(results.results, 1): p = example.probabilities[0] classes.append((i, p[1])) return classes
def test_SVMD(self): # Train a svm svm = AZorngCvSVM.CvSVMLearner( self.inDataD, scaleData=False, gamma=4, C=1, nu=0.5, p=0.1, eps=0.001, coef0=0, degree=3 ) trainedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, svm) self.assertEqual(round(trainedAcc, 7), round(0.986666666667, 7)) # Save model rc = svm.write(self.modelPath) self.assertEqual(rc, True) # Load the saved model loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath) loadedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, loadedsvm) # Assure equal accuracy self.assertEqual(trainedAcc, loadedAcc) svmLearner = AZorngCvSVM.CvSVMLearner(scaleData=False) svmLearner.name = "CvSVMLearner" svmLearner.eps = 0.001 svmLearner.p = 0.0 svmLearner.nu = 0.6 svmLearner.kernel_type = 2 svmLearner.svm_type = 101 svmLearner.gamma = 0.0033 svmLearner.C = 47 svmLearner.scaleData = True svmLearner.scaleClass = False Res = orngTest.crossValidation( [svmLearner], self.inDataD, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible ) CA = evalUtilities.CA(Res)[0] self.assertEqual(round(CA, 2), round(0.96666666666666667, 2)) # Before in AZSVM: 0.95999999999999996 newSVM = svmLearner(self.inDataD) trainedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, newSVM) # Save model rc = newSVM.write(self.modelPath) self.assertEqual(rc, True) # Load the saved model loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath) loadedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, loadedsvm) # Assure equal accuracy self.assertEqual(round(trainedAcc, 7), round(0.96666669999999999, 7)) # Before in AZSVM: 0.953333300000 self.assertEqual(round(trainedAcc, 1), round(loadedAcc, 1))
def evaluation(data, folds): """ Evaluates Bayes and DT and prints results for 4 different metrics: Correspondence analysis, Information score, Brier score and Area under the ROC curve. """ bayes = orange.BayesLearner() tree = orngTree.TreeLearner(mForPruning=2) bayes.name = "bayes" tree.name = "tree" learners = [bayes, tree] print "Statistical measures per learner (using %d-fold cross-validation):" % (folds) results = orngTest.crossValidation(learners, data, folds) print "Learner CA IS Brier AUC" for i in range(len(learners)): print "%-8s %5.3f %5.3f %5.3f %5.3f" % (learners[i].name, orngStat.CA(results)[i], orngStat.IS(results)[i], orngStat.BrierScore(results)[i], orngStat.AUC(results)[i]) print "-------" print
def cross_validation(self): data = self.data # set up the learners bayes = orange.BayesLearner() tree = orngTree.TreeLearner(mForPruning=2) bayes.name = "bayes" tree.name = "tree" l = orange.SVMLearner() l.name = "SVM" l=orange.SVMLearner() l.svm_type=orange.SVMLearner.Nu_SVC l.nu=0.3 l.probability=True learners = [bayes, tree, l] deepcopy # compute accuracies on data res = orngTest.crossValidation(learners, data, folds=10) cm = orngStat.computeConfusionMatrices(res, classIndex=data.domain.classVar.values.index('-1')) stat = (('CA', 'CA(res)'), ('Sens', 'sens(cm)'), ('Spec', 'spec(cm)'), ('AUC', 'AUC(res)'), ('IS', 'IS(res)'), ('Brier', 'BrierScore(res)'), ('F1', 'F1(cm)'), ('F2', 'Falpha(cm, alpha=2.0)'), ('MCC', 'MCC(cm)'), ('sPi', 'scottsPi(cm)'), ) scores = [eval("orngStat."+s[1]) for s in stat] print "Learner " + "".join(["%-7s" % s[0] for s in stat]) for (i, l) in enumerate(learners): print "%-8s " % l.name + "".join(["%5.3f " % s[i] for s in scores]) return None
def ensemble2(data = None): import orange, orngTree, orngEnsemble if not data: data = orange.ExampleTable('bupa.tab') forest = orngEnsemble.RandomForestLearner(trees=50, name="forest") tree = orngTree.TreeLearner(minExamples=2, mForPrunning=2, \ sameMajorityPruning=True, name='tree') learners = [tree, forest] import orngTest, orngStat #results = orngTest.leaveOneOut(learners, data) results = orngTest.crossValidation(learners, data, folds=2) print "Learner CA Brier AUC" for i in range(len(learners)): print "%-8s %5.3f %5.3f %5.3f" % (learners[i].name, \ orngStat.CA(results)[i], orngStat.AUC(results)[i])
def classifier(personalrecords, features , attlist ,lowage, upage ,estimation): data = orange.ExampleTable(createAttributes(attlist), getallfeatures(personalrecords,attlist, lowage, upage, 'MONTHS')) bayes = orange.BayesLearner() bayesWithEstimation = orange.BayesLearner(m=estimation) tree = orngTree.TreeLearner(mForPruning=2) bayes.name = "bayes" bayesWithEstimation.name = "bayesWithEstimation" tree.name = "tree" learners = [bayes, bayesWithEstimation] print '10 fold cross validattion' results = orngTest.crossValidation(learners, data, folds=10) # output the results print "Learner CA IS Brier AUC" for i in range(len(learners)): print "%-8s %5.3f %5.3f %5.3f %5.3f" % (learners[i].name, \ orngStat.CA(results)[i], orngStat.IS(results)[i], orngStat.BrierScore(results)[i], orngStat.AUC(results)[i]) if learners[i].name == 'bayes' or learners[i].name == 'bayesWithEstimation': orngTree.printDot(tree, fileName='c:\\tree10.dot', internalNodeShape="ellipse", leafShape="box") orngTree.printDot(bayes, fileName='c:\\bayes10.dot', internalNodeShape="ellipse", leafShape="box")
def knn_classifier_xv(tree, outfile="dev.tab", **kwargs): import orange, orngTest, orngStat classes = [] # TODO: skipp skriving til fil outfile = outfile.rsplit(".",1)[0] data = orange.ExampleTable(outfile) knn = orange.kNNLearner(k=21, name="knn") results = orngTest.crossValidation([knn], data, folds=10) # output the results print "Learner CA IS Brier AUC" print "%-8s %5.3f %5.3f %5.3f %5.3f" % (knn.name, \ orngStat.CA(results)[0], orngStat.IS(results)[0], orngStat.BrierScore(results)[0], orngStat.AUC(results)[0]) print results.results[0].probabilities for i, example in enumerate(results.results, 1): p = example.probabilities[0] classes.append((i, p[1])) return classes
#scores = Orange.feature.scoring.score_all(start_data) #data = Orange.feature.selection.select(start_data, scores, features) train_data, test_data = proj_utils.partition_data(start_data) #selection = orange.MakeRandomIndicesCV(data, cv_folds) #sen1 = 0.0 #spe1 = 0.0 #acc1 = 0.0 #sen2 = 0.0 #spe2 = 0.0 #acc2 = 0.0 model = train_classifier(train_data, features) train_results = orngTest.crossValidation([model], example_data, cv_folds) #test_results = orngTest.crossValidation([model], test_data, cv_folds) train_stats = proj_utils.get_stats(train_results) #test_stats = proj_utils.get_stats(test_results) print "Train:\n%s" % str(train_stats) #print "\nTest:\n%s" % str(test_stats) f = open( os.path.dirname(__file__) + '\\logisticRegressionFilteredCVResults_' + 'V' + str(cv_folds) + '_F' + str(features) + '.txt', 'w+') f.write("Train:\n") f.write(str(train_stats) + "\n") #f.write("Test:\n") #f.write(str(test_stats))
def getProbabilitiesAsAttribute(self, algorithm=None, minsup=None, atts=None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None parameters: algo - key for the structural feature generation algorithm (set dependent structural features that have to be calculated inside the crossvalidation) minsup - minimum support for the algorithm atts - attributes to be removed before learning (e.g. meta etc...) """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None if algorithm: self.__log(" Additional features to be calculated inside of cross-validation") self.__log(" Algorithm for structural features: " + str(algorithm)) self.__log(" Minimum support parameter: " + str(minsup)) # Set the response type self.responseType = ( self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" ) self.__log(" " + str(self.responseType)) # Create the Train and test sets DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) # Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} # Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models = {} rocs = {} self.__log("Calculating Statistics for MLmethods:") self.__log(" " + str([x for x in MLmethods])) # Check data in advance so that, by chance, it will not faill at the last fold! for foldN in range(self.nExtFolds): trainData = self.data.select(DataIdxs[foldN], negate=1) self.__checkTrainData(trainData) # Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0, "PLS") for ml in sortedML: self.__log(" > " + str(ml) + "...") try: # Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] rocs[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] ### mods TG prediction_attribute = orange.FloatVariable("class_prob") domain = [data.domain.attributes, prediction_attribute, data.domain.classvar] data_new = orange.ExampleTable(domain) logTxt = "" for foldN in range(self.nExtFolds): if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs[foldN], negate=1) orig_len = len(trainData.domain.attributes) # add structural descriptors to the training data (TG) if algorithm: trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, algorithm, minsup) trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts) testData = self.data.select(DataIdxs[foldN]) # print "IDX: ", # print DataIdxs[foldN] # calculate the feature values for the test data (TG) if algorithm: cut_off = orig_len - len(atts) smarts = trainData.domain.attributes[cut_off:] self.__log(" Number of structural features added: " + str(len(smarts))) testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts) testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) # Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True if dontOptimize: logTxt += ( " Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n" ) self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData) ) trainData.save(os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner=MLmethods[ml], trainDataFile=os.path.join(runPath, "trainData.tab"), paramList=self.paramList, useGrid=False, verbose=self.verbose, queueType=self.queueType, runPath=runPath, nExtFolds=None, nFolds=self.nInnerFolds, logFile=self.logFile, getTunedPars=True, ) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log( " WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized." ) self.__log(" It will be ignored") # self.__log(" It will be set to default parameters") self.__log(" DEBUG can be done in: " + runPath) # Set learner back to default # MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner " + str(ml) + " was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) # Train the model model = MLmethods[ml](trainData) models[ml].append(model) # Test the model if self.responseType == "Classification": results[ml].append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) roc = self.aroc(testData, [model]) rocs[ml].append(roc) # save the prediction probabilities else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, rocs[ml], ) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results " + ml + ":\n" pprint(res) if not res: raise Exception("No results available!") statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner " + str(ml) + " failed to create/optimize the model!") res = self.createStatObj() statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: # We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models consensusMLs = {} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName]["stable"]: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) self.__log( "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods." ) if len(consensusMLs) <= 1: # we need more models to build a consensus! consensusMLs = {} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) if len(consensusMLs) >= 2: # Var for saving each Fols result Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log( "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs]) ) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" exprTest1 = exprTest0.replace(CLASS0, CLASS1) expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in consensusMLs: expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " " expression += ")" testData = self.data.select(DataIdxs[foldN]) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression) CnTrainEx.append(model.NTrainEx) # Test the model if self.responseType == "Classification": Cresults.append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) Cresults.append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds) statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics # By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
tree = orange.C45Learner(data, m=100) for i in data[:5]: print tree(i), i.getclass() print "\n\nC4.5 with minObjs=100" tree = orange.C45Learner(data, minObjs=100) for i in data[:5]: print tree(i), i.getclass() print "\n\nC4.5 with -m 1 and -s" lrn = orange.C45Learner() lrn.commandline("-m 1 -s") tree = lrn(data) for i in data: if i.getclass() != tree(i): print i, tree(i) import orngC45 tree = orange.C45Learner(data) orngC45.printTree(tree) print import orngStat, orngTest res = orngTest.crossValidation( [orange.C45Learner(), orange.C45Learner(convertToOrange=1)], data) print "Classification accuracy: %5.3f (converted to tree: %5.3f)" % tuple( orngStat.CA(res)) print "Brier score: %5.3f (converted to tree: %5.3f)" % tuple( orngStat.BrierScore(res))
import orange, orngTest, orngStat, orngBayes data = orange.ExampleTable("lung-cancer") bayes = orngBayes.BayesLearner() bayes_m = orngBayes.BayesLearner(m=2) res = orngTest.crossValidation([bayes, bayes_m], data) CAs = orngStat.CA(res) print print "Without m: %5.3f" % CAs[0] print "With m=2: %5.3f" % CAs[1] data = orange.ExampleTable("voting") model = orngBayes.BayesLearner(data) orngBayes.printModel(model)
# to compare naive Bayesian learner when all or just the most # important attribute is used. Shows how to find out which (in # ten-fold cross validation) attributes was used the most. # Category: feature selection # Uses: voting # Referenced: Orange.feature.html#selection # Classes: Orange.feature.selection.FilteredLearner import Orange, orngTest, orngStat voting = Orange.data.Table("voting") nb = Orange.classification.bayes.NaiveLearner() fl = Orange.feature.selection.FilteredLearner(nb, filter=Orange.feature.selection.FilterBestNAtts(n=1), name='filtered') learners = (Orange.classification.bayes.NaiveLearner(name='bayes'), fl) results = orngTest.crossValidation(learners, voting, storeClassifiers=1) # output the results print "Learner CA" for i in range(len(learners)): print "%-12s %5.3f" % (learners[i].name, orngStat.CA(results)[i]) # find out which attributes were retained by filtering print "\nNumber of times attributes were used in cross-validation:" attsUsed = {} for i in range(10): for a in results.classifiers[i][1].atts(): if a.name in attsUsed.keys(): attsUsed[a.name] += 1 else:
# Description: Shows how to add class noise to data # Category: preprocessing # Uses: imports-85 # Classes: Preprocessor_addClassNoise, orngTest.crossValidation # Referenced: domain.htm import orange, orngTest, orngStat filename = "promoters.tab" data = orange.ExampleTable(filename) data.name = "unspoiled" datasets = [data] add_noise = orange.Preprocessor_addClassNoise() for noiselevel in (0.2, 0.4, 0.6): add_noise.proportion = noiselevel add_noise.randomGenerator = 42 d = add_noise(data) d.name = "class noise %4.2f" % noiselevel datasets.append(d) learner = orange.BayesLearner() for d in datasets: results = orngTest.crossValidation([learner], d, folds=10) print "%20s %5.3f" % (d.name, orngStat.CA(results)[0])
# Category: preprocessing # Uses: crx.tab # Referenced: orngFSS.htm import orange, orngDisc, orngTest, orngStat, orngFSS data = orange.ExampleTable("../datasets/crx") bayes = orange.BayesLearner() dBayes = orngDisc.DiscretizedLearner(bayes, name='disc bayes') fss = orngFSS.FilterAttsAboveThresh(threshold=0.05) fBayes = orngFSS.FilteredLearner(dBayes, filter=fss, name='bayes & fss') learners = [dBayes, fBayes] results = orngTest.crossValidation(learners, data, folds=10, storeClassifiers=1) # how many attributes did each classifier use? natt = [0.] * len(learners) for fold in range(results.numberOfIterations): for lrn in range(len(learners)): natt[lrn] += len(results.classifiers[fold][lrn].domain.attributes) for lrn in range(len(learners)): natt[lrn] = natt[lrn] / 10. print "\nLearner Accuracy #Atts" for i in range(len(learners)): print "%-15s %5.3f %5.2f" % (learners[i].name, orngStat.CA(results)[i], natt[i])
def score(self, ids): """compute scores for the list of learners""" if (not self.data): for id in ids: self.learners[id].results = None return # test which learners can accept the given data set # e.g., regressions can't deal with classification data learners = [] n = len(self.data.domain.attributes) * 2 indices = orange.MakeRandomIndices2( p0=min(n, len(self.data)), stratified=orange.MakeRandomIndices2.StratifiedIfPossible) new = self.data.selectref(indices(self.data)) # new = self.data.selectref([1]*min(n, len(self.data)) + # [0]*(len(self.data) - min(n, len(self.data)))) self.warning(0) for l in [self.learners[id] for id in ids]: learner = l.learner if self.preprocessor: learner = self.preprocessor.wrapLearner(learner) try: predictor = learner(new) if predictor(new[0]).varType == new.domain.classVar.varType: learners.append(learner) else: l.scores = [] except Exception as ex: self.warning( 0, "Learner %s ends with exception: %s" % (l.name, str(ex))) l.scores = [] if not learners: return # computation of results (res, and cm if classification) pb = None if self.resampling == 0: pb = OWGUI.ProgressBar(self, iterations=self.nFolds) res = orngTest.crossValidation( learners, self.data, folds=self.nFolds, strat=orange.MakeRandomIndices.StratifiedIfPossible, callback=pb.advance, storeExamples=True) pb.finish() elif self.resampling == 1: pb = OWGUI.ProgressBar(self, iterations=len(self.data)) res = orngTest.leaveOneOut(learners, self.data, callback=pb.advance, storeExamples=True) pb.finish() elif self.resampling == 2: pb = OWGUI.ProgressBar(self, iterations=self.pRepeat) res = orngTest.proportionTest(learners, self.data, self.pLearning / 100., times=self.pRepeat, callback=pb.advance, storeExamples=True) pb.finish() elif self.resampling == 3: pb = OWGUI.ProgressBar(self, iterations=len(learners)) res = orngTest.learnAndTestOnLearnData(learners, self.data, storeExamples=True, callback=pb.advance) pb.finish() elif self.resampling == 4: if not self.testdata: for l in self.learners.values(): l.scores = [] return pb = OWGUI.ProgressBar(self, iterations=len(learners)) res = orngTest.learnAndTestOnTestData(learners, self.data, self.testdata, storeExamples=True, callback=pb.advance) pb.finish() if self.isclassification(): cm = orngStat.computeConfusionMatrices(res, classIndex=self.targetClass) if self.preprocessor: # Unwrap learners learners = [l.wrappedLearner for l in learners] res.learners = learners for l in [self.learners[id] for id in ids]: if l.learner in learners: l.results = res self.error(list(range(len(self.stat)))) scores = [] for i, s in enumerate(self.stat): try: scores.append(eval("orngStat." + s.f)) except Exception as ex: self.error(i, "An error occurred while evaluating orngStat." + s.f + "on %s due to %s" % \ (" ".join([l.name for l in learners]), ex)) scores.append([None] * len(self.learners)) for (i, l) in enumerate(learners): self.learners[l.id].scores = [s[i] if s else None for s in scores] self.sendResults()
# to compare naive Bayesian learner when all or just the most # important attribute is used. Shows how to find out which (in # ten-fold cross validation) attributes was used the most. # Category: feature selection # Uses: voting # Referenced: Orange.feature.html#selection # Classes: Orange.feature.selection.FilteredLearner import Orange, orngTest, orngStat voting = Orange.data.Table("voting") nb = Orange.classification.bayes.NaiveLearner() fl = Orange.feature.selection.FilteredLearner( nb, filter=Orange.feature.selection.FilterBestNAtts(n=1), name='filtered') learners = (Orange.classification.bayes.NaiveLearner(name='bayes'), fl) results = orngTest.crossValidation(learners, voting, storeClassifiers=1) # output the results print "Learner CA" for i in range(len(learners)): print "%-12s %5.3f" % (learners[i].name, orngStat.CA(results)[i]) # find out which attributes were retained by filtering print "\nNumber of times attributes were used in cross-validation:" attsUsed = {} for i in range(10): for a in results.classifiers[i][1].atts(): if a.name in attsUsed.keys(): attsUsed[a.name] += 1 else:
import orange, orngTree, orngWrap, orngStat learner = orngTree.TreeLearner() data = orange.ExampleTable("voting") tuner = orngWrap.Tune1Parameter(object=learner, parameter="minSubset", values=[1, 2, 3, 4, 5, 10, 15, 20], evaluate = orngStat.AUC, verbose=2) classifier = tuner(data) print "Optimal setting: ", learner.minSubset import orngTest untuned = orngTree.TreeLearner() res = orngTest.crossValidation([untuned, tuner], data) AUCs = orngStat.AUC(res) print "Untuned tree: %5.3f" % AUCs[0] print "Tuned tree: %5.3f" % AUCs[1] learner = orngTree.TreeLearner(minSubset=10).instance() data = orange.ExampleTable("voting") tuner = orngWrap.Tune1Parameter(object=learner, parameter=["split.continuousSplitConstructor.minSubset", "split.discreteSplitConstructor.minSubset"], values=[1, 2, 3, 4, 5, 10, 15, 20], evaluate = orngStat.AUC, verbose=2) classifier = tuner(data) print "Optimal setting: ", learner.split.continuousSplitConstructor.minSubset
#data = Orange.feature.selection.select(start_data, scores, features) train_data, test_data = proj_utils.partition_data(start_data) #selection = orange.MakeRandomIndicesCV(data, cv_folds) #sen1 = 0.0 #spe1 = 0.0 #acc1 = 0.0 #sen2 = 0.0 #spe2 = 0.0 #acc2 = 0.0 model = train_classifier(train_data, features) train_results = orngTest.crossValidation([model], example_data, cv_folds) #test_results = orngTest.crossValidation([model], test_data, cv_folds) train_stats = proj_utils.get_stats(train_results) #test_stats = proj_utils.get_stats(test_results) print "Train:\n%s" % str(train_stats) #print "\nTest:\n%s" % str(test_stats) f = open(os.path.dirname(__file__) + '\\logisticRegressionFilteredCVResults_' + 'V' + str(cv_folds) + '_F' + str(features) + '.txt', 'w+') f.write("Train:\n") f.write(str(train_stats) + "\n") #f.write("Test:\n") #f.write(str(test_stats)) f.close()
import orange, orngSVM data = orange.ExampleTable("iris.tab") lin = orngSVM.SVMLearner(kernel_type=orngSVM.SVMLearner.Linear, name="SVM - Linear") poly = orngSVM.SVMLearner(kernel_type=orngSVM.SVMLearner.Polynomial, name="SVM - Poly") rbf = orngSVM.SVMLearner(kernel_type=orngSVM.SVMLearner.RBF, name="SVM - RBF") learners = [lin, poly, rbf] import orngTest, orngStat res = orngTest.crossValidation(learners, data) print "%15s%8s%8s" % ("Name", "CA", "AUC") for l, ca, auc in zip(learners, orngStat.CA(res), orngStat.AUC(res)): print "%-15s %.3f %.3f" % (l.name, ca, auc)
nPermutations = 0 # Some arrays to hold values pvalue = np.zeros(len(t0)) items = range(len(data)) data_p = data # Just loop through randomly permuted labels ... for m in range(nPermutations): if m < nPermutations - 1: np.random.shuffle(items) for i in range(len(data)): j = items[i] data_p[i].setclass(data[j].getclass()) results_p = orngTest.crossValidation(learners, data_p, folds=kFolds) cm = orngStat.computeConfusionMatrices(results_p, classIndex=data_p.domain.classVar.values.index(target)) t = orngStat.CA(results_p) for p in range(len(learners)): if t[p] >= t0[p]: pvalue[p] += 1.0 if nPermutations > 0: pvalue /= (nPermutations * 1.0) scores = [eval("orngStat." + s[1]) for s in stat] + [pvalue] # Write out the empirical p-values Headers = "Learner " + "".join(["%-7s" % s[0] for s in stat]) + 'p-value' print ""
import orange, orngSVM data=orange.ExampleTable("iris.tab") l1=orngSVM.SVMLearner() l1.kernelFunc=orngSVM.RBFKernelWrapper(orange.ExamplesDistanceConstructor_Euclidean(data), gamma=0.5) l1.kernel_type=orange.SVMLearner.Custom l1.probability=True c1=l1(data) l1.name="SVM - RBF(Euclidean)" l2=orngSVM.SVMLearner() l2.kernelFunc=orngSVM.RBFKernelWrapper(orange.ExamplesDistanceConstructor_Hamming(data), gamma=0.5) l2.kernel_type=orange.SVMLearner.Custom l2.probability=True c2=l2(data) l2.name="SVM - RBF(Hamming)" l3=orngSVM.SVMLearner() l3.kernelFunc=orngSVM.CompositeKernelWrapper(orngSVM.RBFKernelWrapper(orange.ExamplesDistanceConstructor_Euclidean(data), gamma=0.5),orngSVM.RBFKernelWrapper(orange.ExamplesDistanceConstructor_Hamming(data), gamma=0.5), l=0.5) l3.kernel_type=orange.SVMLearner.Custom l3.probability=True c3=l1(data) l3.name="SVM - Composite" import orngTest, orngStat tests=orngTest.crossValidation([l1, l2, l3], data, folds=5) [ca1, ca2, ca3]=orngStat.CA(tests) print l1.name, "CA:", ca1 print l2.name, "CA:", ca2 print l3.name, "CA:", ca3
import orange, orngImpute, orngTest, orngStat data = orange.ExampleTable("voting") ba = orange.BayesLearner() imba = orngImpute.ImputeLearner( baseLearner=ba, imputerConstructor=orange.ImputerConstructor_minimal) res = orngTest.crossValidation([ba, imba], data) CAs = orngStat.CA(res) print "Without imputation: %5.3f" % CAs[0] print "With imputation: %5.3f" % CAs[1]
# Description: Demonstrates the use of discretization # Category: discretization # Classes: entropyDiscretization, DiscretizedLearner # Uses: iris.tab import orange import orngDisc data = orange.ExampleTable("iris.tab") disc_data = orngDisc.entropyDiscretization(data) disc_learner = orngDisc.DiscretizedLearner(orange.BayesLearner(), name="disc-bayes") learner = orange.BayesLearner(name="bayes") learners = [learner, disc_learner] import orngTest, orngStat results = orngTest.crossValidation(learners, data) print "Classification Accuracy:" for i in range(len(learners)): print("%15s: %5.3f") % (learners[i].name, orngStat.CA(results)[i])
resList = [ str(CM[0][0]), str(CM[0][1]), str(CM[1][0]), str(CM[1][1]), str(CA), str(MCC) ] wrtStr = string.join(resList, "\t") print "nonIID test set results" print wrtStr # CV accuracy res = orngTest.crossValidation( [learner], data, strat=orange.MakeRandomIndices.StratifiedIfPossible, folds=10) CM = evalUtilities.ConfMat(res)[0] CA = round(orngStat.CA(res)[0], 3) MCC = round(evalUtilities.calcMCC(CM), 3) # TH, FL, FH, TL resList = [ str(CM[0][0]), str(CM[0][1]), str(CM[1][0]), str(CM[1][1]), str(CA), str(MCC) ] wrtStr = string.join(resList, "\t")
import orange, orngWrap, orngTest, orngStat data = orange.ExampleTable("bupa") learner = orange.BayesLearner() thresh = orngWrap.ThresholdLearner(learner=learner) thresh80 = orngWrap.ThresholdLearner_fixed(learner=learner, threshold=.8) res = orngTest.crossValidation([learner, thresh, thresh80], data) CAs = orngStat.CA(res) print "W/out threshold adjustement: %5.3f" % CAs[0] print "With adjusted thredhold: %5.3f" % CAs[1] print "With threshold at 0.80: %5.3f" % CAs[2]
def buildModel(trainData, MLMethod, queueType="NoSGE", verbose=0, logFile=None): """ Buld the method passed in MLMethod and optimize ( "IndividualStatistics" not in MLMethod) if MLMethod is a Consensus ("individualStatistics" in MLMethod) , build each and optimize first all models and after build the consensus! """ log(logFile, "Building and optimizing learner: " + MLMethod["MLMethod"] + "...") learners = {} MLMethods = {} if "IndividualStatistics" in MLMethod: #It is a consensus and will certaily not contain any #special model as it was filtered in the getUnbiasedAcc for ML in MLMethod["IndividualStatistics"]: MLMethods[ML] = copy.deepcopy(MLMethod["IndividualStatistics"][ML]) else: ML = MLMethod["MLMethod"] if MLMETHODS[ML]( name=ML ).specialType == 1: # If is a special model and has a built-in optimizaer log(logFile, " This is a special model") smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: log(logFile, "Found SMILES attribute:" + smilesAttr) trainData = dataUtilities.attributeSelectionData( trainData, [smilesAttr, trainData.domain.classVar.name]) optInfo, SpecialModel = MLMETHODS[ML](name=ML).optimizePars( trainData, folds=5) return SpecialModel else: MLMethods[MLMethod["MLMethod"]] = MLMethod smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: trainData = dataUtilities.attributeDeselectionData( trainData, [smilesAttr]) # optimize all MLMethods for ML in MLMethods: log(logFile, " Optimizing MLmethod: " + ML) learners[ML] = MLMETHODS[ML](name=ML) runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="competitiveWorkflow_BuildModel") trainData.save(os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam(learner=learners[ML], trainDataFile=os.path.join( runPath, "trainData.tab"), useGrid=False, verbose=verbose, queueType=queueType, runPath=runPath, nExtFolds=None, logFile=logFile, getTunedPars=True) if not learners[ML].optimized: print "WARNING: competitiveWorkflow: The learner " + str( learners[ML]) + " was not optimized." #print " Using default parameters" print " The " + str(learners[ML]) + " will not be included" #print " Returning None" print " DEBUG can be made in: " + runPath #Setting default parameters #learners[ML] = learners[ML].__class__() #return None learners.pop(ML) continue else: print "Optimized learner ", learners[ML] if trainData.domain.classVar.varType == orange.VarTypes.Discrete: MLMethods[ML]["optAcc"] = tunedPars[0] else: res = orngTest.crossValidation( [learners[ML]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] MLMethods[ML]["optAcc"] = R2 miscUtilities.removeDir(runPath) #Train the model if len(learners) == 1: log(logFile, " Building the model:" + learners.keys()[0]) model = learners[learners.keys()[0]](trainData) elif len(learners) >= 1: model = buildConsensus(trainData, learners, MLMethods) else: print "ERROR: No Learners were selected!" return None return model
l1.kernel_type = orange.SVMLearner.Custom l1.probability = True c1 = l1(data) l1.name = "SVM - RBF(Euclidean)" l2 = orngSVM.SVMLearner() l2.kernelFunc = orngSVM.RBFKernelWrapper( orange.ExamplesDistanceConstructor_Hamming(data), gamma=0.5) l2.kernel_type = orange.SVMLearner.Custom l2.probability = True c2 = l2(data) l2.name = "SVM - RBF(Hamming)" l3 = orngSVM.SVMLearner() l3.kernelFunc = orngSVM.CompositeKernelWrapper( orngSVM.RBFKernelWrapper( orange.ExamplesDistanceConstructor_Euclidean(data), gamma=0.5), orngSVM.RBFKernelWrapper(orange.ExamplesDistanceConstructor_Hamming(data), gamma=0.5), l=0.5) l3.kernel_type = orange.SVMLearner.Custom l3.probability = True c3 = l1(data) l3.name = "SVM - Composite" import orngTest, orngStat tests = orngTest.crossValidation([l1, l2, l3], data, folds=5) [ca1, ca2, ca3] = orngStat.CA(tests) print l1.name, "CA:", ca1 print l2.name, "CA:", ca2 print l3.name, "CA:", ca3
def test_SVMD(self): # Train a svm svm = AZorngCvSVM.CvSVMLearner(self.inDataD, scaleData=False, gamma=4, C=1, nu=0.5, p=0.1, eps=0.001, coef0=0, degree=3) trainedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, svm) self.assertEqual(round(trainedAcc, 7), round(0.986666666667, 7)) # Save model rc = svm.write(self.modelPath) self.assertEqual(rc, True) # Load the saved model loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath) loadedAcc = evalUtilities.getClassificationAccuracy( self.inDataD, loadedsvm) # Assure equal accuracy self.assertEqual(trainedAcc, loadedAcc) svmLearner = AZorngCvSVM.CvSVMLearner(scaleData=False) svmLearner.name = "CvSVMLearner" svmLearner.eps = 0.001 svmLearner.p = 0.0 svmLearner.nu = 0.6 svmLearner.kernel_type = 2 svmLearner.svm_type = 101 svmLearner.gamma = 0.0033 svmLearner.C = 47 svmLearner.scaleData = True svmLearner.scaleClass = False Res = orngTest.crossValidation( [svmLearner], self.inDataD, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible) CA = evalUtilities.CA(Res)[0] self.assertEqual(round(CA, 2), round(0.96666666666666667, 2)) # Before in AZSVM: 0.95999999999999996 newSVM = svmLearner(self.inDataD) trainedAcc = evalUtilities.getClassificationAccuracy( self.inDataD, newSVM) # Save model rc = newSVM.write(self.modelPath) self.assertEqual(rc, True) # Load the saved model loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath) loadedAcc = evalUtilities.getClassificationAccuracy( self.inDataD, loadedsvm) # Assure equal accuracy self.assertEqual(round(trainedAcc, 7), round(0.96666669999999999, 7)) #Before in AZSVM: 0.953333300000 self.assertEqual(round(trainedAcc, 1), round(loadedAcc, 1))
# validation (how many and which attributes were used?). # Category: preprocessing # Uses: crx.tab # Referenced: orngFSS.htm import orange, orngDisc, orngTest, orngStat, orngFSS data = orange.ExampleTable("../datasets/crx") bayes = orange.BayesLearner() dBayes = orngDisc.DiscretizedLearner(bayes, name='disc bayes') fss = orngFSS.FilterAttsAboveThresh(threshold=0.05) fBayes = orngFSS.FilteredLearner(dBayes, filter=fss, name='bayes & fss') learners = [dBayes, fBayes] results = orngTest.crossValidation(learners, data, folds=10, storeClassifiers=1) # how many attributes did each classifier use? natt = [0.] * len(learners) for fold in range(results.numberOfIterations): for lrn in range(len(learners)): natt[lrn] += len(results.classifiers[fold][lrn].domain.attributes) for lrn in range(len(learners)): natt[lrn] = natt[lrn]/10. print "\nLearner Accuracy #Atts" for i in range(len(learners)): print "%-15s %5.3f %5.2f" % (learners[i].name, orngStat.CA(results)[i], natt[i]) # which attributes were used in filtered case?
def test_SVM_Priors_D(self): """Test SVM with priors """ # Train a svm svm = AZorngCvSVM.CvSVMLearner(self.inDataD, priors={ "Iris-setosa": 0.2, "Iris-versicolor": 0.3, "Iris-virginica": 0.5 }) trainedAcc = evalUtilities.getClassificationAccuracy(self.inDataD, svm) self.assertEqual(round(trainedAcc, 7), round(0.73333329999999997, 7)) # Save model rc = svm.write(self.modelPath) self.assertEqual(rc, True) # Load the saved model loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath) loadedAcc = evalUtilities.getClassificationAccuracy( self.inDataD, loadedsvm) # Assure equal accuracy self.assertEqual(trainedAcc, loadedAcc) svmLearner = AZorngCvSVM.CvSVMLearner(scaleData=False, priors={ "Iris-setosa": 0.2, "Iris-versicolor": 0.3, "Iris-virginica": 0.5 }) svmLearner.name = "CvSVMLearner" svmLearner.shrinking = 1 svmLearner.eps = 0.001 svmLearner.p = 0.0 svmLearner.nu = 0.6 svmLearner.kernel_type = 2 svmLearner.svm_type = 103 svmLearner.gamma = 0.0033 svmLearner.C = 47 svmLearner.probability = 1 svmLearner.scaleData = True svmLearner.scaleClass = False #svmLearner.for_nomogram=1 Res = orngTest.crossValidation( [svmLearner], self.inDataD, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible) CA = evalUtilities.CA(Res)[0] self.assertEqual(round(CA, 2), round(0.940000000, 2)) # orange1.0: 0.93333333333333335]) svmLearner.priors = None Res = orngTest.crossValidation( [svmLearner], self.inDataD, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible) CA = evalUtilities.CA(Res)[0] self.assertEqual(round(CA, 2), round(0.94666666666666666, 2)) newSVM = svmLearner(self.inDataD) trainedAcc = evalUtilities.getClassificationAccuracy( self.inDataD, newSVM) # Save model rc = newSVM.write(self.modelPath) self.assertEqual(rc, True) # Load the saved model loadedsvm = AZorngCvSVM.CvSVMread(self.modelPath) loadedAcc = evalUtilities.getClassificationAccuracy( self.inDataD, loadedsvm) # Assure equal accuracy self.assertEqual(round(trainedAcc, 7), round(0.95999999999999996, 7)) #Before in AZSVM: 0.953333300000 self.assertEqual(round(trainedAcc, 1), round(loadedAcc, 1))
def getAcc(self, callBack=None, algorithm=None, params=None, atts=None, holdout=None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None parameters: algorithm - list of feature generation algorithms (set dependent features that have to be calculated inside the crossvalidation) params - dictionary of parameters atts - attributes to be removed before learning (e.g. meta etc...) """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None if holdout: self.nExtFolds = 1 if algorithm: self.__log(" Additional features to be calculated inside of cross-validation") for i in algorithm: self.__log(" Algorithm: " + str(i)) for j, v in params.iteritems(): self.__log(" Parameter: " + str(j) + " = " + str(v)) # Set the response type self.responseType = ( self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" ) self.__log(" " + str(self.responseType)) # Create the Train and test sets DataIdxs = None if holdout: self.__log("Using hold out evaluation with " + str(holdout) + "*100 % of data for training") DataIdxs = dataUtilities.SeedDataSampler_holdOut(self.data, holdout) else: DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) # Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} # Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models = {} rocs = {} self.__log("Calculating Statistics for MLmethods:") self.__log(" " + str([x for x in MLmethods])) # Check data in advance so that, by chance, it will not fail at the last fold! for foldN in range(self.nExtFolds): trainData = self.data.select(DataIdxs[foldN], negate=1) self.__checkTrainData(trainData) # Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0, "PLS") stepsDone = 0 nTotalSteps = len(sortedML) * self.nExtFolds for ml in sortedML: self.__log(" > " + str(ml) + "...") try: # Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] rocs[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] logTxt = "" for foldN in range(self.nExtFolds): if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs[foldN], negate=1) orig_len = len(trainData.domain.attributes) refs = None methods = [ "rdk_MACCS_keys", "rdk_topo_fps", "rdk_morgan_fps", "rdk_morgan_features_fps", "rdk_atompair_fps", ] train_domain = None # add structural descriptors to the training data (TG) if algorithm: for i in range(len(algorithm)): if algorithm[i] == "structClust": self.__log("Algorithm " + str(i) + ": " + str(algorithm[i])) actData = orange.ExampleTable(trainData.domain) for d in trainData: # only valid for simboosted qsar paper experiments!? if d.getclass() == "2": actData.append(d) refs = structuralClustering.getReferenceStructures( actData, threshold=params["threshold"], minClusterSize=params["minClusterSize"], numThreads=2, ) self.__log( " found " + str(len(refs)) + " reference structures in " + str(len(actData)) + " active structures" ) orig_len = orig_len + (len(refs) * len(methods)) trainData_sim = SimBoostedQSAR.getSimDescriptors(refs, trainData, methods) if i == (len(algorithm) - 1): trainData = dataUtilities.attributeDeselectionData(trainData_sim, atts) else: trainData = dataUtilities.attributeDeselectionData(trainData_sim, []) elif algorithm[i] == "ECFP": self.__log("Algorithm " + str(i) + ": " + str(algorithm[i])) trainData_ecfp = getCinfonyDesc.getCinfonyDescResults(trainData, ["rdk.FingerPrints"]) train_domain = trainData_ecfp.domain if i == (len(algorithm) - 1): trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, atts) else: trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, []) else: self.__log("Algorithm " + str(i) + ": " + str(algorithm[i])) trainData_structDesc = getStructuralDesc.getStructuralDescResult( trainData, algorithm[i], params["minsup"] ) if i == (len(algorithm) - 1): trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts) else: trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, []) # trainData.save("/home/girschic/proj/AZ/ProjDev/train.tab") testData = self.data.select(DataIdxs[foldN]) # calculate the feature values for the test data (TG) if algorithm: for i in range(len(algorithm)): if algorithm[i] == "structClust": self.__log(str(algorithm[i])) testData_sim = SimBoostedQSAR.getSimDescriptors(refs, testData, methods) if i == (len(algorithm) - 1): testData = dataUtilities.attributeDeselectionData(testData_sim, atts) else: testData = dataUtilities.attributeDeselectionData(testData_sim, []) elif algorithm[i] == "ECFP": self.__log(str(algorithm[i])) # testData_ecfp = orange.ExampleTable(train_domain) tmp_dat = [] for d in testData: tmp = getCinfonyDesc.getRdkFPforTestInstance(train_domain, d) tmp_dat.append(tmp) testData_ecfp = orange.ExampleTable(tmp_dat[0].domain, tmp_dat) if i == (len(algorithm) - 1): # print "removing atts" testData = dataUtilities.attributeDeselectionData(testData_ecfp, atts) else: # print "removing no atts" testData = dataUtilities.attributeDeselectionData(testData_ecfp, []) else: cut_off = orig_len - len(atts) smarts = trainData.domain.attributes[cut_off:] self.__log(" Number of structural features added: " + str(len(smarts))) testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts) if i == (len(algorithm) - 1): testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts) else: testData = dataUtilities.attributeDeselectionData(testData_structDesc, []) # testData.save("/home/girschic/proj/AZ/ProjDev/test.tab") nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) # Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True if dontOptimize: logTxt += ( " Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n" ) self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData) ) # self.__log(" run path:"+str(runPath)) trainData.save(os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner=MLmethods[ml], trainDataFile=os.path.join(runPath, "trainData.tab"), paramList=self.paramList, useGrid=False, verbose=self.verbose, queueType=self.queueType, runPath=runPath, nExtFolds=None, nFolds=self.nInnerFolds, logFile=self.logFile, getTunedPars=True, ) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log( " WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized." ) self.__log(" It will be ignored") # self.__log(" It will be set to default parameters") self.__log(" DEBUG can be done in: " + runPath) # Set learner back to default # MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner " + str(ml) + " was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) # Train the model model = MLmethods[ml](trainData) models[ml].append(model) # Test the model if self.responseType == "Classification": results[ml].append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) roc = self.aroc(testData, [model]) rocs[ml].append(roc) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, rocs[ml], ) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results " + ml + ":\n" pprint(res) if not res: raise Exception("No results available!") statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: print "Unexpected error:", print sys.exc_info()[0] print sys.exc_info()[1] self.__log(" Learner " + str(ml) + " failed to create/optimize the model!") res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, rocs[ml], ) statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: # We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models consensusMLs = {} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName]["stable"]: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) self.__log( "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods." ) if len(consensusMLs) <= 1: # we need more models to build a consensus! consensusMLs = {} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) if len(consensusMLs) >= 2: # Var for saving each Fols result Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log( "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs]) ) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" exprTest1 = exprTest0.replace(CLASS0, CLASS1) expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in consensusMLs: expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " " expression += ")" testData = self.data.select(DataIdxs[foldN]) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression) CnTrainEx.append(model.NTrainEx) # Test the model if self.responseType == "Classification": Cresults.append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) Cresults.append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds) statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics # By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
# Description: Test of naive bayesian classifier with entropy-based discretization (as defined in nbdisc.py) # Category: modelling # Uses: iris.tab # Classes: orngTest.crossValidation, orngStat.CA # Referenced: c_nb_disc.htm import orange, orngTest, orngStat, nbdisc data = orange.ExampleTable("iris") results = orngTest.crossValidation([nbdisc.Learner()], data, folds=10) print "Accuracy = %5.3f" % orngStat.CA(results)[0]
data = orange.ExampleTable("features.tab") folds = 10 k = 150 cv = CrossVal(data, folds, k) cv.run_kNN() cv.printCA() # Builtin ClossValidator, with several classifiers, only used for testing early on if False: # set up the learners bayes = orange.BayesLearner() tree = orngTree.TreeLearner(mForPruning=2) knn = orange.kNNLearner(k=k) bayes.name = "bayes" tree.name = "tree" knn.name = "knn" learners = [bayes, tree, knn] # compute accuracies on data data = orange.ExampleTable("features.tab") # Create a crossvalidation on the sampleset so that you don't classify it's own data results = orngTest.crossValidation(learners, data, folds=10) # output the results print "Learner \tAccuracy" for i in range(len(learners)): print "%-8s\t%5.3f%%" % (learners[i].name, orngStat.CA(results)[i]*100)
# Description: Demostrates the use of classification scores # Category: evaluation # Uses: voting.tab # Referenced: orngStat.htm import orange, orngTest, orngTree learners = [orange.BayesLearner(name = "bayes"), orngTree.TreeLearner(name="tree"), orange.MajorityLearner(name="majrty")] voting = orange.ExampleTable("voting") res = orngTest.crossValidation(learners, voting) vehicle = orange.ExampleTable("vehicle") resVeh = orngTest.crossValidation(learners, vehicle) import orngStat CAs = orngStat.CA(res) APs = orngStat.AP(res) Briers = orngStat.BrierScore(res) ISs = orngStat.IS(res) print print "method\tCA\tAP\tBrier\tIS" for l in range(len(learners)): print "%s\t%5.3f\t%5.3f\t%5.3f\t%6.3f" % (learners[l].name, CAs[l], APs[l], Briers[l], ISs[l]) CAs = orngStat.CA(res, reportSE=True)
# Classes: orngTest.crossValidation, orngTree.TreeLearner, orange.kNNLearner, orngRegression.LinearRegressionLearner # Referenced: regression.htm import orange import orngRegression import orngTree import orngStat, orngTest data = orange.ExampleTable("housing") # definition of learners (regressors) lr = orngRegression.LinearRegressionLearner(name="lr") rt = orngTree.TreeLearner(measure="retis", mForPruning=2, minExamples=20, name="rt") maj = orange.MajorityLearner(name="maj") knn = orange.kNNLearner(k=10, name="knn") learners = [maj, lr, rt, knn] # evaluation and reporting of scores results = orngTest.crossValidation(learners, data, folds=10) scores = [("MSE", orngStat.MSE), ("RMSE", orngStat.RMSE), ("MAE", orngStat.MAE), ("RSE", orngStat.RSE), ("RRSE", orngStat.RRSE), ("RAE", orngStat.RAE), ("R2", orngStat.R2)] print "Learner " + "".join(["%-7s" % s[0] for s in scores]) for i in range(len(learners)): print "%-8s " % learners[i].name + "".join( ["%6.3f " % s[1](results)[i] for s in scores])
def classificationAccuracy(self, histOrImgs, labels=None, confThr=0.0, peakThr=None, edgeThr=None, nu=0.6, gamma=2.0, doCrossVal=False): """ Classify test data and (optionally) perform a cross validation on the training data. @param histOrImgs: Either a list of SIFT descriptor arrays or an iterator of images. @type histOrImgs: numpy.ndarray or [numpy.ndarray] or Image.Image or [Image.Image] @param labels: A list of labels corresponding to the list of descriptors/images. @type labels: [string] @param nu: The S{nu}-Parameter of the support vector machine. @type nu: float @param gamma: The S{gamma}-Parameter of the RBF-Kernel. @type gamma: float @param confidenceThreshold: All classifications who are classified with a lower confidence than this threshold are rejected. 1.0: Everything is rejected, 0.0: Nothing is rejected. @type confidenceThreshold: float @param peakThreshold: A SIFT parameter. Sensible values: 0.0 < x < 30.0. @type peakThreshold: float @param edgeThreshold: A SIFT parameter. Sensible values: 0.0 < x < 10.0. @type edgeThreshold: float @rtype: (float,float) @return: The cross validation accuracy and the test data classification accuracy. """ if self.learner is None: raise ValueError("Learner has to be loaded before classification can be done.") # Set SIFT member variables (so they get stored in the DB if requested) if peakThr is None: self.peakThreshold = self.learner.peakThreshold else: self.peakThreshold = peakThr if edgeThr is None: self.edgeThreshold = self.learner.edgeThreshold else: self.edgeThreshold = edgeThr # If we've been given an images iterator, extract features and vector quantize if isinstance(histOrImgs, collections.Iterator): if labels is None: raise ValueError("If argument 'histOrImgs' is an iterator of images, \ the argument 'lables' must not be None.") desc, self.numTestDesc = im.extractFeatures(histOrImgs, self.peakThreshold, self.edgeThreshold) recognosco.logger.info("Found %i features/image on average.", self.numTestDesc / len(desc)) tmpHistograms = _buildHistograms(self.learner.codebook, desc) histograms = _convertToOrangeDataSet(tmpHistograms, self.learner.domain, labels) else: histograms = histOrImgs values = histograms.domain.classVar.values self.values = values length = len(values) self.confusion = numpy.zeros((length, length), int) starttime = time.time() self.nu = nu self.gamma = gamma svm = orange.SVMLearner() svm.svm_type = orange.SVMLearner.Nu_SVC svm.nu = nu svm.gamma = gamma svm.kernel_type = orange.SVMLearner.RBF svm.probability = True recognosco.logger.debug("Training Support Vector Machine...") self.classifier = svm(self.learner.histograms) recognosco.logger.debug("Done...") crossVal = 0.0 if doCrossVal: crossVal = orngTest.crossValidation([svm], self.learner.histograms, folds=10) numCorrectClassified = 0.0 numClassified = 0.0 for i in range(len(histograms)): c = self.classifier(histograms[i]) recognosco.logger.info("Has the Class: %s", histograms[i].getclass()) recognosco.logger.info("Classified as: %s", c) prob = self.classifier(histograms[i], self.classifier.GetProbabilities) conf = self.__getConfidence(prob) recognosco.logger.info("Confidence: %f", conf) if conf < confThr: recognosco.logger.info("Rejected classification (Threshold: %.2f)", confThr) continue numClassified += 1.0 predicted = values.index(str(c)) actual = values.index(str(histograms[i].getclass())) self.confusion[predicted][actual] += 1 if(c == histograms[i].getclass()): numCorrectClassified += 1.0 endtime = time.time() self.confusion = str(self.confusion) self.clAccuracy = numCorrectClassified / numClassified self.fracClassified = numClassified / len(histograms) if doCrossVal: self.cvAccuracy = orngStat.CA(crossVal)[0] recognosco.logger.info("Cross validation accuracy: %s", self.cvAccuracy) else: self.cvAccuracy = -1.0 recognosco.logger.info("Classification accuracy of test data: %s", self.clAccuracy) self.testTime = endtime - starttime return (self.cvAccuracy, self.classificationAccuracy)