def readCross(num,type,numtrees): filename=resultFile+'_'+type+'_'+num+'_all.csv' loader=CSVLoader() loader.setSource(File(filename)) data=loader.getDataSet() #print data.numAttributes() data.setClassIndex(data.numAttributes()-1) rf=RF() rf.setNumTrees(numtrees) #pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) buffer = StringBuffer() # buffer for the predictions output=PlainText() output.setHeader(data) output.setBuffer(buffer) output.setOutputDistribution(True) attRange = Range() # attributes to output outputDistributions = Boolean(True) evaluator=Evaluation(data) evaluator.crossValidateModel(rf,data,10, Random(1),[output,attRange,outputDistributions]) print evaluator.toSummaryString() print evaluator.toClassDetailsString() print evaluator.toMatrixString() return [evaluator.weightedPrecision(),evaluator.weightedRecall(),evaluator.weightedFMeasure(),evaluator.weightedMatthewsCorrelation(),evaluator.weightedFalseNegativeRate(),evaluator.weightedFalsePositiveRate(),evaluator.weightedTruePositiveRate(),evaluator.weightedTrueNegativeRate(),evaluator.weightedAreaUnderROC()]
def myGridSearch(data,RBound,MBound): bestlogistic = None best_acc = -float('inf') class bestValues(object): m = float('nan') r = float('nan') for r in range(RBound[0],RBound[1]+RBound[2],RBound[2]): for m in range(MBound[0],MBound[1]+MBound[2],MBound[2]): logistic = Logistic() logistic.setMaxIts(int(m)) logistic.setRidge(pow(10,r)) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(logistic,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc): bestlogistic = logistic best_acc = acc bestValues.m = int(m) bestValues.r = pow(10,r) print "Best accuracy: ", best_acc print "Best values: M = ", bestValues.m, ", Ridge = ", bestValues.r print "-----------------------------------------" return bestlogistic, bestValues.r, bestValues.m, best_acc
def myGridSearch(data,NTreeBounds,NFeaturesBounds): best_acc = -float('inf') bestrandomforest = None class bestValues(object): t = float('nan') f = float('nan') for t in range(NTreeBounds[0],NTreeBounds[1]+NTreeBounds[2],NTreeBounds[2]): for f in range(NFeaturesBounds[0],NFeaturesBounds[1]+NFeaturesBounds[2],NFeaturesBounds[2]): randomforest = RandomForest() randomforest.setNumTrees(int(t)) randomforest.setNumFeatures(int(f)) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(randomforest,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc): bestrandomforest = randomforest best_acc = acc bestValues.t = t bestValues.f = f print "Best accuracy:", best_acc print "Best values: NTreeBounds = ", bestValues.t, ", NFeaturesBounds = ", bestValues.f print "-----------------------------------------" return bestrandomforest, bestValues.t, bestValues.f, best_acc
def Logistic_ParamFinder(data): # Possible set for Ridge-value RBounds = [-10,2,1] # possible set for maximum Iteration MBounds = [-1,10,1] if (data.numInstances()>10): # grid search does 10-fold cross validation; hence number of samples must be more than 10 gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(True)) logistic = Logistic() gridsearch.setClassifier(logistic) gridsearch.setXProperty(String('classifier.maxIts')) gridsearch.setYProperty(String('classifier.ridge')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('pow(BASE,I)')) gridsearch.setXMin(MBounds[0]) gridsearch.setXMax(MBounds[1]) gridsearch.setXStep(MBounds[2]) gridsearch.setYMin(RBounds[0]) gridsearch.setYMax(RBounds[1]) gridsearch.setYStep(RBounds[2]) gridsearch.setYBase(10) print "searching for logistic lcassifier Max Iteration = [", MBounds[0], ",", MBounds[1], "], Ridge = [ 10E", RBounds[0], ",10E", RBounds[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # ----------------------- Evaluation bestlogistic = Logistic() bestlogistic.setMaxIts(int(bestValues.x)) bestlogistic.setRidge(pow(10,bestValues.y)) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestlogistic,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() print "best accuracy: ", acc print "best logistic classifier with Ridge = ", bestlogistic.getRidge(), " Max Iteration = ", bestlogistic.getMaxIts() OptLog = bestlogistic OptLogp1 = bestlogistic.getRidge() OptLogp2 = bestlogistic.getMaxIts() OptLogAcc = acc else: OptLog, OptLogp1, OptLogp2, OptLogAcc = myGridSearch(data,RBounds,MBounds) Description = 'Logistic classifier OptRidge = ' + str(OptLogp1) + \ ', OptMaxIts = ' + str(OptLogp2) + ', OptAcc = ' + str(OptLogAcc) print "-----------------------------------------" return OptLog, OptLogp1, OptLogp2, OptLogAcc, Description
def RandomForest_ParamFinder(data): # possible set for Number of trees NTreeBounds = [1,20,1] # possible set for number of features NFeaturesBounds = [0,20,1] if (data.numInstances()>10): # grid search does 10-fold cross validation; hence number of samples must be more than 10 gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(True)) randomforest = RandomForest() gridsearch.setClassifier(randomforest) gridsearch.setXProperty(String('classifier.numTrees')) gridsearch.setYProperty(String('classifier.numFeatures')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setXMin(NTreeBounds[0]) gridsearch.setXMax(NTreeBounds[1]) gridsearch.setXStep(NTreeBounds[2]) gridsearch.setYMin(NFeaturesBounds[0]) gridsearch.setYMax(NFeaturesBounds[1]) gridsearch.setYStep(NFeaturesBounds[2]) gridsearch.setYBase(10) print "searching for random-forest NumTrees = [", NTreeBounds[0], ",", NTreeBounds[1], "], NumFeatures = [ ", NFeaturesBounds[0], ",", NFeaturesBounds[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # ----------------------- Evaluation bestrandomforest = RandomForest() bestrandomforest.setNumTrees(int(bestValues.x)) bestrandomforest.setNumFeatures(int(bestValues.y)) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestrandomforest,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() print "best accuracy: ", acc print "best random-forest classifier with NumTrees=",bestValues.x , ", NumFeatures = ", bestValues.y OptRndFrst = bestrandomforest OptRndFrstp1 = bestValues.x OptRndFrstp2 = bestValues.y OptRndFrstAcc = acc else: OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc = myGridSearch(data,NTreeBounds,NFeaturesBounds) Description = 'Random-Forest classifier: OptNumTrees = ' + str(OptRndFrstp1) + \ ', OptNumFeatures = ' + str(OptRndFrstp2) + ', OptAcc = ' + str(OptRndFrstAcc) print "-----------------------------------------" return OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc, Description
cover = CoverTree() cover.setDistanceFunction(EuclideanDistance()) # only Euclidean Distance function tree_algorithms.append(cover) data.setClassIndex(data.numAttributes() - 1) for num in range(1,30,2): file.write(str(num)) for algoknn in tree_algorithms : log.write("---------------------------------\nK: " + str(num) + ", Search Algorithm: " + algoknn.__class__.__name__ + "\n") algo = IBk() algo.setNearestNeighbourSearchAlgorithm(algoknn) algo.setKNN(num) x = time.time() algo.buildClassifier(data) log.write("Time to build classifier: " + str(time.time() - x) + "\n") evaluation = Evaluation(data) output = PlainText() # plain text output for predictions output.setHeader(data) buffer = StringBuffer() # buffer to use output.setBuffer(buffer) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution x = time.time() #evaluation.evaluateModel(algo, data, [output, attRange, outputDistribution]) evaluation.crossValidateModel(algo, data, 10, rand, [output, attRange, outputDistribution]) log.write("Time to evaluate model: " + str(time.time() - x) + "\n") log.write(evaluation.toSummaryString()) file.write("," + str(evaluation.rootMeanSquaredError())) file.write("\n") file.close() log.close()
print "Confusion Matrix:" for l in confusion_matrix: print '** ', ','.join('%2d' % int(x) for x in l) # example to collect an individual statistic for all evaluated classifiers print "------------------------------------" print "Example to collect an individual statistic for all evaluated classifiers" print "Kappa" for index in range(len(algo_keys)): evaluation = my_evaluations[index] key = algo_keys[index] algo = algo_dict[key] print algo.__class__.__name__ + ": " + str(evaluation.kappa()) # Example K fold cross validate model against training data # NOTE: This should be done against test data not training data. print "Cross validation with 10 folds" for index in range(len(algo_keys)): evaluation = my_evaluations[index] key = algo_keys[index] algo = algo_dict[key] output = PlainText() # plain text output for predictions output.setHeader(data) buffer = StringBuffer() # buffer to use output.setBuffer(buffer) rand = Random(1) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.crossValidateModel(algo, data, 10, rand, [output, attRange, outputDistribution])
def myGridSearch(data,cBounds,GBound,eBounds): IsBestRBFKernel = False best_acc_poly = -float('inf') best_acc_rbf = -float('inf') # Poly Kernel class bestValues_poly(object): x = float('nan') y = float('nan') for Cbnd in cBounds: for c in range(Cbnd[0],Cbnd[1]+Cbnd[2],Cbnd[2]): for e in range(eBounds[0],eBounds[1]+eBounds[2],eBounds[2]): smo = SMO() kernel = PolyKernel() kernel.setExponent(e) smo.setC(c) smo.setKernel(kernel) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(smo,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc_poly): best_smo_poly = smo best_acc_poly = acc bestValues_poly.x = c bestValues_poly.y = e print "Best accuracy (Poly Kernel): ", best_acc_poly print "Best values (Poly Kernel): C = ", bestValues_poly.x, ", exponent = ", bestValues_poly.y print "-----------------------------------------" # RBF Kernel class bestValues_rbf(object): x = float('nan') y = float('nan') for Cbnd in cBounds: for c in range(Cbnd[0],Cbnd[1]+Cbnd[2],Cbnd[2]): for g in range(GBound[0],GBound[1]+GBound[2],GBound[2]): smo = SMO() kernel = RBFKernel() kernel.setGamma(pow(10,g)) smo.setC(c) smo.setKernel(kernel) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(smo,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc_rbf): best_smo_rbf = smo best_acc_rbf = acc bestValues_rbf.x = c bestValues_rbf.y = g print "Best accuracy (RBF Kernel): ", best_acc_rbf print "Best values (RBF Kernel): C = ", bestValues_rbf.x, ", gamma = ", bestValues_rbf.y if (best_acc_rbf > best_acc_poly): IsBestRBFKernel = True print "best smo classifier is RBF kernel with C = ", bestValues_rbf.x," and gamma = ", pow(10,bestValues_rbf.y) best_smo = best_smo_rbf OptSMOp1 = bestValues_rbf.x OptSMOp2 = pow(10,bestValues_rbf.y) OptSMOAcc = best_acc_rbf OptSMOIsRBF = IsBestRBFKernel else: IsBestRBFKernel = False print "best smo classifier is Poly kernel with C = ", bestValues_poly.x," and exponent = ", bestValues_poly.y best_smo = best_smo_poly OptSMOp1 = bestValues_poly.x OptSMOp2 = bestValues_poly.y OptSMOAcc = best_acc_poly OptSMOIsRBF = IsBestRBFKernel return IsBestRBFKernel, best_smo, OptSMOp1, OptSMOp2, OptSMOAcc
def SMO_ParamFinder(data): # Possible set for C-value cBounds = [[1,10,1],[10,100,10],[100,300,20]] # possible set for exponents eBounds = [1,3,1] # possible set for Gamma GBound = [-5,2,1] if (data.numInstances()>10): # grid search does 10-fold cross validation; hence number of samples must be more than 10 # Polynomials Kernel gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(True)) smo = SMO() kernel = PolyKernel() smo.setKernel(kernel) gridsearch.setClassifier(smo) gridsearch.setXProperty(String('classifier.c')) gridsearch.setYProperty(String('classifier.kernel.Exponent')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) best_acc_poly = -float('inf') for cnt in range(0,len(cBounds)): cbound = cBounds[cnt] cmin = cbound[0] cmax = cbound[1] cstep = cbound[2] gridsearch.setXMin(cmin) gridsearch.setXMax(cmax) gridsearch.setXStep(cstep) gridsearch.setYMin(eBounds[0]) gridsearch.setYMax(eBounds[1]) gridsearch.setYStep(eBounds[2]) print "searching for Polykernel C = [", cmin, ",", cmax, "], exponent = [", eBounds[0], ",", eBounds[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # --------------------------------- Evaluation bestsmo = SMO() kernel = PolyKernel() kernel.setExponent(bestValues.y) bestsmo.setC(bestValues.x) bestsmo.setKernel(kernel) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) print "numFolds : ", numFolds evaluation.crossValidateModel(bestsmo,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc_poly): best_smo_poly = bestsmo best_acc_poly = acc bestValues_poly = bestValues print "Best accuracy so far: ",best_acc_poly print "Best values so far: ",bestValues_poly print "Best accuracy (Poly Kernel): ", best_acc_poly print "Best values (Poly Kernel): ", bestValues_poly print "-----------------------------------------" # RBF Kernel smo = SMO() kernel = RBFKernel() smo.setKernel(kernel) gridsearch.setClassifier(smo) gridsearch.setXProperty(String('classifier.c')) gridsearch.setYProperty(String('classifier.kernel.gamma')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('pow(BASE,I)')) gridsearch.setYBase(10) best_acc_rbf = -float('inf') for cnt in range(0,len(cBounds)): cbound = cBounds[cnt] cmin = cbound[0] cmax = cbound[1] cstep = cbound[2] gridsearch.setXMin(cmin) gridsearch.setXMax(cmax) gridsearch.setXStep(cstep) gridsearch.setYMin(GBound[0]) gridsearch.setYMax(GBound[1]) gridsearch.setYStep(GBound[2]) gridsearch.setYBase(10) print "searching for RBF Kernel C = [", cmin, ",", cmax, "], gamma = [10^", GBound[0], ",10^", GBound[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # ----------------------------------- Evaluation bestsmo = SMO() kernel = RBFKernel() kernel.setGamma(pow(10,bestValues.y)) bestsmo.setC(bestValues.x) bestsmo.setKernel(kernel) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestsmo,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc_rbf): best_smo_rbf = bestsmo best_acc_rbf = acc bestValues_rbf = bestValues print "Best accuracy so far: ",best_acc_rbf print "Best values so far: ",bestValues_rbf print "Best accuracy (RBF Kernel): ", best_acc_rbf print "Best values (RBF Kernel): ", bestValues_rbf print "-----------------------------------------" if (best_acc_rbf > best_acc_poly): IsBestRBFKernel = True print "best smo classifier is RBF kernel with C = ", bestValues_rbf.x, " and gamma = ", pow(10,bestValues.y) best_smo = best_smo_rbf OptSMOp1 = bestValues_rbf.x OptSMOp2 = pow(10,bestValues.y) OptSMOAcc = best_acc_rbf OptSMOIsRBF = IsBestRBFKernel else: IsBestRBFKernel = False print "best smo classifier is Poly kernel with C = ", bestValues_poly.x, " and exponent = ", bestValues_poly.y best_smo = best_smo_poly OptSMOp1 = bestValues_poly.x OptSMOp2 = bestValues_poly.y OptSMOAcc = best_acc_poly OptSMOIsRBF = IsBestRBFKernel else: # we have very small ssample size OptSMOIsRBF, best_smo, OptSMOp1, OptSMOp2, OptSMOAcc = myGridSearch(data,cBounds,GBound,eBounds) if OptSMOIsRBF: Description = 'SMO classifier(RBF kernel): OptC=' + str(OptSMOp1) + \ ', OptGamma=' + str(OptSMOp2) + ', OptAcc=' + str(OptSMOAcc) else: Description = 'SMO classifier(Poly kernel): OptC=' + str(OptSMOp1) + \ ', OptExponent=' + str(OptSMOp2) + ', OptAcc=' + str(OptSMOAcc) return OptSMOIsRBF, best_smo, OptSMOp1, OptSMOp2, OptSMOAcc, Description
def runClassifierAlgo(algo, class_index, training_filename, test_filename, do_model, do_eval, do_predict): """ If <test_filename> Run classifier algorithm <algo> on training data in <training_filename> to build a model then test on data in <test_filename> (equivalent of Weka "Supplied test set") else do 10 fold CV lassifier algorithm <algo> on data in <training_filename> <class_index> is the column containing the dependent variable http://weka.wikispaces.com/Generating+classifier+evaluation+output+manually http://weka.sourceforge.net/doc.dev/weka/classifiers/Evaluation.html """ print ' runClassifierAlgo: training_filename= ', training_filename, ', test_filename=', test_filename misc.checkExists(training_filename) training_file = FileReader(training_filename) training_data = Instances(training_file) if test_filename: test_file = FileReader(test_filename) test_data = Instances(test_file) else: test_data = training_data # set the class Index - the index of the dependent variable training_data.setClassIndex(class_index) test_data.setClassIndex(class_index) # create the model if test_filename: algo.buildClassifier(training_data) evaluation = None # only a trained classifier can be evaluated if do_eval or do_predict: evaluation = Evaluation(test_data) buffer = StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution if test_filename: evaluation.evaluateModel(algo, test_data, [buffer, attRange, outputDistribution]) else: # evaluation.evaluateModel(algo, [String('-t ' + training_filename), String('-c 1')]) # print evaluation.toSummaryString() rand = Random(1) evaluation.crossValidateModel(algo, training_data, 4, rand) if False: print 'percentage correct =', evaluation.pctCorrect() print 'area under ROC =', evaluation.areaUnderROC(class_index) confusion_matrix = evaluation.confusionMatrix() for l in confusion_matrix: print '** ', ','.join('%2d'%int(x) for x in l) if verbose: if do_model: print '--> Generated model:\n' print algo.toString() if do_eval: print '--> Evaluation:\n' print evaluation.toSummaryString() if do_predict: print '--> Predictions:\n' print buffer return {'model':str(algo), 'eval':str(evaluation.toSummaryString()), 'predict':str(buffer) }
def Bayes_ParamFinder(data): # ----------------------- Evaluation of Naive Bayes without kernel estimation naivebayes = NaiveBayes() evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(naivebayes,data,numFolds,random,[output, attRange, outputDistribution]) acc_naivebayes = evaluation.pctCorrect() print "Naive Bayesisn accuracy (without kernel density estimation): ", acc_naivebayes # ----------------------- Evaluation of Naive Bayes with kernel estimation naivebayes = NaiveBayes() naivebayes.setUseKernelEstimator(Boolean(True)) # use kernel density estimation evaluation = Evaluation(data) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(naivebayes,data,numFolds,random,[output, attRange, outputDistribution]) acc_naivebayes_withkernel = evaluation.pctCorrect() print "Naive Bayesisn accuracy (with kernel density estimation): ", acc_naivebayes_withkernel # ----------------------- Evaluation of Naive bayes multinomial naivebayesmultinomial = NaiveBayesMultinomial() evaluation = Evaluation(data) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) if (allAttributesPositive(data)): # multinomial bayes classifier only work on positive attributes numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(naivebayesmultinomial,data,numFolds,random,[output, attRange, outputDistribution]) acc_naivemultinomialbayes = evaluation.pctCorrect() else: acc_naivemultinomialbayes = 0 print "Naive Multinomial Bayesisn accuracy : ", acc_naivemultinomialbayes # ------------------------- Comparision if (acc_naivemultinomialbayes > acc_naivebayes): if (acc_naivemultinomialbayes > acc_naivebayes_withkernel): IsOptMultinomialBayes = True IsOptNaiveKernelDensity = False acc = acc_naivemultinomialbayes else: IsOptMultinomialBayes = False IsOptNaiveKernelDensity = True acc = acc_naivebayes_withkernel else: if (acc_naivebayes > acc_naivebayes_withkernel): IsOptMultinomialBayes = False IsOptNaiveKernelDensity = False acc = acc_naivebayes else: IsOptMultinomialBayes = False IsOptNaiveKernelDensity = True acc = acc_naivebayes_withkernel print "-----------------------------------------" OptBayesAcc = acc if IsOptMultinomialBayes: Description = 'Optimal Bayes classifier is Multinomial Bayes: OptAcc = ' + str(OptBayesAcc) elif IsOptNaiveKernelDensity: Description = 'Optimal Bayes classifier is Naive Bayes with kernel density estimation: OptAcc = ' +\ str(OptBayesAcc) else: Description = 'Optimal Bayes classifier is Naive Bayes: OptAcc = ' + str(OptBayesAcc) return IsOptMultinomialBayes, IsOptNaiveKernelDensity, OptBayesAcc, Description
def feat_trimming(cl_list, config, f, fe, min_feat, new_instances, num_feat, pos_class_weight, progress, progress_per_iteration, result_list, split_ratio, iterative = False): # print "num_feat:%s"%num_feat # print "min_feat:%s"%min_feat min_feat = int(min_feat) num_feat = int(num_feat) if debug: print "num_feat:%s"%num_feat print "min_feat:%s"%min_feat print "In feat_trimming" if split_ratio == 0: cut_amount = 1 else: cut_amount = compute_cut_amount(min_feat, num_feat, split_ratio) if not iterative: num_feat = min_feat if num_feat > new_instances.numAttributes(): return new_instances, 30 # else: # num_feat -= cut_amount classifier_list = [] for cl in cl_list: if cl == 0: liblinear = Liblinear() liblinear.setConvertNominalToBinary(True) liblinear.setWeights(str(pos_class_weight) + " 1") classifier_list.append(liblinear) elif cl == 1: k2 = weka.classifiers.bayes.net.search.local.K2() k2. setMaxNrOfParents(1) bayesNet = BayesNet() bayesNet.setSearchAlgorithm(k2) classifier_list.append(bayesNet) elif cl == 2: j48 = J48() classifier_list.append(j48) elif cl == 3: jRip = JRip() classifier_list.append(jRip) else: raise ValueError('Unknown Classifier number -- %d given' % cl) while num_feat >= min_feat: if debug: print "Num_feat:%d, min_feat:%d"%(num_feat, min_feat) start = time.time() for classifier in classifier_list: if debug: print "Before selecting Features" # Assigns to t_selector the classifier if config.optimize: master_map, header_rows, new_instances, t_selector = select_features(classifier, config.fmeasure, fe, new_instances, num_feat, config.optimize) else: master_map, header_rows, new_instances, t_selector = select_features(classifier, config.fmeasure, fe, new_instances, num_feat) if debug: print "After selecting Features" print "Num_selected_features:%d"%(new_instances.numAttributes()-1) if isinstance(t_selector, tSelector): classifier_name = t_selector.getClassifier().getClass().__name__ else: classifier_name = t_selector.getClass().__name__ evaluation = Evaluation(new_instances) variance_analysis(config, evaluation, new_instances, t_selector) if config.temporal_folds: do_temporal_cv(t_selector, new_instances, config.folds) else: evaluation.crossValidateModel(t_selector, new_instances, config.folds, Random(1), []) # Add to candidate feature list only if its in the iterative stage report_results(classifier_name, config, evaluation, f, fe, new_instances, num_feat, result_list, t_selector, add_to_list=not iterative) progress = update_progress(progress, progress_per_iteration) cut_amount = compute_cut_amount(min_feat, num_feat, split_ratio) num_feat -= cut_amount # Break slow feature selection after first iteration if is_slow_fs(fe): break if debug: elapsed = (time.time() - start) print "Time elapsed:%d ms for num_feat=%d, min_feat=%d"%(elapsed, num_feat, min_feat) return new_instances, progress
def AdaBoostedSimpleLogistic_ParamFinder(data, param1, param2): # Adaboost params: Possible set for Weight Threshold WeightThresholdBounds = [99,100,1] # Adaboost params: possible set for NumIteration NumItrBound = [5,50,5] # Simple Logisitic params: Possible set for num of boosting NumBoostIterationBounds = [0,200,10] # This section tries to boost the best simple logistic print "searching for the best parameters to boosting on the optimal simple Logistic ...." gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(True)) simplelogistic = SimpleLogistic() adaboostm = AdaBoostM1() simplelogistic.setHeuristicStop(param1) simplelogistic.setNumBoostingIterations(param2) adaboostm.setClassifier(simplelogistic) gridsearch.setClassifier(adaboostm) gridsearch.setXProperty(String('classifier.weightThreshold')) gridsearch.setYProperty(String('classifier.numIterations')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setXMin(WeightThresholdBounds[0]) gridsearch.setXMax(WeightThresholdBounds[1]) gridsearch.setXStep(WeightThresholdBounds[2]) gridsearch.setYMin(NumItrBound[0]) gridsearch.setYMax(NumItrBound[1]) gridsearch.setYStep(NumItrBound[2]) print "searching for best parameters for boosting simple Logistic weightThreshold = [", WeightThresholdBounds[0], ",", WeightThresholdBounds[1], "], # Iterations = [", NumItrBound[0], ",", NumItrBound[1], "] ...." gridsearch.buildClassifier(data) bestValues1 = gridsearch.getValues() # ------------------------------ Evaluation simplelogistic = SimpleLogistic() bestadaboostm1 = AdaBoostM1() simplelogistic.setHeuristicStop(param1) simplelogistic.setNumBoostingIterations(param2) bestadaboostm1.setWeightThreshold(int(bestValues1.x)) bestadaboostm1.setNumIterations(int(bestValues1.y)) bestadaboostm1.setClassifier(simplelogistic) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestadaboostm1,data,numFolds,random,[output, attRange, outputDistribution]) best_acc1 = evaluation.pctCorrect() print "best accuracy by boosting the optimal simple Logistic classifier: ", best_acc1 print "Optimal weight Threshold Percent : ", bestValues1.x , "Optimal number of Iterations : ", bestValues1.y print "-----------------------------------------" # ------------------------------------------------------------------------------------------------------------------------- # in this section we set the weak classifier to the linear SMO and optimize over c-value of the SMO and number of iteration simplelogistic = SimpleLogistic() adaboostm = AdaBoostM1() adaboostm.setClassifier(simplelogistic) gridsearch.setClassifier(adaboostm) gridsearch.setXProperty(String('classifier.classifier.numBoostingIterations')) gridsearch.setYProperty(String('classifier.numIterations')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setXBase(10) gridsearch.setXMin(NumBoostIterationBounds[0]) gridsearch.setXMax(NumBoostIterationBounds[1]) gridsearch.setXStep(NumBoostIterationBounds[2]) gridsearch.setYMin(NumItrBound[0]) gridsearch.setYMax(NumItrBound[1]) gridsearch.setYStep(NumItrBound[2]) print "searching for number of boosting Iterations bound = [", NumBoostIterationBounds[0], ",", NumBoostIterationBounds[1], "], # Iteration = [", NumItrBound[0], ",", NumItrBound[1], "] ...." gridsearch.buildClassifier(data) bestValues2 = gridsearch.getValues() # ------------------ Evaluation simplelogistic = SimpleLogistic() bestadaboostm2 = AdaBoostM1() simplelogistic.setNumBoostingIterations(int(bestValues2.x)) bestadaboostm2.setNumIterations(int(bestValues2.y)) bestadaboostm2.setClassifier(simplelogistic) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestadaboostm2,data,numFolds,random,[output, attRange, outputDistribution]) best_acc2 = evaluation.pctCorrect() print "best accuracy by boosting the Simple Logistic classifier (with optimization over ridge): ", best_acc2 print "Optimal number of boosting Iteration : ", bestValues2.x , "Optimal number of Iteration : ", bestValues2.y print "-----------------------------------------" print "Final optimal boosting classifier:" if (best_acc2 > best_acc1): print " Best boosting is based on simple logistic with optimal numBoostingIterations :",\ bestValues2.x, " optimal numIteration :", bestValues2.y print " optimal accuracy: ", best_acc2 IsOptimalBoostingOnOptSimpleLogistic = False # is optimal boosting based on optimal simple Logistic ? IsOptBoostOnOptSimpLog = IsOptimalBoostingOnOptSimpleLogistic OptBoostSimpLog = bestadaboostm2 OptBoostSimpLogp1 = bestValues2.x OptBoostSimpLogp2 = bestValues2.y OptBoostSimpLogAcc = best_acc2 else: print " Best boosting is based on optimal simple Logistic with optimal weight Threshold :",\ bestValues1.x, " optimal numIteration :", bestValues1.y print " optimal accuracy: ", best_acc1 IsOptimalBoostingOnOptSimpleLogistic = True # is optimal boosting based on optimal simple Logistic ? IsOptBoostOnOptSimpLog = IsOptimalBoostingOnOptSimpleLogistic OptBoostSimpLog = bestadaboostm1 OptBoostSimpLogp1 = bestValues1.x OptBoostSimpLogp2 = bestValues1.y OptBoostSimpLogAcc = best_acc1 if IsOptBoostOnOptSimpLog: Description = 'Boosting optimal simple logistic classifier: OptWeightThreshold = ' + \ str(OptBoostSimpLogp1) + ', OptNumIterations=' + \ str(OptBoostSimpLogp2) + ', OptAcc = ' + str(OptBoostSimpLogAcc) else: Description = 'Boosting simple logistic classifier: OptNumBoostingIterations = ' + \ str(OptBoostSimpLogp1) + ', OptNumIterations=' + \ str(OptBoostSimpLogp2) + ', OptAcc = ' + str(OptBoostSimpLogAcc) return IsOptBoostOnOptSimpLog, OptBoostSimpLog, OptBoostSimpLogp1, OptBoostSimpLogp2, \ OptBoostSimpLogAcc, Description
def BaggingSMO_ParamFinder(data, BestSMOIsRBFKernel, param1, param2): # Possible set for C-value cBounds = [[1,10,1],[10,100,10],[100,300,20]] # possible set bag size percent BagSizePercentBound = [ max(10, int(float(1)/float(data.numInstances())*100)+1 ) ,100,10] # max operation is to make sure that least number of samples are provided to the classifier # possible set for Iteration ItrBound = [5,50,5] # This section tries to boost the best smo print "searching for the best parameters to Bag the best SMO ...." gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(False)) smo = SMO() bagging = Bagging() if BestSMOIsRBFKernel: kernel = RBFKernel() kernel.setGamma(param2) smo.setKernel(kernel) smo.setC(param1) else: kernel = PolyKernel() kernel.setExponent(param2) smo.setKernel(kernel) smo.setC(param1) bagging.setClassifier(smo) gridsearch.setClassifier(bagging) gridsearch.setXProperty(String('classifier.bagSizePercent')) gridsearch.setYProperty(String('classifier.numIterations')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setXMin(BagSizePercentBound[0]) gridsearch.setXMax(BagSizePercentBound[1]) gridsearch.setXStep(BagSizePercentBound[2]) gridsearch.setYMin(ItrBound[0]) gridsearch.setYMax(ItrBound[1]) gridsearch.setYStep(ItrBound[2]) print "searching for best parameters for bagging SMO bagSizePercent = [", BagSizePercentBound[0], ",", BagSizePercentBound[1], "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...." gridsearch.buildClassifier(data) #bestbagging1 = gridsearch.getBestClassifier() bestValues1 = gridsearch.getValues() # ------------------ Evaluation smo = SMO() bestbagging1 = Bagging() smo.setKernel(kernel) smo.setC(param1) bestbagging1.setBagSizePercent(int(bestValues1.x)) bestbagging1.setNumIterations(int(bestValues1.y)) bestbagging1.setClassifier(smo) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestbagging1,data,numFolds,random,[output, attRange, outputDistribution]) best_acc1 = evaluation.pctCorrect() bestValues1 = gridsearch.getValues() print "best accuracy by bagging the optimal SMO classifier: ", best_acc1 print "Optimal Bag size Percent : ", bestValues1.x , "Optimal number of Iteration : ", bestValues1.y print "-----------------------------------------" # ------------------------------------------------------------------------------------------------------------------------ # in this section we set the weak classifier to the linear SMO and optimize over c-value of the SMO and number of iteration smo = SMO() kernel = PolyKernel() smo.setKernel(kernel) bagging.setClassifier(smo) gridsearch.setClassifier(bagging) gridsearch.setXProperty(String('classifier.classifier.c')) gridsearch.setYProperty(String('classifier.numIterations')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setGridIsExtendable(Boolean(True)) best_acc2 = -float('inf') for cnt in range(0,len(cBounds)): cbound = cBounds[cnt] cmin = cbound[0] cmax = cbound[1] cstep = cbound[2] gridsearch.setXMin(cmin) gridsearch.setXMax(cmax) gridsearch.setXStep(cstep) gridsearch.setYMin(ItrBound[0]) gridsearch.setYMax(ItrBound[1]) gridsearch.setYStep(ItrBound[2]) print "searching for RBF Kernel C = [", cmin, ",", cmax, "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # ------------ Evaluation smo = SMO() bestbagging = Bagging() kernel = PolyKernel() smo.setKernel(kernel) smo.setC(bestValues.x) bestbagging.setNumIterations(int(bestValues.y)) bestbagging.setClassifier(smo) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestbagging,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc2): bestbagging2 = bestbagging best_acc2 = acc bestValues2 = bestValues print "Best accuracy so far by bagging linear SMO: ", best_acc2 print "Best values so far by bagging linear SMO: ", bestValues2 print "Best accuracy by bagging linear SMO: ", best_acc2 print "Best values by bagging linear SMO: ", bestValues2 print "-----------------------------------------" print "Final optimal bagging classifier:" if (best_acc2 > best_acc1): print " Best bagging is based on linear SMO with optimal c-value :", bestValues2.x, " optimal numIteration = ", bestValues2.y print " optimal accuracy: ", best_acc2 IsOptimalBaggingIsOptSMO = False # is optimal bagging based on optimal SMO ? IsOptBagOnOptSMO = IsOptimalBaggingIsOptSMO OptBagSMO = bestbagging2 OptBagSMOp1 = bestValues2.x OptBagSMOp2 = bestValues2.y OptBagSMOAcc = best_acc2 else: print " Best bagging is based on optimal SMO with optimal bagSizePercent :", bestValues1.x, " optimal numIteration = ", bestValues1.y print " optimal accuracy: ", best_acc1 IsOptimalBaggingIsOptSMO = True # is optimal bagging based on optimal SMO ? IsOptBagOnOptSMO = IsOptimalBaggingIsOptSMO OptBagSMO = bestbagging1 OptBagSMOp1 = bestValues1.x OptBagSMOp2 = bestValues1.y OptBagSMOAcc = best_acc1 if IsOptBagOnOptSMO: Description = 'Bagging on optimal SMO classifier: OptBagSizePercent=' + str(OptBagSMOp1) + \ ', OptNumIterations=' + str(OptBagSMOp2) + ', OptAcc=' + str(OptBagSMOAcc) else: Description = 'Bagging on linear SMO classifier: OptC=' + str(OptBagSMOp1) + \ ', OptNumIterations=' + str(OptBagSMOp2) + ', OptAcc=' + str(OptBagSMOAcc) return IsOptBagOnOptSMO, OptBagSMO, OptBagSMOp1, OptBagSMOp2, OptBagSMOAcc, Description
def BaggingLogistic_ParamFinder(data, param1, param2): # Possible set for Ridge-value RBounds = [-10,2,1] # possible set bag size percent BagSizePercentBound = [ max(10, int(float(1)/float(data.numInstances())*100)+1 ) ,100,10] # max operation is to make sure that least number of samples are provided to the classifier # possible set for Iteration ItrBound = [5,50,5] # This section tries to boost the best logistic print "searching for the best parameters to Bag the optimal Logistic ...." gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(False)) logistic = Logistic() bagging = Bagging() logistic.setRidge(param1) logistic.setMaxIts(param2) bagging.setClassifier(logistic) gridsearch.setClassifier(bagging) gridsearch.setXProperty(String('classifier.bagSizePercent')) gridsearch.setYProperty(String('classifier.numIterations')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setXMin(BagSizePercentBound[0]) gridsearch.setXMax(BagSizePercentBound[1]) gridsearch.setXStep(BagSizePercentBound[2]) gridsearch.setYMin(ItrBound[0]) gridsearch.setYMax(ItrBound[1]) gridsearch.setYStep(ItrBound[2]) print "searching for best parameters for bagging Logistic bagSizePercent = [", BagSizePercentBound[0], ",", BagSizePercentBound[1], "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...." gridsearch.buildClassifier(data) #bestbagging1 = gridsearch.getBestClassifier() bestValues1 = gridsearch.getValues() # ------------------------------ Evaluation logistic = Logistic() bestbagging1 = Bagging() logistic.setRidge(param1) logistic.setMaxIts(param2) bestbagging1.setBagSizePercent(int(bestValues1.x)) bestbagging1.setNumIterations(int(bestValues1.y)) bestbagging1.setClassifier(logistic) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestbagging1,data,numFolds,random,[output, attRange, outputDistribution]) best_acc1 = evaluation.pctCorrect() print "best accuracy by bagging the optimal Logistic classifier: ", best_acc1 print "Optimal Bag size Percent: ", bestValues1.x, " Optimal number of Iterations: ", bestValues1.y print "-----------------------------------------" # ------------------------------------------------------------------------------------------------------------------------- # in this section we set the weak classifier to the linear SMO and optimize over c-value of the SMO and number of iteration logistic = Logistic() bagging = Bagging() bagging.setClassifier(logistic) gridsearch.setClassifier(bagging) gridsearch.setXProperty(String('classifier.classifier.ridge')) gridsearch.setYProperty(String('classifier.numIterations')) gridsearch.setXExpression(String('pow(BASE,I)')) gridsearch.setYExpression(String('I')) gridsearch.setXBase(10) gridsearch.setGridIsExtendable(Boolean(True)) gridsearch.setXMin(RBounds[0]) gridsearch.setXMax(RBounds[1]) gridsearch.setXStep(RBounds[2]) gridsearch.setYMin(ItrBound[0]) gridsearch.setYMax(ItrBound[1]) gridsearch.setYStep(ItrBound[2]) print "searching for ridge bound = [10^", RBounds[0], ",10^", RBounds[1], "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...." gridsearch.buildClassifier(data) #bestbagging = gridsearch.getBestClassifier() bestValues2 = gridsearch.getValues() # ------------------ Evaluation logistic = Logistic() bestbagging2 = Bagging() logistic.setRidge(pow(10,bestValues2.x)) bestbagging2.setNumIterations(int(bestValues2.y)) bestbagging2.setClassifier(logistic) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestbagging2,data,numFolds,random,[output, attRange, outputDistribution]) best_acc2 = evaluation.pctCorrect() print "best accuracy by bagging the Logistic classifier (with optimization over ridge): ", best_acc2 print "Optimal Ridge value : ", bestValues2.x , "Optimal number of Iteration : ", bestValues2.y print "-----------------------------------------" print "Final optimal bagging classifier:" if (best_acc2 > best_acc1): print " Best bagging is based on logistic with optimal ridge-value :", bestValues2.x, " optimal numIteration :", bestValues2.y print " optimal accuracy: ", best_acc2 IsOptimalBaggingIsOptLogistic = False # is optimal bagging based on optimal Logistic ? IsOptBagOnOptLog = IsOptimalBaggingIsOptLogistic OptBagLog = bestbagging2 OptBagLogp1 = pow(10,bestValues2.x) OptBagLogp2 = bestValues2.y OptBagLogAcc = best_acc2 else: print " Best bagging is based on optimal Logistic with optimal bagSizePercent :", bestValues1.x, " optimal numIteration :", bestValues1.y print " optimal accuracy: ", best_acc1 IsOptimalBaggingIsOptLogistic = True # is optimal bagging based on optimal Logistic ? IsOptBagOnOptLog = IsOptimalBaggingIsOptLogistic OptBagLog = bestbagging1 OptBagLogp1 = bestValues1.x OptBagLogp2 = bestValues1.y OptBagLogAcc = best_acc1 if IsOptBagOnOptLog: Description = 'Bagging on optimal logistic classifier: OptBagSizePercent= ' + str(OptBagLogp1) + \ ', OptNumIterations=' + str(OptBagLogp2) + ', OptAcc = ' + str(OptBagLogAcc) else: Description = 'Bagging on logistic classifier: OptRidge= ' + str(OptBagLogp1) + \ ', OptNumIterations=' + str(OptBagLogp2) + ', OptAcc = ' + str(OptBagLogAcc) return IsOptBagOnOptLog, OptBagLog, OptBagLogp1, OptBagLogp2, OptBagLogAcc, Description