def bagging_logistic(trainData,testData,params,exparams): IsOptBagOnOptLog = str2bool(params[0]) logistic = Logistic() bagging = Bagging() if IsOptBagOnOptLog: # optimal bagging is based on optimal logistic ridge = float(exparams[0]) maxIt = int(float(exparams[1])) logistic.setMaxIts(maxIt) bagSizePercent = int(float(params[1])) bagging.setBagSizePercent(bagSizePercent) else: # ridge parameter is also optimized in the process ridge = float(params[1]) numIterations = int(float(params[2])) bagging.setNumIterations(numIterations) logistic.setRidge(ridge) bagging.setClassifier(logistic) bagging.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(bagging, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(bagging, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def simple_logistic(trainData,testData,params,exparams): heuristicStop = int(float(params[0])) numBoostingIterations = int(float(params[1])) simplelogistic = SimpleLogistic() simplelogistic.setHeuristicStop(heuristicStop) simplelogistic.setNumBoostingIterations(numBoostingIterations) if (trainData.numInstances()<5): # special case for small sample size simplelogistic.setUseCrossValidation(False) simplelogistic.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(simplelogistic, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(simplelogistic, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def bayesian(trainData,testData,params,exparams): IsOptMultinomialBayes = str2bool(params[0]) IsOptNaiveKernelDensity = str2bool(params[1]) if IsOptMultinomialBayes: # optimal bayesian classifier is multinomial bayes = NaiveBayesMultinomial() else: bayes = NaiveBayes() if IsOptNaiveKernelDensity: # use kernel density estimation bayes.setUseKernelEstimator(Boolean(True)) bayes.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(bayes, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(bayes, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def smo(trainData,testData,params,exparams): kerType = str2bool(params[0]) cValue = float(params[1]) kerParam = float(params[2]) if kerType: # RBF kernel kernel = RBFKernel() kernel.setGamma(kerParam) else: # Polynomial kernel kernel = PolyKernel() kernel.setExponent(kerParam) smo = SMO() smo.setKernel(kernel) smo.setC(cValue) smo.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(smo, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(smo, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def readCross(num,type,numtrees): filename=resultFile+'_'+type+'_'+num+'_all.csv' loader=CSVLoader() loader.setSource(File(filename)) data=loader.getDataSet() #print data.numAttributes() data.setClassIndex(data.numAttributes()-1) rf=RF() rf.setNumTrees(numtrees) #pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) buffer = StringBuffer() # buffer for the predictions output=PlainText() output.setHeader(data) output.setBuffer(buffer) output.setOutputDistribution(True) attRange = Range() # attributes to output outputDistributions = Boolean(True) evaluator=Evaluation(data) evaluator.crossValidateModel(rf,data,10, Random(1),[output,attRange,outputDistributions]) print evaluator.toSummaryString() print evaluator.toClassDetailsString() print evaluator.toMatrixString() return [evaluator.weightedPrecision(),evaluator.weightedRecall(),evaluator.weightedFMeasure(),evaluator.weightedMatthewsCorrelation(),evaluator.weightedFalseNegativeRate(),evaluator.weightedFalsePositiveRate(),evaluator.weightedTruePositiveRate(),evaluator.weightedTrueNegativeRate(),evaluator.weightedAreaUnderROC()]
def readFeature(num_features,type,select_feature,numtrees): #filename1=resultFileTest #filename2=resultFileTest2 filename1=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_train.csv' filename2=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_test.csv' #print filename1 loader=CSVLoader() loader.setSource(File(filename1)) data=loader.getDataSet() #print data.numAttributes() data.setClassIndex(data.numAttributes()-1) rf=RF() rf.setNumTrees(numtrees) rf.buildClassifier(data) #print rf loader.setSource(File(filename2)) test_data=Instances(loader.getDataSet()) test_data.setClassIndex(test_data.numAttributes()-1) ''' num=test_data.numInstances() print num for i in xrange(num): r1=rf.distributionForInstance(test_data.instance(i)) r2=rf.classifyInstance(test_data.instance(i)) ptrixrint r1 print r2''' buffer = StringBuffer() # buffer for the predictions output=PlainText() output.setHeader(test_data) output.setBuffer(buffer) attRange = Range() # attributes to output outputDistribution = Boolean(True) evaluator=Evaluation(data) evaluator.evaluateModel(rf,test_data,[output,attRange,outputDistribution]) #print evaluator.evaluateModel(RF(),['-t',filename1,'-T',filename2,'-I',str(numtrees)]) #evaluator1=Evaluation(test_data) print evaluator.toSummaryString() print evaluator.toClassDetailsString() print evaluator.toMatrixString() return [evaluator.precision(1),evaluator.recall(1),evaluator.fMeasure(1),evaluator.matthewsCorrelationCoefficient(1),evaluator.numTruePositives(1),evaluator.numFalsePositives(1),evaluator.numTrueNegatives(1),evaluator.numFalseNegatives(1),evaluator.areaUnderROC(1)]
def baggin_smo(trainData,testData,params,exparams): IsOptBagOnOptSMO = str2bool(params[0]) if IsOptBagOnOptSMO: # optimal bagging is based on optimal SMO thus I should use extra params kerType = str2bool(params[0]) cValue = float(exparams[1]) kerParam = float(exparams[2]) if kerType: # RBF kernel kernel = RBFKernel() kernel.setGamma(kerParam) else: # Polynomial kernel kernel = PolyKernel() kernel.setExponent(kerParam) bagSizePercent = int(float(params[1])) numIterations = int(float(params[2])) smo = SMO() bagging = Bagging() smo.setKernel(kernel) smo.setC(cValue) bagging.setBagSizePercent(bagSizePercent) bagging.setNumIterations(numIterations) bagging.setClassifier(smo) else: # optimal bagging is based on linear SMO cValue = float(params[1]) numIterations = int(float(params[2])) smo = SMO() bagging = Bagging() kernel = PolyKernel() smo.setKernel(kernel) smo.setC(cValue) bagging.setNumIterations(numIterations) bagging.setClassifier(smo) bagging.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(bagging, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(bagging, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def random_forest(trainData,testData,params,exparams): numTrees = int(float(params[0])) numFeatures = int(float(params[1])) randomforest = RandomForest() randomforest.setNumTrees(numTrees) randomforest.setNumFeatures(numFeatures) randomforest.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(randomforest, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(randomforest, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def adaboostM1_simple_logistic(trainData,testData,params,exparams): IsOptBoostOnOptSimpLog = str2bool(params[0]) simplelogistic = SimpleLogistic() adaboostm = AdaBoostM1() if IsOptBoostOnOptSimpLog: # optimal adaboost is based on optimal simple logisatic heuristicStop = int(float(exparams[0])) numBoostingIterations = int(float(exparams[1])) weightThreshold = int(float(params[1])) numIterations = int(float(params[2])) simplelogistic.setHeuristicStop(heuristicStop) simplelogistic.setNumBoostingIterations(numBoostingIterations) adaboostm.setWeightThreshold(weightThreshold) adaboostm.setNumIterations(numIterations) else: numBoostingIterations = int(float(params[1])) numIterations = int(float(params[2])) simplelogistic.setNumBoostingIterations(numBoostingIterations) adaboostm.setNumIterations(numIterations) adaboostm.setClassifier(simplelogistic) adaboostm.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(adaboostm, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(adaboostm, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def logistic(trainData,testData,params,exparams): ridge = float(params[0]) maxIt = int(float(params[1])) print "Ridge=%s, maxIt=%s" %(str(ridge),str(maxIt)) logistic = Logistic() logistic.setMaxIts(maxIt) logistic.setRidge(ridge) logistic.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(logistic, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(logistic, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def runClassifierAlgo(algo, training_filename, test_filename, do_model, do_eval, do_predict): """ Run classifier algorithm <algo> on training data in <training_filename> to build a model then run in on data in <test_filename> (equivalent of WEKA "Supplied test set") """ training_file = FileReader(training_filename) training_data = Instances(training_file) test_file = FileReader(test_filename) test_data = Instances(test_file) # set the class Index - the index of the dependent variable training_data.setClassIndex(class_index) test_data.setClassIndex(class_index) # create the model algo.buildClassifier(training_data) evaluation = None # only a trained classifier can be evaluated if do_eval or do_predict: evaluation = Evaluation(test_data) buffer = StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(algo, test_data, [buffer, attRange, outputDistribution]) if verbose: if do_model: print "--> Generated model:\n" print algo.toString() if do_eval: print "--> Evaluation:\n" print evaluation.toSummaryString() if do_predict: print "--> Predictions:\n" print buffer return {"model": str(algo), "eval": str(evaluation.toSummaryString()), "predict": str(buffer)}
# load data file print "Loading data..." file = FileReader(sys.argv[1]) data = Instances(file) # set the class Index - the index of the dependent variable data.setClassIndex(data.numAttributes() - 1) # create the model evaluation = Evaluation(data) output = PlainText() # plain text output for predictions output.setHeader(data) buffer = StringBuffer() # buffer to use output.setBuffer(buffer) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution j48 = J48() j48.buildClassifier(data) # only a trained classifier can be evaluated evaluation.evaluateModel(j48, data, [output, attRange, outputDistribution]) # print out the built model print "--> Generated model:\n" print j48 print "--> Evaluation:\n" print evaluation.toSummaryString() print "--> Predictions:\n" print buffer
tree_algorithms.append(cover) data.setClassIndex(data.numAttributes() - 1) for num in range(1, 30, 2): file.write(str(num)) for algoknn in tree_algorithms: log.write("---------------------------------\nK: " + str(num) + ", Search Algorithm: " + algoknn.__class__.__name__ + "\n") algo = IBk() algo.setNearestNeighbourSearchAlgorithm(algoknn) algo.setKNN(num) x = time.time() algo.buildClassifier(data) log.write("Time to build classifier: " + str(time.time() - x) + "\n") evaluation = Evaluation(data) output = PlainText() # plain text output for predictions output.setHeader(data) buffer = StringBuffer() # buffer to use output.setBuffer(buffer) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution x = time.time() #evaluation.evaluateModel(algo, data, [output, attRange, outputDistribution]) evaluation.crossValidateModel(algo, data, 10, rand, [output, attRange, outputDistribution]) log.write("Time to evaluate model: " + str(time.time() - x) + "\n") log.write(evaluation.toSummaryString()) file.write("," + str(evaluation.rootMeanSquaredError())) file.write("\n") file.close() log.close()
def runClassifierAlgo(algo, class_index, training_filename, test_filename, do_model, do_eval, do_predict): """ If <test_filename> Run classifier algorithm <algo> on training data in <training_filename> to build a model then test on data in <test_filename> (equivalent of Weka "Supplied test set") else do 10 fold CV lassifier algorithm <algo> on data in <training_filename> <class_index> is the column containing the dependent variable http://weka.wikispaces.com/Generating+classifier+evaluation+output+manually http://weka.sourceforge.net/doc.dev/weka/classifiers/Evaluation.html """ print ' runClassifierAlgo: training_filename= ', training_filename, ', test_filename=', test_filename misc.checkExists(training_filename) training_file = FileReader(training_filename) training_data = Instances(training_file) if test_filename: test_file = FileReader(test_filename) test_data = Instances(test_file) else: test_data = training_data # set the class Index - the index of the dependent variable training_data.setClassIndex(class_index) test_data.setClassIndex(class_index) # create the model if test_filename: algo.buildClassifier(training_data) evaluation = None # only a trained classifier can be evaluated if do_eval or do_predict: evaluation = Evaluation(test_data) buffer = StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution if test_filename: evaluation.evaluateModel(algo, test_data, [buffer, attRange, outputDistribution]) else: # evaluation.evaluateModel(algo, [String('-t ' + training_filename), String('-c 1')]) # print evaluation.toSummaryString() rand = Random(1) evaluation.crossValidateModel(algo, training_data, 4, rand) if False: print 'percentage correct =', evaluation.pctCorrect() print 'area under ROC =', evaluation.areaUnderROC(class_index) confusion_matrix = evaluation.confusionMatrix() for l in confusion_matrix: print '** ', ','.join('%2d'%int(x) for x in l) if verbose: if do_model: print '--> Generated model:\n' print algo.toString() if do_eval: print '--> Evaluation:\n' print evaluation.toSummaryString() if do_predict: print '--> Predictions:\n' print buffer return {'model':str(algo), 'eval':str(evaluation.toSummaryString()), 'predict':str(buffer) }
for fld in range(0,folds): train = folds_train[fld] test = folds_test[fld] train.setClassIndex(data.numAttributes() - 1) test.setClassIndex(data.numAttributes() - 1) lr = LR() lr.buildClassifier(train) buf= StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) evaluation = Evaluation(test) evaluation.evaluateModel(lr, test, [buf, attRange, outputDistribution]) buffers.append(buf) ## Writing Evaluation Summaries f = open(''.join([directory , ''.join(['summary_',str(fld),'.report'])]) , 'w') f.write(evaluation.toSummaryString(True)) f.close() f = open(''.join([directory , ''.join(['coeff_',str(fld),'.report'])]) , 'w') f.write(str(lr)) f.close() ## Writing predictions in a file f = open(''.join([directory , 'prediction.weka']) , 'w') for prediction in buffers: f.write(str(prediction)) f.close() ## Finding top n features
# loop for different values of x using full dataset data.setClassIndex(data.numAttributes() - 1) for num in [x * 0.05 for x in range(0, 10)]: log.write("---------------------------------\nCF: " + str(num) + "\n") algo = J48() x = time.time() algo.buildClassifier(data) log.write("Time to build classifier: " + str(time.time() - x) + "\n") algo.setConfidenceFactor(num) evaluation = Evaluation(data) output = PlainText() # plain text output for predictions output.setHeader(data) buffer = StringBuffer() # buffer to use output.setBuffer(buffer) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution x = time.time() evaluation.evaluateModel(algo, data, [output, attRange, outputDistribution]) #evaluation.crossValidateModel(algo, data, 10, rand, [output, attRange, outputDistribution]) log.write("Time to evaluate model: " + str(time.time() - x) + "\n") log.write(evaluation.toSummaryString()) file.write(str(num) + "," + str(evaluation.rootMeanSquaredError()) + "\n") # create graph graphfilename = "image/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \ str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_" + str(num) + ".dot" graphfile = open(graphfilename, 'wb') graphfile.write(algo.graph()) graphfile.close() file.close() log.close()
# check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: UsingJ48Ext.py <ARFF-file>" sys.exit() # load data file print "Loading data..." file = FileReader(sys.argv[1]) data = Instances(file) # set the class Index - the index of the dependent variable data.setClassIndex(data.numAttributes() - 1) # create the model evaluation = Evaluation(data) buffer = StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution j48 = J48() j48.buildClassifier(data) # only a trained classifier can be evaluated evaluation.evaluateModel(j48, data, [buffer, attRange, outputDistribution]) # print out the built model print "--> Generated model:\n" print j48 print "--> Evaluation:\n" print evaluation.toSummaryString() print "--> Predictions:\n" print buffer
test = folds_test[fld] train.setClassIndex(data.numAttributes() - 1) test.setClassIndex(data.numAttributes() - 1) lr = LR() lr.buildClassifier(train) buf = StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) evaluation = Evaluation(test) evaluation.evaluateModel(lr, test, [buf, attRange, outputDistribution]) buffers.append(buf) ## Writing Evaluation Summaries f = open( ''.join([directory, ''.join(['summary_', str(fld), '.report'])]), 'w') f.write(evaluation.toSummaryString(True)) f.close() f = open( ''.join([directory, ''.join(['coeff_', str(fld), '.report'])]), 'w') f.write(str(lr)) f.close() ## Writing predictions in a file f = open(''.join([directory, 'prediction.weka']), 'w') for prediction in buffers: f.write(str(prediction)) f.close()