fulltrainset = Instances(trainfile) fulltrainset.setClassIndex(fulltrainset.numAttributes() - 1) testset = Instances(testfile) testset.setClassIndex(testset.numAttributes() - 1) # open output files bufsize=0 classifiername = str(os.path.splitext(os.path.basename(__file__))[0]) dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv" filelimit=open(datafilelimit, 'w', bufsize) filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n") logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log" log=open(logfile, 'w', bufsize) # open general log file for num in range(int(p['j48.initial']),fulltrainset.numInstances(),(fulltrainset.numInstances() / int(p['j48.numdatapoints']))): filelimit.write(str(num)) trainset = Instances(fulltrainset,0,num) # create training set trainset.setClassIndex(trainset.numAttributes() - 1) log.write("---------------------------------\nTraining Set Size: " + str(trainset.numInstances()) + ", Test Set Size: " + str(testset.numInstances()) + ", Full data set size: " + str(fulltrainset.numInstances()) + "\n") for dataset in [testset, fulltrainset]: algo = J48() algo.buildClassifier(trainset) algo.setConfidenceFactor(float(p['j48.C'])) evaluation = Evaluation(trainset) output = PlainText() # plain text output for predictions output.setHeader(trainset) buffer = StringBuffer() # buffer to use output.setBuffer(buffer) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution
print "Usage: supervised.py <ARFF-file>" sys.exit() # load data file print "Loading data..." datafile = FileReader(sys.argv[1]) data = Instances(datafile) rand = Random() # seed from the system time data.randomize(rand) # randomize data with number generator # open output files bufsize=0 dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) # loop for different amounts of data with fixed test set datasize = data.numInstances() limit = (datasize*2)/3 # loop until we use 2/3 data as training set testset = Instances(data,limit,datasize-limit) # create training set using the last 1/3 of data testset.setClassIndex(testset.numAttributes() - 1) saver = ArffSaver() saver.setInstances(testset) testsetfile = "./data/split/" + dataname + "-" + "test.arff" file = File(testsetfile) saver.setFile(file) saver.writeBatch() trainset = Instances(data,0,limit) # create training set saver = ArffSaver() saver.setInstances(trainset) trainsetfile = "./data/split/" + dataname + "-" + "train.arff"
testset = Instances(testfile) testset.setClassIndex(testset.numAttributes() - 1) # open output files bufsize = 0 classifiername = str(os.path.splitext(os.path.basename(__file__))[0]) dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv" filelimit = open(datafilelimit, 'w', bufsize) filelimit.write( "instances,lineartest,lineartrain,polytest,polytrain,radialtest,radialtrain,sigmoidtest,sigmoidtrain\n" ) logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log" log = open(logfile, 'w', bufsize) # open general log file for num in range(int(p['svm.initial']), fulltrainset.numInstances(), (fulltrainset.numInstances() / int(p['svm.numdatapoints']))): trainset = Instances(fulltrainset, 0, num) # create training set trainset.setClassIndex(trainset.numAttributes() - 1) filelimit.write(str(num)) for kerneltype in range(0, 4): log.write("---------------------------------\nTraining Set Size: " + str(trainset.numInstances()) + ", Test Set Size: " + str(testset.numInstances()) + ", Full data set size: " + str(fulltrainset.numInstances()) + "\n") for dataset in [testset, fulltrainset]: algo = LibSVM() tag = SelectedTag( str(kerneltype), algo.TAGS_KERNELTYPE ) # 0 = linear, 1 = polynomial, 2 = radial basis function, 3 = sigmoid
testset.setClassIndex(testset.numAttributes() - 1) # open output files bufsize = 0 classifiername = str(os.path.splitext(os.path.basename(__file__))[0]) dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv" filelimit = open(datafilelimit, 'w', bufsize) filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n") logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log" log = open(logfile, 'w', bufsize) # open general log file timefilename = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_traintime.csv" timefile = open(timefilename, 'w', bufsize) timefile.write("instances,timetest,timetrain\n") for num in range(int(p['mlp.initial']), fulltrainset.numInstances(), (fulltrainset.numInstances() / int(p['mlp.numdatapoints']))): trainset = Instances(fulltrainset, 0, num) # create training set trainset.setClassIndex(trainset.numAttributes() - 1) log.write("---------------------------------\nTraining Set Size: " + str(trainset.numInstances()) + ", Test Set Size: " + str(testset.numInstances()) + ", Full data set size: " + str(fulltrainset.numInstances()) + "\n") filelimit.write(str(trainset.numInstances())) timefile.write(str(num)) for dataset in [testset, fulltrainset]: algo = MultilayerPerceptron() algo.setTrainingTime(int(p['mlp.N'])) x = time.time() algo.buildClassifier(trainset)
testset.setClassIndex(testset.numAttributes() - 1) # open output files bufsize=0 classifiername = str(os.path.splitext(os.path.basename(__file__))[0]) dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv" filelimit=open(datafilelimit, 'w', bufsize) filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n") logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log" log=open(logfile, 'w', bufsize) # open general log file timefilename = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_traintime.csv" timefile = open(timefilename, 'w', bufsize) timefile.write("instances,timetest,timetrain\n") for num in range(int(p['adaboost.initial']),fulltrainset.numInstances(),(fulltrainset.numInstances() / int(p['adaboost.numdatapoints']))): trainset = Instances(fulltrainset,0,num) # create training set trainset.setClassIndex(trainset.numAttributes() - 1) log.write("---------------------------------\nTraining Set Size: " + str(trainset.numInstances()) + ", Test Set Size: " + str(testset.numInstances()) + ", Full data set size: " + str(fulltrainset.numInstances()) + "\n") filelimit.write(str(num)) timefile.write(str(num)) for dataset in [testset, fulltrainset]: algo = AdaBoostM1() weaklearner = J48() algo.setClassifier(weaklearner) algo.setNumIterations(int(p['adaboost.iterations'])) x = time.time() algo.buildClassifier(trainset) evaluation = Evaluation(trainset) timefile.write("," + str(time.time() - x)) output = PlainText() # plain text output for predictions
print "Usage: supervised.py <ARFF-file>" sys.exit() # load data file print "Loading data..." datafile = FileReader(sys.argv[1]) data = Instances(datafile) rand = Random() # seed from the system time data.randomize(rand) # randomize data with number generator # open output files bufsize = 0 dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) # loop for different amounts of data with fixed test set datasize = data.numInstances() limit = (datasize * 2) / 3 # loop until we use 2/3 data as training set testset = Instances(data, limit, datasize - limit) # create training set using the last 1/3 of data testset.setClassIndex(testset.numAttributes() - 1) saver = ArffSaver() saver.setInstances(testset) testsetfile = "./data/split/" + dataname + "-" + "test.arff" file = File(testsetfile) saver.setFile(file) saver.writeBatch() trainset = Instances(data, 0, limit) # create training set saver = ArffSaver() saver.setInstances(trainset)