def readFeature(num_features,type,select_feature,numtrees): #filename1=resultFileTest #filename2=resultFileTest2 filename1=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_train.csv' filename2=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_test.csv' #print filename1 loader=CSVLoader() loader.setSource(File(filename1)) data=loader.getDataSet() #print data.numAttributes() data.setClassIndex(data.numAttributes()-1) rf=RF() rf.setNumTrees(numtrees) rf.buildClassifier(data) #print rf loader.setSource(File(filename2)) test_data=Instances(loader.getDataSet()) test_data.setClassIndex(test_data.numAttributes()-1) ''' num=test_data.numInstances() print num for i in xrange(num): r1=rf.distributionForInstance(test_data.instance(i)) r2=rf.classifyInstance(test_data.instance(i)) ptrixrint r1 print r2''' buffer = StringBuffer() # buffer for the predictions output=PlainText() output.setHeader(test_data) output.setBuffer(buffer) attRange = Range() # attributes to output outputDistribution = Boolean(True) evaluator=Evaluation(data) evaluator.evaluateModel(rf,test_data,[output,attRange,outputDistribution]) #print evaluator.evaluateModel(RF(),['-t',filename1,'-T',filename2,'-I',str(numtrees)]) #evaluator1=Evaluation(test_data) print evaluator.toSummaryString() print evaluator.toClassDetailsString() print evaluator.toMatrixString() return [evaluator.precision(1),evaluator.recall(1),evaluator.fMeasure(1),evaluator.matthewsCorrelationCoefficient(1),evaluator.numTruePositives(1),evaluator.numFalsePositives(1),evaluator.numTrueNegatives(1),evaluator.numFalseNegatives(1),evaluator.areaUnderROC(1)]
def load_arff(self, arff): file = FileReader(arff) #fis = FileInputStream(arff) #file = InputStreamReader(fis, "UTF-8"); #fr = FileReader(arff) #file = BufferedReader(fr) data = Instances(file) data.setClassIndex(data.numAttributes() - 1) return data
weka.classifiers.Evaluation class) """ # check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: UsingJ48Ext.py <ARFF-file>" sys.exit() # load data file print "Loading data..." file = FileReader(sys.argv[1]) data = Instances(file) # set the class Index - the index of the dependent variable data.setClassIndex(data.numAttributes() - 1) # create the model evaluation = Evaluation(data) buffer = StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution j48 = J48() j48.buildClassifier(data) # only a trained classifier can be evaluated evaluation.evaluateModel(j48, data, [buffer, attRange, outputDistribution]) # print out the built model print "--> Generated model:\n" print j48 print "--> Evaluation:\n"
sys.exit() crossvalidate = sys.argv[2] rand = Random() # seed from the system time # load properties p = Properties() p.load(open('./ml.properties')) # load data file print "Loading data..." trainfile = FileReader(sys.argv[1] + "-train.arff") print "Loading " + sys.argv[1] + "-train.arff" testfile = FileReader(sys.argv[1] + "-test.arff") print "Loading " + sys.argv[1] + "-test.arff" fulltrainset = Instances(trainfile) fulltrainset.setClassIndex(fulltrainset.numAttributes() - 1) testset = Instances(testfile) testset.setClassIndex(testset.numAttributes() - 1) # open output files bufsize=0 classifiername = str(os.path.splitext(os.path.basename(__file__))[0]) dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv" filelimit=open(datafilelimit, 'w', bufsize) filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n") logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log" log=open(logfile, 'w', bufsize) # open general log file for num in range(int(p['j48.initial']),fulltrainset.numInstances(),(fulltrainset.numInstances() / int(p['j48.numdatapoints']))): filelimit.write(str(num))
if (numReqOpt < 2): usage() return 1 options = {'idFlag':True, 'weightFlag': False, 'rmClassFlag': False, 'rmClass': 0} # read the first dataset fn = inputList[0] fid = FileReader(fn) Data = Instances(fid) Data, IDs = PreprocessData(Data,options) # remove class label attributeremove = AttributeRemove() attributeremove.setInvertSelection(Boolean(False)) # remove class labels from dataset attributeremove.setAttributeIndices(String(str(Data.numAttributes()))) attributeremove.setInputFormat(Data) newData = Filter.useFilter(Data, attributeremove) # loop over input arff file cnt = Data.numAttributes() for fnCnt in range(1,len(inputList)): fn = inputList[fnCnt] fid = FileReader(fn) Data = Instances(fid) Data, IDs = PreprocessData(Data,options) # remove class label attributeremove = AttributeRemove() attributeremove.setInvertSelection(Boolean(True)) # remove every attribute but the last one which is class label attributeremove.setAttributeIndices(String(str(Data.numAttributes()))) attributeremove.setInputFormat(Data) labels = Filter.useFilter(Data, attributeremove)
# load data file print "Loading data..." datafile = FileReader(sys.argv[1]) data = Instances(datafile) rand = Random() # seed from the system time data.randomize(rand) # randomize data with number generator # open output files bufsize=0 dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) # loop for different amounts of data with fixed test set datasize = data.numInstances() limit = (datasize*2)/3 # loop until we use 2/3 data as training set testset = Instances(data,limit,datasize-limit) # create training set using the last 1/3 of data testset.setClassIndex(testset.numAttributes() - 1) saver = ArffSaver() saver.setInstances(testset) testsetfile = "./data/split/" + dataname + "-" + "test.arff" file = File(testsetfile) saver.setFile(file) saver.writeBatch() trainset = Instances(data,0,limit) # create training set saver = ArffSaver() saver.setInstances(trainset) trainsetfile = "./data/split/" + dataname + "-" + "train.arff" file = File(trainsetfile) saver.setFile(file) saver.writeBatch()
Based on this code example: http://www.btbytes.com/2005/11/30/weka-j48-classifier-example-using-jython/ Commandline parameter(s): first parameter must be the ARFF file one wants to process with J48 """ # check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: UsingJ48.py <ARFF-file>" sys.exit() # load data file print "Loading data..." file = FileReader(sys.argv[1]) data = Instances(file) # set the class Index - the index of the dependent variable data.setClassIndex(data.numAttributes() - 1) # create the model print "Training J48..." j48 = J48() j48.buildClassifier(data) # print out the built model print "Generated model:\n" print j48
print "Loading data..." datafile = FileReader(sys.argv[1]) data = Instances(datafile) rand = Random() # seed from the system time data.randomize(rand) # randomize data with number generator # open output files bufsize = 0 dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) # loop for different amounts of data with fixed test set datasize = data.numInstances() limit = (datasize * 2) / 3 # loop until we use 2/3 data as training set testset = Instances(data, limit, datasize - limit) # create training set using the last 1/3 of data testset.setClassIndex(testset.numAttributes() - 1) saver = ArffSaver() saver.setInstances(testset) testsetfile = "./data/split/" + dataname + "-" + "test.arff" file = File(testsetfile) saver.setFile(file) saver.writeBatch() trainset = Instances(data, 0, limit) # create training set saver = ArffSaver() saver.setInstances(trainset) trainsetfile = "./data/split/" + dataname + "-" + "train.arff" file = File(trainsetfile) saver.setFile(file) saver.writeBatch()
sys.exit() crossvalidate = sys.argv[2] rand = Random() # seed from the system time # load properties p = Properties() p.load(open('./ml.properties')) # load data file print "Loading data..." trainfile = FileReader(sys.argv[1] + "-train.arff") print "Loading " + sys.argv[1] + "-train.arff" testfile = FileReader(sys.argv[1] + "-test.arff") print "Loading " + sys.argv[1] + "-test.arff" fulltrainset = Instances(trainfile) fulltrainset.setClassIndex(fulltrainset.numAttributes() - 1) testset = Instances(testfile) testset.setClassIndex(testset.numAttributes() - 1) # open output files bufsize = 0 classifiername = str(os.path.splitext(os.path.basename(__file__))[0]) dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv" filelimit = open(datafilelimit, 'w', bufsize) filelimit.write( "instances,letest,letrain,lmtest,lmtrain,kdtest,kdtrain,balltest,balltrain,covertest,covertrain\n" ) logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log" log = open(logfile, 'w', bufsize) # open general log file