def readCross(num,type,numtrees): filename=resultFile+'_'+type+'_'+num+'_all.csv' loader=CSVLoader() loader.setSource(File(filename)) data=loader.getDataSet() #print data.numAttributes() data.setClassIndex(data.numAttributes()-1) rf=RF() rf.setNumTrees(numtrees) #pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) buffer = StringBuffer() # buffer for the predictions output=PlainText() output.setHeader(data) output.setBuffer(buffer) output.setOutputDistribution(True) attRange = Range() # attributes to output outputDistributions = Boolean(True) evaluator=Evaluation(data) evaluator.crossValidateModel(rf,data,10, Random(1),[output,attRange,outputDistributions]) print evaluator.toSummaryString() print evaluator.toClassDetailsString() print evaluator.toMatrixString() return [evaluator.weightedPrecision(),evaluator.weightedRecall(),evaluator.weightedFMeasure(),evaluator.weightedMatthewsCorrelation(),evaluator.weightedFalseNegativeRate(),evaluator.weightedFalsePositiveRate(),evaluator.weightedTruePositiveRate(),evaluator.weightedTrueNegativeRate(),evaluator.weightedAreaUnderROC()]
def myGridSearch(data,NTreeBounds,NFeaturesBounds): best_acc = -float('inf') bestrandomforest = None class bestValues(object): t = float('nan') f = float('nan') for t in range(NTreeBounds[0],NTreeBounds[1]+NTreeBounds[2],NTreeBounds[2]): for f in range(NFeaturesBounds[0],NFeaturesBounds[1]+NFeaturesBounds[2],NFeaturesBounds[2]): randomforest = RandomForest() randomforest.setNumTrees(int(t)) randomforest.setNumFeatures(int(f)) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(randomforest,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc): bestrandomforest = randomforest best_acc = acc bestValues.t = t bestValues.f = f print "Best accuracy:", best_acc print "Best values: NTreeBounds = ", bestValues.t, ", NFeaturesBounds = ", bestValues.f print "-----------------------------------------" return bestrandomforest, bestValues.t, bestValues.f, best_acc
def readFeature(num_features,type,select_feature,numtrees): #filename1=resultFileTest #filename2=resultFileTest2 filename1=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_train.csv' filename2=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_test.csv' #print filename1 loader=CSVLoader() loader.setSource(File(filename1)) data=loader.getDataSet() #print data.numAttributes() data.setClassIndex(data.numAttributes()-1) rf=RF() rf.setNumTrees(numtrees) rf.buildClassifier(data) #print rf loader.setSource(File(filename2)) test_data=Instances(loader.getDataSet()) test_data.setClassIndex(test_data.numAttributes()-1) ''' num=test_data.numInstances() print num for i in xrange(num): r1=rf.distributionForInstance(test_data.instance(i)) r2=rf.classifyInstance(test_data.instance(i)) ptrixrint r1 print r2''' buffer = StringBuffer() # buffer for the predictions output=PlainText() output.setHeader(test_data) output.setBuffer(buffer) attRange = Range() # attributes to output outputDistribution = Boolean(True) evaluator=Evaluation(data) evaluator.evaluateModel(rf,test_data,[output,attRange,outputDistribution]) #print evaluator.evaluateModel(RF(),['-t',filename1,'-T',filename2,'-I',str(numtrees)]) #evaluator1=Evaluation(test_data) print evaluator.toSummaryString() print evaluator.toClassDetailsString() print evaluator.toMatrixString() return [evaluator.precision(1),evaluator.recall(1),evaluator.fMeasure(1),evaluator.matthewsCorrelationCoefficient(1),evaluator.numTruePositives(1),evaluator.numFalsePositives(1),evaluator.numTrueNegatives(1),evaluator.numFalseNegatives(1),evaluator.areaUnderROC(1)]
def RandomForest_ParamFinder(data): # possible set for Number of trees NTreeBounds = [1,20,1] # possible set for number of features NFeaturesBounds = [0,20,1] if (data.numInstances()>10): # grid search does 10-fold cross validation; hence number of samples must be more than 10 gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(True)) randomforest = RandomForest() gridsearch.setClassifier(randomforest) gridsearch.setXProperty(String('classifier.numTrees')) gridsearch.setYProperty(String('classifier.numFeatures')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setXMin(NTreeBounds[0]) gridsearch.setXMax(NTreeBounds[1]) gridsearch.setXStep(NTreeBounds[2]) gridsearch.setYMin(NFeaturesBounds[0]) gridsearch.setYMax(NFeaturesBounds[1]) gridsearch.setYStep(NFeaturesBounds[2]) gridsearch.setYBase(10) print "searching for random-forest NumTrees = [", NTreeBounds[0], ",", NTreeBounds[1], "], NumFeatures = [ ", NFeaturesBounds[0], ",", NFeaturesBounds[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # ----------------------- Evaluation bestrandomforest = RandomForest() bestrandomforest.setNumTrees(int(bestValues.x)) bestrandomforest.setNumFeatures(int(bestValues.y)) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestrandomforest,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() print "best accuracy: ", acc print "best random-forest classifier with NumTrees=",bestValues.x , ", NumFeatures = ", bestValues.y OptRndFrst = bestrandomforest OptRndFrstp1 = bestValues.x OptRndFrstp2 = bestValues.y OptRndFrstAcc = acc else: OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc = myGridSearch(data,NTreeBounds,NFeaturesBounds) Description = 'Random-Forest classifier: OptNumTrees = ' + str(OptRndFrstp1) + \ ', OptNumFeatures = ' + str(OptRndFrstp2) + ', OptAcc = ' + str(OptRndFrstAcc) print "-----------------------------------------" return OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc, Description
def main(): #create the training & test sets, skipping the header row with [1:] dataset = genfromtxt(open('Data/train.csv','r'), delimiter=',', dtype='f8')[1:] target = [x[0] for x in dataset] train = [x[1:] for x in dataset] test = genfromtxt(open('Data/test.csv','r'), delimiter=',', dtype='f8')[1:] #create and train the random forest #multi-core CPUs can use: rf = RandomForestClassifier(n_estimators=100, n_jobs=2) rf = RandomForest.setNumTrees(100) rf.Evaluation(train, target) savetxt('Data/submission2.csv', rf.predict(test), delimiter=',', fmt='%f')
def random_forest(trainData,testData,params,exparams): numTrees = int(float(params[0])) numFeatures = int(float(params[1])) randomforest = RandomForest() randomforest.setNumTrees(numTrees) randomforest.setNumFeatures(numFeatures) randomforest.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(randomforest, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(randomforest, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
}] test = [{"sex": "f", "subject": "Phil"}, {"sex": "m", "subject": "CS"}] numericAttributes = [] classAttr = "subject" tree = DecisionTree(numericAttributes) for i in inst: tree.addInstance(i) tree.learn(classAttr, unpruned=True, minNumObj=0) svm = SVM(numericAttributes) for i in inst: svm.addInstance(i) svm.learn(classAttr) ada = AdaBoost(numericAttributes) for i in inst: ada.addInstance(i) ada.learn(classAttr) forest = RandomForest(numericAttributes) for i in inst: forest.addInstance(i) forest.learn(classAttr) for j, model in enumerate((tree, svm, ada, forest)): print "\nmodel", j for i in test: #del i[classAttr] print model.classify(i)
if (not (len(sys.argv) == 2)): print "Usage: supervised.py <ARFF-file>" sys.exit() # load data file print "Loading data..." file = FileReader(sys.argv[1]) data = Instances(file) # set the class Index - the index of the dependent variable data.setClassIndex(data.numAttributes() - 1) # define the algorithms to be used. algo_list = [(NaiveBayes(), 'NaiveBayes'), (BayesNet(), 'BayesNet'), (J48(), 'J48'), (JRip(), 'JRip'), (KStar(), 'KStar'), (RandomForest(), 'RandomForest'), (AdaBoostM1(), 'AdaBoostM1'), (MultilayerPerceptron(), 'MultilayerPerceptron'), (LibSVM(), 'LibSVM')] algo_dict = dict([(x[1], x[0]) for x in algo_list]) algo_keys = [ 'NaiveBayes', 'J48', 'BayesNet', 'JRip', 'RandomForest', 'KStar', 'AdaBoostM1', 'LibSVM', 'MultilayerPerceptron' ] # example to set kernal type on libsvm. Default is 2 #algo = algo_dict['LibSVM'] #tag = SelectedTag("1",algo.TAGS_KERNELTYPE) # 0 = linear, 1 = polynomial, 2 = radial basis function, 3 = sigmoid #algo.setKernelType(tag) # train classifiers but filter out the name column first print "Training classifiers..."
{"sex":"m", "subject":"CS"} ] test = [ {"sex":"f", "subject":"Phil"}, {"sex":"m", "subject":"CS"} ] numericAttributes=[] classAttr = "subject" tree = DecisionTree(numericAttributes) for i in inst: tree.addInstance(i) tree.learn(classAttr, unpruned=True, minNumObj=0) svm = SVM(numericAttributes) for i in inst: svm.addInstance(i) svm.learn(classAttr) ada = AdaBoost(numericAttributes) for i in inst: ada.addInstance(i) ada.learn(classAttr) forest = RandomForest(numericAttributes) for i in inst: forest.addInstance(i) forest.learn(classAttr) for j,model in enumerate((tree, svm, ada, forest)): print "\nmodel", j for i in test: #del i[classAttr] print model.classify(i)
# check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: supervised.py <ARFF-file>" sys.exit() # load data file print "Loading data..." file = FileReader(sys.argv[1]) data = Instances(file) # set the class Index - the index of the dependent variable data.setClassIndex(data.numAttributes() - 1) # define the algorithms to be used. algo_list = [(NaiveBayes(), 'NaiveBayes'), (BayesNet(),'BayesNet'), (J48(),'J48'), (JRip(), 'JRip'), (KStar(), 'KStar'), (RandomForest(), 'RandomForest'), (AdaBoostM1(),'AdaBoostM1'), (MultilayerPerceptron(),'MultilayerPerceptron'), (LibSVM(), 'LibSVM')] algo_dict = dict([(x[1], x[0]) for x in algo_list]) algo_keys = ['NaiveBayes', 'J48', 'BayesNet', 'JRip', 'RandomForest', 'KStar', 'AdaBoostM1', 'LibSVM', 'MultilayerPerceptron'] # example to set kernal type on libsvm. Default is 2 algo = algo_dict['LibSVM'] tag = SelectedTag("1",algo.TAGS_KERNELTYPE) # 0 = linear, 1 = polynomial, 2 = radial basis function, 3 = sigmoid algo.setKernelType(tag) # train classifiers print "Training classifiers..." for key in algo_keys : algo = algo_dict[key] algo.buildClassifier(data)
def run(basename,train_filename,test_filename, num_trees=100,tree_depth=0,class_index=0): with timer.Timer("loading data"): training = read_dataset(train_filename,class_index=class_index) testing = read_dataset(test_filename,class_index=class_index) """ print "====== naive Bayes =====" with timer.Timer("training"): nb = NaiveBayes() nb.buildClassifier(training) with timer.Timer("testing"): eval_training = evaluate_dataset(nb,training) eval_testing = evaluate_dataset(nb,testing) print "=== evaluation (training):" print eval_training.toSummaryString() print "=== evaluation (testing):" print eval_testing.toSummaryString() """ print "====== random forest =====" with timer.Timer("training"): rf = RandomForest() #rf.setOptions([ # u'-P', u'100', u'-I', u'100', u'-num-slots', u'1', u'-K', u'0', u'-M', u'1.0', u'-V', u'0.001', u'-S', u'1', # u'-num-decimal-places', u'6' #]) rf.setNumIterations(num_trees) if tree_depth: rf.setMaxDepth(tree_depth) rf.buildClassifier(training) with timer.Timer("testing"): eval_training = evaluate_dataset(rf,training) eval_testing = evaluate_dataset(rf,testing) print "=== evaluation (training):" print eval_training.toSummaryString() print "=== evaluation (testing):" print eval_testing.toSummaryString() #print rf.getmembers() num_classifiers = len(rf.m_Classifiers) for i,tree in enumerate(rf.m_Classifiers): options_arr = tree.getOptions() options_arr_python = [x for x in options_arr] options_arr_python += [u'-num-decimal-places',u'6'] tree.setOptions(options_arr_python) #print tree.toString() #binarize(tree) filename = basename % i with open(filename,"w") as f: f.writelines(tree.graph()) correct,incorrect = 0,0 for instance in testing: pos,neg = 0,0 for tree in rf.m_Classifiers: #print tree.classifyInstance(instance) if tree.classifyInstance(instance) >= 0.5: pos += 1 else: neg += 1 my_label = 1.0 if pos >= neg else 0.0 if my_label == instance.classValue(): correct += 1 else: incorrect += 1 print " trees : %d" % num_trees print "--- evaluating majority vote on random forest:" print " correct : %d" % correct print "incorrect : %d" % incorrect