def baggedDecTree(depth, dataSet): startTime = time.time() baseDir = "/N/u/hydargah/BigRed2/ml/" global liFeatures global trainVectors global testVectors global lenLiFeatures global liFeaturesDict dataSet = int(dataSet) liFeatures, trainVectors, testVectors = main2.buildDataVectors( dataSet, main2.FeatureSelection.InformationGain) baggedTrainVectors = createBag(trainVectors) tree = createTree(baggedTrainVectors, depth) #This directory will store all the bagged bagTreeDir = baseDir + "baggedTrees/" outputFile = open(baseDir + "baggedTrees.op", "a") randomNumber = random.randrange(0, 50, 1) treePickleFile = open( bagTreeDir + str(randomNumber) + "-" + str(dataSet) + "Tree" + str(depth) + ".pickle", "wb") pickle.dump(tree, treePickleFile) treePickleFile.close() outputFile.write("Tree with depth " + str(depth) + " written in time " + str(time.time() - startTime) + "\n") outputFile.close()
def baggedDecTree(depth, dataSet): startTime = time.time() baseDir="/N/u/hydargah/BigRed2/ml/" global liFeatures global trainVectors global testVectors global lenLiFeatures global liFeaturesDict dataSet = int(dataSet) liFeatures, trainVectors, testVectors = main2.buildDataVectors(dataSet, main2.FeatureSelection.InformationGain) baggedTrainVectors = createBag(trainVectors) tree = createTree(baggedTrainVectors,depth) #This directory will store all the bagged bagTreeDir = baseDir+"baggedTrees/" outputFile = open(baseDir+"baggedTrees.op","a") randomNumber = random.randrange(0,50,1) treePickleFile = open(bagTreeDir+str(randomNumber)+"-"+str(dataSet)+"Tree"+str(depth)+".pickle","wb") pickle.dump(tree, treePickleFile) treePickleFile.close() outputFile.write("Tree with depth "+str(depth)+" written in time "+str(time.time()-startTime)+"\n") outputFile.close()
def runDecTree(dataSet, featureSelectionMethod): startTime = time.time() baseDir = "/N/u/hydargah/BigRed2/ml/" ds = dataSet fs = featureSelectionMethod global liFeatures global trainVectors global testVectors global lenLiFeatures global liFeaturesDict liFeatures, trainVectors, testVectors = main2.buildDataVectors( ds, main2.FeatureSelection.InformationGain) depthAccuracyTPTN = [] for d in range(0, 1): tree = createTree(trainVectors, depth=d) results = classifyNewSample(tree, testVectors[:20], depth=d) accuracy = resultsToAccuracy(results) tptnRates = TPTNRates(results) depthAccuracyTPTN.append((d, accuracy, tptnRates)) print "" print "" print "" print "Depth\t Accuracy\t TP\t TN\t FP\t FN" for row in depthAccuracyTPTN: print str(row[0]) + "\t " + str(row[1]) + "\t " + str( row[2][0]) + "\t " + str(row[2][1]) + "\t " + str( row[2][2]) + "\t " + str(row[2][3]) totalTime = time.time() - startTime print "Total Runtime = " + str(totalTime) ''' print countOccurenceOfClassLabel(trainVectors) print calcEntropy(trainVectors) temp = liFeatures[53] subDataSet1, subDataSet2 = splitData(trainVectors, temp) print "Len of subdataset1" print str(len(subDataSet1)) print "Len of subDataSet2 " print str(len(subDataSet2)) print "Entropy of subdataset 1 : " print calcEntropy(subDataSet1) print "Entropy of subDataSet1 2 : " print calcEntropy(subDataSet2) ''' '''
def runDecTree(dataSet, featureSelectionMethod): startTime = time.time() baseDir="/N/u/hydargah/BigRed2/ml/" ds = dataSet fs = featureSelectionMethod global liFeatures global trainVectors global testVectors global lenLiFeatures global liFeaturesDict liFeatures, trainVectors, testVectors = main2.buildDataVectors(ds, main2.FeatureSelection.InformationGain) depthAccuracyTPTN = [] for d in range(0,1): tree = createTree(trainVectors,depth = d) results = classifyNewSample(tree, testVectors[:20], depth = d) accuracy = resultsToAccuracy(results) tptnRates = TPTNRates(results) depthAccuracyTPTN.append((d, accuracy, tptnRates)) print "" print "" print "" print "Depth\t Accuracy\t TP\t TN\t FP\t FN" for row in depthAccuracyTPTN: print str(row[0])+"\t "+str(row[1])+"\t "+str(row[2][0])+"\t "+str(row[2][1])+"\t "+str(row[2][2])+"\t "+str(row[2][3]) totalTime = time.time() - startTime print "Total Runtime = "+str(totalTime) ''' print countOccurenceOfClassLabel(trainVectors) print calcEntropy(trainVectors) temp = liFeatures[53] subDataSet1, subDataSet2 = splitData(trainVectors, temp) print "Len of subdataset1" print str(len(subDataSet1)) print "Len of subDataSet2 " print str(len(subDataSet2)) print "Entropy of subdataset 1 : " print calcEntropy(subDataSet1) print "Entropy of subDataSet1 2 : " print calcEntropy(subDataSet2) ''' '''
def main(): baseDir = "" listOfTrees = [] baggedDir = baseDir+"baggedTrees/" treeFileList = os.listdir(baggedDir) #Load all the bagged trees into main memory for f in treeFileList: if "Tree20" in f: pickleFile = open(baggedDir+f,"rb") tree = pickle.load(pickleFile) listOfTrees.append(tree) #no use for liFeatures and trainVectors liFeatures, trainVectors, testVectors = main2.buildDataVectors(1, main2.FeatureSelection.InformationGain) biglist = [] tree = listOfTrees[6] for vec in testVectors: result = decTree.classifyNewSample(tree, [vec], depth=20) biglist.append(result[0]) accuracy = decTree.resultsToAccuracy(biglist) tptnRates = decTree.TPTNRates(biglist) print "Results for a single tree " print "Accuracy\t TP\t TN\t FP\t FN" print str(accuracy)+"\t"+str(tptnRates[0])+"\t"+str(tptnRates[1])+"\t"+str(tptnRates[2])+"\t"+str(tptnRates[3]) allPredictions =[] for i in range(0,len(testVectors)): vecPrediction = [] for j in range(0,9): result = decTree.classifyNewSample(listOfTrees[j], [testVectors[i]], depth=20) vecPrediction.append(result[0]) countPos = 0 countNeg = 0 expectedValue = None for result in vecPrediction: if result[0] == "POSITIVE": countPos = countPos+1 elif result[0] == "NEGATIVE": countNeg = countNeg+1 expectedValue = result[1] prediction = None if countPos == countNeg: prediction = coinToss() elif countPos > countNeg: prediction = "POSITIVE" else : prediction = "NEGATIVE" baggedPredictionPlusExpected = [prediction, expectedValue] allPredictions.append(baggedPredictionPlusExpected) accuracy = decTree.resultsToAccuracy(allPredictions) tptnRates = decTree.TPTNRates(allPredictions) print "Results for bagged Tree " print "Accuracy\t TP\t TN\t FP\t FN" print str(accuracy)+"\t"+str(tptnRates[0])+"\t"+str(tptnRates[1])+"\t"+str(tptnRates[2])+"\t"+str(tptnRates[3])
def main(): baseDir = "" listOfTrees = [] baggedDir = baseDir + "baggedTrees/" treeFileList = os.listdir(baggedDir) #Load all the bagged trees into main memory for f in treeFileList: if "Tree20" in f: pickleFile = open(baggedDir + f, "rb") tree = pickle.load(pickleFile) listOfTrees.append(tree) #no use for liFeatures and trainVectors liFeatures, trainVectors, testVectors = main2.buildDataVectors( 1, main2.FeatureSelection.InformationGain) biglist = [] tree = listOfTrees[6] for vec in testVectors: result = decTree.classifyNewSample(tree, [vec], depth=20) biglist.append(result[0]) accuracy = decTree.resultsToAccuracy(biglist) tptnRates = decTree.TPTNRates(biglist) print "Results for a single tree " print "Accuracy\t TP\t TN\t FP\t FN" print str(accuracy) + "\t" + str(tptnRates[0]) + "\t" + str( tptnRates[1]) + "\t" + str(tptnRates[2]) + "\t" + str(tptnRates[3]) allPredictions = [] for i in range(0, len(testVectors)): vecPrediction = [] for j in range(0, 9): result = decTree.classifyNewSample(listOfTrees[j], [testVectors[i]], depth=20) vecPrediction.append(result[0]) countPos = 0 countNeg = 0 expectedValue = None for result in vecPrediction: if result[0] == "POSITIVE": countPos = countPos + 1 elif result[0] == "NEGATIVE": countNeg = countNeg + 1 expectedValue = result[1] prediction = None if countPos == countNeg: prediction = coinToss() elif countPos > countNeg: prediction = "POSITIVE" else: prediction = "NEGATIVE" baggedPredictionPlusExpected = [prediction, expectedValue] allPredictions.append(baggedPredictionPlusExpected) accuracy = decTree.resultsToAccuracy(allPredictions) tptnRates = decTree.TPTNRates(allPredictions) print "Results for bagged Tree " print "Accuracy\t TP\t TN\t FP\t FN" print str(accuracy) + "\t" + str(tptnRates[0]) + "\t" + str( tptnRates[1]) + "\t" + str(tptnRates[2]) + "\t" + str(tptnRates[3])