def main(): #Create our main trees tree1 = dtree.buildTree(m.monk1, m.attributes) tree2 = dtree.buildTree(m.monk2, m.attributes) tree3 = dtree.buildTree(m.monk3, m.attributes) #PLOT MONK1 - MEAN AND VARIANCE dataset = m.monk1 testdata = m.monk1test #Overall error on test set benchmarkTreeMonk1 = dtree.buildTree(dataset, m.attributes) #print("BENCHMARK: ", 1-dtree.check(benchmarkTreeMonk1, testdata)) plotMonk(dataset, testdata, "Mean error vs. Fraction - MONK1\n500 runs in each batch", True) plotMonk(dataset, testdata, "Variance vs. Fraction - MONK1\n500 runs in each batch", False) #PLOT MONK3 - MEAN AND VARIANCE dataset = m.monk3 testdata = m.monk3test #Overall error on test set benchmarkTreeMonk3 = dtree.buildTree(dataset, m.attributes) #print("BENCHMARK: ", 1-dtree.check(benchmarkTreeMonk3, testdata)) plotMonk(dataset, testdata, "Mean error vs. Fraction - MONK3\n500 runs in each batch", True) plotMonk(dataset, testdata, "Variance vs. Fraction - MONK3\n500 runs in each batch", False) plt.show()
def calculate_best(Td,Vd): error = -sys.maxsize counter = 0 current_tree = tree.buildTree(Td,m.attributes) tr = tree.buildTree(Td,m.attributes) tr_pruned = tree.allPruned(tr) while True: counter = 0 count = len(tr_pruned) for x in tr_pruned: if tree.check(x,Vd) > error: error = tree.check(x,Vd) current_tree = x #print("current tree") #print(current_tree) #print("error") #print(error) else: counter = counter + 1 if count == counter: break tr = current_tree # print("Selected tree:") #print(tr) #print("error:") #print(error) return error, tr
def assignment5_id3(): t1 = d.buildTree(m.monk1, m.attributes) #qt.drawTree(t1) print(1 - d.check(t1, m.monk1test)) t2 = d.buildTree(m.monk2, m.attributes) print(1 - d.check(t2, m.monk2test)) #qt.drawTree(t2) t3 = d.buildTree(m.monk3, m.attributes) print(1 - d.check(t3, m.monk3test))
def build_tree(): print "\n------------------------------\nAssignment 3 - Error\n------------------------------" tree = dt.buildTree(data.monk1, data.attributes) #drawtree.drawTree(tree) print "Dataset\tE(train)\tE(test)" print "Monk1:\t%.6f\t%.6f" % (1-dt.check(tree, data.monk1), 1-dt.check(tree, data.monk1test)) tree = dt.buildTree(data.monk2, data.attributes) print "Monk2:\t%.6f\t%.6f" % (1-dt.check(tree, data.monk2), 1-dt.check(tree, data.monk2test)) tree = dt.buildTree(data.monk3, data.attributes) print "Monk3:\t%.6f\t%.6f" % (1-dt.check(tree, data.monk3), 1-dt.check(tree, data.monk3test))
def bldTree(): tree_monk1 = dtree.buildTree(mdata.monk1,mdata.attributes) tree_monk2 = dtree.buildTree(mdata.monk2,mdata.attributes) tree_monk3 = dtree.buildTree(mdata.monk3,mdata.attributes) print('MONK1 Performance on training set',dtree.check(tree_monk1,mdata.monk1)) print('MONK1 Performance on test set',dtree.check(tree_monk1,mdata.monk1test)) print('MONK2 Performance on training set',dtree.check(tree_monk2,mdata.monk2)) print('MONK2 Performance on test set',dtree.check(tree_monk2,mdata.monk2test)) print('MONK3 Performance on training set',dtree.check(tree_monk3,mdata.monk3)) print('MONK3 Performance on test set',dtree.check(tree_monk3,mdata.monk3test))
def ASSIGNMENT5(): t1 = dtree.buildTree(m.monk1, m.attributes) print(dtree.check(t1, m.monk1test)) print(dtree.check(t1, m.monk1)) t2 = dtree.buildTree(m.monk2, m.attributes) print(dtree.check(t2, m.monk2test)) print(dtree.check(t2, m.monk2)) t3 = dtree.buildTree(m.monk3, m.attributes) print(dtree.check(t3, m.monk3test)) print(dtree.check(t3, m.monk3))
def build_and_check_trees(): tree_m1 = d.buildTree(m.monk1, m.attributes) tree_m2 = d.buildTree(m.monk2, m.attributes) tree_m3 = d.buildTree(m.monk3, m.attributes) print(1 - d.check(tree_m1, m.monk1)) print(1 - d.check(tree_m2, m.monk2)) print(1 - d.check(tree_m3, m.monk3)) print(1 - d.check(tree_m1, m.monk1test)) print(1 - d.check(tree_m2, m.monk2test)) print(1 - d.check(tree_m3, m.monk3test))
def ASSIGNMENT_5(): print(" ") print("ASSIGNMENT(5)") print("ERROR:") t = buildTree(m.monk1, m.attributes) print("MONK-1 %f %f" % (1 - check(t, m.monk1), 1 - check(t, m.monk1test))) t = buildTree(m.monk2, m.attributes) print("MONK-2 %f %f" % (1 - check(t, m.monk2), 1 - check(t, m.monk2test))) t = buildTree(m.monk3, m.attributes) print("MONK-3 %f %f" % (1 - check(t, m.monk3), 1 - check(t, m.monk3test)))
def a5(): t1 = d.buildTree(m.monk1, m.attributes) #pyqt.drawTree(t1) print("Accuracy monk 1 train" + str(d.check(t1, m.monk1))) print("Accuracy monk 1 test" + str(d.check(t1, m.monk1test))) t2 = d.buildTree(m.monk2, m.attributes) #pyqt.drawTree(t2) print("Accuracy monk 2 train" + str(d.check(t2, m.monk2))) print("Accuracy monk 2 test" + str(d.check(t2, m.monk2test))) t3 = d.buildTree(m.monk3, m.attributes) #pyqt.drawTree(t3) print("Accuracy monk 3 train" + str(d.check(t3, m.monk3))) print("Accuracy monk 3 test" + str(d.check(t3, m.monk3test)))
def optimisePartitions1(): #runs tree1 = d.buildTree(m.monk1, m.attributes) score1 = d.check(tree1, m.monk1test) print("Performance of monk1 tree: " + str(score1) + "\n") for index, partition in enumerate(partitions): for j in range(runs): train1, val3 = d.partition(m.monk1, partition) tree1a = d.buildTree(train1, m.attributes) best1 = bestPrunedTree(tree1a, val3) bigList1.append(1 - d.check(best1, m.monk1test)) errorList1.append(sum(bigList1) / len(bigList1)) varianceList1.append(variance(bigList1, errorList1[index])) return errorList1, varianceList1
def optimisePartitions3(): #runs tree3 = d.buildTree(m.monk3, m.attributes) score3 = d.check(tree3, m.monk3test) print("Performance of monk3 tree: " + str(score3) + "\n") for index, partition in enumerate(partitions): for j in range(runs): train3, val3 = d.partition(m.monk3, partition) tree3a = d.buildTree(train3, m.attributes) best3 = bestPrunedTree(tree3a, val3) bigList3.append(1 - d.check(best3, m.monk3test)) errorList3.append(sum(bigList3) / len(bigList3)) varianceList3.append(variance(bigList3, errorList3[index])) return errorList3, varianceList3
def A3(): t1 = dT.buildTree( m.monk1, m.attributes ) print( dT.check( t1, m.monk1test ) ) print( dT.check( t1, m.monk1 ) ) print '\n' #draw.drawTree( t1 ) t2 = dT.buildTree( m.monk2, m.attributes ) print( dT.check( t2, m.monk2test ) ) print '\n' #draw.drawTree( t2 ) t3 = dT.buildTree( m.monk3, m.attributes ) print( dT.check( t3, m.monk3test ) ) print '\n'
def A3(): t1 = dT.buildTree(m.monk1, m.attributes) print(dT.check(t1, m.monk1test)) print(dT.check(t1, m.monk1)) print '\n' #draw.drawTree( t1 ) t2 = dT.buildTree(m.monk2, m.attributes) print(dT.check(t2, m.monk2test)) print '\n' #draw.drawTree( t2 ) t3 = dT.buildTree(m.monk3, m.attributes) print(dT.check(t3, m.monk3test)) print '\n'
def check_pruning(data_set): s_dict = dict() t_temp = d.buildTree(data_set.Train, m.attributes) prun_set = d.allPruned(t_temp) for temp in prun_set: s_dict[temp] = (d.check(temp, data_set.Test)) return key_with_maxval(s_dict)
def best_partition(full_dataset): #Set local variables tmp_max_perf = 0 max_partition = None plot_y = [] for i in range(6): #[0,1,2,3,4,5] i = (float(i) + 3) / 10 #[0.3,0.4,0.5,0.6,0.7,0.8] monk_train, monk_val = partition(full_dataset, i) #Get the best pruning for that partition max_prune, pruned_tree = prune(buildTree(monk_train, m.attributes), monk_val) #Compute performance for pruned_tree on the test set max_prune = check(pruned_tree, test_set[k]) #print("\t NEW(%f), OLD(%f)" % (max_prune, tmp_max_perf)) #Store the results in a list plot_y.append(1 - max_prune) #Compare perf with the best one if max_prune > tmp_max_perf: tmp_max_perf = max_prune max_partition = i return max_partition, tmp_max_perf, plot_y
def findBestPrunedTree(originalTrainSet, fraction): """ Find the best pruned tree, given a training set and a fraction for partitioning. """ trainSet, validationSet = partition(originalTrainSet.dataset, fraction) tree = d.buildTree(trainSet, m.attributes) bestTreeSoFar = tree bestPerformanceSoFar = d.check(tree, validationSet) print("Pruning " + originalTrainSet.name + " with fraction = " + str(fraction) + " and performance on new validation set = " + str(bestPerformanceSoFar)) while (True): possibleWaysToPruneTree = d.allPruned(bestTreeSoFar) if (len(possibleWaysToPruneTree) == 0): print("No more ways to prune tree. Returning.") return bestTreeSoFar, bestPerformanceSoFar bestPrunedTree, performance = getBestPerformingTree( possibleWaysToPruneTree, validationSet) if (performance >= bestPerformanceSoFar): print("Found pruned tree which performed better: " + str(performance)) bestTreeSoFar = bestPrunedTree bestPerformanceSoFar = performance else: print("All pruned trees perform worse. Stopping here.") return bestTreeSoFar, bestPerformanceSoFar
def pruningTest(dataset, fraction): # returns the error classification ratio monktrain, monkval = partition(dataset, fraction) tree = dtree.buildTree(monktrain, m.attributes) curRatio = dtree.check(tree, monkval) maxR = prune(curRatio, tree, monkval) #print("Max is: {:f}".format(maxR)) return 1 - maxR
def test_error_per_partition(monk, monktest): fraction_values = [0.3, 0.4, 0.5, 0.6, 0.7,0.8] test_error_mean = [] test_error_std = [] for partition_number in fraction_values: print(partition_number) testErrors_list =[] n_iters = 600 for iter in range(n_iters): monktrain, monkval = partition(monk, partition_number) tree = d.buildTree(monktrain, m.attributes) prunedTree = getPrunedTree(tree, monkval) testError = 1 - d.check(prunedTree, monktest) testErrors_list.append(testError) print("all iters calculated") testErrors_np = np.array(testErrors_list) test_error_mean.append(testErrors_np.mean()) test_error_std.append(testErrors_np.std()) plt.scatter(fraction_values, test_error_mean, c=test_error_std) cbar = plt.colorbar() cbar.set_label('Standard deviation', rotation=270, labelpad=30) plt.xlabel("fraction parameter") plt.ylabel("Average classification error (test set)") plt.show()
def calcNextTreeLevel(): selectedAttribute = m.attributes[4] s1 = dtree.select(m.monk1, selectedAttribute, 1) s2 = dtree.select(m.monk1, selectedAttribute, 2) s3 = dtree.select(m.monk1, selectedAttribute, 3) s4 = dtree.select(m.monk1, selectedAttribute, 4) # Calculate information gain of subsets #ASSIGNMENT3(s1) #ASSIGNMENT3(s2) #ASSIGNMENT3(s3) #ASSIGNMENT3(s4) mc1 = dtree.mostCommon(s1) mc2 = dtree.mostCommon(s2) mc3 = dtree.mostCommon(s3) mc4 = dtree.mostCommon(s4) #print(mc1) #print(mc2) #print(mc3) #print(mc4) tree = dtree.buildTree(m.monk2test, m.attributes) print(tree) draw.drawTree(tree)
def PRINT_TREE_AT_LEVEL_2(): # A5 print(" ") print("LEVEL 1:") print(m.attributes[4]) Att = [None] * 4 for value in range(1, 5): Att[value - 1] = select(m.monk1, m.attributes[4], value) print("LEVEL 2:") for A in Att: tmp = bestAttribute(A, m.attributes) print(tmp) if tmp == m.attributes[0]: for value in range(1, 4): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[1]: for value in range(1, 4): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[2]: for value in range(1, 3): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[3]: for value in range(1, 4): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[4]: for value in range(1, 5): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[5]: for value in range(1, 3): print(mostCommon(select(A, tmp, value))) print(" ") t = buildTree(m.monk1, m.attributes) drawTree(t)
def getData1(iterations): fraction = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] error = [0] * 6 for i in range(6): error[i] = [0] * iterations #print("\nMonk1") for f in range(len(fraction)): #print("\nFactor: %.1f" % f) for i in range(0, iterations): monk1train, monk1val = partition(mdata.monk1, fraction[f]) monk1tree = dtree.buildTree(monk1train, mdata.attributes) while True: prunelist = dtree.allPruned(monk1tree) temptree = monk1tree for x in prunelist: if dtree.check(x, monk1val) >= dtree.check( temptree, monk1val): temptree = x if temptree == monk1tree: break monk1tree = temptree error[f][i] = dtree.check(monk1tree, mdata.monk1test) return error
def assignment4_p3(data, attributes, fraction): trainData, validData = partition(data, fraction) dataTree = d.buildTree(trainData, attributes) orgErr = 1 - d.check(dataTree, validData) # print("ORIGINAL ERR", orgErr) orgTree = dataTree ######################### bestPrunedTreesList = [] toPrune = [] toPrune.append(orgTree) # bestPrunedTreesList.append(orgTree) err = orgErr bestErrorRate = err bestPrunedTreesList = getPrunedChildren(toPrune, bestErrorRate, validData) if len(bestPrunedTreesList) == 0: toReturn = toPrune[0] else: toReturn = bestPrunedTreesList[0] # print(toReturn) # print("No. of best pruned trees:", len(bestPrunedTreesList)) # for i in range(0, len(bestPrunedTreesList)): # print("Pruned Tree No. ", i, "test error rate: ", 1-d.check(bestPrunedTreesList[i], validData)) # print("Pruned Tree ", "test error rate: ", 1-d.check(toReturn, validData)) # return bestPrunedTreesList return 1 - d.check(toReturn, validData)
def pruneTree(train, validation, acc_desired): t = d.buildTree(train, m.attributes) accuracy = d.check(t, validation) accuracy_p = accuracy #print("Starting accuracy:" + str(accuracy)) temp = t tt = 0 while (tt < acc_desired): tt += 1 temp = t tlist = d.allPruned(t) accuracy_p = 0 for i in range(0, len(tlist)): #print(i) accuracy = d.check(tlist[i], validation) #print("Pruned tree no " + str(i) + " accuracy: " + str(accuracy)) #print(accuracy_p) if (accuracy >= accuracy_p): accuracy_p = accuracy #print("Set new accuracy_p: " + str(accuracy_p)) t = tlist[i] #print(str(acc_prev_tree) + " " + str(accuracy_p)) if (d.check(temp, validation) > d.check(t, validation)): t = temp """ print(t) print("Final accuracy: " + str(d.check(t, validation))) pyqt.drawTree(t) """ return t
def prunedtree(data,fraction): trainset,validationSet=partition(trainingset,fraction) tree =d.buildTree(trainset,m.attributes) bestTreeSoFar =tree bestPerformance=d.check(tree,validationSet) print("Pruning"+trainset+ "and fraction ="+ str(fraction)+ "and performance on new validationSet ="+ str(bestPerformance)) return bestTreeSoFar,bestPerformance # bestTreeSoFar= tree # bestPerformance=d.check(tree,validationSet)
def pruning(data_set, fraction = 0.6): # A function that returns a pruned decision tree from a data set data_train, data_val = partition(data_set, fraction) # The tree to become pruned tree_pruned = dtree.buildTree(data_train, m.attributes) err_tree_pru = dtree.check(tree_pruned, data_val) # print("Tree before prune:") # print(tree_pruned) better = True while better: better = False trees_alt = dtree.allPruned(tree_pruned) best_prune = None err_best = 0 for alternative in trees_alt: err_alternative = dtree.check(alternative, data_val) if err_alternative >= err_tree_pru and err_alternative > err_best: best_prune = alternative err_best = err_alternative better = True if better: tree_pruned = best_prune err_tree_pru = err_best return tree_pruned
def assignment7(): datasets = [m.monk1, m.monk3] test = [m.monk1test, m.monk3test] name = ['Monk1', 'Monk2'] fractions = [i * .1 for i in range(3, 9)] runs = 50 scores = [] scores_numbers = [] for dataset, testset, name in zip(datasets, test, name): datasetScore = [] for fraction in fractions: results = [] for _ in range(runs): monktrain, monkval = partition(dataset, fraction) tree = dtree.buildTree(monktrain, m.attributes) tree, score = getTree(tree, monkval) results.append(1 - dtree.check(tree, testset)) datasetScore.append((mean(results), variance(results))) scores_numbers.append(datasetScore) # scores.append(f'Fraction: {fraction}\nMean: {mean(results)}\nVariance: {variance(results)}') return scores_numbers
def buildtree(): for i in range(len(trainingset)): tree=d.buildTree(trainingset[i].dataset,m.attributes) performanceOnTrainData = d.check(tree,trainingset[i].dataset) performanceOnTestData=d.check(tree,testset[i].dataset) print("Error of " + trainingset[i].name+ "on " + testset[i].name + ":" + str(1-performanceOnTestData)) print("Error of " + trainingset[i].name+ "on " + trainingset[i].name + ":" + str(1-performanceOnTrainData))
def assignment4_p1(data, attributes, fraction): trainData, validData = partition(data, fraction) dataTree = d.buildTree(trainData, attributes) orgErr = 1 - d.check(dataTree, validData) print("ORIGINAL ERR", orgErr) orgTree = dataTree bestPrunedTree = orgTree cont = True while cont: err = orgErr bestErrorRate = err prunedTrees = d.allPruned(bestPrunedTree) print(len(prunedTrees)) for i in range(0, len(prunedTrees)): err = 1 - d.check(prunedTrees[i], validData) print(i, err) if err < bestErrorRate: bestErrorRate = err bestPrunedTree = prunedTrees[i] print("Best Error Rate:", bestPrunedTree, bestErrorRate) if bestErrorRate > orgErr: return orgTree elif bestPrunedTree == dataTree: break # else: # if bestPrunedTree == prunedTrees: # prunedTrees = d.allPruned(bestPrunedTree) orgTree = bestPrunedTree orgErr = bestErrorRate
def pruneTree(dataset, testSet): fractions = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] errorList = [] for x in fractions: train, val = partition(dataset, x) theTree = tree.buildTree(train, data.attributes) list_of_trees = tree.allPruned(theTree) theBest = 1000 bestTree = 0 for t in list_of_trees: error = 1 - tree.check(t, val) if error < theBest: theBest = error bestTree = t draw.drawTree(bestTree) smallest_error_at_fraction = 1 - tree.check(bestTree, testSet) errorList.append(smallest_error_at_fraction) # print ("smalest error") # print (smallest_error_at_fraction) # print ("occured at fraction") # print (x) return errorList
def tests(pair): tree=dtree.buildTree(pair[0], monkdata.attributes) return [ pair[2], dtree.check(tree,pair[0]), dtree.check(tree,pair[1]) ]
def gen_validate_data(monkset, monktest, fraction): validation_values = [] for x in range(1, 100): train, valid = partition(monkset, fraction) tree = d.buildTree(train, m.attributes) pruned = get_pruned(tree, valid) validation_values.append(1 - d.check(pruned, monktest)) return validation_values
def evaluate_fraction(data, fraction, monktest): #data = monkdata.monk1 res = [None] * 2000 for i in range(2000): monktrain, monkval = partition(data, fraction) t = dtree.buildTree(monktrain, monkdata.attributes) res[i] = 1 - dtree.check(prune(t, monkval), monktest) return res
def evaluate_pruning(): fractions = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] monk1_pruned = [] monk3_pruned = [] for i in range(100): monk1_pruned.append(prune_trees(m.monk1, m.monk1test)) monk3_pruned.append(prune_trees(m.monk3, m.monk3test)) monk1_pruned = np.transpose(monk1_pruned) monk3_pruned = np.transpose(monk3_pruned) mean1 = np.mean(monk1_pruned, axis=1) mean3 = np.mean(monk3_pruned, axis=1) std1 = np.std(monk1_pruned, axis=1) std3 = np.std(monk3_pruned, axis=1) stat_table = PrettyTable(['Dataset/Stat', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6']) stat_table.add_row(np.concatenate((['MONK-1 - MEAN'], np.around(mean1, decimals=6)), axis=0)) stat_table.add_row(np.concatenate((['MONK-3 - MEAN'], np.around(mean3, decimals=6)), axis=0)) stat_table.add_row(np.concatenate((['MONK-1 - STDEV'], np.around(std1, decimals=6)), axis=0)) stat_table.add_row(np.concatenate((['MONK-1 - STDEV'], np.around(std3, decimals=6)), axis=0)) print(stat_table) complete_tree1 = dt.buildTree(m.monk1, m.attributes) complete_tree3 = dt.buildTree(m.monk3, m.attributes) prn_table = PrettyTable(['Dataset', 'Error on Complete Tree', 'Error on Pruned Tree (mean)']) prn_table.add_row(['MONK-1', 1 - dt.check(complete_tree1, m.monk1test), np.amin(mean1)]) prn_table.add_row(['MONK-3', 1 - dt.check(complete_tree3, m.monk3test), np.amin(mean3)]) print(prn_table) plt.plot(fractions, mean1, color='#49abc2', marker='o', label="Means") plt.title("Mean Error vs Fractions on MONK-1") plt.xlabel("Fractions") plt.ylabel("Means of Error") plt.legend(loc='upper right', frameon=False) plt.show() plt.plot(fractions, mean3, color='#fe5f55', marker='o', label="Means") plt.title("Mean Error vs Fractions on MONK-3") plt.xlabel("Fractions") plt.ylabel("Means of Error") plt.legend(loc='upper right', frameon=False) plt.show()
def ass3(): test = [mdata.monk1test, mdata.monk2test, mdata.monk3test] count = 0 for dset in [mdata.monk1, mdata.monk2, mdata.monk3]: t = dtree.buildTree(dset, mdata.attributes) print("Training error for set " + str(count + 1) + ": " + str(1 - dtree.check(t, dset))) print("Test error for set " + str(count + 1) + ": " + str(1 - dtree.check(t, test[count]))) count = count + 1
def assignment4(): print "--- Assignment 4 ---" print "Selecting the best fraction to divide training and validation sets for pruning" table = Texttable(max_width=100) table.add_row(["Dataset", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "Benchmark"]) for i in range(3): row = ["Monk-" + str(i+1)] for frac in [(x * 0.1) for x in range(3,9)]: train_set, valid_set = m.partition(monkdata[i], frac) base = d.buildTree(train_set,m.attributes) best = best_pruned(base,valid_set) true_perf = d.check(best[0],testdata[i]) row += [true_perf] row += [d.check(d.buildTree(monkdata[i],m.attributes),testdata[i])] table.add_row(row) print table.draw() print
def getMean(data, testData, frac, iter): val = 0 i = 0 while i < iter: monktrain, monkval = partition(data, frac) t = d.buildTree(monktrain, m.attributes) val = val + pruneNow(t, monkval, testData) i = i + 1 return val / iter
def getClasification(dataset,fraction): monk1train, monk1val = partition(dataset,fraction) testTree = tree.buildTree(monk1val,m.attributes) prunedTrees = tree.allPruned(testTree) pValue = 0 for pruned in prunedTrees: if(tree.check(pruned,monk1train) > pValue): bestTree = pruned pValue = tree.check(pruned,monk1train) return pValue, bestTree
def assignment3(): print "--- Assignment 3 ---" print "Performance of the decision trees" table = Texttable(max_width=100) table.add_row(["Dataset", "Training", "Test"]) for i in range(3): tree = d.buildTree(monkdata[i],m.attributes) perf = [d.check(tree, monkdata[i]), d.check(tree, testdata[i])] table.add_row(["Monk-" + str(i+1)] + perf) print table.draw() print
def find_prunned(data_part, f_part): monk1train, monkvalue = partition(data_part, f_part) dtree = tree.buildTree(monk1train, dataset.attributes) prun_list = tree.allPruned(dtree) current_correctness = tree.check(dtree, monkvalue) for current_tree in prun_list: check_correctness = tree.check(current_tree, monkvalue) if check_correctness > current_correctness: current_correctness = check_correctness dtree = current_tree return dtree
def main(argv): print "Entropy Monk1: " + str(tree.entropy(m.monk1)) print "Entropy Monk2: " + str(tree.entropy(m.monk2)) print "Entropy Monk3: " + str(tree.entropy(m.monk3)) print "Average Gain Monk1(a1): " + str(tree.averageGain(m.monk1, m.attributes[0])) print "Average Gain Monk1(a2): " + str(tree.averageGain(m.monk1, m.attributes[1])) print "Average Gain Monk1(a3): " + str(tree.averageGain(m.monk1, m.attributes[2])) print "Average Gain Monk1(a4): " + str(tree.averageGain(m.monk1, m.attributes[3])) print "Average Gain Monk1(a5): " + str(tree.averageGain(m.monk1, m.attributes[4])) print "Average Gain Monk1(a6): " + str(tree.averageGain(m.monk1, m.attributes[5])) print "Average Gain Monk2(a1): " + str(tree.averageGain(m.monk2, m.attributes[0])) print "Average Gain Monk2(a2): " + str(tree.averageGain(m.monk2, m.attributes[1])) print "Average Gain Monk2(a3): " + str(tree.averageGain(m.monk2, m.attributes[2])) print "Average Gain Monk2(a4): " + str(tree.averageGain(m.monk2, m.attributes[3])) print "Average Gain Monk2(a5): " + str(tree.averageGain(m.monk2, m.attributes[4])) print "Average Gain Monk2(a6): " + str(tree.averageGain(m.monk2, m.attributes[5])) print "Average Gain Monk3(a1): " + str(tree.averageGain(m.monk3, m.attributes[0])) print "Average Gain Monk3(a2): " + str(tree.averageGain(m.monk3, m.attributes[1])) print "Average Gain Monk3(a3): " + str(tree.averageGain(m.monk3, m.attributes[2])) print "Average Gain Monk3(a4): " + str(tree.averageGain(m.monk3, m.attributes[3])) print "Average Gain Monk3(a5): " + str(tree.averageGain(m.monk3, m.attributes[4])) print "Average Gain Monk3(a6): " + str(tree.averageGain(m.monk3, m.attributes[5])) #print "Average Gain Level 2 Monk1(a1): " + str(tree.averageGain(tree.select(m.monk1, m.attributes[0], value), m.attributes[0])) #draw.drawTree(tree.buildTree(m.monk1, m.attributes, 2)) t=tree.buildTree(m.monk1,m.attributes); print(tree.check(t, m.monk1test)) print(tree.check(t, m.monk1)) t2=tree.buildTree(m.monk2,m.attributes); print(tree.check(t2, m.monk2test)) print(tree.check(t2, m.monk2)) t3=tree.buildTree(m.monk3,m.attributes); print(tree.check(t3, m.monk3test)) print(tree.check(t3, m.monk3))
def generateErrorTable(dataset, testset, fractions, tries): result=[] for x in fractions: acc = 0 for i in range(tries): trainSet, valSet =partition(dataset, x) tree = dtree.buildTree(trainSet, m.attributes) prunedTree = findBestPrune(tree, valSet) acc += dtree.check(prunedTree, testset) result.append( (x,acc / tries) ) return result
def test_pruning(dataset, testset): fraction_list = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] print ("TESTING PRUNING") for fraction in fraction_list: print("--------------") print(fraction) monk_tree = d.buildTree(dataset,m.attributes) training, validation = partition(dataset, fraction) pruned_monk_tree = prune_tree(monk_tree,validation) print(d.check(monk_tree, testset)) print(d.check(pruned_monk_tree, testset)) print("--------------")
def prune(): print "\n------------------------------\nAssignment 4 - Pruning\n------------------------------" print "Dataset\t 0.3\t\t 0.4\t\t 0.5\t\t 0.6\t\t 0.7\t\t 0.8" partSizes = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] r = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] i = 0 for size in partSizes: for j in range(100): training, test = partition(data.monk1, size) bestTree = dt.buildTree(training, data.attributes) bestClass = dt.check(bestTree, test) better = True while better: better = False for subTree in dt.allPruned(bestTree): if dt.check(subTree, test) > bestClass: bestTree = subTree bestClass = dt.check(subTree, test) better = True r[i] += (1-dt.check(bestTree, data.monk1test)) i += 1 print "Monk1\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t" % (r[0]/100, r[1]/100, r[2]/100, r[3]/100, r[4]/100, r[5]/100) r = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] i = 0 for size in partSizes: for j in range(100): training, test = partition(data.monk3, size) bestTree = dt.buildTree(training, data.attributes) bestClass = dt.check(bestTree, test) better = True while better: better = False for subTree in dt.allPruned(bestTree): if dt.check(subTree, test) >= bestClass: bestTree = subTree bestClass = dt.check(subTree, test) better = True r[i] += (1-dt.check(bestTree, data.monk3test)) i += 1 print "Monk3\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t" % (r[0]/100, r[1]/100, r[2]/100, r[3]/100, r[4]/100, r[5]/100)
def make_pruned(dataset, testset, ratio = 0.5): ''' Takes data- and testset and partitions data. Then makes pruned tree and checks performance ''' test, val = partition(dataset, ratio) tree = dt.buildTree(test, m.attributes) #per_ref = check_tree_performance(tree, testset) per_ref = check_tree_performance(tree, val) best = prune(tree, val, per_ref) #per_pruned = check_tree_performance(best, testset) per_pruned = check_tree_performance(best, val) return best, per_ref, per_pruned
def best_pruned_tree(dataset, fraction): train, val = partition(dataset, fraction) tree = dt.buildTree(train, m.attributes) improved = True while improved: improved = False best_performance = dt.check(tree, val) for pruned_tree in dt.allPruned(tree): performance = dt.check(pruned_tree, val) if performance > best_performance: best_performance = performance tree = pruned_tree improved = True return tree
def main(): # tree = d.buildTree(m.monk1,m.attributes) # draw.drawTree(tree) # Assignment 1 print("==Ass 01==") calcentropy() # Assignment 2 print("==Ass 02==") calcgain() # Assignment 3.1 print("==Ass 03.1==") mytree = buildMonk1DecisionTreeTo2ndLevel() prebuildttree = d.buildTree(m.monk1, m.attributes, 2) print(mytree) print(prebuildttree) # draw.drawTree(mytree) # Assignment 3.2 print("==Ass 03.2==") buildAndCheckDecisionTreeForDatasets(m.monk1, m.monk1test, "Monk 1") buildAndCheckDecisionTreeForDatasets(m.monk2, m.monk2test, "Monk 2") buildAndCheckDecisionTreeForDatasets(m.monk3, m.monk3test, "Monk 3") print("==Ass 04==") trainingdatapercentage = [.3, .4, .5, .6, .7, .8] treeerrormonk1 = [] treeerrormonk3 = [] tries = 1000 for fraction in trainingdatapercentage: for dataset in [m.monk1, m.monk3]: value = 0.0 for iteration in range(0, tries): value += pruneDecisionTree(dataset, m.attributes, fraction) value /= tries value = round(value, 4) if dataset == m.monk1: treeerrormonk1.append(value) else: treeerrormonk3.append(value) print("Errors for fractions") print(trainingdatapercentage) print(treeerrormonk1) print(treeerrormonk3)
def pruning( trainingSet, testSet, fraction ): train1, train2 = partition( trainingSet, fraction ) bestTree = dT.buildTree( train1, m.attributes ) bestTreePerf = dT.check( bestTree, train2 ) bestTreeFound = True while bestTreeFound == True: bestTreeFound = False prunedTrees = dT.allPruned( bestTree ) for candidateTree in prunedTrees: if dT.check( candidateTree, train2 ) >= bestTreePerf: bestTree = candidateTree bestTreePerf = dT.check( candidateTree, train2 ) bestTreeFound = True return dT.check( bestTree, testSet )
def assignment4helper(dataset, fraction): monk1train, monk1val = partition(dataset, fraction) tree = d.buildTree(monk1train, m.attributes) bestTree = None maxVal = -1 cont = True i = 0 while (cont): cont = False i += 1 for t in d.allPruned(tree): val = d.check(t, monk1val) if (val > maxVal): cont = True bestTree = t maxVal = val tree = bestTree # print("#iterations: %d" % i) return tree
def calc_next_level(): #print "\nAverage gain when a5 is choosen" print "\nA5\t a1\t\t a2\t\t a3\t\t a4\t\t a5\t\t a6" s = "A5(" for val in data.attributes[4].values: subset = dt.select(data.monk1, data.attributes[4], val) t = "\t" for attr in data.attributes: t = t + "%.6f\t" % (dt.averageGain(subset, attr)) print val , t best = dt.bestAttribute(subset, data.attributes) s = s + best.name + "(" #print "best attribute: ", best.name for value in best.values: #print "choose: ", value, "mostCommon: ", dt.mostCommon(dt.select(subset, best, value)) if(dt.mostCommon(dt.select(subset, best, value))): s = s + "+" else: s = s + "-" s = s + ")" s = s + ")" print "\nOur tree:\t", s print "Build tree:\t", dt.buildTree(data.monk1, data.attributes, 2)
def assignment3(): print("Monk1") monk1Tree = d.buildTree(m.monk1, m.attributes) print(1 - d.check(monk1Tree, m.monk1)) print(1 - d.check(monk1Tree, m.monk1test)) print(monk1Tree) print("Monk2") monk2Tree = d.buildTree(m.monk2, m.attributes) print(1 - d.check(monk2Tree, m.monk2)) print(1 - d.check(monk2Tree, m.monk2test)) print(monk2Tree) print("Monk3") monk3Tree = d.buildTree(m.monk3, m.attributes) print(1 - d.check(monk3Tree, m.monk3)) print(1 - d.check(monk3Tree, m.monk3test)) print(monk3Tree) print("Monk1 -- 2 Levels") monk1Tree = d.buildTree(m.monk1, m.attributes, 2) print(1 - d.check(monk1Tree, m.monk1)) print(1 - d.check(monk1Tree, m.monk1test)) print(monk1Tree) print("Monk2 -- 2 Levels") monk2Tree = d.buildTree(m.monk2, m.attributes, 2) print(1 - d.check(monk2Tree, m.monk2)) print(1 - d.check(monk2Tree, m.monk2test)) print(monk2Tree) print("Monk3 -- 2 Levels") monk3Tree = d.buildTree(m.monk3, m.attributes, 2) print(1 - d.check(monk3Tree, m.monk3)) print(1 - d.check(monk3Tree, m.monk3test)) print(monk3Tree)
breakPoint= int(len(ldata) * fraction) return ldata[:breakPoint], ldata[breakPoint:] def unzip(values): return [list(t) for t in zip(*values)] fractions = [0.3,0.4,0.5,0.6,0.7,0.8] series=[] for pair in setpairs: values = [] for fraction in fractions: s = pair[0] testdata = pair[1] training, validation = partition(s, fraction) tree=dtree.buildTree(training, monkdata.attributes) keepPruning = True while keepPruning: alternatives = dtree.allPruned(tree) keepPruning = False for alternative in alternatives: if(dtree.check(alternative,validation) > dtree.check(tree,validation)): tree = alternative keepPruning = True error=dtree.check(tree,testdata) values.append((fraction,error)) #convert pairs to two lists [xs, ys] data=unzip(values) data.append(pair[2]) series.append(data)
#splitting the data a = bestAttribute(m.monk1, m.attributes) data = [] for v in a.values: data.append(dt.select(m.monk1, a, v)) #calculating the average information gain for the next level for d in data: for a in m.attributes: print dt.averageGain(d, a) print '\n' print '\n' #comparison with the tree from the predefined function tree = dt.buildTree(m.monk1, m.attributes, 2) #draw.drawTree(tree) #building the trees for all the monks datasets #assignment 3 tree1 = dt.buildTree(m.monk1, m.attributes) print dt.check(tree1, m.monk1) print dt.check(tree1, m.monk1test) #draw.drawTree(tree) print '\n' tree2 = dt.buildTree(m.monk2, m.attributes) print dt.check(tree2, m.monk2) print dt.check(tree2, m.monk2test) #draw.drawTree(tree)
def print_non_pruned_performance(training_set, test_set): non_pruned_tree = dt.buildTree(training_set, m.attributes) performance_without_pruning = dt.check(non_pruned_tree, test_set) print('Performance without pruning: {}'.format(performance_without_pruning))
gain_partition3.append(dt.averageGain(partition3,m.attributes[x])) gain_partition4.append(dt.averageGain(partition4,m.attributes[x])) print "Dataset\tA1\t\tA2\t\tA3\t\tA4\t\tA5\t\tA6" print "Part 1: ","\t".join(["%.7f"%y for y in gain_partition1]) print "Part 2: ","\t".join(["%.7f"%y for y in gain_partition2]) print "Part 3: ","\t".join(["%.7f"%y for y in gain_partition3]) print "Part 4: ","\t".join(["%.7f"%y for y in gain_partition4]) print print "Own tree" print "A5(",dt.mostCommon(partition1),"A4(",dt.mostCommon(partition2),")","A6",dt.mostCommon(partition3),")","A1(",dt.mostCommon(partition4), "))" print print "BuildTree function" print dt.buildTree(m.monk1,m.attributes,2) #draw.drawTree(dt.buildTree(m.monk1,m.attributes,2)) print print "Building Trees" t1 = dt.buildTree(m.monk1,m.attributes) t2 = dt.buildTree(m.monk2,m.attributes) t3 = dt.buildTree(m.monk3,m.attributes) print "Checking Full Tree" print "Dataset\tE train\t\tE test" print "Monk1\t","%.7f"%dt.check(t1,m.monk1), "\t%.7f"%dt.check(t1,m.monk1test) print "Monk1\t","%.7f"%dt.check(t2,m.monk2), "\t%.7f"%dt.check(t2,m.monk2test) print "Monk1\t","%.7f"%dt.check(t3,m.monk3), "\t%.7f"%dt.check(t3,m.monk3test)
d.averageGain(m.monk2, m.attributes[0]), d.averageGain(m.monk2, m.attributes[1]), d.averageGain(m.monk2, m.attributes[2]), d.averageGain(m.monk2, m.attributes[3]), d.averageGain(m.monk2, m.attributes[4]), d.averageGain(m.monk2, m.attributes[5]) )) print("monk-3: %f %f %f %f %f %f" % ( d.averageGain(m.monk3, m.attributes[0]), d.averageGain(m.monk3, m.attributes[1]), d.averageGain(m.monk3, m.attributes[2]), d.averageGain(m.monk3, m.attributes[3]), d.averageGain(m.monk3, m.attributes[4]), d.averageGain(m.monk3, m.attributes[5]) )) monk1_subset = d.select(m.monk1, m.attributes[4], 3) print len(monk1_subset) print(d.mostCommon(monk1_subset)) monk1_subset_tree = d.buildTree(monk1_subset, m.attributes, 5) print(monk1_subset_tree) t1 = d.buildTree(m.monk1, m.attributes); print(d.check(t1, m.monk1test)) print(d.check(t1, m.monk1)) t2 = d.buildTree(m.monk2, m.attributes); print(d.check(t2, m.monk2test)) print(d.check(t2, m.monk2)) t3 = d.buildTree(m.monk3, m.attributes); print(d.check(t3, m.monk3test)) print(d.check(t3, m.monk3))
def assignment3_p2(): print("\n#####Start Assignment 3 part 2") splits = myBuildTree(m.monk1, 2) print("splits", splits) print(d.buildTree(m.monk1, m.attributes, 2))
print "Gain Monk1 a5(3) - a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[4])) print "Gain Monk1 a5(3) - a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[5])) print "Gain Monk1 a5(4) - a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[0])) print "Gain Monk1 a5(4) - a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[1])) print "Gain Monk1 a5(4) - a3: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[2])) print "Gain Monk1 a5(4) - a4: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[3])) print "Gain Monk1 a5(4) - a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[4])) print "Gain Monk1 a5(4) - a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[5])) selec1 = tree.select(m.monk1, m.attributes[4], 4) print "Most Common Level2 Monk1(1): " + str(tree.mostCommon(tree.select(selec1,m.attributes[1],1))) print "Most Common Level2 Monk1(2): " + str(tree.mostCommon(tree.select(selec1,m.attributes[1],2))) print "Most Common Level2 Monk1(3): " + str(tree.mostCommon(tree.select(selec1,m.attributes[1],3))) print "Monk 1 Etrain : " + str(tree.check(tree.buildTree(m.monk1, m.attributes), m.monk1)) print "Monk 1 Etest : " + str(tree.check(tree.buildTree(m.monk1, m.attributes), m.monk1test)) print "Monk 2 Etrain : " + str(tree.check(tree.buildTree(m.monk2, m.attributes), m.monk2)) print "Monk 2 Etest : " + str(tree.check(tree.buildTree(m.monk2, m.attributes), m.monk2test)) print "Monk 3 Etrain : " + str(tree.check(tree.buildTree(m.monk3, m.attributes), m.monk3)) print "Monk 3 Etest : " + str(tree.check(tree.buildTree(m.monk3, m.attributes), m.monk3test)) print "ID3 built tree : \n" tree1 = tree.buildTree(m.monk1,m.attributes,2) #d.drawTree(tree1) #x = [0.3,0.4,0.5,0.6,0.7,0.8] #y = [] #for fraction in x: # monk1train, monk1val = partition(m.monk1,fraction) # testTree = tree.buildTree(monk1val,m.attributes)
# print(sel) sub = [] mC = [] for subset in sel: for i in [0, 1, 2, 3, 5]: sub.append(t.averageGain(subset, m.attributes[i])) mC.append(t.mostCommon(subset)) # print(sub) sub = [] "Highest information gain on second level of the tree # 2 - A4 , 3 - A6 , 4 - A1 #" """Assignment 3""" tree1 = t.buildTree(m.monk1, m.attributes) tree2 = t.buildTree(m.monk2, m.attributes) tree3 = t.buildTree(m.monk3, m.attributes) draw.drawTree(tree1) # draw.drawTree(tree2) # draw.drawTree(tree3) print("Assignment 3: Decision tree performances") print("Train errors:") print(1 - round(t.check(tree1, m.monk1), 5)) print(1 - round(t.check(tree2, m.monk2), 5)) print(1 - round(t.check(tree3, m.monk3), 5)) print("Test errors:")
import monkdata as m import dtree as d t = d.buildTree(m.monk1, m.attributes) print('monk1') print(d.check(t, m.monk1test)) print(d.check(t, m.monk1)) print() print('monk2') t = d.buildTree(m.monk2, m.attributes) print(d.check(t, m.monk2test)) print(d.check(t, m.monk2)) print() print('monk3') t = d.buildTree(m.monk3, m.attributes) print(d.check(t, m.monk3test)) print(d.check(t, m.monk3))
currentgain = d.check(prunedTrees[x], validation) #print("Rate for tree %d: %f " % (x + 1, currentgain)) if(currentgain > maxgain): maxgain = currentgain; bestTree = prunedTrees[x] prunedTrees = d.allPruned(bestTree) if(maxgain > bestGain): bestGain = maxgain else: run = False #print("Max accuracy reached. Pruning stopped.") #print("Best accuracy: %f" % bestGain); return bestTree i = 1; for set in monkset: print("Pruning for MONK-%d" % (monkset.index(set) + 1)); for frac in fractions: print("Fraction: %f" % frac) newmonk, monkval = partition(set, frac) monktree = d.buildTree(newmonk, m.attributes) t = pruneTree(monktree, monkval) print("Accuracy for pruned tree against test data: %f (vs nonpruned: %f)" \ % (d.check(t, monktestset[monkset.index(set)]), d.check(monktree, monktestset[monkset.index(set)]))) i += 1; print()