def get_best_tree(currtree): found_better_tree = False for newtree in dtree.allPruned(currtree): if dtree.check(newtree, monkval) > dtree.check(currtree, monkval): found_better_tree = True currtree = newtree if found_better_tree: currtree = get_best_tree(currtree) return currtree
def pick_best_tree(old_tree, validation_set): list_of_trees = dec.allPruned(old_tree) old_accuracy = dec.check(old_tree, validation_set) validation_accuracy = 0 result = None for i in range(len(list_of_trees)): temp = list_of_trees[i] temp_accuracy = dec.check(temp, validation_set) if temp_accuracy > validation_accuracy: validation_accuracy = temp_accuracy result = temp if validation_accuracy < old_accuracy: return old_tree else: pick_best_tree(result, validation_set) return result
def optimum_prune(tree, val_data): def get_local_opt(tp): opt = (None, 0) for tree, perf in tp: opt = (tree, perf) if perf >= opt[1] else opt return opt optimum = (None, 0) while True: pruned_trees = allPruned(tree) performance = [(t, check(t, val_data)) for t in pruned_trees] local_opt = get_local_opt(performance) #print('Current optimum: {}, new optimum: {}'.format(optimum[1], local_opt[1])) if local_opt[1] > optimum[1]: optimum = local_opt else: break return optimum
def assignment_5(): print("*** ASSIGNMENT 5 ***") t_monk1 = buildTree(monk1, attributes) t_monk2 = buildTree(monk2, attributes) t_monk3 = buildTree(monk3, attributes) result_text = "{} -- E_train: {}; E_test: {}" print( result_text.format('MONK1', 1.0 - check(t_monk1, monk1), 1.0 - check(t_monk1, monk1test))) print( result_text.format('MONK2', 1.0 - check(t_monk2, monk2), 1.0 - check(t_monk2, monk2test))) print( result_text.format('MONK3', 1.0 - check(t_monk3, monk3), 1.0 - check(t_monk3, monk3test))) print("\n")
# print(s) # Onwards # tree = dtree.buildTree(monkdata.monk1, monkdata.attributes, maxdepth=1) # Two levels # tree = dtree.buildTree(monkdata.monk1, monkdata.attributes) # All levels # drawtree.drawTree(tree) # Show tree # PERFORMANCE CHECK PART name_sets = ('MONK-1', 'MONK-2', 'MONK-3') training_sets = (monkdata.monk1, monkdata.monk2, monkdata.monk3) test_sets = (monkdata.monk1test, monkdata.monk2test, monkdata.monk3test) trees = list( dtree.buildTree(training_set, monkdata.attributes) for training_set in training_sets) print('# Performance Check') header = ['Dataset', 'Train', 'Test'] data = [] for tree, name_set, training_set, test_set in zip(trees, name_sets, training_sets, test_sets): data.append([ name_set, 1 - round(dtree.check(tree, training_set), 5), 1 - round(dtree.check(tree, test_set), 5) ]) print(tabulate(data, header))
datasets_test): data = [] mean_errors = [] stdev = [] for fraction in fractions: errors = [] for i in range(n): monktrain, monkval = partition(dataset, fraction) built_tree = dtree.buildTree(monktrain, m.attributes) best_tree = get_best_tree(built_tree) errors.append(1 - dtree.check(best_tree, dataset_test)) mean_error = round(statistics.mean(errors), decimals) mean_errors.append(round(statistics.mean(errors), decimals)) stdev.append(round(statistics.stdev(errors), decimals)) data.append([fraction, mean_error, statistics.mean(stdev)]) print(tabulate(data, header), '\n') plt.errorbar(fractions, mean_errors, yerr=stdev, marker='o') plt.title('{} (n = {})'.format(dataset_name, n)) plt.xlabel('fraction') plt.ylabel('mean error')
def main(): monk1 = m.monk1 monk2 = m.monk2 monk3 = m.monk3 entropy = dec.entropy(m.monk1) print("MONK 1: ", entropy) entropy = dec.entropy(m.monk2) print("MONK 2: ", entropy) entropy = dec.entropy(m.monk3) print("MONK 3: ", entropy) print(dec.bestAttribute(monk1, m.attributes)) print(("MONK1:")) for i in range(6): print("Information gain of a" + str(i + 1) + " is " + str(dec.averageGain(monk1, m.attributes[i]))) print(("MONK2:")) for i in range(6): print("Information gain of a" + str(i + 1) + " is " + str(dec.averageGain(monk2, m.attributes[i]))) print(("MONK3:")) for i in range(6): print("Information gain of a" + str(i + 1) + " is " + str(dec.averageGain(monk3, m.attributes[i]))) print(("MONK1:")) tree = dec.buildTree(monk1, m.attributes) print("Training Error: ", 1 - dec.check(tree, m.monk1)) print("Test Error: ", 1 - dec.check(tree, m.monk1test)) print(("MONK2:")) tree = dec.buildTree(monk2, m.attributes) print("Training Error: ", 1 - dec.check(tree, m.monk2)) print("Test Error: ", 1 - dec.check(tree, m.monk2test)) print(("MONK3:")) tree = dec.buildTree(monk3, m.attributes) print("Training Error: ", 1 - dec.check(tree, m.monk3)) print("Test Error: ", 1 - dec.check(tree, m.monk3test)) def partition(data, fraction): ldata = list(data) random.shuffle(ldata) breakPoint = int(len(ldata) * fraction) return ldata[:breakPoint], ldata[breakPoint:] values = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] best_test_error = 0 model_score = [] best_fraction = 0 for fraction in values: total_test_error = 0 for j in range(1000): monk3train, monk3val = partition(m.monk3, fraction) tree = dec.buildTree(monk3train, m.attributes) result = pick_best_tree(tree, monk3val) total_test_error += dec.check(result, m.monk3test) avg_test_error = total_test_error / 1000 model_score.append(avg_test_error) if avg_test_error > best_test_error: best_test_error = avg_test_error best_fraction = fraction plt.scatter(values, model_score) plt.xlabel("Split fraction") plt.ylabel("Test accuracy") plt.savefig("Monk3.png") plt.show() best_test_error = 0 model_score = [] best_fraction = 0 for fraction in values: total_test_error = 0 for j in range(1000): monk1train, monk1val = partition(m.monk1, fraction) tree = dec.buildTree(monk1train, m.attributes) result = pick_best_tree(tree, monk1val) total_test_error += dec.check(result, m.monk1test) avg_test_error = total_test_error / 1000 model_score.append(avg_test_error) if avg_test_error > best_test_error: best_test_error = avg_test_error best_fraction = fraction plt.scatter(values, model_score) plt.xlabel("Split fraction") plt.ylabel("Test accuracy") plt.savefig("Monk1.png") plt.show() print(best_fraction)
raw.set_title('Mean and raw values') raw.set_xlabel('fraction') raw.set_ylabel('error') stat = plt.subplot(2, 1, 2) stat.set_title('Standard deviation') stat.set_xlabel('fraction') stat.set_ylabel('standard deviation') for j in range(3, 9): print j total = 0 for k in range(size): monk1train, monk1val = partition(m.monk3, j / 10.0) t = d.buildTree(monk1train, m.attributes) checkT = d.check(t, monk1val) while True: hold = checkT tprune = d.allPruned(t) for i in tprune: temp = (d.check(i, monk1val)) if (temp > checkT): checkT = temp t = i #print checkT if (checkT == hold): break
from python import dtree as d from python import monkdata as m print("Error rate MONK 1") t= d.buildTree(m.monk1, m.attributes) print(1-d.check(t,m.monk1test)) print("Error rate MONK 2") t= d.buildTree(m.monk2, m.attributes) print(1-d.check(t,m.monk2test)) print("Error rate MONK 3") t= d.buildTree(m.monk3, m.attributes) print(1-d.check(t,m.monk3test))
def assignment_7(): print("*** ASSIGNMENT 7 ***") samples = 100 fractions = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.925, 0.95, 0.99) datasets = { 'monk1': { 'training': monk1, 'test': monk1test }, 'monk3': { 'training': monk3, 'test': monk3test } } results = {} for dataset_name, dataset_data in datasets.items(): dataset = dataset_data['training'] dataset_test = dataset_data['test'] results[dataset_name] = {} for fraction in fractions: errors = [] for _ in range(samples): train, validation = partition(dataset, fraction) tree = buildTree(train, attributes) opt_tree, _ = optimum_prune(tree, validation) errors.append(1.0 - check(opt_tree, dataset_test)) results[dataset_name][fraction] = { 'mean': np.mean(errors), 'median': np.median(errors), 'std': np.std(errors), 'max': max(errors), 'min': min(errors) } pp = PrettyPrinter(indent=4) pp.pprint(results) y_monk1 = [(stats['mean'], stats['std']) for fraction, stats in results['monk1'].items()] y_monk3 = [(stats['mean'], stats['std']) for fraction, stats in results['monk3'].items()] plt.figure() plt.errorbar(fractions, [e[0] for e in y_monk1], yerr=[e[1] for e in y_monk1], fmt='or', capsize=5, label='MONK-1') plt.errorbar(fractions, [e[0] for e in y_monk3], yerr=[e[1] for e in y_monk3], fmt='ob', capsize=5, label='MONK-3') plt.xlabel('Pruning fraction size (relative size of training set)') plt.ylabel('Classification error') plt.legend() plt.title( 'Error vs. fraction size (mean of {} samples, errorbars represent one standard deviation)' .format(samples)) plt.show()