def main(): # Steps to build and prune a decision tree: # 1. Prepare dataset. headings, dataset = utils.load_dataset() random.shuffle(dataset) # Split the dataset into training data, test data and pruning data if needed. train_data = dataset[:32000] test_data = dataset[32000:40000] # prune_data = dataset[:] # 2. Grow a decision tree from training data based on entropy or gini. dt = DecisionTree.build_tree(train_data, DecisionTree.entropy) # dt = DecisionTree.build_tree(train_data, DecisionTree.gini) # 3. Visualize the tree. DecisionTree.plot_tree(dt, headings, conf.org_tree_filepath) leaves = DecisionTree.count_leaves(dt) print('Leaves count before pruning: %d' % leaves) # 4. Run the test data through the tree. err = DecisionTree.evaluate(dt, test_data) print('Accuracy before pruning: %d/%d = %f' % \ (len(test_data) - err, len(test_data), (len(test_data) - err) / len(test_data))) # 5. Prune the tree. # 5.1 REP: REP requires another dataset for pruning, so we need to split the dataset in a different way. # 5.2 PP: top-down DecisionTree.top_down_pessimistic_pruning(dt) # 5.3 PP: bottom-up. # DecisionTree.bottom_up_pessimistic_pruning(dt) # 5.4 MEP # DecisionTree.minimum_error_pruning(dt) # 6. Visualize the pruned tree. DecisionTree.plot_tree(dt, headings, conf.prn_tree_filepath) leaves = DecisionTree.count_leaves(dt) print('Leaves count after pruning: %d' % leaves) # 7. Check if the classification ability is improved after pruning. err = DecisionTree.evaluate(dt, test_data) print('Accuracy after pruning: %d/%d = %f' % \ (len(test_data) - err, len(test_data), (len(test_data) - err) / len(test_data)))
from tree import DecisionTree training_data = [ ['Green', 3, 'Apple'], ['Yellow', 3, 'Apple'], ['Red', 1, 'Grape'], ['Red', 1, 'Grape'], ['Yellow', 3, 'Lemon'], ] decison_tree = DecisionTree() tree = decison_tree.build_tree(training_data) decison_tree.print_tree(tree) def pretty_print_leaf_predictions(counts): total = sum(counts.values()) * 1.0 probabilities = {} for label in counts.keys(): probabilities[label] = str(int(counts[label] / total * 100)) + "%" return probabilities pretty_print_leaf_predictions(decison_tree.classify(training_data[0], tree)) testing_data = [ ['Green', 3, 'Apple'], ['Yellow', 4, 'Apple'], ['Red', 2, 'Grape'],