def test_findSplit_easy(self): data = load_testdata('fake_dataset_small.txt') split = DecisionTree.findSplit(None, data) self.assertEqual(split, (2, -15), "Should be (2,-15)") left, right = DecisionTree.partitionData(self, data, split) self.assertEqual(DecisionTree.findSplit(None, left), (None, None), "Should be (None,None)") self.assertEqual(DecisionTree.findSplit(None, right), (4, -20), "Should be (4,-20)")
def load_tree(file, manager): global CURRENT_TREE f = open(file, 'r') CURRENT_TREE = file if not f.read(): f.close() tree = DecisionTree(manager, file, True) # ready for training else: f.close() tree = DecisionTree(manager, file) # tree.print_tree() return tree
def run(params, manager): global TREE command = params[0] options = params[1] if command == 'train': # train if len(params) != 3: bad_input() else: dimension = params[2] file = options if os.path.isfile(file): manager.train(file, TREE, dimension) else: print("file %s does not exists" % file) TREE.save_tree(CURRENT_TREE) elif command == 'play': # start game numbered = False if len(params) > 2: if params[2] == '-n': numbered = True dimension = options # TREE.print_tree() Game(TREE, manager, dimension, numbered) TREE.save_tree(CURRENT_TREE) TREE = DecisionTree(manager, CURRENT_TREE) elif command == 'tree': # load or view if options == '-l': # load tree file = params[2] if os.path.isfile(file): new_tree = load_tree(file, manager) return new_tree else: print("file %s does not exist." % file) elif options == '-p': TREE.print_tree() else: print("tree command unrecognized") print("tree -l - load a new decision tree") print("tree -p - print the current decision tree\n") pass else: bad_input() return TREE
def test_predict(self): df = pd.DataFrame( { 'YearMade': [1984, 1986, 2000, 1984, 1986, 2000, 1984, 1986], 'Price': [2000, 2000, 4000, 2000, 2000, 4000, 2000, 2000] }, index=[0, 1, 2, 3, 4, 5, 6, 7]) x = df.iloc[:, 0:1] y = np.array(df['Price']) tree = DecisionTree(x, y) x_test = pd.DataFrame({'YearMade': [1985, 1985, 2002]}, index=[0, 1, 2]) test_predictions = tree.predict(x_test) expected_predictions = [2000, 2000, 4000] self.assertEqual( (np.array(test_predictions) == np.array(expected_predictions) ).all(), True)
def test_partition_data(self): data = load_testdata('fake_dataset_small.txt') split = (2, -15) left, right = DecisionTree.partitionData(self, data, split) np.testing.assert_array_equal(left, data[data[:, split[0]] < split[1]], "Split arrays do not match") np.testing.assert_array_equal(right, data[data[:, split[0]] >= split[1]], "Split arrays do not match")
def test_setupTree(self): decisiontree = DecisionTree(self.data) self.assertEqual( decisiontree.xm, self.x_train.shape[1], "Should be " + str(self.x_train.shape[1]) + " attributes") self.assertEqual( decisiontree.xn, self.x_train.shape[0], "Should be " + str(self.x_train.shape[0]) + " datapoints") np.testing.assert_array_equal(decisiontree.data, self.data, "x_train data does not match")
delimiter = ';' metadata = "" seed = None for i in range(len(argv)): if (argv[i] in ['--file', '-f']): filename = argv[i + 1] elif (argv[i] in ['--delimiter', '-d']): delimiter = argv[i + 1] elif (argv[i] in ['--meta', '-m']): metadata = argv[i + 1] elif (argv[i] in ['--seed', '-s']): seed = int(argv[i + 1]) elif (argv[i] in ['--help', '-h']): print_usage(argv[0]) exit(0) if seed is not None: random.seed(int(seed)) # load the dataset to memory db = Dataset(filename, delimiter=delimiter, metadata=metadata) # generates a decision tree from the dataset tree = DecisionTree(db.data, db.attributes, db.target_attribute, db.numeric, single_tree_print=True) # print the resultant tree on the terminal print(tree)
def cross_val(data, folds): start = datetime.datetime.now() splits = crossValidation_split(data, folds) stats = [] stats_pruned = [] #Creats a tree and evaluates for every fold. Results collected in list "stats" for i in range(folds): #Take one testset test = data[splits[i], :] results_lowlevel_unpruned = [] results_lowlevel_pruned = [] #Take only non-test folds for train and val remaining_folds = np.delete(np.arange(folds), i) for j in remaining_folds: mask_train = np.ones(data.shape[0], dtype=bool) mask_train[splits[i]] = False mask_train[splits[j]] = False train = data[mask_train, :] val = data[splits[j]] decisiontree = DecisionTree() decisiontree.buildTree(train) results_lowlevel_unpruned.append(decisiontree.evaluate(test)) decisiontree.prune(train, val) results_lowlevel_pruned.append(decisiontree.evaluate(test)) print("Tree Complete! Test Set: {} Validation Set: {}".format( i, j)) stats.append(average_statistics(results_lowlevel_unpruned)) stats_pruned.append(average_statistics(results_lowlevel_pruned)) result = average_statistics(stats) result_pruned = average_statistics(stats_pruned) end = datetime.datetime.today() print("") print("#############################") print( "{}-fold Crossvalidation complete. Average scores unpruned tree over all folds:" .format(folds)) print("Average Confusion Matrix:") print(result['confusionmatrix']) print("Average Precision: {:.2%}".format(np.mean(result['precision']))) print("Average Precision per Class:") print(result['precision']) print("Average Recall: {:.2%}".format(np.mean(result['recall']))) print("Average Recall per Class:") print(result['recall']) print("Average F1: {:.2%}".format(np.mean(result['F1score']))) print("Average F1 score per Class:") print(result['F1score']) print("Average Classification Rate: {:.2%}".format(result['posClassRate'])) print("#############################") print("Average scores pruned tree over all folds:") print("Average Confusion Matrix:") print(result_pruned['confusionmatrix']) print("Average Precision: {:.2%}".format( np.mean(result_pruned['precision']))) print("Average Precision per Class:") print(result_pruned['precision']) print("Average Recall: {:.2%}".format(np.mean(result_pruned['recall']))) print("Average Recall per Class:") print(result_pruned['recall']) print("Average F1 score: {:.2%}".format(np.mean(result_pruned['F1score']))) print("Average F1 score per Class:") print(result_pruned['F1score']) print("Average Classification Rate: {:.2%}".format( result_pruned['posClassRate'])) print("Runtime: {}".format(end - start)) print("#############################") print("") return result, result_pruned
def train(self, training_set): for train_set in training_set: self.Forest.append( DecisionTree(train_set, self.db.attributes, self.db.target_attribute, self.db.numeric))
# Random Forest / Naiive Bayes from src.loader import DataFactory from src.DecisionTree import DecisionTree from src.RandomForest import RandomForest import numpy as np import matplotlib.pyplot as plt df = DataFactory() animal_df = df.get_dataframe() dt = DecisionTree(animal_df) dt.run_simulation('class_type') rf = RandomForest(animal_df) rf.run_simulation('class_type')
def test_score_split(self): tree = DecisionTree(self.x, self.y) score, _, _ = tree.score_split(1, 0) self.assertEqual(score, 1000)
def test_find_best_split(self): tree = DecisionTree(self.x, self.y) self.assertEqual(tree.split_row, 1985)
def test_is_leaf(self): tree = DecisionTree(self.x, self.y) self.assertEqual(tree.left.is_leaf(), True)
cnt_cutoff = 3 gain_cutoff = 0.1 train_ac = [] test_ac = [] for i in range(k): idx = np.arange(l) cv_set = indexes[int(i * l / k):int((i + 1) * l / k)] train_data = data[np.logical_not(np.isin(idx, cv_set)), :] test_data = data[cv_set, :] X_train = train_data[:, :-1] Y_train = train_data[:, -1] X_test = test_data[:, :-1] Y_test = test_data[:, -1] tree = DecisionTree() tree.train(X_train, Y_train, cnt_cutoff, gain_cutoff) pred_train = tree.run(X_train) train_ac.append(accuracy(Y_train.T, pred_train.flatten())) pred_test = tree.run(X_test) test_ac.append(accuracy(Y_test.T, pred_test.flatten())) tp += np.logical_and(Y_test.flatten(), pred_test.flatten()).sum() tn += np.logical_and(np.logical_not(Y_test.flatten()), np.logical_not(pred_test.flatten())).sum() fp += np.logical_and(Y_test.flatten(), np.logical_not(pred_test.flatten())).sum() fn += np.logical_and(np.logical_not(Y_test.flatten()), pred_test.flatten()).sum()