Пример #1
0
 def test_findSplit_easy(self):
     data = load_testdata('fake_dataset_small.txt')
     split = DecisionTree.findSplit(None, data)
     self.assertEqual(split, (2, -15), "Should be (2,-15)")
     left, right = DecisionTree.partitionData(self, data, split)
     self.assertEqual(DecisionTree.findSplit(None, left), (None, None),
                      "Should be (None,None)")
     self.assertEqual(DecisionTree.findSplit(None, right), (4, -20),
                      "Should be (4,-20)")
Пример #2
0
def load_tree(file, manager):
    global CURRENT_TREE
    f = open(file, 'r')
    CURRENT_TREE = file
    if not f.read():
        f.close()
        tree = DecisionTree(manager, file, True)
        # ready for training
    else:
        f.close()
        tree = DecisionTree(manager, file)
        # tree.print_tree()
    return tree
Пример #3
0
def run(params, manager):
    global TREE
    command = params[0]
    options = params[1]
    if command == 'train':
        # train
        if len(params) != 3:
            bad_input()
        else:
            dimension = params[2]
            file = options
            if os.path.isfile(file):
                manager.train(file, TREE, dimension)
            else:
                print("file %s does not exists" % file)
            TREE.save_tree(CURRENT_TREE)
    elif command == 'play':
        # start game
        numbered = False
        if len(params) > 2:
            if params[2] == '-n':
                numbered = True
        dimension = options
        # TREE.print_tree()
        Game(TREE, manager, dimension, numbered)
        TREE.save_tree(CURRENT_TREE)
        TREE = DecisionTree(manager, CURRENT_TREE)
    elif command == 'tree':
        # load or view
        if options == '-l':
            # load tree
            file = params[2]
            if os.path.isfile(file):
                new_tree = load_tree(file, manager)
                return new_tree
            else:
                print("file %s does not exist." % file)
        elif options == '-p':
            TREE.print_tree()
        else:
            print("tree command unrecognized")
            print("tree -l - load a new decision tree")
            print("tree -p - print the current decision tree\n")
        pass
    else:
        bad_input()
    return TREE
Пример #4
0
 def test_predict(self):
     df = pd.DataFrame(
         {
             'YearMade': [1984, 1986, 2000, 1984, 1986, 2000, 1984, 1986],
             'Price': [2000, 2000, 4000, 2000, 2000, 4000, 2000, 2000]
         },
         index=[0, 1, 2, 3, 4, 5, 6, 7])
     x = df.iloc[:, 0:1]
     y = np.array(df['Price'])
     tree = DecisionTree(x, y)
     x_test = pd.DataFrame({'YearMade': [1985, 1985, 2002]},
                           index=[0, 1, 2])
     test_predictions = tree.predict(x_test)
     expected_predictions = [2000, 2000, 4000]
     self.assertEqual(
         (np.array(test_predictions) == np.array(expected_predictions)
          ).all(), True)
Пример #5
0
    def test_partition_data(self):
        data = load_testdata('fake_dataset_small.txt')
        split = (2, -15)

        left, right = DecisionTree.partitionData(self, data, split)
        np.testing.assert_array_equal(left, data[data[:, split[0]] < split[1]],
                                      "Split arrays do not match")
        np.testing.assert_array_equal(right,
                                      data[data[:, split[0]] >= split[1]],
                                      "Split arrays do not match")
Пример #6
0
 def test_setupTree(self):
     decisiontree = DecisionTree(self.data)
     self.assertEqual(
         decisiontree.xm, self.x_train.shape[1],
         "Should be " + str(self.x_train.shape[1]) + " attributes")
     self.assertEqual(
         decisiontree.xn, self.x_train.shape[0],
         "Should be " + str(self.x_train.shape[0]) + " datapoints")
     np.testing.assert_array_equal(decisiontree.data, self.data,
                                   "x_train data does not match")
Пример #7
0
        delimiter = ';'
        metadata = ""
        seed = None
        for i in range(len(argv)):
            if (argv[i] in ['--file', '-f']):
                filename = argv[i + 1]
            elif (argv[i] in ['--delimiter', '-d']):
                delimiter = argv[i + 1]
            elif (argv[i] in ['--meta', '-m']):
                metadata = argv[i + 1]
            elif (argv[i] in ['--seed', '-s']):
                seed = int(argv[i + 1])
            elif (argv[i] in ['--help', '-h']):
                print_usage(argv[0])
                exit(0)
        if seed is not None:
            random.seed(int(seed))

        # load the dataset to memory
        db = Dataset(filename, delimiter=delimiter, metadata=metadata)

        # generates a decision tree from the dataset
        tree = DecisionTree(db.data,
                            db.attributes,
                            db.target_attribute,
                            db.numeric,
                            single_tree_print=True)

        # print the resultant tree on the terminal
        print(tree)
Пример #8
0
def cross_val(data, folds):

    start = datetime.datetime.now()

    splits = crossValidation_split(data, folds)
    stats = []
    stats_pruned = []

    #Creats a tree and evaluates for every fold. Results collected in list "stats"
    for i in range(folds):

        #Take one testset
        test = data[splits[i], :]
        results_lowlevel_unpruned = []
        results_lowlevel_pruned = []

        #Take only non-test folds for train and val
        remaining_folds = np.delete(np.arange(folds), i)
        for j in remaining_folds:

            mask_train = np.ones(data.shape[0], dtype=bool)
            mask_train[splits[i]] = False
            mask_train[splits[j]] = False

            train = data[mask_train, :]
            val = data[splits[j]]

            decisiontree = DecisionTree()
            decisiontree.buildTree(train)

            results_lowlevel_unpruned.append(decisiontree.evaluate(test))

            decisiontree.prune(train, val)
            results_lowlevel_pruned.append(decisiontree.evaluate(test))
            print("Tree Complete! Test Set: {} Validation Set: {}".format(
                i, j))

        stats.append(average_statistics(results_lowlevel_unpruned))
        stats_pruned.append(average_statistics(results_lowlevel_pruned))

    result = average_statistics(stats)
    result_pruned = average_statistics(stats_pruned)

    end = datetime.datetime.today()

    print("")
    print("#############################")
    print(
        "{}-fold Crossvalidation complete. Average scores unpruned tree over all folds:"
        .format(folds))
    print("Average Confusion Matrix:")
    print(result['confusionmatrix'])
    print("Average Precision: {:.2%}".format(np.mean(result['precision'])))
    print("Average Precision per Class:")
    print(result['precision'])
    print("Average Recall: {:.2%}".format(np.mean(result['recall'])))
    print("Average Recall per Class:")
    print(result['recall'])
    print("Average F1: {:.2%}".format(np.mean(result['F1score'])))
    print("Average F1 score per Class:")
    print(result['F1score'])
    print("Average Classification Rate: {:.2%}".format(result['posClassRate']))
    print("#############################")
    print("Average scores pruned tree over all folds:")
    print("Average Confusion Matrix:")
    print(result_pruned['confusionmatrix'])
    print("Average Precision: {:.2%}".format(
        np.mean(result_pruned['precision'])))
    print("Average Precision per Class:")
    print(result_pruned['precision'])
    print("Average Recall: {:.2%}".format(np.mean(result_pruned['recall'])))
    print("Average Recall per Class:")
    print(result_pruned['recall'])
    print("Average F1 score: {:.2%}".format(np.mean(result_pruned['F1score'])))
    print("Average F1 score per Class:")
    print(result_pruned['F1score'])
    print("Average Classification Rate: {:.2%}".format(
        result_pruned['posClassRate']))
    print("Runtime: {}".format(end - start))
    print("#############################")
    print("")

    return result, result_pruned
Пример #9
0
 def train(self, training_set):
     for train_set in training_set:
         self.Forest.append(
             DecisionTree(train_set, self.db.attributes,
                          self.db.target_attribute, self.db.numeric))
Пример #10
0
# Random Forest / Naiive Bayes
from src.loader import DataFactory

from src.DecisionTree import DecisionTree
from src.RandomForest import RandomForest
import numpy as np

import matplotlib.pyplot as plt

df = DataFactory()
animal_df = df.get_dataframe()

dt = DecisionTree(animal_df)
dt.run_simulation('class_type')

rf = RandomForest(animal_df)
rf.run_simulation('class_type')
Пример #11
0
 def test_score_split(self):
     tree = DecisionTree(self.x, self.y)
     score, _, _ = tree.score_split(1, 0)
     self.assertEqual(score, 1000)
Пример #12
0
 def test_find_best_split(self):
     tree = DecisionTree(self.x, self.y)
     self.assertEqual(tree.split_row, 1985)
Пример #13
0
 def test_is_leaf(self):
     tree = DecisionTree(self.x, self.y)
     self.assertEqual(tree.left.is_leaf(), True)
Пример #14
0
cnt_cutoff = 3
gain_cutoff = 0.1
train_ac = []
test_ac = []

for i in range(k):
    idx = np.arange(l)
    cv_set = indexes[int(i * l / k):int((i + 1) * l / k)]
    train_data = data[np.logical_not(np.isin(idx, cv_set)), :]
    test_data = data[cv_set, :]
    X_train = train_data[:, :-1]
    Y_train = train_data[:, -1]
    X_test = test_data[:, :-1]
    Y_test = test_data[:, -1]

    tree = DecisionTree()
    tree.train(X_train, Y_train, cnt_cutoff, gain_cutoff)

    pred_train = tree.run(X_train)
    train_ac.append(accuracy(Y_train.T, pred_train.flatten()))

    pred_test = tree.run(X_test)
    test_ac.append(accuracy(Y_test.T, pred_test.flatten()))

    tp += np.logical_and(Y_test.flatten(), pred_test.flatten()).sum()
    tn += np.logical_and(np.logical_not(Y_test.flatten()),
                         np.logical_not(pred_test.flatten())).sum()
    fp += np.logical_and(Y_test.flatten(),
                         np.logical_not(pred_test.flatten())).sum()
    fn += np.logical_and(np.logical_not(Y_test.flatten()),
                         pred_test.flatten()).sum()