def __init__(self, n_trees, tree_config=None):

        # By default we want some randomness in the trees
        default_tree_config = dict(cut_dim="random_best")
        tree_config = {**default_tree_config, **(tree_config or {})}

        self.trees = [DecisionTree(**tree_config) for i in range(n_trees)]
예제 #2
0
def create_tree(_dataset):
    '''

    :param _dataset: dataset that will be assigned to the root of the tree
    :return: return a DecisionTree object (not trained)
    '''
    return DecisionTree(_dataset)
예제 #3
0
def part1(t_data, v_data):
    tree = DecisionTree()
    tree_list = [tree]
    ts = time.time()
    create_dt_classifier(t_data, 9, tree, 0)
    print("Completed ", " Time : ", (time.time() - ts))

    train_accuracy_list = []
    val_accuracy_list = []
    iterations = []

    for i in range(0, 10):

        train_accuracy = check_accuracy_with_trees(t_data, tree_list, i)
        train_accuracy_list.append(train_accuracy)
        val_accuracy = check_accuracy_with_trees(v_data, tree_list, i)
        val_accuracy_list.append(val_accuracy)
        iterations.append(i)

    accuracy = [train_accuracy_list, val_accuracy_list]
    iters = [iterations, iterations]

    print("Completed ", " Time : ", (time.time() - ts))

    print(accuracy)
    legends = ["Training", "Validation"]
    labels = ["Accuracy in %", "Depth"]
    plot(iters, accuracy, "Accuracy Vs Depth", legends, labels)
예제 #4
0
def create_dt_classifier(data, depth, tree, m):
    # print("Depth : ", depth)
    # print("Data Shape : ", data.shape)
    if depth == 0:
        prediction = get_leaf_prediction_value(data)
        tree.insert(None, None, prediction, True)
        return
    u_root = gini_function(data[:, 0])
    gain = 0
    feature_index = 0
    threshold = 0

    feature_sampled_data, random_indexes = get_sampled_features(data, m)
    for i in range(1, feature_sampled_data.shape[1]):
        feature_index_current, gain_current, threshold_current = get_feature_gain(feature_sampled_data, i, u_root)

        if gain_current > gain:
            gain = gain_current
            feature_index = feature_index_current
            threshold = threshold_current

    if gain == 0:
        prediction = get_leaf_prediction_value(data)
        tree.insert(None, None, prediction, True)
        return

    depth = depth - 1

    if random_indexes is not None:
        feature_index = random_indexes[feature_index]

    sorted_vals = data[np.argsort(data[:, feature_index])[::1]]

    val = np.split(sorted_vals, np.where(sorted_vals[:, feature_index] >= threshold)[0][:1])
    # true_space = data[data[:, feature_index] >= threshold]
    true_space = val[1]
    false_space = val[0]
    # false_space = data[data[:, feature_index] < threshold]
    prediction = get_leaf_prediction_value(data)
    tree.insert(threshold, feature_index, prediction, False)
    tree.left = DecisionTree()
    tree.right = DecisionTree()
    create_dt_classifier(true_space, depth, tree.left, m)
    create_dt_classifier(false_space, depth, tree.right, m)
예제 #5
0
def create_dt_classifier(data, depth, tree):
    # print(data.shape, " Depth : ", depth)
    if depth == 0:
        prediction = ada_get_leaf_prediction_value(data)
        tree.insert(None, None, prediction, True)
        return
    u_root = ada_gini_function(data[:, 0:2])
    # print(u_root)
    gain = 0
    feature_index = 0
    threshold = 0

    for i in range(2, data.shape[1]):
        feature_index_current, gain_current, threshold_current = get_feature_gain(
            data, i, u_root)
        # print("gain_current : ", gain_current, " threshold_current : ", threshold, " feature_index_current", feature_index_current)
        if gain_current > gain:
            gain = gain_current
            feature_index = feature_index_current
            threshold = threshold_current
        # break
        # print("gain : ", gain, " threshold : ", threshold, " feature_index", feature_index)

    if gain == 0:
        prediction = ada_get_leaf_prediction_value(data)
        tree.insert(None, None, prediction, True)
        return

    depth = depth - 1

    sorted_vals = data[np.argsort(data[:, feature_index])[::1]]

    val = np.split(sorted_vals,
                   np.where(sorted_vals[:, feature_index] >= threshold)[0][:1])
    # true_space = data[data[:, feature_index] >= threshold]
    true_space = val[1]
    false_space = val[0]
    # false_space = data[data[:, feature_index] < threshold]
    prediction = ada_get_leaf_prediction_value(data)
    tree.insert(threshold, feature_index, prediction, False)
    tree.left = DecisionTree()
    tree.right = DecisionTree()
    create_dt_classifier(true_space, depth, tree.left)
    create_dt_classifier(false_space, depth, tree.right)
예제 #6
0
def part2():
    """randomly choose 5%, 10%, 20%, 50%, 100% samples to train, and choose 10 sets each time"""
    plt.figure()
    for trainFileName, testFileName, key in [
        ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'),
        ('../heart_train.arff', '../heart_test.arff', 'heart')
    ]:
        attribute, trainset = data_provider(trainFileName)
        testAttribute, testset = data_provider(testFileName)
        m = 4
        avgPoints = []
        maxPoints = []
        minPoints = []
        for rate in (0.05, 0.1, 0.2, 0.5, 1):
            accuracys = []
            for newTrainset in selectSample(trainset, rate):
                root = TreeNode(newTrainset, attribute)
                curTree = DecisionTree(root)
                curTree.createTree(root, m)
                trueSamples = 0
                falseSamples = 0
                for instance in testset:
                    if curTree.predict(root, instance) == instance[-1]:
                        trueSamples += 1
                    else:
                        falseSamples += 1
                accuracys.append(
                    float(trueSamples) / (trueSamples + falseSamples))
            accuracy = float(sum(accuracys)) / len(accuracys)
            avgPoints.append([int(rate * 100), accuracy])
            maxPoints.append([int(rate * 100), max(accuracys)])
            minPoints.append([int(rate * 100), min(accuracys)])

        mapping = {'diabetes': 1, 'heart': 2}
        ax = plt.subplot(1, 2, mapping[key])
        ax.set_xlim(0, 105)
        ax.set_ylim(0.45, 0.9)
        ax.set_ylabel('accuracy')
        ax.set_title(key)
        ax.plot([x[0] for x in avgPoints], [x[1] for x in avgPoints],
                label='average')
        ax.plot([x[0] for x in maxPoints], [x[1] for x in maxPoints],
                label='maximum')
        ax.plot([x[0] for x in minPoints], [x[1] for x in minPoints],
                label='minimum')
        ax.legend()
    plt.xlabel('dataset sample percentage')
    plt.savefig('../part2.pdf')
예제 #7
0
    def __init__(self, x, y, depth=2):
        """
        In the constructor we instantiate nn.Linear modules and assign them as
        member variables.
        """

        H = 100
        D_out = 1
        self.dTree = DecisionTree(x, y, idxs=range(len(y)), depth=2)

        super(treeNet, self).__init__()
        self.linear1 = torch.nn.Linear(self.dTree.D_in, H).cuda()
        self.theta = torch.nn.ModuleList([
            torch.nn.Linear(self.dTree.D_in, D_out)
            for i in range(self.dTree.nNodes)
        ]).cuda()
        self.sigmoid = torch.nn.Sigmoid().cuda()
예제 #8
0
    def calc_misclassification_rate(training_dataframe, validation_dataframe,
                                    criterion):
        err = 0
        x = training_dataframe[categorical_columns]
        y = training_dataframe['num']
        dt = DecisionTree(criterion)
        dt.fit(x, y)
        dt.prune(
            validation_dataframe.loc[:, validation_dataframe.columns != "num"],
            validation_dataframe.loc[:, "num"])
        for i in validation_dataframe.index:
            if (dt.root.evaluate(validation_dataframe.loc[
                    i, validation_dataframe.columns != "num"]) !=
                    validation_dataframe.loc[i, "num"]):
                err += 1
        err = err / len(validation_dataframe)
        print((err, dt))
        return (err, dt)

        gini_trees = calc_misclassification_rate(criterion="gini")
        gtree = max(gini_trees, key=lambda x: x[0])[1]
        print("best gini tree = {}".format(gtree))
        Gg = Digraph("", filename="tree_gini.pdf")
        gtree.plot(Gg)
        Gg.view()
        entropy_trees = calc_misclassification_rate(criterion="entropy")
        etree = max(entropy_trees, key=lambda x: x[0])[1]
        print("best entropy tree = {}".format(etree))
        Ge = Digraph("", filename="tree_entropy.pdf")
        etree.plot(Ge)
        Ge.view()

        fig, ax = plt.subplots(nrows=1, ncols=1)
        clf = tree.DecisionTreeClassifier(criterion="entropy")
        clf = clf.fit(categorical_features, df.num)
        tree.plot_tree(clf, ax=ax)
        plt.savefig("sklearn_entropy")
        plt.show()

        fig, ax = plt.subplots(nrows=1, ncols=1)
        clf = tree.DecisionTreeClassifier(criterion="gini")
        clf = clf.fit(categorical_features, df.num)
        tree.plot_tree(clf, ax=ax)
        plt.savefig("sklearn_gini")
        plt.show()
예제 #9
0
파일: dtree.py 프로젝트: csammcgrath/CS450
def compare_algorithm():
    skCount = 0
    samCount = 0

    data, targets, headers = get_voting()

    #split dataset into random parts
    train_data, test_data, train_target, test_target = split_data(data, targets)

    #reset the indexes so the dataframe can be properly parsed.
    train_data.reset_index(inplace=True, drop=True)
    test_data.reset_index(inplace=True, drop=True)
    train_target.reset_index(inplace=True, drop=True)
    test_target.reset_index(inplace=True, drop=True)

    #get the trees initialized
    samClassifier = DecisionTree()
    skClassifer = tree.DecisionTreeClassifier()

    #build trees
    samModel = samClassifier.fit(train_data, train_target, headers)
    skModel = skClassifer.fit(train_data, train_target)

    #get the predictions
    samPredicted = samModel.predict(test_data)
    skPredicted = skModel.predict(test_data)

    #this is important because this is how we can 
    #measure the accuracy
    test_target = test_target[headers[-1]]

    #loop through the program and measure the accuracy
    for index in range(len(test_data)):
        if skPredicted[index] == test_target[index]:
            skCount += 1

        if samPredicted[index] == test_target[index]:
            samCount += 1

    #get the accuracy rating
    samAccuracy = get_accuracy(samCount, len(test_data))
    skAccuracy = get_accuracy(skCount, len(test_data))

    print("Sam's ID3 Accuracy: {:.2f}%. \nSK's ID3 Accuracy: {:.2f}%.".format(samAccuracy, skAccuracy))
예제 #10
0
파일: dtree.py 프로젝트: csammcgrath/CS450
def execute_algorithm(dataset):
    #we all know that this whole shell is designed just for the Decision Tree
    classifier = DecisionTree()

    #determine which dataset to retrieve
    if (dataset == 1):
        data, targets, headers = get_loans()
    elif (dataset == 2):
        data, targets, headers = get_voting()
    count = 0

    #split dataset into random parts
    train_data, test_data, train_target, test_target = split_data(data, targets)

    #reset the indexes so the dataframe can be properly parsed.
    train_data.reset_index(inplace=True, drop=True)
    test_data.reset_index(inplace=True, drop=True)
    train_target.reset_index(inplace=True, drop=True)
    test_target.reset_index(inplace=True, drop=True)

    #build the tree!
    model = classifier.fit(train_data, train_target, headers)

    #prompt the user if he/she wants to display the tree
    print_id3(model)

    #target_predicted is an array of predictions that is received by the predict
    target_predicted = model.predict(test_data)

    #this allows us to know which column is the target
    test_target = test_target[headers[-1]]

    #loop through the target_predicted and count up the correct predictions
    for index in range(len(target_predicted)):
        #increment counter for every match from
        #target_predicted and test_target
        if target_predicted[index] == test_target[index]:
            count += 1

    accuracy = get_accuracy(count, len(test_data))

    #report to the user
    print("Accuracy: {:.2f}%".format(accuracy))
예제 #11
0
def main():
    #import the data from a csv
    car_data = np.genfromtxt('car_data.csv', delimiter=',')

    #call the tree creator module and pass the name of the json file to it
    dTree = DecisionTree('jsonTrees/' + testName + '.json')

    scores = []
    #track the best score and data
    best = [0, 0]
    test = []

    for data in car_data:

        #change the inputs for each of the cars in the tree
        dTree.changeInputs(convertCarData(data))
        #get the score for that car
        score = dTree.run()

        #check if its the highest
        if (score > best[0]):
            best[0] = score
            best[1] = data
        #add it to the list of scores
        scores.append(score)
        test.append([score, data.tolist()])

    #print("best",best[0],best[1])

    test = sorted(test, key=lambda x: x[0], reverse=True)

    pprint(test[0:3])

    #create a normilized histagram of the scores
    n, bins, patches = plt.hist(scores,
                                normed=1,
                                facecolor='green',
                                alpha=0.75)
    plt.title(testName + wValue)
    #save the image to a file
    plt.savefig("graphs/" + testName + wValue + ".png", bbox_inches='tight')
    #show the image
    plt.show()
예제 #12
0
def part3():
    points = {}
    plt.figure()
    for trainFileName, testFileName, key in [
        ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'),
        ('../heart_train.arff', '../heart_test.arff', 'heart')
    ]:
        attribute, trainset = data_provider(trainFileName)
        testAttribute, testset = data_provider(testFileName)
        root = TreeNode(trainset, attribute)
        curTree = DecisionTree(root)

        points = []
        for m in (2, 5, 10, 20):
            curTree.createTree(root, m)
            trueSamples = 0
            falseSamples = 0
            for instance in testset:
                if curTree.predict(root, instance) == instance[-1]:
                    trueSamples += 1
                else:
                    falseSamples += 1
            points.append(
                [m, float(trueSamples) / (trueSamples + falseSamples)])

        mapping = {'diabetes': 1, 'heart': 2}
        for x, y in points:
            ax = plt.subplot(2, 1, mapping[key])
            ax.set_xlim(0, 22)
            ax.set_ylim(0.6, 0.8)
            ax.set_ylabel('accuracy')
            ax.set_title(key)
            plt.annotate('%.3f' % y, xy=(x - 0.02, y + 0.02))
            plt.annotate('m=%d' % x, xy=(x - 0.02, y - 0.07))
            ax.plot(x, y, 'o-')

    plt.xlabel('tree number m')
    plt.savefig('../part3.pdf')
예제 #13
0
 def setUp(self):
     self.decision_tree = DecisionTree("c4.5")
예제 #14
0
def main():

    fmemFile = File("fmemFile.csv")

    #import the data from a csv
    car_data = np.genfromtxt('car_data.csv', delimiter=',')

    #call the tree creator module and pass the name of the json file to it
    aTree = DecisionTree('jsonTrees/' + testA + '.json')
    nTree = DecisionTree('jsonTrees/' + testN + '.json')
    eTree = DecisionTree('jsonTrees/' + testE + '.json')

    #iterator = car_data[np.random.randint(car_data.shape[0], size=100), :]
    #iterator = car_data
    iterator = inputs2

    for inputs in iterator:

        #change the inputs for each of the cars in the tree

        #inputs = convertCarData(inputs)
        aTree.changeInputs(inputs)
        nTree.changeInputs(inputs)
        eTree.changeInputs(inputs)

        #get the score for that car

        aScore = aTree.run()
        nScore = nTree.run()
        eScore = eTree.run()

        print("Inputs:", inputs)
        print("ASCORE #######:", aScore)
        print("NSCORE #######:", nScore)
        print("ESCORE #######:", eScore)

        eScore = np.array(eScore)

        f1 = MemFunc('trap', aScore)
        X = np.arange(0, 1, .05)

        l1, = plt.plot(X, [f1.memFunc(i) for i in X],
                       c='r',
                       linewidth=2.0,
                       label="AlphaCuts")
        l2, = plt.plot(eScore[:, 0],
                       eScore[:, 1],
                       c='b',
                       linewidth=2.0,
                       label="Extention Principle")
        l3 = plt.axvline(nScore, c='g', linewidth=2.0, label="Crisp")

        plt.legend(handles=[l1, l2, l3])
        plt.title("Regular Title")
        plt.xlabel("Output Score")
        plt.ylabel("Membership Value")

        #Batch Save Rember to remove input
        #plt.savefig("test.png")

        plt.show()
        break
예제 #15
0
from tree import DecisionTree

training_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
]

decison_tree = DecisionTree()

tree = decison_tree.build_tree(training_data)

decison_tree.print_tree(tree)


def pretty_print_leaf_predictions(counts):
    total = sum(counts.values()) * 1.0
    probabilities = {}
    for label in counts.keys():
        probabilities[label] = str(int(counts[label] / total * 100)) + "%"
    return probabilities


pretty_print_leaf_predictions(decison_tree.classify(training_data[0], tree))

testing_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 4, 'Apple'],
    ['Red', 2, 'Grape'],
from sklearn.preprocessing import LabelEncoder
from tree import DecisionTree
import pandas as pd
import numpy as np

if __name__ == '__main__':
    train_df = pd.read_csv('/app/data/train.csv')
    le_sex = LabelEncoder()
    le_sex.fit(train_df['Sex'])
    train_df.loc[:, 'SexInt'] = le_sex.transform(train_df['Sex'])

    X = np.array(train_df[['SexInt']])
    y = train_df['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=71)
    tree = DecisionTree(max_depth=3)
    tree.fit(X_train, y_train)

    print(classification_report(y_train, tree.predict(X_train)))
    print(classification_report(y_test, tree.predict(X_test)))

    # tree.make_graph()

    s_tree = DecisionTreeClassifier(max_depth=3)
    s_tree.fit(X_train, y_train)
    print(classification_report(y_train, s_tree.predict(X_train)))
    print(classification_report(y_test, s_tree.predict(X_test)))
    s_tree.predict_proba(X_test)
예제 #17
0
        sys.exit()
    trainFileName = sys.argv[1]
    testFileName = sys.argv[2]
    try:
        m = int(sys.argv[3])
    except:
        print >> sys.stderr, "[ERROR] [m] should be in integer!"
        sys.exit()

    attribute, trainset = data_provider(trainFileName)
    testAttribute, testset = data_provider(testFileName)
    try:
        assert (testAttribute == attribute)
    except AssertionError:
        print >> sys.stderr, "[ERROR] pls check the attributes of test data."
        sys.exit()

    # train
    root = TreeNode(trainset, attribute)
    curTree = DecisionTree(root)
    curTree.createTree(root, m)
    curTree.printTree(root, 0)

    # test
    print '<Predictions for the Test Set Instances>'
    index = 1
    for instance in testset:
        print '{}: Actual: {} Predicted: {}'.format(
            index, instance[-1], curTree.predict(root, instance))
        index += 1
예제 #18
0
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import accuracy_score
from pprint import pprint

#导入数据
data = pd.read_table('Font_dataset.txt', header=None, sep=',')

#特征数据和标签
X = data.drop(4, axis=1)
y = data[4]

from tree import DecisionTree
clf = DecisionTree()

print(u"*****在自己的决策树上进行10折交叉验证*****")
test_accuracy = []
L = X.shape[0]
kf = KFold(L, n_folds=10, random_state=2018)
count = 0
for train_index, test_index in kf:
    count += 1
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    #训练
    clf.fit(X.values, y.values)
    #测试
    test_pre = clf.predict(X_test)
    test_acc = accuracy_score(y_test, test_pre)
예제 #19
0
def printInputs():
    dTree = DecisionTree('jsonTrees/' + testName + '.json')
    dTree.printInputs()
예제 #20
0
from tree import DecisionTree
from iris_dataset import vectors, labels

N = int(len(vectors)*0.8)
training_vectors = vectors[:N]
training_labels = labels[:N]
test_vectors = vectors[N:]
test_labels = labels[N:]

tree = DecisionTree(leaf_size=1, n_trials=1)
tree.fit(training_vectors, training_labels)
results = tree.predict(test_vectors)
tree.show()

print("results:{}".format(results))
print("answers:{}".format(test_labels))
예제 #21
0
    ts = time.time()

    D = np.full((len(t_data)), 1 / len(t_data))
    print(D)
    ada_test_data = np.insert(t_data, 1, D, axis=1)
    ada_val_data = np.insert(v_data, 1, np.full(len(v_data), 1), axis=1)

    # print(ada_test_data)
    for j in range(0, len(l_list)):
        print("L : ", l_list[j])
        tree_list = []
        alpha_list = []
        ada_test_data = np.insert(t_data, 1, D, axis=1)
        # np.zeros((2, 1))
        for l in range(0, l_list[j]):
            tree = DecisionTree()
            create_dt_classifier(ada_test_data, d, tree)
            tree_list.append(tree)
            alpha = get_params(ada_test_data, tree, d)
            alpha_list.append(alpha)
            print(ada_test_data[:, 1])
            # print(ada_test_data)

        train_accuracy = check_accuracy_with_trees(ada_test_data, tree_list, d,
                                                   alpha_list)
        train_accuracy_list.append(train_accuracy)
        val_accuracy = check_accuracy_with_trees(ada_val_data, tree_list, d,
                                                 alpha_list)
        val_accuracy_list.append(val_accuracy)
        iterations.append(l_list[j])
        # print(train_accuracy)