Exemplo n.º 1
0
def test(dataset):
    prunedTest = []
    prunedTrain = []
    testAcc = []
    trainSetAcc = []
    data = processDataset(dataset)
    # random pick data form dataset and test, return the mean accuracy
    for i in range(35):
        random.shuffle(data)
        trainSet = data[:2 * len(data) / 3]
        testSet = data[2 * len(data) / 3:]
        tree = decisionTree.buildTree(trainSet, 'Class')
        accTrain = decisionTree.test(tree, trainSet)
        accTest = decisionTree.test(tree, testSet)
        prunedTree = decisionTree.pruneTree(tree, testSet)
        accPruneTestset = decisionTree.test(prunedTree, testSet)
        accPruneTrainset = decisionTree.test(prunedTree, trainSet)
        trainSetAcc.append(accTrain)
        testAcc.append(accTest)
        prunedTrain.append(accPruneTrainset)
        prunedTest.append(accPruneTestset)
    print
    print "Before the pruning,the trainSet accuracy : ", sum(trainSetAcc) / len(trainSetAcc)
    print "Before the pruning,the testSet accuracy  : ", sum(testAcc) / len(testAcc)
    print "After the pruning, the trainSet accuracy : ", sum(prunedTrain) / len(prunedTrain)
    print "After the pruning, the testSet accuracy  : ", sum(prunedTest) / len(prunedTest)
Exemplo n.º 2
0
def __main_():  #todo to make this real, add an extra _ at the end of the name
    l = sys.argv[0]
    k = sys.argv[1]
    trainingSet = sys.argv[2]
    validationSet = sys.argv[3]
    testSet = sys.argv[4]
    toPrint = sys.argv[5]
    tree = dt.buildTree(trainingSet, "e")
    print("Test results, entropy: " + tree.test(testSet))
    print("Test results, post pruning, entropy" +
          dt.postPruning(tree, l, k, validationSet))
    if toPrint == "yes":
        tree.printTree()

    tree = dt.buildTree(trainingSet, "v")
    print("Test results, VI: " + tree.test(testSet))
    print("Test results, post pruning, VI" +
          dt.postPruning(tree, l, k, validationSet))
    if toPrint == "yes":
        tree.printTree()
Exemplo n.º 3
0
import decisionTree
import sys

trainIns, trainOuts = decisionTree.loadData(sys.argv[1])
testIns, testOuts = decisionTree.loadData(sys.argv[2])

tree = decisionTree.buildTree(trainIns, trainOuts, 1)
(theta, feature, gain, leftCategory, rightCategory) = tree
print("Learned Tree: ")
print("\tFeature " + str(feature) + ": (Information gain: " + str(gain) + ")")
print("\t| > " + str(theta) + " -> " + str(leftCategory))
print("\t| < " + str(theta) + " -> " + str(rightCategory))

trainErrs = decisionTree.runTree(tree, trainIns, trainOuts)
testErrs = decisionTree.runTree(tree, testIns, testOuts)

print("\nTraining Error:\t" + str(trainErrs * 100. / len(trainOuts)) + "%")
print("Testing Error:\t" + str(testErrs * 100. / len(testOuts)) + "%")
Exemplo n.º 4
0
                    inp = list(i[1].iloc[j][:-1])
                    expect = i[1].iloc[j][-1:]
                    nn.setInput(inp)
                    nn.allProcess()
                    o = nn.classify()
                    if o == int(expect):
                        success += 1
                # nn.getLayerList()
                t_e = success / float(len(i[1]))
                print("This error :", t_e)
                error.append(t_e)

            else:  #Decision tree
                input.normalize(i[0])
                input.normalize(i[1])
                target = [xp for xp in range(21)]
                root = dt.DecisionTree.Node(target)
                root.avaiable_attr = list(i[0])[:-1]
                dt.buildTree(i[0], root, 'G3')
                success = 0
                for j in range(len(i[1])):
                    result = root.classify(i[1].iloc[j])
                    if result == int(i[1].iloc[j]['G3']):
                        success += 1
                t_e = success / float(len(i[1]))
                print("This error :", t_e)
                error.append(t_e)

    a_error = sum(error) / 10.0
    print('ERROR is', a_error)
Exemplo n.º 5
0
import pandas as pd
import scipy as sc
import numpy as np
import math
import decisionTree as dt
import sys

tree = dt.buildTree("training_set_1.csv", "e")
tree.printTree()
tree = dt.buildTree("training_set_1.csv", "v")
tree.printTree()
print(tree.test("test_set_1.csv"))
#dt.buildTree("training_set_1.csv", "v")


def findEntropy(matches, mismatches):
    total = matches + mismatches
    print(total)
    first = -1 * (matches / total) * np.log2(matches / total)
    print(first)
    second = -1 * (mismatches / total) * np.log2(mismatches / total)
    print(second)
    return first + second


# n = dt.node("asdf")
# n.total = 4
# n.mismatchesOne = 4
# n.matchesOne = 0
# n.mismatchesZero = 0
# n.matchesZero = 0