예제 #1
0
def main():
    trainSetFilename, testSetFilename, m = getArgs()
    trainData = arffParser.parse(trainSetFilename)
    testData = arffParser.parse(testSetFilename)

    data = trainData['data']
    attr = trainData['attributes']
    targetIndex = len(trainData['attributes']) - 1
    tree = DecisionTree(data, attr, targetIndex, m)

    if tree.root == None:
        return
    tree.printTree()
    print('<Predictions for the Test Set Instances>')
    predictedData = []
    for row in testData['data']:
        predictedClass = tree.classify(row, testData['attributes'])
        predictedRow = row[-1:] + [predictedClass]
        predictedData.append(predictedRow)

    numCorrect = 0
    for i, row in zip(list(range(1, len(predictedData) + 1)), predictedData):
        print('{0}: Actual: {1} Predicted: {2}'.format(i, row[0], row[1]))
        if row[0] == row[1]:
            numCorrect += 1
    print('Number of correctly classified: ' + str(numCorrect),
          'Total number of test instances: ' + str(len(predictedData)))
예제 #2
0
from fileIO import FileIO
from preprocess import Preprocessing
from decisionTree import DecisionTree


if __name__ == '__main__':
    filename = 'house-votes-84.data.txt'
    fileio = FileIO()
    data = fileio.read_csv(filename)

    preprocessing = Preprocessing()
    preprocessing.assume_missing_values(data)
    for percent in range(3, 8):
        training_data, testing_data = preprocessing.split_into_training_and_testing(data, percent/float(10))
        attributes_number = len(training_data[0]) - 1
        decision_tree = DecisionTree()
        root_node = decision_tree.build(training_data)
        # decision_tree.print()
        # print("Classification: ")
        accuracy = 0
        for row in testing_data:
            classified = decision_tree.classify(row, decision_tree.root)
            classified.calc_percentages(len(testing_data))
            if classified.republicans_percent > 50.0 and row[0] == 'republican' or (
                    classified.democrats_percent > 50.0 and row[0] == 'democrat'):
                accuracy += 1

        accuracy = accuracy / float(len(testing_data))
        print("Accuracy using training data", percent/float(10)*100, "% is: ", accuracy)
예제 #3
0
    train_input = '../handout/education_train.tsv'
    test_input = '../handout/education_test.tsv'
    train_output = '../result/education_train.labels'
    test_output = '../result/education_test.labels'

    ld = LoadData()
    dataset = ld.load_data(train_input)
    dt = DecisionTree(ld)
    tr_err = []
    te_err = []
    x_arr = []
    print(ld.head)
    for i in range(len(ld.head)):
        root = dt.construct(dataset, i)
        # dt.traverse(root)
        dt.classify(ld.load_data(train_input), root, train_output)
        dt.classify(ld.load_data(test_input), root, test_output)
        with open(train_output, 'r') as f:
            predcol = f.read().splitlines()
        realcol = np.loadtxt(train_input,
                             dtype=str,
                             delimiter='\t',
                             skiprows=1)[:, -1]
        eva_train = Evaluate(realcol, predcol)
        train_errate = eva_train.error_rate()
        with open(test_output, 'r') as f:
            predcol = f.read().splitlines()
        realcol = np.loadtxt(test_input, dtype=str, delimiter='\t',
                             skiprows=1)[:, -1]
        eva_test = Evaluate(realcol, predcol)
        test_errate = eva_test.error_rate()