def main(): trainSetFilename, testSetFilename, m = getArgs() trainData = arffParser.parse(trainSetFilename) testData = arffParser.parse(testSetFilename) data = trainData['data'] attr = trainData['attributes'] targetIndex = len(trainData['attributes']) - 1 tree = DecisionTree(data, attr, targetIndex, m) if tree.root == None: return tree.printTree() print('<Predictions for the Test Set Instances>') predictedData = [] for row in testData['data']: predictedClass = tree.classify(row, testData['attributes']) predictedRow = row[-1:] + [predictedClass] predictedData.append(predictedRow) numCorrect = 0 for i, row in zip(list(range(1, len(predictedData) + 1)), predictedData): print('{0}: Actual: {1} Predicted: {2}'.format(i, row[0], row[1])) if row[0] == row[1]: numCorrect += 1 print('Number of correctly classified: ' + str(numCorrect), 'Total number of test instances: ' + str(len(predictedData)))
from fileIO import FileIO from preprocess import Preprocessing from decisionTree import DecisionTree if __name__ == '__main__': filename = 'house-votes-84.data.txt' fileio = FileIO() data = fileio.read_csv(filename) preprocessing = Preprocessing() preprocessing.assume_missing_values(data) for percent in range(3, 8): training_data, testing_data = preprocessing.split_into_training_and_testing(data, percent/float(10)) attributes_number = len(training_data[0]) - 1 decision_tree = DecisionTree() root_node = decision_tree.build(training_data) # decision_tree.print() # print("Classification: ") accuracy = 0 for row in testing_data: classified = decision_tree.classify(row, decision_tree.root) classified.calc_percentages(len(testing_data)) if classified.republicans_percent > 50.0 and row[0] == 'republican' or ( classified.democrats_percent > 50.0 and row[0] == 'democrat'): accuracy += 1 accuracy = accuracy / float(len(testing_data)) print("Accuracy using training data", percent/float(10)*100, "% is: ", accuracy)
train_input = '../handout/education_train.tsv' test_input = '../handout/education_test.tsv' train_output = '../result/education_train.labels' test_output = '../result/education_test.labels' ld = LoadData() dataset = ld.load_data(train_input) dt = DecisionTree(ld) tr_err = [] te_err = [] x_arr = [] print(ld.head) for i in range(len(ld.head)): root = dt.construct(dataset, i) # dt.traverse(root) dt.classify(ld.load_data(train_input), root, train_output) dt.classify(ld.load_data(test_input), root, test_output) with open(train_output, 'r') as f: predcol = f.read().splitlines() realcol = np.loadtxt(train_input, dtype=str, delimiter='\t', skiprows=1)[:, -1] eva_train = Evaluate(realcol, predcol) train_errate = eva_train.error_rate() with open(test_output, 'r') as f: predcol = f.read().splitlines() realcol = np.loadtxt(test_input, dtype=str, delimiter='\t', skiprows=1)[:, -1] eva_test = Evaluate(realcol, predcol) test_errate = eva_test.error_rate()