import matplotlib.pyplot as plt import numpy as np from loadData import LoadData from decisionTree import Node, DecisionTree, Evaluate from inspection import Inspection if __name__ == '__main__': train_input = '../handout/education_train.tsv' test_input = '../handout/education_test.tsv' train_output = '../result/education_train.labels' test_output = '../result/education_test.labels' ld = LoadData() dataset = ld.load_data(train_input) dt = DecisionTree(ld) tr_err = [] te_err = [] x_arr = [] print(ld.head) for i in range(len(ld.head)): root = dt.construct(dataset, i) # dt.traverse(root) dt.classify(ld.load_data(train_input), root, train_output) dt.classify(ld.load_data(test_input), root, test_output) with open(train_output, 'r') as f: predcol = f.read().splitlines() realcol = np.loadtxt(train_input, dtype=str, delimiter='\t', skiprows=1)[:, -1]
self.gi=0 else: count1 = 0 for item in dataset: if item[-1]==dataset[0][-1]: count1+=1 count2 = len(dataset)-count1 self.gi = (count1/len(dataset))*(count2/len(dataset))+(count2/len(dataset))*(count1/len(dataset)) return self.gi # evaluate with error_rate and gini_impurity def evaluate(self): err_rate = self.error_rate(self.ori_dataset) gini_impurity = self.gini_impurity(self.ori_dataset) return err_rate,gini_impurity if __name__ == '__main__': infile = sys.argv[1] outfile = sys.argv[2] ld = LoadData() ori_dataset = ld.load_data(infile) ins = Inspection(ori_dataset) eva = ins.evaluate() err_rate = eva[0] gini_impurity = eva[1] with open(outfile, 'w') as f: f.writelines("gini_impurity: {}\n".format(gini_impurity)) f.writelines("error: {}\n".format(err_rate)) # print(err_rate) # print(gini_impurity)
right_branch = self.construct(new_dataset[1], col_index, depth) node.right = right_branch # print('col_index:',col_index) self.col.remove(col_index) return node def traverse(self, node): if node: # print(node.dataset,'\n') print(node.depth, '\t', node.attribute) self.traverse(node.left) self.traverse(node.right) if __name__ == '__main__': ld = LoadData() dataset = ld.load_data('../handout/small_train.tsv') dt = DecisionTree(dataset, 0) ds = dt.divide_dataset(dataset, 1) # gini = dt.gini_impurity(dataset,1) giga = dt.gini_gain(dataset, 1) # col = dt.get_attribute(dataset) root = dt.construct(dataset) # print(root.left.left.left.right.depth) dt.traverse(root) # print(root.dataset) # print(dataset) # print(ds[0]) # print(ds[1])