def test_accuracy_compare(root, data, prune1, prune2): d1 = 0 d2 = 0 for point, location in data: h1_right = classify(root, point, prune1) == location h2_right = classify(root, point, prune2) == location if h1_right and not h2_right: d1 += 1 if h2_right and not h1_right: d2 += 1 return (d1, d2)
def classify(self, row, label=True): ''' Aggregates the results from the decision trees on the given row. ''' agg_res = {} for tree in self.trees: tree_res = dt.classify(tree, row) max_label = None max_val = 0 for k in tree_res.keys(): if tree_res[k] > max_val: max_label = k max_val = tree_res[k] if max_label not in agg_res: agg_res[max_label] = 0 agg_res[max_label] += 1 if label: max_label = None max_val = 0 for k in agg_res.keys(): if agg_res[k] > max_val: max_label = k max_val = agg_res[k] return max_label else: return agg_res
def test_accuracy(root, data, prune=-1): wrong = {} total = {} for point, location in data: total[location] = total.get(location, 0) + 1.0 if classify(root, point, prune) != location: wrong[location] = wrong.get(location, 0) + 1.0 err_locs = dict( (n, wrong.get(n, 0) / total.get(n, 0)) for n in set(wrong)|set(total) ) err_all = sum(wrong.values()) / sum(total.values()) return (err_locs , err_all)
def run_dt(data, tests): # Create attributes with all the columns attributes = [x for x in range(len(data[0]) - 1)] # Initial call dt.tree = dt.DTL(data, attributes, False) if len(sys.argv) >= 4 and sys.argv[4] == "print": dt.printTree(dt.tree, 0) outputs = [] for test in tests: outputs.append("yes" if dt.classify(dt.tree, test) else "no") return outputs
def get_risks(records): # UWAGA! Przy każdym uruchomienu trenowane jest drzewo - jeżeli # będzie to wolny proces, to można zapisać drzewo do pliku! path = str(Path(os.getcwd()).parent) + "/data/artif_data.txt" f = open(path, 'r') training_data = [line.rstrip().split(',') for line in f] header = training_data.pop(0) tree = build_tree(training_data) predicted_risks = {} for area in records: predicted_risks[area] = list(classify(records[area], tree).keys())[0] return predicted_risks
def main(): test = True if len(sys.argv) == 2: data = find_files(argv=sys.argv[1:]) elif len(sys.argv) == 3: data = find_files(argv=sys.argv[1:]) test = False else: data = find_files() data.print_data() target_attr = data.attributes[-1] tree = id3(data.values, data.attributes, target_attr) op = input("Do you want to create picture of tree graph? [y/n]: ") if op in 'yY' or op in 'yesYes': global graph graph = pydot.Dot(graph_type='graph') print_tree(data.values, data.attributes, tree) f_name = data.name.split('.')[1][1:] + '.png' graph.write_png(f_name) print("Generated graph to file:", f_name) else: print_tree(data.values, data.attributes, tree) if test: op = input("Do you want to classify new examples? [y/n]: ") else: op = 'Y' if op in 'yY' or op in 'yesYes': if test: test_data = find_files() else: test_data = find_files(argv=sys.argv[2:]) if test_data.attributes != data.attributes[:-1]: test_data.values.insert(0, test_data.attributes) test_data.attributes = data.attributes[:-1] try: class_results = classify(tree, test_data.values, test_data.attributes) test_data.print_data(data=data, test=class_results) except ValueError: print("Unable to classify examples in:", test_data.name) exit(0)
def classify(tree_model, testlabels, testdata): """ 预测,多数投票 :param tree_model: 各基分类器的树结果,list :param testlabels: 测试数据的特征标签,list :param testdata: 测试数据,list :return: 组合分类器结果 """ vote = {} for tree in tree_model: # 使用异常捕捉原因:随机性导致构造的决策树可能未包含某一特征的所有值,导致最后无法预测,对于这类树,直接投0 try: label = dtree.classify(tree, testlabels, testdata) if label not in vote.keys(): vote[label] = 1 else: vote[label] += 1 except: continue result = max(zip(vote.values(), vote.keys()))[1] return result
def alternative_classifier(train_set, train_labels, test_set, test_labels, **kwargs): pred_set = [] train_set_red, test_set_red = reduce_data(train_set, test_set, [9, 12]) train_data = np.insert(train_set_red, 2, train_labels, axis=1) test_data = np.insert(test_set_red, 2, test_labels, axis=1) tree = build_tree(train_data) for row in test_data: prediction = classify(row, tree) pred_set.append(prediction) accuracy = calculate_accuracy(test_labels, pred_set) print(accuracy) confusionMatrix = calculate_confusion_matrix(test_labels, pred_set) plot_matrix(confusionMatrix) plt.show() return pred_set
newTestSet = my_model.transform(test).tolist() newTrainSet = my_model.transform(training).tolist() ############# Model Building ############## for k in range(len(newSet)): newSet[k].append(trainingLabels[k]) passingData = newSet[:] models.append(dt.buildtree(passingData)) # dt.prune(b,0.1) ############# Classification of Test Records ############## for j in range(len(newTestSet)): if j not in test_classify: test_classify[j] = [] test_classify[j].append(dt.classify(newTestSet[j], models[i])) ############# Accuracy Calculations ############## d = [] f = [] flat = [] for l in test_classify.values(): flat = [] d = [] for m in l: d.append(list(m.keys())) flat = [item for sublist in d for item in sublist] f.append(flat) count = 0
import decision_tree as dtree data = [ ['青年', '否', '否', '一般', '否'], ['青年', '否', '否', '好', '否'], ['青年', '是', '否', '好', '是'], ['青年', '是', '是', '一般', '是'], ['青年', '否', '否', '一般', '否'], ['中年', '否', '否', '一般', '否'], ['中年', '否', '否', '好', '否'], ['中年', '是', '是', '好', '是'], ['中年', '否', '是', '非常好', '是'], ['中年', '否', '是', '非常好', '是'], ['老年', '否', '是', '非常好', '是'], ['老年', '否', '是', '好', '是'], ['老年', '是', '否', '好', '是'], ['老年', '是', '否', '非常好', '是'], ['老年', '否', '否', '一般', '否'], ] labels = ['年龄', '有工作', '有自己的房子', '信贷情况'] mytree = dtree.create_tree(data, labels) print(mytree) testdata = ['青年', '否', '否', '非常好'] testlabel = ['年龄', '有工作', '有自己的房子', '信贷情况'] # 由于在生成决策树模型的时候labels有所改动,所以分类预测时不能直接调用labels result = dtree.classify(mytree, testlabel, testdata) print(result)
def predict_classify(forest, test): predict_cls = [] for tree in forest: cls = decision_tree.classify(tree, test) predict_cls.append(cls) return decision_tree.max_cnt(predict_cls)
def main(argv): # run decision tree classifier decision_tree.classify()
def main(): headers, data_set = read_dataset("../csv_data/data_set.csv") my_tree = build_tree(data_set, headers) #print_tree(my_tree) print( print_leaf(classify([6.44, 21.0, 65.22, 1431.0, 19.0, 99.0], my_tree)))
import decision_tree import json import tree_plotter fr = open(r'/home/zhaoguanyi/PycharmProjects/Decision Tree/watermelon.txt') listWm = [inst.strip().split('\t') for inst in fr.readlines()] # 读取数据集 print(listWm) labels = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感'] # 标签 Trees = decision_tree.createTree(listWm, labels) # 构建决策树 print(json.dumps(Trees, ensure_ascii=False)) # 打印决策树 # 测试 labels = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感'] for i in range(17): testData = listWm[i][:6] print(testData) testClass = decision_tree.classify(Trees, labels, testData) # 测试 print(json.dumps(testClass, ensure_ascii=False)) tree_plotter.createPlot(Trees) # 可视化决策树
import decision_tree import tree_plotter import numpy as np # 创建数据集 def createDataSet(): dataSet = [['可以生存', '有', "鱼类"], ['可以生存', '有', "鱼类"], ['可以生存', '没有', "非鱼类"], ['不能生存', '有', "非鱼类"], ['不能生存', '有', "非鱼类"]] labels = ['不浮出水面是否可以生存', '是否有脚蹼'] return dataSet, labels if __name__ == "__main__": dataSet, labels = createDataSet() tree = decision_tree.createTree(dataSet, labels) print(tree) _, labels = createDataSet() result = decision_tree.classify(tree, labels, ["不能生存", "有"]) print(result)
def classify(self, obs): """Returns the predicted value given the parameters.""" preds = map(lambda tree: dt.classify(obs, tree), self.trees) preds = np.median(map(dt.convertToLabel, preds)) return preds
from decision_tree import get_header from decision_tree import set_header from decision_tree import get_unique_values import csv training_data = [] with open('data.csv', encoding="utf8") as csvfile: readCSV = csv.reader(csvfile, delimiter=',') for row in readCSV: new_row = [] for item in row[0].split(','): new_row.append(item) training_data.append(new_row) my_tree = build_tree(training_data) print_tree(my_tree) print() testing_data = [] for i in range(len(get_header()) - 1): ask = 'Введіть ' + str(get_header()[i]) + str( get_unique_values(training_data, i)) + ': ' user_input = input(ask) testing_data.append(user_input) print("Передбачено: %s" % (print_leaf(classify(testing_data, my_tree)))) input()
my_model = PCA(n_components=pca_comps, svd_solver='full') newSet = my_model.fit_transform(training).tolist() newTestSet = my_model.transform(test).tolist() newTrainSet = my_model.transform(training).tolist() ############# Model Building ############## for i in range(len(newSet)): newSet[i].append(trainingLabels[i]) passingData = newSet[:] b = dt.buildtree(passingData) dt.prune(b, 0.1) ############# Classification of Train Records ############## count = 0 for i in range(len(newTrainSet)): a = dt.classify(newTrainSet[i], b) for key in a.keys(): if (key == trainingLabels[i]): count = count + 1 ############# Accuracy Calculations for Training DataSet ############## accuracy = (count / len(newTrainSet)) * 100 final_train_acc += accuracy print('Train accuracy:', accuracy) ############# Classification of Test Records ############## count = 0 accuracy = 0 for i in range(len(newTestSet)): a = dt.classify(newTestSet[i], b) for key in a.keys():
import decision_tree if __name__ == '__main__': fr = open('lenses.txt') # 读取数据文件的每一行,然后以\t分割成列表 lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ["age", "prescript", "astigmatic", "tearRate"] # 使用decision_tree实现的createTree()函数创建决策树 lensesTree = decision_tree.createTree(lenses, lensesLabels) print(lensesTree) # 注意,我们在使用分类器时,要重新传入分类标签列表,不能重用前面的分类标签列表。因为在创建决策树函数中,会删除标签列表里的数据。 labels = ["age", "prescript", "astigmatic", "tearRate"] # 使用分类器函数预测未知数据 result = decision_tree.classify(lensesTree, labels, ["young", "hyper", "yes", "reduced"]) print(result)
x_train, x_test, y_train, y_test = train_test_split(features_train, labels_train, test_size=0.33) # concatenate features and labels data_train = np.column_stack((x_train, y_train)) data_test = np.column_stack((x_test, y_test)) # build decision tree using entropy decision_tree = dt.buildtree(data_train, dt.entropy, 0.01) min_gain_error = {} # test minimal gain values for pruning for min_gain_value in np.arange(0,1, 0.01): dt_temp = copy.copy(decision_tree) dt.prune(dt_temp, min_gain_value) # classify test data y_hat = map(lambda obs : dt.classify(obs, dt_temp), x_test) y_hat = map(dt.convertToLabel, y_hat) y_hat = np.array(y_hat) error = (y_hat != y_test).sum() / float(y_test.shape[0]) min_gain_error[min_gain_value] = error # prune tree with optimal min_gain value min_gain_opt = min(dict.items(min_gain_error))[0] dt.prune(decision_tree, min_gain_opt) # print and draw decision tree # dt.drawtree(decision_tree,png='census_decision_tree.png') # dt.printtree(decision_tree)
def forest_classify(trees, input): votes = [classify(tree, input) for tree in trees] vote_counts = Counter(votes) return vote_counts.most_common(1)[0][0]
my_model = PCA(n_components=pca_comps, svd_solver='full') newSet = my_model.fit_transform(rows_total).tolist() newtestSet = my_model.transform(rows_test_total).tolist() ############# Model Building ############## for i in range(len(rows_total)): newSet[i].append(training_labels[i]) b = dt.buildtree(newSet) dt.prune(b, 0.1) ############# Classification of Test Records ############## number = 0 accuracy = 0 for i in range(testSize): a = dt.classify(newtestSet[i], b) for key in a.keys(): if (key == testing_labels[i]): number = number + 1 ############# Accuracy Calculations ############## accuracy = (number / testSize) * 100 final_test_acc += accuracy print('Test accuracy:', accuracy) ############# Classification of Training Records ############## number = 0 accuracy = 0 train_label = [] for i in range(trainSize):