def testClass(): myDat, labels = tree.createDataSet() myTree = tree.createTree(myDat, labels) # persistenting the decision tree tree.storeTree(myTree, 'myTree.train') myTree2 = tree.grabTree('myTree.train') testVec = [1, 0] print "Test ",testVec," result: ", tree.classify(myTree2, labels, testVec) testVec = [1, 1] print "Test ",testVec," result: ", tree.classify(myTree2, labels, testVec)
def classify(datapoint, tree): if isinstance(tree, Leaf): return max(tree.labels.items(), key=operator.itemgetter(1))[0] value = datapoint[tree.feature] for branch in tree.branches: if branch.value == value: return classify(datapoint, branch)
from tree import tree, classify car = ['med', 'med', '4', 'more', 'big', 'high'] print(classify(car, tree)) from collections import Counter labels = ["unacc", "unacc", "acc", "acc", "good", "good"] #labels = ["unacc","unacc","unacc", "good", "vgood", "vgood"] #labels = ["unacc", "unacc", "unacc", "unacc", "unacc", "unacc"] impurity = 1 label_counts = Counter(labels) print(label_counts) for label in label_counts: probability_of_label = label_counts[label] / len(labels) impurity -= probability_of_label**2 print(impurity) from collections import Counter unsplit_labels = [ "unacc", "unacc", "unacc", "unacc", "unacc", "unacc", "good", "good", "good", "good", "vgood", "vgood", "vgood" ] split_labels_1 = [[ "unacc", "unacc", "unacc", "unacc", "unacc", "unacc", "good", "good", "vgood" ], ["good", "good"], ["vgood", "vgood"]] split_labels_2 = [[ "unacc", "unacc", "unacc", "unacc", "unacc", "unacc", "good", "good", "good", "good" ], ["vgood", "vgood", "vgood"]]
from tree import build_tree, print_tree, car_data, car_labels, classify import random random.seed(4) # The features are the price of the car, the cost of maintenance, the number of doors, the number of people the car can hold, the size of the trunk, and the safety rating unlabeled_point = ['high', 'vhigh', '3', 'more', 'med', 'med'] indices = [random.randint(0, 999) for i in range(1000)] predictions = [] for i in range(0, 20): data_subset = [car_data[index] for index in indices] labels_subset = [car_labels[index] for index in indices] subset_tree = build_tree(data_subset, labels_subset) predictions.append(classify(unlabeled_point, subset_tree)) print(predictions) final_prediction = max(predictions, key=predictions.count) print(final_prediction)
data_subset = [car_data[index] for index in indices] labels_subset = [car_labels[index] for index in indices] print(find_best_split(data_subset, labels_subset)) from tree import build_tree, print_tree, car_data, car_labels, classify import random random.seed(4) # The features are the price of the car, the cost of maintenance, the number of doors, the number of people the car can hold, the size of the trunk, and the safety rating unlabeled_point = ['high', 'vhigh', '3', 'more', 'med', 'med'] predictions = [] for i in range(20): indices = [random.randint(0, 999) for i in range(1000)] data_subset = [car_data[index] for index in indices] labels_subset = [car_labels[index] for index in indices] subset_tree = build_tree(data_subset, labels_subset) predictions.append(classify(unlabeled_point, subset_tree)) print(predictions) final_prediction = max(predictions, key=predictions.count) print(final_prediction) from tree import training_data, training_labels, testing_data, testing_labels, make_random_forest, make_single_tree, classify import numpy as np import random np.random.seed(1) random.seed(1) tree = make_single_tree(training_data, training_labels) forest = make_random_forest(40, training_data, training_labels) forest_correct = 0 single_tree_correct = 0 for i in range(len(testing_data)): prediction = classify(testing_data[i], tree)
import treePlot import tree if __name__ == '__main__': # 构建树 my_data, class_labels = tree.create_dataset() # my_tree = tree.create_tree(my_data, class_labels) # number_leafs = tree.get_number_leafs(my_tree) # print(number_leafs) # # tree_depth = tree.get_tree_depth(my_tree) # print(tree_depth) # treePlot.create_plot(my_tree) my_tree = treePlot.retrieve_tree(0) class_label = tree.classify(my_tree, class_labels, [1, 0]) print(class_label)
# -*- coding:utf-8 -*- import tree import treePlotter feature, labels = tree.create_data_set() # en = tree.calcShannomEnt(feature) # print en # print feature # print labels # feature[0][-1] = "maybe" # en2 = tree.calcShannomEnt(feature) # print feature # print en2 # split = tree.splitDataSet(feature,0, 0) # print tree.splitDataSet(feature,0, 0) # print tree.splitDataSet(feature,0, 1) # bestFeature = tree.chooseBestFeature(feature) # print bestFeature myTree = tree.create_tree(feature, labels) print myTree # treePlotter.createPlot() feature, labels = tree.create_data_set() pre = tree.classify(myTree, labels, [1, 0]) print pre
# -*- coding: utf-8 -*- import tree import copy dataset, label = tree.createDataSet() print(label) # 这里仅仅用 labels=label是不行的,因为它们指向同一个内存 labels = copy.deepcopy(label) myTree = tree.createTree(dataset, labels) # print(myTree) print(label) testResult = tree.classify(myTree, label, [1, 1]) print(testResult) tree.storeTree(myTree, "F:\NatureRecognition/tree.txt") tt = tree.grabTree("F:\NatureRecognition/tree.txt") print(tt)
print tree.calcShannonEnt([[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'yes'], [0, 1, 'yes'], [0, 1, 'yes']]) print tree.splitDataSet( [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']], 0, 1) print tree.chooseBestFeatureToSplit([[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]) print tree.createTree( [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']], ['No Surfacing?', 'Flippers?']) t = {'No Surfacing?': {0: 'no', 1: {'Flippers?': {0: 'no', 1: 'yes'}}}} print treePlotter.getNumLeafs(t) print treePlotter.getTreeDepth(t) treePlotter.createPlot(t) print tree.classify( {'No Surfacing?': { 0: 'no', 1: { 'Flippers?': { 0: 'no', 1: 'yes' } } }}, ['No Surfacing?', 'Flippers?'], [1, 0])
# labels = ['no surfacing', 'filppers'] # dataset[0][-1] = 'maybe' # shannonEnt = tree.calcShannonEnt(dataset) # print shannonEnt # print tree.splitDataSet(dataset, 0, 0) # print tree.chooseBestFeature(dataset) # print tree.createTree(dataset, labels) # treeplotter.createPlot() # myTree = treeplotter.retrieveTree(0) # print myTree # print treeplotter.getNumLeafs(myTree) # print treeplotter.getTreeDepth(myTree) # treeplotter.createPlot(myTree) # print tree.classify(myTree, labels,[1,1]) fr = open('lenses.txt') lines = fr.readlines() lensesAll = [ inst.split("\t") for inst in lines] lensesTrain = lensesAll[5:len(lines)] lensesLables = ['age', 'prescript', 'astigmatic', 'tearRate'] lensesTree = tree.createTree(lensesTrain, lensesLables[:]) # treeplotter.createPlot(lensesTree) # lensesTree = tree.grabTree( 'Decision.txt') # treeplotter.createPlot(lensesTree) for i in range(5): print "分类为%s, 正确为%s" %(tree.classify(lensesTree, lensesLables, lensesAll[i][0:-1]), lensesAll[i][-1])
import tree import treeplotter dataset,labels = tree.createDataSet() print(dataset) print(labels) label = labels.copy() #classlist = [example[-1] for example in dataset] mytree = tree.createTree(dataset,labels) print(mytree) #treeplotter.createPlot() #print(treeplotter.getTreeDepth(mytree)) #createPlot(mytree) #print(label) print(tree.classify(mytree,label,[1,0])) treeplotter.createPlot(mytree)
from tree import training_data, training_labels, testing_data, testing_labels, make_random_forest, make_single_tree, classify import numpy as np import random np.random.seed(1) random.seed(1) from collections import Counter tree = make_single_tree(training_data, training_labels) single_tree_correct = 0 forest = make_random_forest(40, training_data, training_labels) forest_correct = 0 for i in range(len(testing_data)): prediction = classify(testing_data[i], tree) if prediction == testing_labels[i]: single_tree_correct += 1 predictions = [] for forest_tree in forest: predictions.append(classify(testing_data[i], forest_tree)) forest_prediction = max(predictions, key=predictions.count) if forest_prediction == testing_labels[i]: forest_correct += 1 print(single_tree_correct / len(testing_data)) print(forest_correct / len(testing_data))
# tree.choose_best_feature_to_split(dataset) # # my_tree = tree.create_tree(dataset, labels) # # tree_plotter.retrieve_tree(1) my_tree = tree_plotter.retrieve_tree(0) # # tree_plotter.get_num_leafs(my_tree) # # tree_plotter.get_tree_depth(my_tree) tree_plotter.create_plot(my_tree) data, labels = tree.create_dataset() tree.classify(my_tree, labels, [1, 0]) tree.classify(my_tree, labels, [1, 1]) tree.store_tree( 'my_tree', "/home/zhangzhiliang/Documents/my_git/DATA-SCIENTIST-/" "machine_learing_algorithm/machine_learning_in_action/3_decision_tree/classifierStorage.txt" ) tree.load_tree( "/home/zhangzhiliang/Documents/my_git/DATA-SCIENTIST-/" "machine_learing_algorithm/machine_learning_in_action/3_decision_tree/classifierStorage.txt" ) # 隐形眼镜 fr = open(
import tree as t import treePlotter as tp import os f = open(os.path.dirname(__file__) +'/lenses.txt') lenses = [r.strip().split('\t') for r in f.readlines()] lensesLabel = ['age','prescript','astigmatic','tearRate'] lensesTree = t.createTree(lenses,lensesLabel) tp.createPlot(lensesTree) fmt = '%10s' print [fmt % x for x in lensesLabel] for lense in lenses: print [fmt % x for x in lense],t.classify(lensesTree,lensesLabel,lense[0:-1])
# Source from Codecademy from tree import build_tree, print_tree, car_data, car_labels, classify import random random.seed(4) # The features are the price of the car, the cost of maintenance, the number of doors, the number of people the car can hold, the size of the trunk, and the safety rating unlabeled_point = ['high', 'vhigh', '3', 'more', 'med', 'med'] predictions = [] for i in range(20): indices = [random.randint(0, 999) for i in range(1000)] data_subset = [car_data[index] for index in indices] labels_subset = [car_labels[index] for index in indices] subset_tree = build_tree(data_subset, labels_subset) result = classify(unlabeled_point, subset_tree) predictions.append(result) print(predictions) final_prediction = max(predictions, key=predictions.count) print(final_prediction)
import treePlotter import tree myDat, labels = tree.createDataSet() labelsTemp = [] labelsTemp[:] = labels[:] print('00000000000000000000labels = ', labelsTemp) #myTree = tree.createTree(myDat, labelsTemp) #tree.storeTree(myTree,'Tree.txt') myTreeFromFile = tree.grabTree('Tree.txt') print('myTreeFromFile = ', myTreeFromFile) print('labels = ', labels) result = tree.classify(myTreeFromFile, labels, [1, 0]) print('result = ', result)
# When considering buying a car, what factors go into making that decision? # Each car can fall into four different classes which represent how satisfied someone would be with purchasing the car — unacc (unacceptable), acc (acceptable), good, vgood. # Each car has 6 features: # - The price of the car which can be "vhigh", "high", "med", or "low". # - The cost of maintaining the car which can be "vhigh", "high", "med", or "low". # - The number of doors which can be "2", "3", "4", "5more". # - The number of people the car can hold which can be "2", "4", or "more". # - The size of the trunk which can be "small", "med", or "big". # - The safety rating of the car which can be "low", "med", or "high". from tree import tree, classify, data car = ["low", "low", "4", "4", "big", "high"] print(classify(car, tree))
import arff import tree import sys arg = sys.argv m = int(arg[3]) trainData = arff.load(open(arg[1], 'r')) testData = arff.load(open(arg[2], 'r')) myTree = tree.createTree(trainData['data'], trainData['attributes'], m) tree.plotTree(myTree, trainData['attributes']) prediction = [tree.classify(myTree, testData['attributes'], obs) for obs in testData['data']] true = [obs[-1] for obs in testData['data']] print "<Predictions for the Test Set Instances>" n = 0 for i in range(len(prediction)): index = i + 1 if prediction[i] == true[i]: n += 1 print "{}: Actual: {} Predicted: {}".format(n, true[i], prediction[i]) print "Number of correctly classified: {} Total number of test instances: {}".format(n, len(testData['data']))