def prune(tree, testData): # 判断测试集是否为空,为空,直接返回原始树 if shape(testData)[0] == 0: return getMean(tree) # if we have no test data collapse the tree if (isTree(tree['right']) or isTree(tree['left'])): # if the branches are not trees try to prune them # 根据待剪枝树的特征索引及切分阈值,对测试数据进行切分(获取新的切分后的测试数据lSet,rSet) lSet, rSet = regTrees.binSplitDataSet(testData, tree['spInd'], tree['spVal']) # 如果左节点为树, if isTree(tree['left']): # 递归对树的左节点进行剪枝 tree['left'] = prune(tree['left'], lSet) # 如果右节点为树 if isTree(tree['right']): # 递归对树的右节点进行剪枝 tree['right'] = prune(tree['right'], rSet) # if they are now both leafs, see if we can merge them # 如果左右节点都为叶子节点,则进行合并 if not isTree(tree['left']) and not isTree(tree['right']): # 根据待剪枝树的特征索引及切分阈值,对测试数据进行切分(获取新的切分后的测试数据lSet,rSet) lSet, rSet = regTrees.binSplitDataSet(testData, tree['spInd'], tree['spVal']) # 计算为合并之前的误差 errorNoMerge = sum(power(lSet[:, -1] - tree['left'], 2)) + \ sum(power(rSet[:, -1] - tree['right'], 2)) # 合并后节点的平均值tree treeMean = (tree['left'] + tree['right']) / 2.0 # 计算合并后的误差 errorMerge = sum(power(testData[:, -1] - treeMean, 2)) # 合并后的误差小于不合并的误差,则返回合并后的平均值tree,否则返回原始tree if errorMerge < errorNoMerge: print "merging" return treeMean else: return tree else: return tree
def test_reg_trees(self): # regTrees.loadDataSet() # 创建对角矩阵 testMat = mat(eye(4)) print("\n testMat == %s" % (testMat)) # 将第1列的特征值根据阈值0.5分割成两个字矩阵 mat0, mat1 = regTrees.binSplitDataSet(testMat, 1, 0.5) print("\n mat0 == %s" % (mat0)) print("\n mat1 == %s" % (mat1))
def prune(tree, testData): """ 该函数首先需要确认测试集是否为空。一旦非空,则反复递归调用函数prune()对测试数据进行切分。 接下来要检查某个分支到底是子树还是节点。如果是子树,就调用函数prune()来对该子树进行剪枝。 在对左右两个分支完成剪枝之后,还需要检查他们是否仍然还是子树,如果两个分支已经不再是子树,那么就可以进行合并。 具体的做法是对合并前后的误差进行比较。如果合并后的误差比不合并的误差小就进行合并,否则的话不合并直接返回。 :param tree: 待剪枝的树 :param testData: 剪枝所需的测试数据 :return: """ if shape(testData)[0] == 0: return getMean(tree) # 判断分枝是否是dict字典,如果是就将测试数据集进行切分 if (isTree(tree['right']) or isTree(tree['left'])): lSet, rSet = regTrees.binSplitDataSet(testData, tree['spInd'], tree['spVal']) # 如果是左边分枝是字典,就传入左边的数据集和左边的分枝,进行递归 if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet) # 如果是右边分枝是字典,就传入左边的数据集和左边的分枝,进行递归 if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet) # 上面的一系列操作本质上就是将测试数据集按照训练完成的树拆分好,对应的值放到对应的节点 # 如果左右两边同时都不是dict字典,也就是左右两边都是叶节点,而不是子树了,那么分割测试数据集。 if not isTree(tree['left']) and not isTree(tree['right']): lSet, rSet = regTrees.binSplitDataSet(testData, tree['spInd'], tree['spVal']) #计算总的误差 #power(x,y)表示x的y次方 errorNoMerge = sum(power(lSet[:, -1] - tree['left'], 2)) + sum( power(rSet[:, -1] - tree['right'], 2)) #将两个分支合并并计算误差 treeMean = (tree['left'] + tree['right']) / 2.0 errorMerge = sum(power(testData[:, -1] - treeMean, 2)) #判断是否合并 if errorMerge < errorNoMerge: print("merging") return treeMean else: return tree else: return tree
def prune(tree, testData): if shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree if (isTree(tree['right']) or isTree( tree['left'])): #if the branches are not trees try to prune them lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet) if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet) #if they are now both leafs, see if we can merge them if not isTree(tree['left']) and not isTree(tree['right']): lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\ sum(power(rSet[:,-1] - tree['right'],2)) treeMean = (tree['left'] + tree['right']) / 2.0 errorMerge = sum(power(testData[:, -1] - treeMean, 2)) if errorMerge < errorNoMerge: print("merging") return treeMean else: return tree else: return tree
def test1(): testMat = mat(eye(4)) print(testMat) print(testMat[:, 2]) print(testMat[:, 2] < 0.5) # 第二列小于0.5的情况,有True/False组成,结果m行1列的矩阵 print(nonzero(testMat[:, 2] < 0.5)) print( nonzero(testMat[:, 2] < 0.5)[0] ) #nonzero(mat),如果有k个非零值,返回(a1=[x1,x2,...xk], a2=(y1,2y,...yk)),xi,yi为非零值对应的行idx和列idx print(testMat[nonzero(testMat[:, 2] < 0.5)[0], :]) #可能为空 [],比如改成<-0.5 print(testMat[nonzero(testMat[:, 2] < 0.5)[0], :][0]) #如果上面为空的话,本句出错,下标越界 #dataSet[nonzero(dataSet[:,feature] > value)[0],:][0] print('--') mat0, mat1 = regTrees.binSplitDataSet(testMat, 1, 0.5) print(mat0) #第一个满足第1维特征>0.5的行--->需要修改??? print(mat1) #第一个满足第1维特征<=0.5的行
#!/usr/bin/env python3 #-*- coding:UTF-8 -*- import regTrees from numpy import * testMat=mat(eye(4)) print(testMat) mat0,mat1=regTrees.binSplitDataSet(testMat,1,0.5)#1为特征下标,0.5为阈值 print("mat0=\n",mat0) print("mat1=\n",mat1) myDat=regTrees.loadDataSet('ex00.txt') #print(myDat) myMat=mat(myDat) print("myMat size:",shape(myMat)) #print(myMat) print(regTrees.createTree(myMat)) print("ex0.txt") myDat1=regTrees.loadDataSet('ex0.txt') myMat1=mat(myDat1) print(shape(myMat1))#200 3 print(regTrees.createTree(myMat1)) #建树完成 myDat2=regTrees.loadDataSet('ex2.txt') myMat2=mat(myDat2) print(regTrees.createTree(myMat2)) myTree=regTrees.createTree(myMat2,ops=(0,1)) myDatTest=regTrees.loadDataSet('ex2test.txt')
import regTrees from numpy import * testMat = mat(eye(4)) print testMat mat0, mat1 = regTrees.binSplitDataSet(testMat, 1, 0.5) print mat0 print mat1 myDat = regTrees.loadDataSet('ex00.txt') myMat = mat(myDat) print regTrees.createTree(myMat) myDat1 = regTrees.loadDataSet('ex0.txt') myMat1 = mat(myDat1) print regTrees.createTree(myMat1) #print regTrees.createTree(myMat, ops=(0,1)) myDat2 = regTrees.loadDataSet('ex2.txt') myMat2 = mat(myDat2) #print regTrees.createTree(myMat2) print regTrees.createTree(myMat2,ops=(10000,4)) myTree = regTrees.createTree(myMat2, ops=(0,1)) myDatTest = regTrees.loadDataSet('ex2test.txt') myMat2Test = mat(myDatTest) print regTrees.prune(myTree, myMat2Test) myMat2 = mat(regTrees.loadDataSet('exp2.txt')) print regTrees.createTree(myMat2, regTrees.modelLeaf, regTrees.modelErr, (1,10))
import regTrees testMat = mat(eye(4)) print regTrees.binSplitDataSet(testMat,1,0.5)
import regTrees from numpy import * testMat = mat(eye(4)) print testMat mat0, mat1 = regTrees.binSplitDataSet(testMat, 1, 0.5) print mat0 print mat1 myDat = regTrees.loadDataSet('ex00.txt') myMat = mat(myDat) print regTrees.createTree(myMat) myDat1 = regTrees.loadDataSet('ex0.txt') myMat1 = mat(myDat1) print regTrees.createTree(myMat1) #print regTrees.createTree(myMat, ops=(0,1)) myDat2 = regTrees.loadDataSet('ex2.txt') myMat2 = mat(myDat2) #print regTrees.createTree(myMat2) print regTrees.createTree(myMat2, ops=(10000, 4)) myTree = regTrees.createTree(myMat2, ops=(0, 1)) myDatTest = regTrees.loadDataSet('ex2test.txt') myMat2Test = mat(myDatTest) print regTrees.prune(myTree, myMat2Test) myMat2 = mat(regTrees.loadDataSet('exp2.txt')) print regTrees.createTree(myMat2, regTrees.modelLeaf, regTrees.modelErr, (1, 10))
import regTrees testMat = mat(eye(4)) print regTrees.binSplitDataSet(testMat, 1, 0.5)
#程度名称:第9章:树回归 #程序说明:树回归根据叶子节点的模型分为两种:1常数型即回归树;2模型树即线性模型 #程序功能:1根据训练数据生成树(分类的模型),2使用树(分类模型)预测测试数据集, # 3计算预测值与真实值之间的相关系数,比较两个算法的效果(模型树优于回归树) # 4使用图形用户界面GUI来显示真实数据和预测值,方便调参 #程序时间:2018年7月2日开始,7月4日下午15:20完成出GUI之外程序备注分析。 # treeExplore.py 图形用户界面GUI import numpy import regTrees testMat = numpy.mat(numpy.eye(4)) testMat[2, 1] = 5 print testMat rMat0, lMat0 = regTrees.binSplitDataSet(testMat, 1, 0.5) print "\n特征1 小于等于0.5的数据是:\n", rMat0 print "\n特征1 大于0.5 部分的数据是:\n", lMat0 ################### 预剪枝方法的 树回归 P164 myDat1 = regTrees.loadDataSet('ex00.txt') myMat1 = numpy.mat(myDat1) trees1 = regTrees.createTree(myMat1,ops=(1,4)) print "预剪枝方法生成树回归是:", trees1 trees1 = regTrees.createTree(myMat1,ops=(1,4)) ################## 后剪枝方法的 树回归 P169 myDat2 = regTrees.loadDataSet('ex2.txt') myMat2 = numpy.mat(myDat2) trees2 = regTrees.createTree(myMat2,ops=(10,10)) print "\n后剪枝之前的树trees2是:\n", trees2