Exemplo n.º 1
0
def demo():
    print '... demo'
    myDat, featNames = createDataSet()
    print myDat

    shannonEnt = decisionTree.calcShannonEnt(myDat)
    print '当前数据集的熵是: ', shannonEnt

    print '... 测试拆分数据集'
    print decisionTree.splitDataSet(myDat, 0, 1)

    print '... 测试最佳特征选择'
    bestFeature = decisionTree.chooseBestFeatureToSplit(myDat)
    print '最好的分类特征是 %s' % bestFeature

    print '... 测试决策树的生成'
    myTree = decisionTree.createTree(myDat, featNames)
    print '生成的决策树是: \n', myTree

    print '... 测试SortedCount'
    classList = ['a', 'b', 'b', 'c', 'e']
    print decisionTree.majorityCnt(classList)

    # print '... 测试绘制树节点'
    # plotDecisionTree.createPlot()

    print '... 测试绘制决策树'
    myTree = plotDecisionTree.retrieveTree(0)
    leafNums = decisionTree.getNumLeafs(myTree)
    treeDepth = decisionTree.getTreeDepth(myTree)
    print myTree
    print '叶子数量:%d, 树高度:%d' % (leafNums, treeDepth)

    # plotDecisionTree.createPlot(myTree)
    print '... 预测'
    featNames = ['no surfacing', 'flippers', 'fish']
    print decisionTree.classify(myTree, featNames, [1, 1])

    print '... 测试和读取决策树存储'
    decisionTree.storeTree(myTree, 'classfierStorage.txt')

    print decisionTree.grabTree('classfierStorage.txt')

    print '... 测试中文情况的决策树读取和存储'
    cnTree = plotDecisionTree.retrieveTree(2)
    print '存储前的决策树: \n', cnTree

    # 对于中文来说,正常的print只能打印出utf-8编码格式
    # 但是可以递归的打印字典 就可以输出中文
    decisionTree.storeTree(cnTree, 'cnTree.txt')
    print decisionTree.grabTree('cnTree.txt')
Exemplo n.º 2
0
def demo():
    print '... demo'
    myDat, featNames = createDataSet()
    print myDat

    shannonEnt = decisionTree.calcShannonEnt(myDat)
    print '当前数据集的熵是: ', shannonEnt

    print '... 测试拆分数据集'
    print decisionTree.splitDataSet(myDat, 0, 1)

    print '... 测试最佳特征选择'
    bestFeature = decisionTree.chooseBestFeatureToSplit(myDat)
    print '最好的分类特征是 %s' % bestFeature

    print '... 测试决策树的生成'
    myTree = decisionTree.createTree(myDat, featNames)
    print '生成的决策树是: \n', myTree

    print '... 测试SortedCount'
    classList = ['a', 'b', 'b', 'c', 'e']
    print decisionTree.majorityCnt(classList)

    # print '... 测试绘制树节点'
    # plotDecisionTree.createPlot()

    print '... 测试绘制决策树'
    myTree = plotDecisionTree.retrieveTree(0)
    leafNums = decisionTree.getNumLeafs(myTree)
    treeDepth = decisionTree.getTreeDepth(myTree)
    print myTree
    print '叶子数量:%d, 树高度:%d' % (leafNums, treeDepth)

    # plotDecisionTree.createPlot(myTree)
    print '... 预测'
    featNames = ['no surfacing', 'flippers', 'fish']
    print decisionTree.classify(myTree, featNames, [1, 1])

    print '... 测试和读取决策树存储'
    decisionTree.storeTree(myTree, 'classfierStorage.txt')

    print decisionTree.grabTree('classfierStorage.txt')

    print '... 测试中文情况的决策树读取和存储'
    cnTree = plotDecisionTree.retrieveTree(2)
    print '存储前的决策树: \n', cnTree

    # 对于中文来说,正常的print只能打印出utf-8编码格式
    # 但是可以递归的打印字典 就可以输出中文
    decisionTree.storeTree(cnTree, 'cnTree.txt')
    print decisionTree.grabTree('cnTree.txt')
Exemplo n.º 3
0
def test(inputTree, labels, data, prt=True):
    '''
    测试分类器效果
    :param inputTree: 分类器
    :param labels: 特征列
    :param data: 二维list
    :return: 准确率, 混淆矩阵
    '''
    count = 0
    escape = 0
    ac = 0
    acpc = 0
    au = 0
    aupu = 0
    for i in range(len(data)):
        dataVec = list(data[i])
        predict = classify(inputTree, labels, dataVec)
        if predict == None:
            escape += 1
            continue
        if dataVec[-1] == ' >50K':
            au += 1
            if predict == ' >50K':
                aupu += 1
        else:
            ac += 1
            if predict == ' <=50K':
                acpc += 1
        if predict == dataVec[-1]:
            count += 1
    accuracy = (count + escape) / float(len(data))
    print('escape: {}'.format(escape))
    confuse_matrix = np.array([[acpc, ac - acpc], [au - aupu, aupu]])
    if prt:
        print('accuracy: {}'.format(accuracy))
        print(confuse_matrix)
    return accuracy, confuse_matrix
Exemplo n.º 4
0
def main():
    dataSetX, dataSetY = readLensesData()
    featureList = ['age', 'prescript', 'astigmatic', 'tearRate']
    print(featureList)
    threshold = 0
    # create decisionTree
    tree = decisionTree.createTree(dataSetX, dataSetY, featureList, threshold)

    print(tree)

    # save
    import pickle
    fo = open('tree.file', 'w+')
    pickle.dump(tree, fo)
    fo.close()

    # load and use
    inX = ['young', 'myope', 'yes', 'normal']
    fi = open('tree.file')
    tree = pickle.load(fi)
    fi.close()
    inY = decisionTree.classify(tree, featureList, inX)
    print(inX)
    print(inY)
Exemplo n.º 5
0
def randomForestClassify(myForest, featLables, testVec):
    classCount = {}
    for tree in myForest:
        firstStr = tree.keys()[0]
        secondDict = tree[firstStr]
        # 找到特征列在原始数据集的位置
        featIndex = featLables.index(firstStr)
        classLabel = None
        for key in secondDict.keys():
            if testVec[featIndex] == key:
                if type(secondDict[key]).__name__ == 'dict':
                    classLabel = classify(secondDict[key], featLables, testVec)
                else:
                    classLabel = secondDict[key]
        if not classLabel:  # 没有目标分支, 提前终止
            classList = {}
            searchTree(tree, classList)
            classLabel = sorted(classList.items(),
                                key=lambda item: item[1],
                                reverse=True)[0][0]
        classCount.setdefault(classLabel, 0)
        classCount[classLabel] += 1
    return sorted(classCount.items(), key=lambda item: item[1],
                  reverse=True)[0][0]
Exemplo n.º 6
0
#!/usr/bin/env python

import decisionTree as dt


def createDataset():
    dataset = [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]]
    labels = ['yes', 'yes', 'no', 'no', 'no']
    return (dataset, labels)


if __name__ == '__main__':
    (dataset, labels) = createDataset()
    labelName = ["no surface", "flipper"]

    tree = dt.createTree(dataset, labels, labelName)
    print(tree)

    labelName = ["no surface", "flipper"]
    label = dt.classify(tree, labelName, [0, 0])
    print(label)
Exemplo n.º 7
0
#!/usr/bin/env python

import decisionTree as dt

def createDataset() :
	dataset = [[1, 1],
				[1, 1],
				[1, 0],
				[0, 1],
				[0, 1]]
	labels = ['yes', 'yes', 'no', 'no', 'no']
	return (dataset, labels)

if __name__ == '__main__' :
	(dataset, labels) = createDataset()
	labelName = ["no surface", "flipper"]

	tree = dt.createTree(dataset, labels, labelName)
	print(tree)

	labelName = ["no surface", "flipper"]
	label = dt.classify(tree, labelName, [0,0])
	print(label)