Exemplo n.º 1
0
def buildTree(
    dataset,
    attrs,
    treeNode=None,
):
    if treeNode == None:
        treeNode = TreeNode()
    #determine if the cut attribute is 0
    if len(attrs) <= 0:
        #raise TypeError('len(attrs)<=0')
        #if 0, find the max
        label = findmaxlabel(dataset)
        treeNode.setlabel(label)
        return treeNode

    #determine if pure value or not
    tset = set()
    sign = True
    for row in dataset.getrows():
        tset.add(row[-1])
        if len(tset) > 1:
            sign = False
            break
    if sign:
        label = (list(tset))[0]
        treeNode.setlabel(label)
        return treeNode

    #compute the gain
    max_gain = 0.0
    attr_id = None
    entropys = calentropy1(dataset)
    for attr in attrs:
        attrentropy = calentropy2(attr, dataset)
        gain = entropys - attrentropy
        if gain > max_gain:
            max_gain = gain
            attr_id = attr

    #vote to decide if no change in entropy after cut
    if attr_id == None:

        label = findmaxlabel(dataset)
        treeNode.setlabel(label)
        return treeNode

    #set node value
    treeNode.setsplistattrindex(attr_id)
    treeNode.setsplitattr(dataset.gettitle()[attr_id])
    #classify data
    subdatasets = splitdataset(dataset, attr_id)

    #release ram
    del (dataset)
    #delete the current cutting attribute
    attrs.discard(attr_id)

    #build branch, choose the biggest branch to build
    keyset = set(subdatasets.keys())
    for x in range(len(keyset)):
        max_count = 0
        rkey = None
        for key in keyset:
            value = subdatasets[key].getrowcount()
            if value > max_count:
                max_count = value
                rkey = key
        keyset.discard(rkey)
        ctreenode = TreeNode(treeNode)
        treeNode.children[rkey] = ctreenode
        buildTree(subdatasets.get(rkey), attrs, ctreenode)
    return treeNode