def buildTree( dataset, attrs, treeNode=None, ): if treeNode == None: treeNode = TreeNode() #determine if the cut attribute is 0 if len(attrs) <= 0: #raise TypeError('len(attrs)<=0') #if 0, find the max label = findmaxlabel(dataset) treeNode.setlabel(label) return treeNode #determine if pure value or not tset = set() sign = True for row in dataset.getrows(): tset.add(row[-1]) if len(tset) > 1: sign = False break if sign: label = (list(tset))[0] treeNode.setlabel(label) return treeNode #compute the gain max_gain = 0.0 attr_id = None entropys = calentropy1(dataset) for attr in attrs: attrentropy = calentropy2(attr, dataset) gain = entropys - attrentropy if gain > max_gain: max_gain = gain attr_id = attr #vote to decide if no change in entropy after cut if attr_id == None: label = findmaxlabel(dataset) treeNode.setlabel(label) return treeNode #set node value treeNode.setsplistattrindex(attr_id) treeNode.setsplitattr(dataset.gettitle()[attr_id]) #classify data subdatasets = splitdataset(dataset, attr_id) #release ram del (dataset) #delete the current cutting attribute attrs.discard(attr_id) #build branch, choose the biggest branch to build keyset = set(subdatasets.keys()) for x in range(len(keyset)): max_count = 0 rkey = None for key in keyset: value = subdatasets[key].getrowcount() if value > max_count: max_count = value rkey = key keyset.discard(rkey) ctreenode = TreeNode(treeNode) treeNode.children[rkey] = ctreenode buildTree(subdatasets.get(rkey), attrs, ctreenode) return treeNode