def build_tree(part, scoref=entropy, beta=0): if(len(part) == 0): return decisionnode() best_gain = 0 best_criteria = None best_sets = None columns = len(part[0]) -1 for elem in part: for i in range(columns): try: (set1, set2) = divideset(part, i, float(elem[i])) except ValueError: (set1, set2) = divideset(part, i, elem[i]) total = len(part) pr = len(set1)/ float(total) pl = len(set2)/float(total) gain = scoref(part) - pr * scoref(set1) - pl * scoref(set2) if(gain > best_gain): best_gain = gain best_criteria = (i, elem[i]) best_sets = (set1, set2) if(best_gain > beta): tree_r = build_tree(best_sets[0], scoref, beta) tree_l = build_tree(best_sets[1], scoref, beta) try: return decisionnode(best_criteria[0], float(best_criteria[1]),tb=tree_r, fb=tree_l, gain=best_gain) except ValueError: return decisionnode(best_criteria[0], best_criteria[1],tb=tree_r, fb=tree_l, gain=best_gain) else: return decisionnode(results=unique_counts(part), gain=best_gain)
def buildtree_iter(part, scoref=entropy, beta=0): if len(part)==0: return decisionnode() node_list = [] sets_list = [[part, None, None]] while sets_list: best_criteria = None best_sets = None best_gain = 0 current_node = sets_list.pop(0) data_set = current_node[0] father = current_node[1] side = current_node[2] update_best_gain = False for row in range(len(data_set)): for column in range(len(data_set[row])-1): try: set1,set2=divideset(data_set,column,float(data_set[row][column])) except ValueError: set1,set2=divideset(data_set,column,data_set[row][column]) total = len(data_set) pr = len(set1)/ float(total) pl = len(set2)/float(total) current_gain = scoref(data_set) - pr * scoref(set1) - pl * scoref(set2) if best_gain < current_gain: best_gain = current_gain best_sets = (set1,set2) best_criteria = (column, data_set[row][column]) update_best_gain=True if best_gain > beta: try: node = decisionnode(col=best_criteria[0], value=float(best_criteria[1]), gain=best_gain) except ValueError: node = decisionnode(col=best_criteria[0], value=best_criteria[1], gain=best_gain) node_list.append([node, father, side]) sets_list.append([best_sets[0], node, True]) sets_list.append([best_sets[1], node ,False]) else: node = decisionnode(results=unique_counts(data_set), gain=best_gain) node_list.append([node,father,side]) for node1 in node_list: for node2 in node_list: if node1[0]==node2[1]: if node2[2]==True: node1[0].tb=node2[0] else: node1[0].fb=node2[0] return node_list[0][0]
def buildtree(rows, scoref=entropy): if len(rows) == 0: return decisionnode() current_score = scoref(rows) #print 'entropy score =' + str(current_score) # Set up some variables to track the best criteria best_gain = 0.0 best_criteria = None best_sets = None column_count = len(rows[0]) - 1 for col in range(0, column_count): # Generate the list of different values in # this column column_values = {} for row in rows: column_values[row[col]] = 1 # Now try dividing the rows up for each value # in this column for value in column_values.keys(): (set1, set2) = divideset(rows, col, value) # Information gain p = float(len(set1)) / len(rows) gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2) if gain > best_gain and len(set1) > 0 and len(set2) > 0: best_gain = gain best_criteria = (col, value) best_sets = (set1, set2) # Create the sub branches if best_gain > 0: trueBranch = buildtree(best_sets[0]) falseBranch = buildtree(best_sets[1]) return decisionnode(col=best_criteria[0], value=best_criteria[1], tb=trueBranch, fb=falseBranch) else: return decisionnode(results=uniquecounts(rows))
def buildtree(rows,scoref=entropy): if len(rows)==0: return decisionnode() current_score=scoref(rows) #print 'entropy score =' + str(current_score) # Set up some variables to track the best criteria best_gain=0.0 best_criteria=None best_sets=None column_count=len(rows[0])-1 for col in range(0,column_count): # Generate the list of different values in # this column column_values={} for row in rows: column_values[row[col]]=1 # Now try dividing the rows up for each value # in this column for value in column_values.keys(): (set1,set2)=divideset(rows,col,value) # Information gain p=float(len(set1))/len(rows) gain=current_score-p*scoref(set1)-(1-p)*scoref(set2) if gain>best_gain and len(set1)>0 and len(set2)>0: best_gain=gain best_criteria=(col,value) best_sets=(set1,set2) # Create the sub branches if best_gain>0: trueBranch=buildtree(best_sets[0]) falseBranch=buildtree(best_sets[1]) return decisionnode(col=best_criteria[0],value=best_criteria[1], tb=trueBranch,fb=falseBranch) else: return decisionnode(results=uniquecounts(rows))
def buildrandomtree(rows, kcandidates, nmin, pickcandidate=pick_candidate_gini): rows = rows[:] if len(rows) == 0: return decisionnode() candidates = [] column_count = len(rows[0]) - 1 #print "number of columns = " + str(column_count) #pick k random candidates #candidate = (column_index,value) for i in range(0, kcandidates): random_index = random.randint(0, column_count - 1) #get all unique values for a specific feature (column) column_values = {} for row in rows: column_values[row[random_index]] = 1 #get a cutting point cutting_point = get_cutting_point(column_values) #print "rand feature index ="+str(random_index)+ "\n cutting point="+str(cutting_point) #add to list of candidates candidates.append((random_index, cutting_point)) #print candidates #choose a candidate based on function given chosen_candidate = pickcandidate(candidates, rows) #print chosen_candidate col = chosen_candidate[0] value = chosen_candidate[1] #split set based on the feature and value (set1, set2) = divideset(rows, col, value) #set1 = truebranch trueBranch = None #set2 = falsebranch falseBranch = None #print "item in leaf1 = " +str(len(set1)) #check if set1 has the min size if (len(set1) <= nmin): #do voting on the elements of of set1 #set and answer for this true branch voting_result = get_voting_result(set1) trueBranch = decisionnode(results=voting_result) else: #it means we need to grow this trueBranch = buildrandomtree(set1, kcandidates, nmin, pickcandidate) #print "item in leaf2 = " +str(len(set2)) #check if set2 has the min size if (len(set2) <= nmin): #do voting on the elements of of set2 #set and answer for this true branch voting_result = get_voting_result(set2) #uniquecounts falseBranch = decisionnode(results=voting_result) else: #it means we need to grow this falseBranch = buildrandomtree(set2, kcandidates, nmin, pickcandidate) return decisionnode(col=col, value=value, tb=trueBranch, fb=falseBranch)
def buildrandomtree(rows,kcandidates,nmin,pickcandidate=pick_candidate_gini): rows = rows[:] if len(rows)==0: return decisionnode() candidates = [] column_count=len(rows[0])-1 #print "number of columns = " + str(column_count) #pick k random candidates #candidate = (column_index,value) for i in range(0,kcandidates): random_index = random.randint(0,column_count-1) #get all unique values for a specific feature (column) column_values={} for row in rows: column_values[row[random_index]]=1 #get a cutting point cutting_point = get_cutting_point(column_values) #print "rand feature index ="+str(random_index)+ "\n cutting point="+str(cutting_point) #add to list of candidates candidates.append((random_index,cutting_point)) #print candidates #choose a candidate based on function given chosen_candidate = pickcandidate(candidates,rows) #print chosen_candidate col = chosen_candidate[0] value = chosen_candidate[1] #split set based on the feature and value (set1,set2)=divideset(rows,col,value) #set1 = truebranch trueBranch = None #set2 = falsebranch falseBranch = None #print "item in leaf1 = " +str(len(set1)) #check if set1 has the min size if(len(set1)<=nmin): #do voting on the elements of of set1 #set and answer for this true branch voting_result = get_voting_result(set1) trueBranch = decisionnode(results=voting_result) else: #it means we need to grow this trueBranch = buildrandomtree(set1,kcandidates,nmin,pickcandidate) #print "item in leaf2 = " +str(len(set2)) #check if set2 has the min size if(len(set2)<=nmin): #do voting on the elements of of set2 #set and answer for this true branch voting_result = get_voting_result(set2) #uniquecounts falseBranch = decisionnode(results=voting_result) else: #it means we need to grow this falseBranch = buildrandomtree(set2,kcandidates,nmin,pickcandidate) return decisionnode(col=col,value=value,tb=trueBranch,fb=falseBranch)