示例#1
0
def build_tree(part, scoref=entropy, beta=0):
    if(len(part) == 0): return decisionnode()
    best_gain = 0
    best_criteria = None 
    best_sets = None
    columns = len(part[0]) -1 
    for elem in part:
        for i in range(columns):
            try:
                (set1, set2) = divideset(part, i, float(elem[i]))
            except ValueError:
                (set1, set2) = divideset(part, i, elem[i])
            total = len(part)
            pr = len(set1)/ float(total)
            pl = len(set2)/float(total)
            gain = scoref(part) - pr * scoref(set1) - pl * scoref(set2)
            if(gain > best_gain):
                best_gain = gain
                best_criteria = (i, elem[i])
                best_sets = (set1, set2)

    if(best_gain > beta):
        tree_r = build_tree(best_sets[0], scoref, beta)
        tree_l = build_tree(best_sets[1], scoref, beta)
        try:
            return decisionnode(best_criteria[0], float(best_criteria[1]),tb=tree_r, fb=tree_l, gain=best_gain)
        except ValueError:
            return decisionnode(best_criteria[0], best_criteria[1],tb=tree_r, fb=tree_l, gain=best_gain)
    else:
        return decisionnode(results=unique_counts(part), gain=best_gain)
示例#2
0
def buildtree_iter(part, scoref=entropy, beta=0):
    if len(part)==0: return decisionnode()
    node_list = []
    sets_list = [[part, None, None]]
    while sets_list:
        best_criteria = None
        best_sets = None
        best_gain = 0
        current_node = sets_list.pop(0)
        data_set = current_node[0]
        father = current_node[1]
        side = current_node[2]
        update_best_gain = False  
        for row in range(len(data_set)):
            for column in range(len(data_set[row])-1):
                try:
                    set1,set2=divideset(data_set,column,float(data_set[row][column]))
                except ValueError:
                    set1,set2=divideset(data_set,column,data_set[row][column]) 
                total = len(data_set)
                pr = len(set1)/ float(total)
                pl = len(set2)/float(total)
                current_gain = scoref(data_set) - pr * scoref(set1) - pl * scoref(set2)
                if best_gain < current_gain:
                    best_gain = current_gain
                    best_sets = (set1,set2)
                    best_criteria = (column, data_set[row][column])
                    update_best_gain=True
        if best_gain > beta:
            try:
                node = decisionnode(col=best_criteria[0], value=float(best_criteria[1]), gain=best_gain)
            except ValueError:
                node = decisionnode(col=best_criteria[0], value=best_criteria[1], gain=best_gain)
            node_list.append([node, father, side])
            sets_list.append([best_sets[0], node, True])
            sets_list.append([best_sets[1], node ,False])
        else:
            node = decisionnode(results=unique_counts(data_set), gain=best_gain)
            node_list.append([node,father,side])
    
    for node1 in node_list:
        for node2 in node_list:
            if node1[0]==node2[1]:
                if node2[2]==True:
                    node1[0].tb=node2[0]
                else:
                    node1[0].fb=node2[0]
    
    return node_list[0][0]
def buildtree(rows, scoref=entropy):
    if len(rows) == 0: return decisionnode()
    current_score = scoref(rows)
    #print 'entropy score =' + str(current_score)
    # Set up some variables to track the best criteria
    best_gain = 0.0
    best_criteria = None
    best_sets = None

    column_count = len(rows[0]) - 1
    for col in range(0, column_count):
        # Generate the list of different values in
        # this column
        column_values = {}
        for row in rows:
            column_values[row[col]] = 1
        # Now try dividing the rows up for each value
        # in this column
        for value in column_values.keys():
            (set1, set2) = divideset(rows, col, value)

            # Information gain
            p = float(len(set1)) / len(rows)
            gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
            if gain > best_gain and len(set1) > 0 and len(set2) > 0:
                best_gain = gain
                best_criteria = (col, value)
                best_sets = (set1, set2)
    # Create the sub branches
    if best_gain > 0:
        trueBranch = buildtree(best_sets[0])
        falseBranch = buildtree(best_sets[1])
        return decisionnode(col=best_criteria[0],
                            value=best_criteria[1],
                            tb=trueBranch,
                            fb=falseBranch)
    else:
        return decisionnode(results=uniquecounts(rows))
示例#4
0
def buildtree(rows,scoref=entropy):
  if len(rows)==0: return decisionnode()
  current_score=scoref(rows)
  #print 'entropy score =' + str(current_score)
  # Set up some variables to track the best criteria
  best_gain=0.0
  best_criteria=None
  best_sets=None
  
  column_count=len(rows[0])-1
  for col in range(0,column_count):
    # Generate the list of different values in
    # this column
    column_values={}
    for row in rows:
       column_values[row[col]]=1
    # Now try dividing the rows up for each value
    # in this column
    for value in column_values.keys():
      (set1,set2)=divideset(rows,col,value)
      
      # Information gain
      p=float(len(set1))/len(rows)
      gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
      if gain>best_gain and len(set1)>0 and len(set2)>0:
        best_gain=gain
        best_criteria=(col,value)
        best_sets=(set1,set2)
  # Create the sub branches   
  if best_gain>0:
    trueBranch=buildtree(best_sets[0])
    falseBranch=buildtree(best_sets[1])
    return decisionnode(col=best_criteria[0],value=best_criteria[1],
                        tb=trueBranch,fb=falseBranch)
  else:
    return decisionnode(results=uniquecounts(rows))
示例#5
0
def buildrandomtree(rows,
                    kcandidates,
                    nmin,
                    pickcandidate=pick_candidate_gini):
    rows = rows[:]
    if len(rows) == 0: return decisionnode()

    candidates = []

    column_count = len(rows[0]) - 1
    #print "number of columns = " + str(column_count)
    #pick k random candidates
    #candidate = (column_index,value)
    for i in range(0, kcandidates):
        random_index = random.randint(0, column_count - 1)

        #get all unique values for a specific feature (column)
        column_values = {}
        for row in rows:
            column_values[row[random_index]] = 1

        #get a cutting point
        cutting_point = get_cutting_point(column_values)
        #print "rand feature index ="+str(random_index)+ "\n cutting point="+str(cutting_point)
        #add to list of candidates
        candidates.append((random_index, cutting_point))

    #print candidates
    #choose a candidate based on function given
    chosen_candidate = pickcandidate(candidates, rows)

    #print chosen_candidate

    col = chosen_candidate[0]
    value = chosen_candidate[1]
    #split set based on the feature and value
    (set1, set2) = divideset(rows, col, value)

    #set1 = truebranch
    trueBranch = None
    #set2 = falsebranch
    falseBranch = None
    #print "item in leaf1 = " +str(len(set1))
    #check if set1 has the min size
    if (len(set1) <= nmin):
        #do voting on the elements of of set1
        #set and answer for this true branch
        voting_result = get_voting_result(set1)
        trueBranch = decisionnode(results=voting_result)
    else:
        #it means we need to grow this
        trueBranch = buildrandomtree(set1, kcandidates, nmin, pickcandidate)
    #print "item in leaf2 = " +str(len(set2))
    #check if set2 has the min size
    if (len(set2) <= nmin):
        #do voting on the elements of of set2
        #set and answer for this true branch
        voting_result = get_voting_result(set2)
        #uniquecounts
        falseBranch = decisionnode(results=voting_result)
    else:
        #it means we need to grow this
        falseBranch = buildrandomtree(set2, kcandidates, nmin, pickcandidate)

    return decisionnode(col=col, value=value, tb=trueBranch, fb=falseBranch)
示例#6
0
def buildrandomtree(rows,kcandidates,nmin,pickcandidate=pick_candidate_gini):
  rows = rows[:]
  if len(rows)==0: return decisionnode()

  candidates = []
  
  column_count=len(rows[0])-1
  #print "number of columns = " + str(column_count)
  #pick k random candidates
  #candidate = (column_index,value)
  for i in range(0,kcandidates):
    random_index = random.randint(0,column_count-1)
    
    #get all unique values for a specific feature (column)
    column_values={}
    for row in rows:
      column_values[row[random_index]]=1

    #get a cutting point
    cutting_point = get_cutting_point(column_values)
    #print "rand feature index ="+str(random_index)+ "\n cutting point="+str(cutting_point)
    #add to list of candidates
    candidates.append((random_index,cutting_point))

  #print candidates
  #choose a candidate based on function given
  chosen_candidate = pickcandidate(candidates,rows)

  #print chosen_candidate

  col = chosen_candidate[0]
  value = chosen_candidate[1]
  #split set based on the feature and value
  (set1,set2)=divideset(rows,col,value)

  #set1 = truebranch
  trueBranch = None
  #set2 = falsebranch
  falseBranch = None
  #print "item in leaf1 = " +str(len(set1))
  #check if set1 has the min size
  if(len(set1)<=nmin):
    #do voting on the elements of of set1
    #set and answer for this true branch
    voting_result = get_voting_result(set1)
    trueBranch = decisionnode(results=voting_result)
  else:
    #it means we need to grow this 
    trueBranch = buildrandomtree(set1,kcandidates,nmin,pickcandidate)
  #print "item in leaf2 = " +str(len(set2))
  #check if set2 has the min size
  if(len(set2)<=nmin):
    #do voting on the elements of of set2
    #set and answer for this true branch
    voting_result = get_voting_result(set2)
    #uniquecounts
    falseBranch = decisionnode(results=voting_result)
  else:
    #it means we need to grow this 
    falseBranch = buildrandomtree(set2,kcandidates,nmin,pickcandidate)

  return decisionnode(col=col,value=value,tb=trueBranch,fb=falseBranch)