def GrowTree(dataset, attributes, level): print "Constructing Decision Tree" , level max_gain, max_gain_attr, level = 0, None, 0 print "Attributes ", attributes if not attributes: common_val = get_max_val(dataset.get("Class")) root = BTreeNode(str(common_val)) root.left = None root.right = None else: if dataset.has_key('NA'): return BTreeNode(str(dataset.get('NA'))) else: class_list = dataset.get("Class") tmp_negcnt, tmp_poscnt = get_count(class_list) if tmp_poscnt == 0: print "class: all negative examples" return BTreeNode('0') if tmp_negcnt == 0: print "class: all positive examples" return BTreeNode('1') for val in attributes: neg_dict, pos_dict = Dataset.split_dataset(dataset, val) variance_set = Heuristics.Variance_Impurity_Set(class_list) print "Variance Impurity set for ", val ,variance_set member_list = dataset.get(val) variance_member = Heuristics.Variance_Impurity_Members(dataset,neg_dict,pos_dict,member_list) print "Variance Impurity member for ",val,variance_member var_gain = Heuristics.gain(variance_set, variance_member) print "Variance Impurity gain for ",val ,var_gain print "Bool value - zeros" , bool([a for a in neg_dict.values() if a == []]) print "Bool value - ones" , bool([a for a in pos_dict.values() if a == []]) if bool([a for a in neg_dict.values() if a == []]): print "Sub values empty - zero dataset" common_val = get_max_val(dataset.get("Class")) neg_dict = {} neg_dict.update({'NA':common_val}) elif bool([a for a in pos_dict.values() if a == []]): print "Sub values empty - one dataset" common_val = get_max_val(dataset.get("Class")) pos_dict = {} pos_dict.update({'NA':common_val}) if var_gain > max_gain: max_gain = var_gain max_gain_attr = val root_zero_dataset = neg_dict root_one_dataset = pos_dict else: max_gain = var_gain max_gain_attr = val root_zero_dataset = neg_dict root_one_dataset = pos_dict print "Maximum Information Gain: ",max_gain print "Node selected" , max_gain_attr print "Zero Dataset", root_zero_dataset print "One Dataset", root_one_dataset root = BTreeNode(max_gain_attr) if max_gain_attr in attributes: attributes.remove(max_gain_attr) if root != None: root.left = GrowTree(root_zero_dataset,attributes,level) root.right = GrowTree(root_one_dataset,attributes,level) level+= 1 return root
def GrowTree(dataset, attributes): global cnt_nonleaf_nodes,heuristic max_gain_attr = None max_gain = 0.0 gain = 0.0 # print "Attributes ", attributes if not attributes: common_val = get_max_val(dataset.get("Class")) root = BTreeNode(str(common_val)) root.left = None root.right = None else: if dataset.has_key('NA'): return BTreeNode(str(dataset.get('NA'))) else: class_list = dataset.get("Class") # print class_list tmp_negcnt, tmp_poscnt = get_count(class_list) if tmp_poscnt == 0: print "class: all negative examples" # print class_list return BTreeNode('0') if tmp_negcnt == 0: print "class: all positive examples" # print class_list return BTreeNode('1') for val in attributes: neg_dict, pos_dict = Dataset.split_dataset(dataset, val) # print "Neg dict class" , neg_dict.get("Class") # print "Pos dict class" , pos_dict.get("Class") if heuristic == 0: entropy_set = Heuristics.Entropy_Set(class_list) elif heuristic == 1: variance_set = Heuristics.Variance_Impurity_Set(class_list) # print "Entropy set for ", val ,entropy_set member_list = dataset.get(val) if heuristic == 0: entropy_member = Heuristics.Entropy_Members(dataset,neg_dict,pos_dict,member_list) elif heuristic == 1: variance_member = Heuristics.Variance_Impurity_Members(dataset,neg_dict,pos_dict,member_list) # print "Entropy member for ",val,entropy_member if heuristic == 0: gain = Heuristics.gain(entropy_set, entropy_member) elif heuristic == 1: gain = Heuristics.gain(variance_set, variance_member) print "gain for ",val ,gain if bool([a for a in neg_dict.values() if a == []]): print "Sub values empty - zero dataset" common_val = get_max_val(dataset.get("Class")) neg_dict = {} neg_dict.update({'NA':common_val}) elif bool([a for a in pos_dict.values() if a == []]): print "Sub values empty - one dataset" common_val = get_max_val(dataset.get("Class")) pos_dict = {} pos_dict.update({'NA':common_val}) if gain >= max_gain: max_gain = gain max_gain_attr = val root_zero_dataset = neg_dict # print "inside max gain cal zeros ",val, neg_dict.get("Class") root_one_dataset = pos_dict # print "inside max gain cal ones ",val, pos_dict.get("Class") neg_dict = {} pos_dict = {} # print print "Maximum Information Gain: ",max_gain print "Node selected: " , max_gain_attr print "Zero Dataset: ", root_zero_dataset.get("Class") print "One Dataset: ", root_one_dataset.get("Class") root = BTreeNode(max_gain_attr) cnt_nonleaf_nodes += 1 root.order = cnt_nonleaf_nodes root.subset = dataset if max_gain_attr in attributes: attributes.remove(max_gain_attr) if root != None: # if root.left: root.left = GrowTree(root_zero_dataset,attributes) # if root.right: root.right = GrowTree(root_one_dataset,attributes) return root