def GrowTree(dataset, attributes, level):

    print "Constructing Decision Tree" , level
    max_gain, max_gain_attr, level = 0, None, 0

    print "Attributes ", attributes

    if not attributes:
        common_val = get_max_val(dataset.get("Class"))
        root = BTreeNode(str(common_val))
        root.left = None
        root.right = None

    else:

        if dataset.has_key('NA'):
            return BTreeNode(str(dataset.get('NA')))

        else:

            class_list = dataset.get("Class")

            tmp_negcnt, tmp_poscnt = get_count(class_list)

            if tmp_poscnt == 0:
                print "class: all negative examples"
                return BTreeNode('0')

            if tmp_negcnt == 0:
                print "class: all positive examples"
                return BTreeNode('1')


            for val in attributes:

                neg_dict, pos_dict = Dataset.split_dataset(dataset, val)

                variance_set = Heuristics.Variance_Impurity_Set(class_list)

                print "Variance Impurity set for ", val ,variance_set

                member_list = dataset.get(val)
                variance_member = Heuristics.Variance_Impurity_Members(dataset,neg_dict,pos_dict,member_list)
                print "Variance Impurity member for ",val,variance_member

                var_gain = Heuristics.gain(variance_set, variance_member)
                print "Variance Impurity gain for ",val ,var_gain

                print "Bool value - zeros" , bool([a for a in neg_dict.values() if a == []])
                print "Bool value - ones" , bool([a for a in pos_dict.values() if a == []])

                if bool([a for a in neg_dict.values() if a == []]):
                    print "Sub values empty - zero dataset"
                    common_val = get_max_val(dataset.get("Class"))
                    neg_dict = {}
                    neg_dict.update({'NA':common_val})

                elif bool([a for a in pos_dict.values() if a == []]):
                    print "Sub values empty - one dataset"
                    common_val = get_max_val(dataset.get("Class"))
                    pos_dict = {}
                    pos_dict.update({'NA':common_val})


                if var_gain > max_gain:
                    max_gain = var_gain
                    max_gain_attr = val
                    root_zero_dataset = neg_dict
                    root_one_dataset = pos_dict

                else:
                    max_gain = var_gain
                    max_gain_attr = val
                    root_zero_dataset = neg_dict
                    root_one_dataset = pos_dict



            print "Maximum Information Gain: ",max_gain
            print "Node selected" , max_gain_attr
            print "Zero Dataset", root_zero_dataset
            print "One Dataset", root_one_dataset

        root = BTreeNode(max_gain_attr)

        if max_gain_attr in attributes:
            attributes.remove(max_gain_attr)

        if root != None:
            root.left = GrowTree(root_zero_dataset,attributes,level)
            root.right = GrowTree(root_one_dataset,attributes,level)


    level+= 1


    return root
Пример #2
0
def GrowTree(dataset, attributes):
    global cnt_nonleaf_nodes,heuristic
    max_gain_attr =  None
    max_gain = 0.0
    gain = 0.0

    # print "Attributes ", attributes

    if not attributes:
        common_val = get_max_val(dataset.get("Class"))
        root = BTreeNode(str(common_val))
        root.left = None
        root.right = None

    else:

        if dataset.has_key('NA'):
            return BTreeNode(str(dataset.get('NA')))

        else:

            class_list = dataset.get("Class")
            # print class_list

            tmp_negcnt, tmp_poscnt = get_count(class_list)

            if tmp_poscnt == 0:
                print "class: all negative examples"
                # print class_list
                return BTreeNode('0')

            if tmp_negcnt == 0:
                print "class: all positive examples"
                # print class_list
                return BTreeNode('1')


            for val in attributes:

                neg_dict, pos_dict = Dataset.split_dataset(dataset, val)
                # print "Neg dict class" , neg_dict.get("Class")
                # print "Pos dict class" , pos_dict.get("Class")

                if heuristic == 0:
                    entropy_set = Heuristics.Entropy_Set(class_list)
                elif heuristic == 1:
                    variance_set = Heuristics.Variance_Impurity_Set(class_list)

                # print "Entropy set for ", val ,entropy_set

                member_list = dataset.get(val)
                if heuristic == 0:
                    entropy_member = Heuristics.Entropy_Members(dataset,neg_dict,pos_dict,member_list)
                elif heuristic == 1:
                    variance_member = Heuristics.Variance_Impurity_Members(dataset,neg_dict,pos_dict,member_list)
                # print "Entropy member for ",val,entropy_member

                if heuristic == 0:
                    gain = Heuristics.gain(entropy_set, entropy_member)
                elif heuristic == 1:
                    gain = Heuristics.gain(variance_set, variance_member)
                print "gain for ",val ,gain


                if bool([a for a in neg_dict.values() if a == []]):
                    print "Sub values empty - zero dataset"
                    common_val = get_max_val(dataset.get("Class"))
                    neg_dict = {}
                    neg_dict.update({'NA':common_val})

                elif bool([a for a in pos_dict.values() if a == []]):
                    print "Sub values empty - one dataset"
                    common_val = get_max_val(dataset.get("Class"))
                    pos_dict = {}
                    pos_dict.update({'NA':common_val})


                if gain >= max_gain:
                    max_gain = gain
                    max_gain_attr = val
                    root_zero_dataset = neg_dict
                    # print "inside max gain cal zeros ",val, neg_dict.get("Class")
                    root_one_dataset = pos_dict
                    # print "inside max gain cal ones ",val, pos_dict.get("Class")

                neg_dict = {}
                pos_dict = {}
                # print


            print "Maximum Information Gain: ",max_gain
            print "Node selected: " , max_gain_attr
            print "Zero Dataset: ", root_zero_dataset.get("Class")
            print "One Dataset: ", root_one_dataset.get("Class")

        root = BTreeNode(max_gain_attr)
        cnt_nonleaf_nodes += 1
        root.order = cnt_nonleaf_nodes
        root.subset = dataset

        if max_gain_attr in attributes:
            attributes.remove(max_gain_attr)

        if root != None:

            # if root.left:
                root.left = GrowTree(root_zero_dataset,attributes)
            # if root.right:
                root.right = GrowTree(root_one_dataset,attributes)


    return root