Exemplo n.º 1
0
    def split(self):
        self.best_info_gain = float('-inf')
        self.best_attribute_values = []

        for attribute_index in range(len(self.features[0])):
            branch = dict()
            for attribute_value in sorted(set(self.features[:, attribute_index])):
                branch[attribute_value] = [0] * self.num_cls
            label_map = sorted(set(self.labels))
            for label_index, label in enumerate(self.labels):
                branch[self.features[label_index, attribute_index]][label_map.index(label)] \
                 = branch.get(self.features[label_index, attribute_index], 0)[label_map.index(label)] + 1
            current_info_gain = Util.Information_Gain(self.entropy, list(branch.values()))

            if (current_info_gain != 0) and ((current_info_gain > self.best_info_gain) or
                    (current_info_gain == self.best_info_gain and len(branch) > self.feature_uniq_split)):
                self.best_info_gain = current_info_gain
                self.dim_split = attribute_index
                self.feature_uniq_split = len(branch)
                self.best_attribute_values = list(branch.keys())

        if self.best_info_gain != float('-inf'):
            # split the best attribute and create children
            child_feature_array = np.column_stack((self.features[:, :self.dim_split], self.features[:, self.dim_split+1:]))
            for attribute_value in self.best_attribute_values:
                subset_of_indices = np.where(self.features[:, self.dim_split] == attribute_value)[0]
                child_labels = np.array(self.labels)[subset_of_indices].tolist()
                self.children.append(TreeNode(child_feature_array[subset_of_indices].tolist(), child_labels,\
                    len(set(child_labels))))
                if self.children[-1].splittable:
                    self.children[-1].split()
        else:
            self.splittable = False
        return
Exemplo n.º 2
0
    def get_split_attribute(self):
        ent = [self.labels.count(x) for x in set(self.labels)]
        S = Util.get_entropy(ent)
        A = 0
        max_ig = 0
        split_features = []
        for a in range(len(self.features[0])):
            newFeatures = []
            branches = {}
            for idf, f in enumerate(self.features):
                newFeatures.append(f[a])
                if f[a] in branches:
                    branches[f[a]].append(self.labels[idf])
                else:
                    branches[f[a]] = [self.labels[idf]]

            counts = []
            for b in branches:
                counts.append([branches[b].count(x) for x in set(branches[b])])
            ig = Util.Information_Gain(S, counts)
            if ig == max_ig:
                if len(newFeatures) > len(split_features):
                    max_ig = ig
                    A = a
                    split_features = newFeatures
            if ig > max_ig:
                max_ig = ig
                A = a
                split_features = newFeatures
        #print("Attribute to split on: " + str(A))
        return A, split_features
Exemplo n.º 3
0
    def split(self):
        if len(self.features) == 0:
            self.splittable = False

        if len(self.labels) == 0:
            self.splittable = False

        if len(self.features[0]) == 0:
            self.splittable = False

        # if not splittable, return majority label
        if self.splittable == False:
            return

        unique, counts = np.unique(self.labels, return_counts=True)
        current_entropy = Util.get_entropy(counts)

        max_info = None
        for index in range(len(self.features[0])):
            branches = self.make_br(index)
            info = Util.Information_Gain(current_entropy, branches)
            if max_info is None or info > max_info:
                max_info = info
                self.dim_split = index
            elif info == max_info:
                current_index_values = self.all_attrib_values(index)
                best_index_values = self.all_attrib_values(self.dim_split)
                if current_index_values > best_index_values:
                    self.dim_split = index
                elif current_index_values == best_index_values:
                    self.dim_split = min(self.dim_split, index)

        attribute_val_dict = {}
        for index in range(0, len(self.features)):
            attribute_val = self.features[index][self.dim_split]
            features_and_labels = [[], []]
            if attribute_val in attribute_val_dict:
                features_and_labels = attribute_val_dict[attribute_val]
            feature_to_del = self.features[index]
            feature_to_del = np.delete(feature_to_del, [self.dim_split])
            features_and_labels[0].append(feature_to_del)
            features_and_labels[1].append(self.labels[index])
            attribute_val_dict[attribute_val] = features_and_labels

        self.feature_uniq_split = list(attribute_val_dict.keys())
        self.feature_uniq_split = sorted(self.feature_uniq_split,
                                         key=lambda e: ({
                                             int: 1,
                                             float: 1,
                                             str: 0
                                         }.get(type(e), 0), e))

        for key in self.feature_uniq_split:
            features_and_labels = attribute_val_dict[key]
            childn = TreeNode(features_and_labels[0], features_and_labels[1],
                              np.unique(features_and_labels[1]))
            if childn.splittable:
                childn.split()
            self.children.append(childn)
Exemplo n.º 4
0
    def split(self):
        feature_information_gains = []
        unique_labels, unique_label_count = np.unique(self.labels,
                                                      return_counts=True)

        for f in range(len(np.array(self.features).T)):
            feature_class_count = [[
                len([
                    i for i, j in zip(
                        np.array(self.features)[:, f], np.array(self.labels))
                    if i == feature and j == label
                ]) for label in unique_labels
            ] for feature in self.features_unique[f]]
            Entropy = sum([(-1) * (float(x) / sum(unique_label_count)) *
                           np.log2(float(x) / sum(unique_label_count))
                           for x in unique_label_count])
            feature_information_gains.append(
                (Util.Information_Gain(Entropy, feature_class_count),
                 len(np.unique(np.array(self.features)[:, f]))))

        information_gains = np.array([i[0] for i in feature_information_gains])
        if all(information_gains == 0.0):
            self.splittable = False
            return

        self.dim_split = feature_information_gains.index(
            max(feature_information_gains, key=lambda x: (x[0], x[1])))
        feature_labels = np.column_stack((self.features, self.labels)).tolist()

        feature_labels.sort(key=lambda x: x[self.dim_split])
        feature_unique, unique_index = np.unique(
            np.array(feature_labels)[:, self.dim_split], return_index=True)
        feature_class_split = np.split(feature_labels, unique_index[1:])
        self.feature_uniq_split = self.features_unique[self.dim_split]

        self.feature_uniq_split = self.feature_uniq_split.tolist()
        self.feature_uniq_split.sort()
        feature_unique = feature_unique.tolist()

        for i in range(len(self.feature_uniq_split)):
            if not self.feature_uniq_split[i] in feature_unique:
                new_child = TreeNode([[]], self.labels, [[]])
                new_child.cls_max = self.cls_max
                self.children.append(new_child)
            else:
                index = feature_unique.index(self.feature_uniq_split[i])
                child_labels = feature_class_split[index][:, -1]
                child_features = np.delete(feature_class_split[index], -1, 1)
                child_features = np.delete(child_features, self.dim_split, 1)
                child_features_unique = np.delete(self.features_unique,
                                                  self.dim_split, 0)
                new_child = TreeNode(child_features.tolist(),
                                     child_labels.astype(int).tolist(),
                                     child_features_unique)
                self.children.append(new_child)
                if new_child.splittable:
                    new_child.split()
Exemplo n.º 5
0
    def split(self):
        s = Util.Weighted_Average_Entropy(Util.get_amount_cls(self.labels))
        best_gain_value = -1.0 * np.inf
        best_branches = {}
        best_a = 0
        best_class_amounts = []
        # find best attributes to spilt
        for a in range(len(self.features[0])):
            branches = Util.get_branches(self.features, self.labels, a)
            class_amounts = [
                Util.get_amount_cls(branch[1])
                for branch in list(branches.values())
            ]
            gain_value = Util.Information_Gain(s, class_amounts)
            if (gain_value > best_gain_value) or (
                    gain_value == best_gain_value
                    and len(branches.keys()) > len(best_branches.keys())):
                best_a = a
                best_class_amounts = class_amounts
                best_gain_value = gain_value
                best_branches = branches
        # setup the selected attributes splits
        self.dim_split = best_a
        self.feature_uniq_split = list(best_branches.keys())

        features = [data[0] for data in list(best_branches.values())]
        labels = [data[1] for data in list(best_branches.values())]

        best_class_amounts = [
            x for _, x in sorted(zip(self.feature_uniq_split,
                                     best_class_amounts),
                                 reverse=False)
        ]
        features = [
            feature
            for _, feature in sorted(zip(self.feature_uniq_split, features),
                                     reverse=False)
        ]
        labels = [
            label for _, label in sorted(zip(self.feature_uniq_split, labels),
                                         reverse=False)
        ]
        self.feature_uniq_split = sorted(self.feature_uniq_split)

        self.children = [
            TreeNode(feature, label, len(class_amount)) for feature, label,
            class_amount in zip(features, labels, best_class_amounts)
        ]
        #self.children = [ TreeNode(data[0],data[1],len(class_amount)) for data,class_amount in zip(list(best_branches.values()),best_class_amounts)]
        debug_i = 0
        for child in self.children:
            child.debug_path = self.debug_path + [debug_i]
            debug_i += 1
            if child.splittable:
                child.split()
Exemplo n.º 6
0
    def split(self):
        for idx_dim in range(len(self.features[0])):
            ############################################################
            # TODO: compare each split using conditional entropy
            #       find the
            ############################################################

            if not 'max_entropy' in locals():
                max_entropy = -1
            xi = np.array(self.features)[:, idx_dim]
            if None in xi:
                continue
            branch_values = np.unique(xi)
            branches = np.zeros((len(branch_values), self.num_cls + 1))
            for i, val in enumerate(branch_values):
                y = np.array(self.labels)[np.where(xi == val)]
                for yi in y:
                    branches[i, yi] += 1
            e = 0
            X = np.unique(self.labels)
            for x in X:
                i = float(np.count_nonzero(self.labels == x)) / len(
                    self.labels)
                e += i * np.log2(1 / i)

            info_gain_current = Util.Information_Gain(e, branches)
            if info_gain_current > max_entropy:
                #parent_entropy=Util.entropy(branches)
                max_entropy = info_gain_current
                self.dim_split = idx_dim
                self.feature_uniq_split = branch_values.tolist()

        ############################################################
        # TODO: split the node, add child nodes
        ############################################################
        xi = np.array(self.features)[:, self.dim_split]
        x = np.array(self.features, dtype=object)
        x[:, self.dim_split] = None
        # x = np.delete(self.features, self.dim_split, axis=1)
        for val in self.feature_uniq_split:
            indexes = np.where(xi == val)
            x_new = x[indexes].tolist()
            y_new = np.array(self.labels)[indexes].tolist()
            child = TreeNode(x_new, y_new, self.num_cls)
            if np.array(x_new).size == 0 or all(v is None for v in x_new[0]):
                child.splittable = False
            self.children.append(child)

        # split the child nodes
        for child in self.children:
            if child.splittable:
                child.split()

        return
Exemplo n.º 7
0
 def split(self):
     if len(self.features[0]) == 0:
         self.splittable = False
     if self.splittable:
         self.features = np.array(self.features)
         [r, c] = len(self.features), len(self.features[0])
         Gain = []
         for j in range(c):
             #print("c",c)
             count = []
             for key in np.sort(np.unique(self.features[:, j])):
                 temp = {}
                 for k in np.unique(self.labels):
                     temp[k] = 0
                 for i in range(r):
                     if self.features[i][j] == key:
                         z = self.labels[i]
                         temp[z] = temp[z] + 1
                 count.append(temp)
             sub = []
             for row in count:
                 row2 = []
                 for key in np.unique(self.labels):
                     row2.append(row[key])
                 sub.append(row2)
             Gain.append(Util.Information_Gain(0, sub))
         maxind = np.argwhere(Gain == np.max(Gain))
         if len(maxind) != 1:
             uft = []
             for m in range(len(maxind)):
                 uft.append(len(np.unique(self.features[:, maxind[m][0]])))
             maxuft = np.argwhere(uft == np.max(uft))
             self.dim_split = maxind[maxuft[0][0]][0]
         else:
             self.dim_split = np.argmax(Gain)
         #print(self.dim_split)
         self.feature_uniq_split = np.sort(
             np.unique(self.features[:, self.dim_split]))
         for k in self.feature_uniq_split:
             feat_new = []
             slce1 = []
             slce2 = []
             for i in range(r):
                 if self.features[i][self.dim_split] == k:
                     slce1 = self.features[i]
                     slce1 = np.delete(slce1, self.dim_split)
                     feat_new.append(slce1)
                     slce2.append(self.labels[i])
             child = TreeNode(feat_new, slce2, len(self.feature_uniq_split))
             self.children.append(child)
         for ch in self.children:
             ch.split()
     return
     raise NotImplementedError
Exemplo n.º 8
0
    def split(self):
        max_ig = float('-inf')
        local_feature = []
        ig_array = []
        for i in range(0, len(self.features[0])):
            branches = self.getbranches(
                (np.unique(np.array(self.features)[:, i])), i, self.features,
                self.labels, (np.unique(np.array(self.labels))))
            parent = self.getparententropy(self.labels)
            ig = Util.Information_Gain(parent, branches)
            ig_array.append(ig)
            if max_ig < ig:  # entropy is greater
                self.dim_split = i
                max_ig = ig
                local_feature = (np.unique(np.array(
                    self.features)[:, i])).tolist()
            elif max_ig == ig:  # entropy is same
                if len(
                    (np.unique(np.array(self.features)[:, i])).tolist()) > len(
                        local_feature
                    ):  # select features with greater attributes
                    self.dim_split = i
                    local_feature = (np.unique(
                        np.array(self.features)[:, self.dim_split])).tolist()
                elif len((np.unique(np.array(self.features)[:, i])
                          ).tolist()) == len(local_feature):  # same attributes
                    if self.dim_split >= i:  # select with lower index
                        self.dim_split = i
                        local_feature = (np.unique(
                            np.array(self.features)[:,
                                                    self.dim_split])).tolist()

        if max(ig_array) == 0.0:
            self.splittable = False
            return
        else:
            self.feature_uniq_split = local_feature
            feature_selected = np.array(self.features)[:, self.dim_split]
            modf_feature = np.delete(np.array(self.features), self.dim_split,
                                     1)
            for fs in np.sort(self.feature_uniq_split):
                indexes = np.where(fs == feature_selected)
                x_fs = modf_feature[indexes].tolist()
                l_fs = np.array(self.labels)[indexes].tolist()
                child = TreeNode(x_fs, l_fs, self.num_cls)
                self.children.append(child)
                if len(x_fs) == 0 or len(x_fs[0]) == 0:
                    child.splittable = False
        for child in self.children:
            if child.splittable:
                child.split()
        return

        raise NotImplementedError
Exemplo n.º 9
0
 def split(self):
     branches = []
     IG = []
     #get entropy for this node
     labels_count = np.bincount(self.labels)
     h = 0.0
     esum = np.sum(labels_count)
     if esum == 0:
         h = 0.0
     else:
         for k in labels_count:
             if k == 0:
                 h1 = 0
             else:
                 h1 = -k/esum*np.log2(k/esum)
             h += h1
     #get IG
     for i in range(len(self.features[0])):
         result1 = []
         for j in np.unique(np.transpose(self.features)[i]):
             labels = []
             for ind,k in enumerate(np.transpose(self.features)[i]):
                 if(j == k):
                     labels.append(self.labels[ind])
             result1.append(np.bincount(labels).tolist())
         branches.append(result1)
     for i in branches:
         IG.append(Util.Information_Gain(h,i))
     self.dim_split = np.argmax(IG)
     self.feature_uniq_split = np.unique(np.transpose(self.features)[self.dim_split]).tolist()
     for i in np.unique(np.transpose(self.features)[self.dim_split]):
         feature = []
         label = []
         for index, j in enumerate(self.features):
             if i == j[self.dim_split]:
                 inter = list(j)
                 inter.pop(self.dim_split)
                 feature.append(inter)
                 label.append(self.labels[index])
         node = TreeNode(feature,label,len(np.unique(label)))
         if node.splittable:
             node.split()
         self.children.append(node)
     return 
Exemplo n.º 10
0
    def split(self):
        Sn=Util.entropy(self.labels)
        max_gain=-np.inf
        branch_vals=[]
        min_entropy=np.inf
        df=pd.DataFrame(self.features)
        if df.empty:
            self.splittable=False
        else:
            df['labels']=self.labels
            for col in df.drop(columns='labels'):
                branches = np.nan_to_num(df[[col, 'labels']].groupby(by=[col, 'labels']).size().unstack().fillna(value=0).values)
                # branches=df[col].value_counts().to_frame().reset_index().values
                gain = Util.Information_Gain(Sn, branches.tolist())

                branch_vals = sorted(df[col].unique().tolist())
                if gain > max_gain:
                    max_gain=gain
                    self.dim_split = col
                    self.feature_uniq_split = branch_vals
                    #branches[:,0] #list(d.keys())

            split_df=df.groupby(by=[self.dim_split])

            for feature_val in self.feature_uniq_split:
                if feature_val not in split_df.groups.keys():
                    continue

                child=split_df.get_group(feature_val).drop(columns=self.dim_split)

                new_node = TreeNode(child.drop(columns=['labels']).values.tolist(), child['labels'].values.tolist(),self.num_cls)
                if child.drop(columns=['labels']).empty:
                    new_node.splittable = False
                    #new_node.cls_max = self.cls_max
                if (len(new_node.features) <= 1) or (child.drop(columns=['labels']).values == []):
                    #new_node.cls_max = self.cls_max
                    new_node.splittable = False
                self.children.append(new_node)

            for child in self.children:
                if child.splittable:
                    child.split()

        return
Exemplo n.º 11
0
 def calculate_information_gain(self, features_array, label_array, thisdict,
                                S):
     inforamtion_gain = []
     num_features = len(features_array[0])
     for i in range(num_features):
         cur_features = features_array[:, i]
         uni_features = np.unique(cur_features)
         branches = []
         for val in uni_features:
             label_for_each_value = label_array[features_array[:, i] == val]
             attribute_num = len(np.unique(label_for_each_value))
             num_for_each_class = [0] * len(thisdict)
             for element in np.unique(label_for_each_value):
                 times = label_for_each_value[label_for_each_value ==
                                              element].size
                 num_for_each_class[thisdict[element]] = times
             branches.append(num_for_each_class)
         res = Util.Information_Gain(S, branches)
         inforamtion_gain.append((res, uni_features, i))
     return sorted(inforamtion_gain,
                   key=lambda x: (-x[0], -len(x[1]), x[2]))
    def split(self):
        if self.splittable:
            labels_split = attribute_split_count(self.labels)

            feat_transpose = Util.transpose_list(self.features)

            # calculate S
            S = Util.calc_entropy(labels_split, sum(labels_split))

            info_gain_all_features = []
            col_index = 0

            for feat_col in feat_transpose:
                # branches
                unique_feat_vals = np.unique(feat_col).tolist()
                branches = {}
                for item in unique_feat_vals:
                    branches[item] = {}
                    for cls in self.classes:
                        branches[item][cls] = 0

                for feat_val, feat_label in zip(feat_col, self.labels):
                    branches[feat_val][feat_label] += 1

                # convert branches dict to 2D array of counts only
                branches_2d_array = []
                for key, branch in branches.items():
                    temp_array = []
                    for inner_key, count in branch.items():
                        temp_array.append(count)
                    branches_2d_array.append(temp_array)

                info_gain_all_features.append(
                    (Util.Information_Gain(S, branches_2d_array),
                     unique_feat_vals, col_index))
                col_index += 1
            info_gain_all_features.sort(key=lambda tup: tup[0], reverse=True)

            # base case if no features left
            if not info_gain_all_features:
                if self.dim_split is None:
                    self.splittable = False
                return

            # filter ties
            info_gain_all_features = Util.filter_ties(info_gain_all_features)

            info_gain_all_features.sort(key=lambda tup: tup[1], reverse=True)

            info_gain_all_features = Util.filter_ties(info_gain_all_features)

            info_gain_all_features.sort(key=lambda tup: tup[2])

            self.assign_selected_feature(info_gain_all_features[0])

            # assign Children
            # The children variable is a list of TreeNode after split
            #  the current node based on the best attributes.
            self.feature_uniq_split.sort()
            for feat_val_extract in self.feature_uniq_split:
                extract_feat = []
                extract_labels = []

                for row_feat, row_labels in zip(self.features, self.labels):
                    if feat_val_extract == row_feat[self.dim_split]:
                        temp_row_feat = row_feat[:]
                        temp_row_feat.pop(self.dim_split)
                        extract_feat.append(temp_row_feat)
                        extract_labels.append(row_labels)
                self.children.append(
                    TreeNode(extract_feat, extract_labels,
                             np.unique(extract_labels).size))
            for node in self.children:
                node.split()
        else:
            return
Exemplo n.º 13
0
    def split(self):
        max_score = 0
        best_child_list = []
        best_label_list = []
        best_class_list = []
        temp_split = -1
        if len(self.features[0]) == len(self.feature_uniq_split):
            self.splittable = False
            return
        else:
            # find best feature to split
            # best_feature = self.features[0]

            # transpose
            transpose_feature = np.transpose(self.features)
            # feature index
            for feature_index in range(len(self.features[0])):
                can_split = True
                for already_split in self.feature_uniq_split:
                    if feature_index == already_split:
                        can_split = False
                if can_split:
                    # print('feature_index',len(self.features[0]))
                    temp_child_list = []
                    temp_label_list = []
                    temp_count_list = []
                    temp_class_list = []
                    # feature class
                    for feature_class in np.unique(
                            transpose_feature[feature_index]):
                        child_num = np.unique(
                            transpose_feature[feature_index]).size
                        temp_child = []
                        temp_label = []
                        temp_count = []
                        temp_class = feature_class
                        # feature number
                        for feature_num in range(len(self.features)):
                            if self.features[feature_num][
                                    feature_index] == feature_class:
                                temp_child.append(self.features[feature_num])
                                temp_label.append(self.labels[feature_num])

                        # count label number and calculate IG
                        for label_class in np.unique(self.labels):
                            count_temp_label_num = 0
                            for temp_label_index in range(len(temp_label)):
                                if temp_label[temp_label_index] == label_class:
                                    count_temp_label_num += 1
                            temp_count.append(count_temp_label_num)
                        # print('temp_count', temp_count)
                        temp_child_list.append(temp_child)
                        temp_label_list.append(temp_label)
                        temp_count_list.append(temp_count)
                        temp_class_list.append(temp_class)
                    parentlist = []
                    # parent_score = 0
                    for child in range(len(temp_count_list[0])):
                        num = 0
                        for index in range(len(temp_count_list)):
                            num += temp_count_list[index][child]
                        parentlist.append(num)
                    # count_parent = sum(parentlist)
                    parent_score = -1 * Util.Information_Gain(0, [parentlist])
                    # for j in range(len(parentlist)):
                    #     if parentlist[j] != 0:
                    #         parent_score -= (parentlist[j] / count_parent) * (np.log2((parentlist[j] / count_parent)))
                    score = Util.Information_Gain(parent_score,
                                                  temp_count_list)
                    if score < 0.000000000000001:
                        score = 0.0
                    print(score > 0.0)
                    print('score', score, temp_label_list)
                    if score > max_score:
                        max_score = score
                        best_child_list = temp_child_list
                        best_label_list = temp_label_list
                        best_class_list = temp_class_list
                        temp_split = feature_index
                    elif score == max_score and score > 0:
                        if len(temp_label_list) > len(best_label_list):
                            best_child_list = temp_child_list
                            best_label_list = temp_label_list
                            best_class_list = temp_class_list
                            temp_split = feature_index
                            # self.dim_split = feature_index
            # self.dim_split = temp_split
            if max_score == 0:
                raise NotImplementedError
                self.splittable = False
                self.cls_max = self.labels[0]
                return
            print('max_score', max_score, best_label_list)
            self.dim_split = temp_split
            self.feature_uniq_split.append(temp_split)
            self.child_class = best_class_list
            # end for loop

            for child_index in range(len(best_label_list)):
                child_num_cls = np.unique(best_label_list[child_index]).size
                if len(best_child_list[child_index]) == 0:
                    return
                else:
                    child_node = TreeNode(best_child_list[child_index],
                                          best_label_list[child_index],
                                          child_num_cls)
                    child_node.feature_uniq_split = self.feature_uniq_split
                if child_node.splittable:
                    child_node.split()
                self.children.append(child_node)
            return
Exemplo n.º 14
0
    def split(self):
        features = np.array(self.features)
        number_of_attributes = features[0].size

        if (number_of_attributes != 0):

            max_info_gain = -1
            max_unique_values = 0
            max_attr_number = 0

            #Entropy for root:
            root_branch = []
            root_label_count = []
            for label in np.unique(self.labels):
                root_label_count.append(self.labels.count(label))
            root_branch.append(root_label_count)
            entropy_root = Util.Information_Gain(0, root_branch)
            entropy_root *= -1

            #Split according to attributes
            for attr_number in range(number_of_attributes):
                unique_values = np.unique(features[:, attr_number])
                splits = unique_values.size
                branches = []
                children = []

                np.sort(unique_values)
                for unique_value in unique_values:
                    branch = []
                    branch_feat = []
                    branch_label = []
                    branch_label_count = []
                    zero_label = 0
                    one_label = 0
                    for pos, feature in enumerate(features):
                        if (feature[attr_number] == unique_value):
                            branch_feat.append(np.delete(feature, attr_number))
                            branch_label.append(self.labels[pos])

                    for label in np.unique(self.labels):
                        branch_label_count.append(branch_label.count(label))

                    branches.append(branch_label_count)
                    child = TreeNode(branch_feat, branch_label,
                                     np.unique(branch_label).size)
                    child.parent = self
                    children.append(child)

                # Check Information Gain for each attribute
                info_gain = Util.Information_Gain(entropy_root, branches)

                if (info_gain > max_info_gain):
                    max_info_gain = info_gain
                    max_unique_values = unique_values.size
                    max_attr_number = attr_number
                    self.children = children
                    self.dim_split = attr_number
                    self.feature_uniq_split = unique_values

                elif (info_gain == max_info_gain):
                    if (unique_values.size > max_unique_values):
                        max_info_gain = info_gain
                        max_unique_values = unique_values.size
                        max_attr_number = attr_number
                        self.children = children
                        self.dim_split = attr_number
                        self.feature_uniq_split = unique_values

                    elif (unique_values.size == max_unique_values):
                        if (attr_number < max_attr_number):
                            max_info_gain = info_gain
                            max_unique_values = unique_values.size
                            max_attr_number = attr_number
                            self.children = children
                            self.dim_split = attr_number
                            self.feature_uniq_split = unique_values
            if (max_info_gain == 0.0):
                self.children = []
                self.dim_split = None
                self.feature_uniq_split = None
                self.splittable = False
                return

            for child in self.children:
                if (child.splittable is True):
                    child.split()
        else:
            self.splittable = False
Exemplo n.º 15
0
 def split(self):
     #split on the basis of best_attribute -> highest information gain
     #calculate root entropy
     # print("features =========", self.features, " labels: ", self.labels, " num_cls ====", self.num_cls)
     num_class = np.unique(self.labels, return_counts=True)
     val_set = num_class[0]
     counts = num_class[1]
     total_entries = len(self.labels)
     prob = counts / total_entries
     root_entropy = 0
     for p in prob:
         root_entropy += -1 * p * np.log2(p)
     # print("root_entropy",root_entropy)
     information_gains = []
     branch_feature_values = []
     best_information_gain = -1
     branch_feature_list = []
     #now find best attributes
     for index_col in range(len(self.features[0])):  #for each feature
         coli = np.array(self.features)[:, index_col]
         branch_feature = np.unique(coli)  #[a,b]
         # print("branch_feature ====", branch_feature)
         branch = [0] * len(branch_feature)
         for b in range(len(branch_feature)):
             branch[b] = [0] * self.num_cls
         # print(branch)
         # branch = np.zeros((len(branch_feature), self.num_cls))#num_branches * num_class
         feature_dict = {}
         count = 0
         for fea in branch_feature:
             feature_dict[fea] = count
             count += 1
         # print(feature_dict)
         labels_dict = {}
         count = 0
         for lab in np.unique(self.labels):
             labels_dict[lab] = count
             count += 1
         # print("coli ========", coli)
         for i in range(len(coli)):
             branch_feature_num = feature_dict[coli[i]]
             label_feature_num = labels_dict[self.labels[i]]
             branch[branch_feature_num][label_feature_num] += 1
         gain = Util.Information_Gain(root_entropy, branch)
         branch_feature_values.append([len(branch_feature), index_col])
         information_gains.append(gain)
         branch_feature_list.append(branch_feature.tolist())
         # print("gain====", gain)
         # if gain > best_information_gain:
         #     best_information_gain = gain
         #     self.dim_split = index_col
         #     self.feature_uniq_split = branch_feature.tolist()
     if (len(information_gains) <= 0):
         self.dim_split = None
         self.feature_uniq_split = None
         self.splittable = False
         return
     gain_max = max(information_gains)
     best_index_gain = [
         i for i in range(len(information_gains))
         if information_gains[i] == gain_max
     ]
     #we need to pick feature with most number of attr values
     max_attr = -1
     fea_col = -1
     best_branch_feature_list = []
     best_position = -1
     for ind in best_index_gain:
         if (max_attr < branch_feature_values[ind][0]):
             best_position = ind
             max_attr = branch_feature_values[ind][0]
             fea_col = branch_feature_values[ind][1]
             best_branch_feature_list = branch_feature_list[ind]
     best_information_gain = information_gains[best_position]
     self.dim_split = fea_col
     self.feature_uniq_split = best_branch_feature_list
     if best_information_gain <= -1:
         self.dim_split = None
         self.feature_uniq_split = None
         self.splittable = False
         return
     # print("dividing on the basis of =====", best_information_gain, "dimension:  ===", self.dim_split, "feature: ===", self.feature_uniq_split)
     #split the nodes, and add child nodes
     coli = np.array(self.features)[:, self.dim_split]
     # print("coli===========", coli) #column to be removed
     print("feature_uniq_split == ", self.feature_uniq_split)
     self.feature_uniq_split.sort()
     if len(self.feature_uniq_split) > 0:
         for val in self.feature_uniq_split:
             labels_new = []
             features_new = []
             for index, row in enumerate(self.features):
                 if row[self.
                        dim_split] == val:  #value for which need to split
                     labels_new.append(self.labels[index])
                     features_new.append(row)
             features_new = np.delete(features_new, self.dim_split, axis=1)
             num_class = np.unique(labels_new)
             child = TreeNode(features_new.tolist(), labels_new,
                              len(num_class))
             self.children.append(child)
         # print(self.children)
         # split the child nodes
         for child in self.children:
             # print("child =========", child.splittable)
             if child.splittable:
                 child.split()
     return
Exemplo n.º 16
0
    def split(self):
        # get each feature list
        if len(self.features) == 0:
            self.splittable = False
            return
        features = np.array(self.features)
        labels = np.array(self.labels)
        max_IG = 0
        unique_feature_split = np.array([])
        for k in range(len(self.features[0])):
            uniq_features_dict = {}
            uniq_label_dict = {}
            unique_features = np.unique(features.T[k])
            unique_labels = np.unique(labels)

            features_label = []
            counter_dict = {}
            for i in self.labels:
                if i not in counter_dict.keys():
                    counter_dict[i] = 1
                else:
                    counter_dict[i] += 1
            # calculate parents entropy
            S = 0
            for i in unique_labels:
                S += (-1) * (counter_dict[i] / len(self.labels) *
                             np.log2(counter_dict[i] / len(self.labels)))

            for i, j in zip(self.features, self.labels):
                features_label.append((i[k], j))

            c_dict = {}
            for i in features_label:
                if i not in c_dict.keys():
                    c_dict[i] = 1
                else:
                    c_dict[i] += 1

            for i in range(len(unique_features)):
                uniq_features_dict = {
                    unique_features[i]: i
                    for i in range(len(unique_features))
                }

            for j in range(len(unique_labels)):
                uniq_label_dict = {
                    unique_labels[i]: i
                    for i in range(len(unique_labels))
                }

            branches = [[0] * len(unique_labels)
                        for i in range(len(unique_features))]
            for x in features_label:
                branches[uniq_features_dict[x[0]]][uniq_label_dict[
                    x[1]]] = c_dict[x]
            # calculate the information gain
            IG = Util.Information_Gain(S, branches)

            if IG > max_IG or (IG == max_IG and len(unique_features) >
                               len(unique_feature_split)):
                max_IG = IG
                unique_feature_split = unique_features
                selected_index = k

        self.dim_split = selected_index
        self.feature_uniq_split = unique_feature_split.tolist()

        if len(self.feature_uniq_split) == 0 or max_IG == 0:
            self.splittable = False

        # split the node
        to_split = self.feature_uniq_split
        cut = self.dim_split
        for i in range(len(to_split)):
            children_features = features[features[:, cut] == to_split[i]]
            New_features = np.delete(children_features, cut, axis=1).tolist()
            New_labels = labels[features[:, cut] == to_split[i]].tolist()
            New_num_cls = len(New_labels)
            chil = TreeNode(New_features, New_labels, New_num_cls)
            self.children.append(chil)

        for child in self.children:
            if child.splittable:
                child.split()
Exemplo n.º 17
0
    def split(self):
        if self.splittable==True:
            index_use=[]
            features=np.array(self.features)
            n=len(self.features[0])
            nn=len(self.features)
            bestIG=-1.0
            baseEn=0.0
            bestcc=0
            labelLength = len(self.labels)
            labelCount = len(set(self.labels))

            for i in range(labelCount):
                a = float(self.labels.count(list(set(self.labels))[i])) / float(labelLength)
                if a > 0:
                    baseEn += -(a * np.log2(a))
            for i in range(n):
                branchs=[]
                featureL=[aaa[i] for aaa in self.features]
                ind=list(set(featureL))
                cc=len(ind)
                for attr in ind:
                    l=[]
                    for j in range(nn):
                        if featureL[j]==attr:
                            l.append(self.labels[j])
                    d={}
                    for ii in l:
                        if ii not in d:
                            d[ii]=1
                        else:
                            d[ii]+=1
                    l1=[]
                    for iii in d.values():
                        l1.append(iii)
                    while len(l1)<self.num_cls:
                        l1.append(0)
                    branchs.append(l1)
                Info=Util.Information_Gain(baseEn,branchs)
            
                if Info - bestIG > 1e-5:
                    bestcc=cc
                    bestIG=Info
                    self.dim_split=i
                    self.feature_uniq_split=np.unique(features[:,self.dim_split]).tolist()
                elif Info==bestIG:
                    if cc>bestcc:
                        bestcc=cc
                        self.dim_split=i
                        self.feature_uniq_split=np.unique(features[:,self.dim_split]).tolist()
            if self.features==None:
                self.splittable=False
                return
            if bestIG < 1e-5:
                self.splittable=False
                return

            if self.num_cls==1:
                self.splittable=False
                return
        
            if self.feature_uniq_split==None:
                self.splittable=False
                return
        
            index_use.append(i)
            labels = self.labels
            t = self.feature_uniq_split
            c = self.dim_split
            for m in t:
                res = []
                l = []
                for i in range(len(features)):

                    if m == features[i][c]:
                        l.append(labels[i])
                        a = list(features[i])
                        a.remove(m)
                        res.append(a)
                num_cls=len(set(l))

                child = TreeNode(res, l, num_cls)
                if len(index_use)==n:
                    child.splittable=False
                if res==None:
                    child.splittable=False
                if len(set(l))==1:
                    child.splittable=False
                self.children.append(child)

            for child in self.children:
                if child.splittable:
                    child.split()
            return
        else:
            
            return
Exemplo n.º 18
0
    def split(self):
        self.feature_uniq_split = []
        S = 0
        #        self.features = [[0, 0], [1, 0], [0, 1], [1, 1], [0, 0], [1, 0], [0, 1], [1, 1]]
        #        self.labels = [1,2,0,1,2,0,0,0]
        #        self.features = [['a', 'b'], ['b', 'a'], ['b', 'c'], ['a', 'c']]
        #        self.labels = [0, 0, 1, 1]
        np_features = np.array(self.features)
        features_T = np.transpose(np_features)
        np_labels = np.array(self.labels)
        print("features transpose", features_T)
        print("labels", np_labels)

        unique_labels = list(np.unique(np_labels))

        for label in np.unique(self.labels):
            p_of_label = self.labels.count(label) / len(self.labels)
            #            print (p_of_label)
            S -= p_of_label * np.log2(p_of_label)
        print(S)

        #unique_labels = list(np.unique(np_labels))
        #        features_transpose = np.transpose(features)
        list_info_gain = []
        for i in range(len(features_T)):
            attribute_value = list(np.unique(features_T[i]))
            current_attribute = features_T[i]
            branches = []
            for idx in range(len(attribute_value)):
                count_of_labels = []
                for j in range(self.num_cls):
                    counter = 0
                    for attr_val, k in zip(current_attribute, np_labels):
                        if attr_val == attribute_value[
                                idx] and k == unique_labels[j]:
                            counter += 1
                    count_of_labels.append(counter)
                branches.append(count_of_labels)
            print(branches)
            list_info_gain.append(Util.Information_Gain(S, branches))
#        for i in range(len(unique_labels)):
#            list_info_gain.append(Util.Information_Gain(S, branches))
        print(list_info_gain)
        if (list_info_gain == []):
            self.dim_split = None
            self.feature_uniq_split = None
            self.splittable = False
            return

        if (max(list_info_gain) == 0):
            self.dim_split = None
            self.feature_uniq_split = None
            self.splittable = False
            return

        selected_attr_to_split = list_info_gain.index(max(list_info_gain))

        list_max_info_gain = []
        count_attr_clash = []

        for index, info_gain_value in enumerate(list_info_gain):
            if (info_gain_value == max(list_info_gain)):
                list_max_info_gain.append(index)

        for index in list_max_info_gain:
            count_attr_clash.append(np.unique(features_T[index]).size)

        selected_attr_clash = count_attr_clash.index(max(count_attr_clash))
        selected_attr_to_split = list_max_info_gain[selected_attr_clash]

        print(selected_attr_to_split)
        self.dim_split = selected_attr_to_split
        #        self.feature_uniq_split.append(np.unique(features_T[self.dim_split]))
        self.feature_uniq_split = list(np.unique(features_T[self.dim_split]))
        #        self.feature_uniq_split.sort()
        #        self.feature_uniq_split = np.array(self.feature_uniq_split)
        #        print(self.feature_uniq_split.dtype)
        print("dim split", self.dim_split)
        print("feature unique split", self.feature_uniq_split)
        for selected_attr_value in self.feature_uniq_split:
            new_features = []
            new_labels = []
            print(selected_attr_value)
            #            new_features = np.delete(self.features, self.dim_split, axis = 1)
            #check this loop
            print(np_features.shape[0])
            for i in range(np_features.shape[0]):
                if (np_features[i][self.dim_split] == selected_attr_value):
                    new_features.append(self.features[i][0:self.dim_split] +
                                        self.features[i][self.dim_split + 1:])
                    #                        new_features.append(str(np_features[i, 0:self.dim_split]) + str(np_features[i, self.dim_split+1:]))
                    #                        new_features.append(self.features[i][:]
                    new_labels.append(self.labels[i])
#                    print ("new features before transformation ", new_features)
                new_features = [x for x in new_features if x]

            new_num_classes = np.unique(new_labels).size
            print(new_features, " size is ", len(new_features))
            print(new_features, " these are new features")
            print(new_labels, " size is", len(new_labels))
            print(new_labels, " these are new labels")
            self.children.append(
                TreeNode(new_features, new_labels, new_num_classes))

        for child in self.children:
            if (child.splittable):
                child.split()
def information_gain_test():
    branch = data.sample_branch_data()
    score = Utils.Information_Gain(0, branch)
    print('Your information gain: ', score)
    print('My information gain: ', -0.91829583405448956)
    def split(self):
        # compute the entropy of this tree node
        if self.splittable:
            # find the entropy of current node
            node_entropy = 0
            for label in np.unique(self.labels):
                num_label = self.labels.count(label)
                prop = num_label / len(self.labels)
                node_entropy += -prop * np.log2(prop)

            # find which feature leads to largest information gain
            max_inf_gain = -1
            best_attr_size = -1
            uni_label_class = np.unique(self.labels)
            all_attr = [i for i in range(len(self.features[0]))]
            available_attr = [j for j in all_attr if j not in self.used_attr]
            for i in available_attr:
                cur_feature = np.array(self.features)[:, i]  # extract certain feature row
                cur_branches = []
                # find out # of attributes for current feature
                uni_cur_attr = np.unique(cur_feature)
                # for each attribute find the labels
                for attr in uni_cur_attr:
                    attr_index = np.where(cur_feature == attr)
                    attr_label = np.array(self.labels)[attr_index]
                    cur_row = []
                    for label in uni_label_class:
                        num = attr_label.tolist().count(label)
                        cur_row.append(num)
                    cur_branches.append(cur_row)
                cur_inf_gain = Util.Information_Gain(node_entropy, cur_branches)
                if cur_inf_gain > max_inf_gain or \
                        (cur_inf_gain == max_inf_gain and len(uni_cur_attr) > best_attr_size):
                    max_inf_gain = cur_inf_gain
                    self.dim_split = i
                    best_attr_size = len(uni_cur_attr)

            # extract the whole line of the best split feature we just computed
            split_feature = np.array(self.features)[:, self.dim_split]

            self.feature_uniq_split = np.unique(split_feature).tolist()

            # store the number of each attribute
            for attr in np.unique(split_feature):
                child_features = []
                child_labels = []
                value_indices_arr = np.array(self.features)
                value_indices = np.where(value_indices_arr[:, self.dim_split] == attr)[0]
                for index in value_indices:
                    child_features.append(self.features[index])
                    child_labels.append(self.labels[index])

                '''idx = np.where(split_feature == attr)
                child_features = np.delete(np.array(self.features)[idx], self.dim_split, axis=1).tolist()
                child_labels = np.array(self.labels)[idx].tolist()'''

                num_cls_child = np.unique(child_labels).size
                child = TreeNode(child_features, child_labels, num_cls_child)
                child.used_attr.extend(self.used_attr)
                child.used_attr.append(self.dim_split)
                if len(child.used_attr) == len(self.features[0]):
                    child.splittable = False
                self.children.append(child)

            for child in self.children:
                if child.splittable:
                    child.split()

        return
Exemplo n.º 21
0
    def split(self):
        #split based on IG
        #get unique values of a feature
        #iterate feature wise

        #for parent entropy calculation
        #print(self.labels)
        #print(self.features)
        featuresT = np.transpose(self.features)
        inf_gain_list = []
        max_gain = 0
        for i in range(len(featuresT)):
            p_ent = 0
            values = np.unique(featuresT[i])
            print("Type of values is ", type(values))
            num_values = len(values)
            print("Num of unique values: ", num_values)  #child nodes
            num_examples = len(featuresT[i])
            for label in np.unique(self.labels):
                p_ent += -np.float(
                    (self.labels.count(label))) / num_examples * np.log2(
                        np.float((self.labels.count(label))) / num_examples)
            branches = {}
            print(values)  #attribute values
            for j in range(len(featuresT[i])):
                value = featuresT[i][j]
                if branches.get(value) is None:
                    branches[value] = {}
                    branches[value][self.labels[j]] = 1
                    #print(branches[value][self.labels[j]])
                    #print(branches.get(value))
                elif branches.get(value).get(self.labels[j]) is None:
                    branches.get(value)[self.labels[j]] = 1
                    #print(branches.get(value)[self.labels[j]])
                else:
                    branches[value][self.labels[j]] = branches.get(value).get(
                        self.labels[j]) + 1
            #print(branches)
            #branches.get(value).append(self.labels[j])
            """for j in range(len(featuresT[i])):
                value = featuresT[i][j]
                if branches.get(value) is None:
                    branches[value] = 1
                    print("created new key")
                elif branches.get(value)>0:
                    branches[value] = branches.get(value) +1
            print(branches)"""
            branchesList = []
            j = 0
            for key in branches:
                branchesList.append([])
                #print(key)
                for i in range(self.num_cls):
                    #print(branches[key][i])
                    if branches.get(key).get(i) is None:
                        branchesList[j].append(0)
                    elif branches[key][i] > 0:
                        branchesList[j].append(branches[key][i])
                j = j + 1
            #print(branchesList)
            inf_gain = Util.Information_Gain(p_ent, branchesList)
            if inf_gain > max_gain:
                max_gain = inf_gain
                selected_feature_index = i
            #inf_gain_list.append(inf_gain)
            #print(inf_gain," is the information gain")
        #selected_feature_index = inf_gain_list.index(max(inf_gain_list))
        #print(selected_feature_index)
        #split into children nodes

        self.dim_split = selected_feature_index
        self.feature_uniq_split = np.sort(values)

        for attribute in values:
            attribute_label = []
            featureList = []
            for i in range(len(featuresT[selected_feature_index])):
                if featuresT[selected_feature_index][i] == attribute:
                    attribute_label.append(self.labels[i])
                    featureList.append(self.features[i])
            for i in featureList:
                del i[selected_feature_index]
            #print(featureList)
            num_cls = np.unique(attribute_label).size
            #print(num_cls, "::" , len(attribute_label))
            self.children.append(
                TreeNode(featureList, attribute_label, num_cls))
        for child in self.children:
            if child.splittable:
                child.split()
Exemplo n.º 22
0
    def split(self):
        if self.splittable == False:
            return

        if len(self.features[0]) == 0:
            self.splittable = False
            return

        if len(self.features) == 0:
            return

        igmax = -1
        more_attributes = -1
        S = 0.0
        ulabels = np.unique(self.labels)
        for l in range(len(ulabels)):
            if self.labels.count(ulabels[l]) == 0:
                S += 0
                continue
            frac = self.labels.count(ulabels[l]) / (len(self.labels))
            S -= frac * np.log2(frac)
        igSum = 0.0
        for i in range(len(self.features[0])):
            ig = 0
            feature_column = [row[i] for row in self.features]

            labels = self.labels
            unique_features = np.unique(feature_column)
            unique_labels = np.unique(labels)

            dict_features = {}
            dict_labels = {}

            for k in range(len(unique_features)):
                dict_features[unique_features[k]] = k

            for p in range(len(unique_labels)):
                dict_labels[unique_labels[p]] = p

            branches = [[0 for x in range(len(unique_labels.tolist()))]
                        for y in range(len(unique_features.tolist()))]

            for l in range(len(feature_column)):
                if feature_column[l] in dict_features:
                    branches[dict_features[feature_column[l]]][dict_labels[
                        labels[l]]] += 1

            ig = Util.Information_Gain(S, branches)
            igSum += ig

            if ig > igmax:
                igmax = ig
                self.dim_split = i
                self.feature_uniq_split = unique_features
                more_attributes = len(np.unique(unique_features))

            elif ig == igmax:
                if len(np.unique(unique_features)) > more_attributes:
                    igmax = ig
                    self.dim_split = i
                    self.feature_uniq_split = unique_features
                    more_attributes = len(np.unique(unique_features))

        if igSum == 0:
            self.splittable = False
            return
        else:
            feature_column = [row[self.dim_split] for row in self.features]
            unique_features = self.feature_uniq_split

            for i in range(len(unique_features)):

                new_features_list = []
                new_labels_list = []

                for j in range(len(self.features)):
                    if unique_features[i] == feature_column[j]:
                        new_features_list.append(self.features[j])
                        new_labels_list.append(self.labels[j])

                new_features_list = np.asarray(new_features_list)
                x1 = new_features_list.transpose()
                x2 = np.delete(x1, self.dim_split, 0)
                x3 = x2.transpose().tolist()
                new_features_list = x3
                #            new_features_list=(np.delete(new_features_list.transpose(),self.dim_split,0)).transpose().tolist()

                child = TreeNode(new_features_list, new_labels_list,
                                 len(np.unique(new_labels_list)))
                if len(new_features_list) == 0:
                    child.cls_max = self.cls_max
                    child.splittable = False
                if len(new_features_list[0]) == 0:
                    count_max = 0
                    for label in np.unique(new_labels_list):
                        if new_labels_list.count(label) > count_max:
                            count_max = new_labels_list.count(label)
                            child.cls_max = label
                    child.splittable = False
                self.children.append(child)

            for i in range(len(unique_features)):
                self.children[i].split()
Exemplo n.º 23
0
    def split(self):

        
        if not self.splittable:
            # this case for only one class (self.cls_max)
            # set all split as default and return self.cls_max as result
            return

        elif len(self.features[0]) == 0 and self.num_cls != 0:
            # this case for no more features available
            # choose majority of classes as result self.cls_max
            return 
        
        elif len(self.features[0]) == 0 and self.num_cls == 0:
            # this case return majority of classes with parent node
            # !!! Consider when predicating parent result should be hold
            return 
        
        
        #TODO: produce specifc feature branch result    
        candidate_value_list:List[List[any]] = []
        tmp = np.sort(np.array(self.features).transpose())
        for row in tmp:
            # candidate_value_list is transpose feature matrix with deduplication
            candidate_value_list.append(np.unique(row))


        Entropy_for_Features:List[float] = []

        
        # candidate_feature is each feature row with its index in feature matrix
        
        
        for index,candidate_feature in enumerate(candidate_value_list, start=0):

            tmp_branches_data:List[List[int]] = []
            for current_value in candidate_feature:
                class_dic = dict()
                for class_label in sorted(np.unique(self.labels)):
                    class_dic[class_label] = 0
                
                #choose instances with specific feature value and return as branch set,
                #this branch set should have deleted specific locaiton feature
                for label_index, row in enumerate(self.features, start=0):
                    if row[index] == current_value:
                        class_dic[self.labels[label_index]] +=1
                #entropy for a specifc feature
                tmp_branch_data:List[int] = []
                for _,value in class_dic.items():
                    tmp_branch_data.append(value)
                #need normalization for tmp_brach_data
                tmp_branches_data.append(tmp_branch_data)
            
            #directly append entropy for each attribute    
            #Entropy_for_Features.append(-1*Util.Information_Gain(0,tmp_branches_data))
            #follow instruction produce S
            #Entropy_for_Features.append(-1*Util.Information_Gain(self.entropy_root(),tmp_branches_data))
            #print('tmp_branches_data_for_each'+str(tmp_branches_data))
            
            Entropy_for_Features.append(Util.Information_Gain(self.entropy_root(),tmp_branches_data))
        
        #print('Entropy for features' + str(Entropy_for_Features))

        #get a entropy list in Entropy_for_Features:List[float]
        #consider when have same entropy value and how to compare
        # find index of all max entropy
        candidate_features:List[int] = []

        for index, entropy in enumerate(Entropy_for_Features, start=0):
            if entropy == max(Entropy_for_Features):
                candidate_features.append(index)
        #print('value of Entropys is:' + str(Entropy_for_Features))
        # if only one maximum entropy
        if len(candidate_features) == 1:
            self.dim_split = candidate_features[0]
        # more than one maximum entropy
        # init: transpose features matrix for picking up data line
        transpose_features = np.array(self.features).transpose()
        # storage for best candidate
        if len(candidate_features) > 1:
            #print("============")
            
            #print("equal entropy features: "+ str(candidate_features))
            best_candidate_index = len(transpose_features) + 1
            best_unique_number = 0
            for candidate_feature_index in candidate_features:
                # oringinal based on possible kinds of values for [2, 4, 5, 7] is 4
                unique_feature_number = len(np.unique(transpose_features[candidate_feature_index]))

                # try based on range of values for [2, 4, 5 ,7] should be 5
                #unique_feature_number = max(transpose_features[candidate_feature_index]) - min(transpose_features[candidate_feature_index])
                # print("all feature values:"+ str(transpose_features[candidate_feature_index]))
                # if unique_feature_number_1 != unique_feature_number:
                    
                #     print("range of feature value:" + str(unique_feature_number))
                #     print("new range of feature:" + str(unique_feature_number_1))
                if unique_feature_number > best_unique_number:
                    best_candidate_index = candidate_feature_index
                    best_unique_number = unique_feature_number
                elif (unique_feature_number == best_unique_number) and (best_candidate_index > candidate_feature_index):
                    best_candidate_index = candidate_feature_index
                    best_unique_number = unique_feature_number
            self.dim_split = best_candidate_index

        #print("final choice feature:" + str(self.dim_split))
        # dimension has been chosen
        
        #####
        # TODO: check variable
        # put candidate unique values into self.feature_uniq_split
        # self.feature_uniq_split = np.unique(transpose_features[self.dim_split])
        #####
        
        # initialize treenode and put in self.children
        # features, labels, num_cls are required parameters
        # pick up data row with specific value
        # feature value from min to max
        feature_values = sorted(np.unique(transpose_features[self.dim_split]))
        for value_index,cur_value in enumerate(feature_values,start=0):
            children_features:List[any] = []
            children_labels:List[int] = []
            for index,feature_row in enumerate(self.features, start=0):
                if feature_row[self.dim_split] == cur_value:
                    
                    #features with specific value has been taken out
                    #labels for that position
                    tmp = list(feature_row)
                    tmp.pop(self.dim_split)
                    children_features.append(tmp)
                    children_labels.append(self.labels[index])
            # num_cls for new node
            children_num_cls = len(np.unique(children_labels))
            # new index should be added based on parent one and its index
            new_index = self.index.copy()
            new_index.append(value_index)
            #print(value_index)

            new_instance = TreeNode(children_features, children_labels, children_num_cls,new_index)

            new_instance.split()
            
            self.children.append(new_instance)
            self.feature_uniq_split.append(cur_value)
Exemplo n.º 24
0
    def split(self):

        self.lables = np.array(self.labels)
        self.features = np.array(self.features)

        if self.num_cls == 1:
            self.splittable = False
            return self
        if self.features.size == 0:
            self.splittable = False
            return self

        branches = []
        entropy = 0
        total_label = len(self.labels)

        feat = self.features
        lab = self.labels

        for label in np.unique(lab):
            number = lab.count(label) / total_label
            if number > 0:
                entropy += -number * np.log2(number)

        arr = []
        for i in range(0, feat.shape[1]):
            attribute = feat[:, i]
            current_attribute_unique_size = np.unique(attribute).size
            current_index = i

            for unique_attribute in np.unique(attribute):
                indices_array = np.where(feat[:, i] == unique_attribute)
                labels_ar = []
                for index in indices_array[0]:
                    labels_ar.append(lab[index])

                label_per_value = []
                label_ctr = Counter(labels_ar)
                for label in np.unique(lab):
                    if label in label_ctr:
                        label_per_value.append(label_ctr[label])
                    else:
                        label_per_value.append(0)

                branches.append(label_per_value)

            gain = Util.Information_Gain(entropy, branches)

            arr.append([gain, current_attribute_unique_size, current_index])

            branches = []

        arr = sorted(arr, key=lambda x: (x[0], x[1], -x[2]), reverse=True)

        self.dim_split = arr[0][2]
        ig_attributes = np.array(arr)
        ig_attributes = ig_attributes[:, 0]

        if np.all(ig_attributes == 0):
            self.splittable = False
            return self

        unique_features = np.unique(feat[:, self.dim_split])

        for value in unique_features:
            indices_array = np.where(feat[:, self.dim_split] == value)

            labels_pass = []
            features_pass = []
            labels_left = []
            features_left = []

            for i, feature in enumerate(feat):
                if i in indices_array[0]:
                    features_pass.append(feature.tolist())
                    labels_pass.append(lab[i])
                else:
                    features_left.append(feature.tolist())
                    labels_left.append(lab[i])

            features_pass = np.delete(features_pass, self.dim_split, axis=1)

            feat = np.array(features_left)
            lab = np.array(labels_left)

            tree_node = TreeNode(features_pass, labels_pass,
                                 np.unique(labels_pass).size)
            tree_node.feature_uniq_split = value
            self.children.append(tree_node.split())

        return self

        raise NotImplementedError
import data
import hw1_dt as decision_tree
import utils as Utils
from sklearn.metrics import accuracy_score
import numpy as np

#TEST IG:
root = [8, 12]
branches = [[5, 2], [3, 10]]
igRoot = Utils.get_entropy(root)
print("IG root", igRoot)
print("IG branches", Utils.Information_Gain(igRoot, branches))

features, labels = data.sample_decision_tree_data()
print(features)
print(labels)

#
# data
X_test, y_test = data.sample_decision_tree_test()
print(X_test)
print(y_test)

# build the tree
dTree = decision_tree.DecisionTree()
dTree.train(features, labels)
# print
Utils.print_tree(dTree)

# testing
y_est_test = dTree.predict(X_test)
Exemplo n.º 26
0
    def split(self):
        #raise NotImplementedError
        #calculating parent entropy
        label_count = np.array(np.unique(self.labels, return_counts=True)).T
        total = np.sum(
            [int(label_count[i][1]) for i in range(len(label_count))])
        entropy_parent = np.sum([
            -1 * (int(label_count[i][1]) / total) * np.log2(
                (int(label_count[i][1]) / total))
            for i in range(len(label_count))
        ])

        #print("entropy parent", entropy_parent)

        #maximum gain
        max_gain = -1
        for j in range(len(self.features[0])):

            split_attr = [
                self.features[i][j] for i in range(len(self.features))
            ]
            split_unique = np.unique(split_attr)
            unique_labels = np.unique(self.labels)
            branches = np.zeros((len(split_unique), len(unique_labels)),
                                dtype='int')
            #print(branches)

            for i in range(len(split_attr)):

                attr_index = list(split_unique).index(split_attr[i])
                class_index = list(unique_labels).index(self.labels[i])
                branches[attr_index][class_index] += 1

            gain = Util.Information_Gain(entropy_parent, branches)
            #print(gain)

            if (gain > max_gain):
                max_gain = gain
                self.dim_split = j
                self.feature_uniq_split = split_unique

            if (gain == max_gain):
                if (len(split_unique) > len(self.feature_uniq_split)):
                    self.dim_split = j
                    self.feature_uniq_split = split_unique

        #print("node",self.dim_split,self.feature_uniq_split)

        #split child

        for f in self.feature_uniq_split:

            sub_data = []
            sub_label = []

            for j in range(len(self.features)):
                if (self.features[j][self.dim_split] == f):
                    sub_feature = self.features[
                        j][:self.dim_split] + self.features[j][self.dim_split +
                                                               1:]
                    sub_data.append(sub_feature)
                    sub_label.append(self.labels[j])

            child = TreeNode(sub_data, sub_label,
                             self.num_cls)  #what is num_cls

            if (all(child.features[0][j] == None
                    for j in range(len(child.features[0])))):
                child.splittable = False

            self.children.append(child)

        for child in self.children:
            if child.splittable:
                child.split()
        return
Exemplo n.º 27
0
    def split(self):

        for current_dim in range(len(self.features[0])):
            if not 'max_gain' in locals():
                max_gain = -9999

            current_x = np.array(self.features)[:, current_dim]
            if None in current_x:
                continue
            branch_values = np.unique(current_x)
            if not 'branch_values_current' in locals():
                branch_values_current = -1
            if not 'current_current_dim' in locals():
                current_current_dim = -1

            #branches = np.zeros((self.num_cls, len(branch_values)))
            branches = np.zeros((len(branch_values), self.num_cls + 1))

            for i, val in enumerate(branch_values):
                y = np.array(self.labels)[np.where(current_x == val)]
                for current_y in y:
                    branches[i, current_y] += 1

            total_entropy = 0

            C = np.unique(self.labels)
            for c in C:
                p = float(np.count_nonzero(self.labels == c)) / len(
                    self.labels)
                total_entropy += p * np.log2(1 / p)

            max_gain_current = Util.Information_Gain(total_entropy, branches)

            if max_gain_current == max_gain and branch_values.shape[
                    0] > branch_values_current:
                max_gain = max_gain_current
                self.dim_split = current_dim
                self.feature_uniq_split = branch_values.tolist()
                branch_values_current = branch_values.shape[0]
                current_current_dim = current_dim

            if max_gain_current > max_gain:
                max_gain = max_gain_current
                self.dim_split = current_dim
                self.feature_uniq_split = branch_values.tolist()
                branch_values_current = branch_values.shape[0]
                current_current_dim = current_dim

        current_x = np.array(self.features)[:, self.dim_split]
        x = np.array(self.features, dtype=object)
        x[:, self.dim_split] = None

        for i in self.feature_uniq_split:
            index = np.where(current_x == i)
            x_child = x[index].tolist()
            y_child = np.array(self.labels)[index].tolist()
            child = TreeNode(x_child, y_child, self.num_cls)
            if np.array(x_child).size == 0 or all(x is None
                                                  for x in x_child[0]):
                child.splittable = False
            self.children.append(child)

        for child in self.children:
            if child.splittable:
                child.split()

        return
Exemplo n.º 28
0
    def split(self):
        #print(self.features)
        if (self.features is None or self.labels is None):

            self.splittable = False
            return
        m = len(self.features)
        n = len(self.features[0])
        if (n == 0 or m == 0):

            self.splittable = False
            return
        # FOR ALL COLUMNS
        # for ith each column
        maxIG = -np.inf
        UniqueFeatures = -1
        Labels = self.labels
        total_labels = len(Labels)

        NUniqueLabels = np.unique(np.array(Labels))
        MinUniqueSubLabels = -1
        entropy = 0

        for i in NUniqueLabels:
            count_i = Labels.count(i)
            if (count_i != 0 and total_labels != 0):
                entropy = entropy - (count_i / total_labels) * np.log2(
                    count_i / total_labels)

        for i in range(n):

            Attr = np.array(self.features)[:, i]
            # finding unique values of ith coulumn
            #print(Attr)
            #break
            NUniqueSubAttr = np.unique(np.array(Attr))
            if (len(NUniqueSubAttr) == 1):
                continue

            branches = []
            # for each unique value of ith column
            for j in sorted(NUniqueSubAttr):
                # Counting indexof features where this column is present in features

                Index_i = []
                subbranch = []
                count = 0
                for row in self.features:
                    if (row[i] == j and row[i] is not None):
                        Index_i.append(count)
                    count += 1

                # Index_i.append((np.array(self.features[i])).index(j))

                # take labels for each NUniueq attributes
                Labels_i = []

                for k in Index_i:
                    Labels_i.append(self.labels[k])

                for k in sorted(NUniqueLabels):
                    subbranch.append(Labels_i.count(k))

                branches.append(subbranch)

            # calculating entropy for given coulmn
            # in whole feature we are calculating no of yes and no of nos
            #self.feature_uniq_split = NUniqueSubAttr

            IG = Util.Information_Gain(entropy, branches)

            if (IG > maxIG):
                maxIG = IG
                self.dim_split = i
                self.feature_uniq_split = NUniqueSubAttr
                #MinUniqueSubLabels = self.feature_uniq_split
            elif (IG == maxIG):
                if (len(self.feature_uniq_split) < len(NUniqueSubAttr)):
                    maxIG = IG
                    self.dim_split = i
                    self.feature_uniq_split = NUniqueSubAttr
                    #MinUniqueSubLabels = self.feature_uniq_split
                elif (len(self.feature_uniq_split) == len(NUniqueSubAttr)):
                    if (self.dim_split > i):
                        maxIG = IG
                        self.dim_split = i
                        self.feature_uniq_split = NUniqueSubAttr
                        #MinUniqueSubLabels = self.feature_uniq_split

        if (self.feature_uniq_split is not None
                and len(self.feature_uniq_split) > 1
                and self.labels is not None and self.dim_split is not None):
            count = 0
            for i in self.feature_uniq_split:
                NewFeature = []
                NewLabels = []
                c = 0
                for j in self.features:
                    if (j[self.dim_split] == i):
                        NewFeature.append(j)
                        NewLabels.append(self.labels[c])
                    c += 1

                NoOfUniqueLabels = len(np.unique(NewLabels))
                NewFeature = np.delete(np.array(NewFeature), self.dim_split, 1)
                NewC = TreeNode(NewFeature, NewLabels, NoOfUniqueLabels)
                self.children.append(NewC)
                #ChildrenDict[NewC] = NoOfUniqueLabels

                count += 1
            for t in self.children:
                if (t.splittable == True):
                    t.split()
        #listofTuples = sorted(ChildrenDict.items(), key=lambda x: x[1])
        #for item in listofTuples:
        #    self.children.append(item[0])
        else:

            return

        return
Exemplo n.º 29
0
    def split(self):
        if self.splittable:
            featuresT = np.array(self.features).T.tolist()
            D = len(featuresT)
            N = len(self.features)
            # calculate entropy
            count = np.unique(self.labels, return_counts=True)[1]
            entropy = 0
            for i in count:
                possibility = i / N
                if possibility != 0:
                    entropy -= possibility * np.log2(possibility)
            # split by feature d
            def split_by(d):
                this_feature = featuresT[d]
                label_dic = {}
                feature_dic = {}
                entropy = 0
                for n in range(0, N):
                    point = self.features[n]
                    label = self.labels[n]
                    if this_feature[n] not in label_dic.keys():
                        label_dic[this_feature[n]] = [label]
                        feature_dic[this_feature[n]] = [point]
                    else:
                        label_dic[this_feature[n]].append(label)
                        feature_dic[this_feature[n]].append(point)
                branches_features = list(feature_dic.values())
                branches_labels = list(label_dic.values())
                branches_count = []
                for branch_labels in branches_labels:
                    branch_count = np.unique(branch_labels,
                                             return_counts=True)[1].tolist()
                    branches_count.append(branch_count)
                return branches_features, branches_labels, branches_count

                # greed best feature

            dic_IG = {}  # dic_IG = { d : IG }
            d_num_attributes = {}
            for d in range(0, D):
                branches_features, branches_labels, branches_count = split_by(
                    d)
                IG = Util.Information_Gain(entropy, branches_count)
                dic_IG[d] = IG
                d_num_attributes[d] = len(branches_features)
            sorted_IG = sorted(dic_IG.items(),
                               key=lambda x: x[1],
                               reverse=True)  # sorted_IG = [(d , IG)]
            num_tie = 1
            for i in range(0, len(sorted_IG) - 1):
                if sorted_IG[i][1] != sorted_IG[i + 1][1]:
                    break
                num_tie += 1
            tie_ds = {}
            for item in sorted_IG[:num_tie]:
                tie_ds[item[0]] = d_num_attributes[item[0]]
            sorted_IG = sorted(tie_ds.items(),
                               key=lambda x: x[1],
                               reverse=True)
            best_d = sorted_IG[0][0]
            best_features, best_labels, branches_count = split_by(best_d)
            self.dim_split = best_d
            self.feature_uniq_split = []
            for feature_value in featuresT[best_d]:
                if feature_value not in self.feature_uniq_split:
                    self.feature_uniq_split.append(feature_value)
            # build child node
            children_sort_info = {}  # { child : attributes }
            for i in range(0, len(best_labels)):
                child_num_cls = len(best_labels[i])
                child_features = np.delete(np.array(best_features[i]),
                                           best_d,
                                           axis=1).tolist()
                child_labels = best_labels[i]
                child = TreeNode(features=child_features,
                                 labels=child_labels,
                                 num_cls=child_num_cls)
                if len(child_features) < 1:  # samples run out
                    child.splittable = False
                    child.cls_max = self.cls_max
                if len(child_features[0]) <= 0:
                    child.splittable = False
                else:  # features run out
                    child.split()
                children_sort_info[child] = len(self.feature_uniq_split)
            children_sorted_info = sorted(children_sort_info.items(),
                                          key=lambda x: x[1],
                                          reverse=True)
            for child_num_attrebutes in children_sorted_info:
                child = child_num_attrebutes[0]
                self.children.append(child)
        else:
            return
Exemplo n.º 30
0
    def __init__(self, features, labels, num_cls):

        # features: List[List[any]], labels: List[int], num_cls: int
        self.features = features
        self.labels = labels
        self.children = []
        self.num_cls = num_cls
        self.children_with_attributes = None
        self.dim_split = None
        self.attribute_val = None

        #attributes for pruning
        self.expectedLabels = []
        self.expectedLabelMap = dict()
        self.trainingLabelsCountMap = dict()
        self.currentExpectedLabel = None
        self.correct_predictions = 0
        self.parentNode = None

        # find the most common labels in current node
        count_max = 0
        labels_with_count = np.unique(labels, return_counts=True)
        for i in range(len(labels_with_count[0])):
            if labels_with_count[1][i] > count_max:
                count_max = labels_with_count[1][i]
                self.cls_max = labels_with_count[0][i]

        #print('treenode',self.features, num_cls,self.labels)

        # splitable is false when all features belongs to one class
        if len(np.unique(labels)) < 2:
            self.splittable = False
        else:
            self.splittable = True

        if len(self.features[0]) == 0 or len(self.features) == 0:
            #print('max_class', self.cls_max)
            self.splittable = False
            return

        indexMap = self.getIndexMap(np.unique(labels))

        listOfLabelCounts = [0] * num_cls
        trainingLabelsCountMap = [0] * num_cls
        for label in labels:
            listOfLabelCounts[indexMap.get(label)] += 1
            if self.trainingLabelsCountMap.get(label) == None:
                self.trainingLabelsCountMap[label] = 1
            else:
                self.trainingLabelsCountMap[label] += 1

        entropy = Util.entropy(len(labels), listOfLabelCounts)
        #print("entropy:",entropy)

        max_ig = -1
        feature_index = None
        max_num_attributes = []
        values = None
        for attribute in range(len(features[0])):
            num_attributes = []
            branches = []
            values = dict()
            for training_point in range(len(features)):
                labelCountsFetched = values.get(
                    features[training_point][attribute])
                if labelCountsFetched != None:
                    if labels[training_point] in labelCountsFetched:
                        currentLabelCount = labelCountsFetched.get(
                            labels[training_point])
                        labelCountsFetched[
                            labels[training_point]] = currentLabelCount + 1
                    else:
                        labelCountsFetched[labels[training_point]] = 1
                    values[features[training_point]
                           [attribute]] = labelCountsFetched
                else:
                    labelCounts = dict()
                    labelCounts[labels[training_point]] = 1
                    values[features[training_point][attribute]] = labelCounts

            #num_attributes=[row[attribute] for row in features]
            num_attributes = np.sort(list(values.keys()))
            #print("num_attributes",num_attributes, "features", features)
            #{'a': {0: 1}, 'b': {0: 1, 1: 1}, 'c': {1: 1}}

            for key, value in values.items():
                newList = [0] * num_cls
                for k, v in value.items():
                    newList[indexMap.get(k)] = v
                branches.append(newList)
                #print(branches)
            ig = Util.Information_Gain(entropy, branches)
            print("ig:", ig, "max_ig", max_ig)
            if ig > max_ig or (ig == max_ig and
                               len(num_attributes) > len(max_num_attributes)):
                max_ig = ig
                feature_index = attribute
                max_num_attributes = num_attributes

        self.dim_split = feature_index  # the index of the feature to be split
        #if feature_index==None:
        #print(max_ig, feature_index, labels, self.splittable)
        self.feature_uniq_split = max_num_attributes
        #print(self.feature_uniq_split)

        if max_ig > 0 and self.splittable:
            #print("called")
            self.split()
        return