def build(self, X, Y, selected): cur = self.Node(None, Y) if self.verbose: print("Cur selected columns:", selected) print("Cur data:") pprint(X) print(Y) split = False # check if there is no attribute to choose # or there is no need for spilt if len(selected) != self.column_cnt and len(set(Y)) > 1: left_columns = list(set(range(self.column_cnt)) - selected) col_ind, best_information_gain = argmax( left_columns, key=lambda col: information_gain(X, Y, col)) col = left_columns[col_ind] # if this split is better than not splitting if best_information_gain > self.information_gain_threshold: if self.verbose: print(f"Split by {col}th column") split = True cur.col = col for val in set(x[col] for x in X): ind = [x[col] == val for x in X] child_X = [x for i, x in zip(ind, X) if i] child_Y = [y for i, y in zip(ind, Y) if i] cur.children[val] = self.build(child_X, child_Y, selected | {col}) if not split and self.verbose: print("No split") return cur
def create_tree(data, attributeList, depth): print "Pre-order Traversal Depth:", depth, "NumberOfDataPoints", len(data) reviews = [ review[1] for review in data] sumOfReviews = sum(reviews) if depth == 19 or attributeList == None: leaf = DecisionTree() leaf.node_label = 1 if sumOfReviews > len(reviews) / 2 else 0 return leaf if sumOfReviews == len(reviews): leaf = DecisionTree() leaf.node_label = 1 return leaf if sumOfReviews == 0: leaf = DecisionTree() leaf.node_label = 0 return leaf maxInfoGain = 0 maxInfoGainIndex = -1 for i, attribute in enumerate(attributeList): withAttribute = [] withOutAttribute = [] infoGain = 0 for datum in data: if attribute in datum[0]: withAttribute.append(datum[1]) else: withOutAttribute.append(datum[1]) if withAttribute == [] or withOutAttribute == []: infoGain = 0 else: infoGain = utils.information_gain(reviews, withAttribute, withOutAttribute) if maxInfoGain < infoGain: maxInfoGain = infoGain maxInfoGainIndex = i leftChildReviews = [] rightChildReviews = [] for review in data: if attributeList[maxInfoGainIndex] in review[0]: leftChildReviews.append(review) else: rightChildReviews.append(review) node = DecisionTree() node.node_word = attributeList[maxInfoGainIndex] attributeList.remove(attributeList[maxInfoGainIndex]) node.left = create_tree(leftChildReviews, attributeList, depth + 1) node.right = create_tree(rightChildReviews, attributeList, depth + 1) return node
def create_decision_tree (data, attributes): root = DecisionTree() # Conditions for termination. check() takes care of cases where all labels are the same, or when attributes are 0 checkint = 0 checkint = check(data,attributes) if (checkint == 0): root.node_label = 0 print root.node_label return root elif (checkint == 1): root.node_label = 1 print root.node_label return root # Get attribute with maximum information gain infogain = 0.0 attribute = '' for ele in attributes: #print ele newinfogain = utils.information_gain(data, ele) #print newinfogain if (newinfogain > infogain): infogain = newinfogain #print infogain attribute = ele # Check if none of the attributes divide the data. If so, return root by majority polling if (infogain == 0): checkint = check(data,attribute) root.node_label = checkint print root.node_label return root root.value = attribute print root.value #Divide dataset into left and right data_left, data_right = divide_dataset (data, attribute) attributes.remove(attribute) #print attributes #Recurse root.left = create_decision_tree (data_left, attributes) root.right = create_decision_tree (data_right, attributes) return root
def find_split(x, y, feature_indices): best_gain = 0 best_feature_index = 0 best_threshold = 0 for feature_index in feature_indices: values = sorted(set(x[:, feature_index])) for j in range(len(values) - 1): threshold = (values[j] + values[j + 1]) / 2 x_true, y_true, x_false, y_false = split(x, y, feature_index, threshold) gain = information_gain(y, y_true, y_false) if gain > best_gain: best_gain = gain best_feature_index = feature_index best_threshold = threshold return best_feature_index, best_threshold
def __tdidt(self, samples, atts, node): """ """ labels = self.training[:,-1] # first best_gain = ('', 0) # calculate entropy for continuous valued attributes for col, sample in enumerate(self.training.T[:-1]): # node attributes[col] # calculate information gain for each column ig = information_gain(sample, labels) if ig[1] > best_gain[1]: best_gain = (self.attributes[col], ig[1]) left_node = Knoten() right_node = Knoten() # recursive call self.__tdidt(samples, atts, left_node) self.__tdidt(samples, atts, rigth_node) pass
def find_split(self, X, y, feature_indices, weights): best_gain = -float('inf') best_feature_index = -1 best_value = [0] # for each feature to be considered for feature_index in sorted(feature_indices): # get rows of instances with known values for the feature not_nan_rows = [ a for a in range(X.shape[0]) if not utils.isnan(X[:, feature_index][a]) ] Xnotnan = (X[not_nan_rows, :]) ynotnan = y[not_nan_rows] #if there aren't any instances with known values for the feature, go to the next one if (Xnotnan.shape[0] == 0): continue # get all possible values for the feature index values = sorted(set(Xnotnan[:, feature_index])) # if the values are numeric if (utils.isnum(Xnotnan[0, feature_index])): # split the data using each value for j in range(len(values) - 1): #value = (float(values[j]) + float(values[j+1]))/2 -- original value = values[j] # split data using the feature and the value Xs, ys, d = utils.split_num(Xnotnan, ynotnan, feature_index, value) # calculate gain considering the rate of missing values. # the bigger the rate, the smaller the gain gain = (len(ynotnan) / len(y)) * utils.information_gain( ynotnan, ys) if gain >= best_gain: # if there's a tie on info gain, decide using gain ratio # if(gain == best_gain and best_feature_index != -1): # print('tie of gain') # gr = utils.gain_ratio(ynotnan,ys,y) # not_nan_rows = [a for a in range(X.shape[0]) if not utils.isnan(X[:,best_feature_index][a])] # Xss,yss, ds = utils.split(X[not_nan_rows,:],y[not_nan_rows],best_feature_index,best_value) # # calculate gain ratio of previous best feature to compare # gr_p = utils.gain_ratio(ynotnan,yss,y) # # if the current feature's gain ratio is not better than the previous one, then # # go to the next feature # if(gr < gr_p): # continue best_gain = gain best_feature_index = feature_index best_value = [ values[j] ] #c4.5 choses the largest value in the trainig set that #does not exceed the midpoint (value). This ensures that all #threshold values appearing in trees actually occur in the data # if the values are categorical else: # split the data using the values Xs, ys, d = utils.split_categ(Xnotnan, ynotnan, feature_index, values) gain = ((len(ynotnan) / len(y)) * utils.information_gain(ynotnan, ys) ) #utils.gain_ratio(ynotnan,ys,y)) if gain >= best_gain: # if(gain == best_gain and best_feature_index != -1): # print('tie of gain') # gr = utils.gain_ratio(ynotnan,ys,y) # not_nan_rows = [a for a in range(X.shape[0]) if not utils.isnan(X[:,best_feature_index][a])] # Xss,yss, ds = utils.split(X[not_nan_rows,:],y[not_nan_rows],best_feature_index,best_value) # gr_p = utils.gain_ratio(ynotnan,yss,y) # if(gr < gr_p): # continue best_gain = gain best_feature_index = feature_index best_value = values return best_feature_index, best_value
def fit(self, X, y): # Train the decision tree (self.tree) using the the sample X and labels y # X should be 2D numpy array NxM, N is the number of instances, # M is the number of features. # y should be 1D numpy array N, N is the number of instances. # max_depth represents the maximum depth of the tree # min_gain represents the minimum information gain # key "left" and "right" represent the left child and right child X = np.asarray(X).astype(float) y = np.asarray(y).astype(int) #current feature set is empty if (self.tree['current_features'].shape[0] == 0): self.tree['label'] = np.argmax(np.bincount(y)) return #All instances are same class if (len(set(y)) == 1): self.tree['label'] = y[0] return #Reach max_depth if ((self.tree['max_depth'] > 0) and (self.tree['depth'] == self.tree['max_depth'])): self.tree['label'] = np.argmax(np.bincount(y)) return current_features = self.tree['current_features'] max_information_gain_list = [] max_split_val_list = [] for split_attribute in current_features: X_select = list(set([x[split_attribute] for x in X])) max_information_gain = 0 max_split_val = X_select[0] for split_val in X_select: (_, _, y_left, y_right) = partition_classes(X, y, split_attribute, split_val) current_information_gain = information_gain( y, [y_left, y_right]) if (current_information_gain > max_information_gain): max_information_gain = current_information_gain max_split_val = split_val max_information_gain_list.append(max_information_gain) max_split_val_list.append(max_split_val) #index of split_attribute in current features index = np.argmax(max_information_gain_list) #information gain is less than threshold if (max_information_gain_list[index] <= self.tree['min_gain']): self.tree['label'] = np.argmax(np.bincount(y)) return self.tree['split_attribute'] = current_features[index] self.tree['split_val'] = max_split_val_list[index] #split node (X_left, X_right, y_left, y_right) = partition_classes(X, y, \ self.tree['split_attribute'], self.tree['split_val']) left_tree = ID3(X_left, y_left, self.tree['max_depth'], self.tree['min_gain']) right_tree = ID3(X_right, y_right, self.tree['max_depth'], self.tree['min_gain']) current_features = np.delete(current_features, index) left_tree.tree['current_features'] = current_features right_tree.tree['current_features'] = current_features left_tree.tree['depth'] = self.tree['depth'] + 1 right_tree.tree['depth'] = self.tree['depth'] + 1 left_tree.fit(X_left, y_left) right_tree.fit(X_right, y_right) self.tree['left'] = left_tree self.tree['right'] = right_tree return
from utils import information_gain, entropy from collections import Counter from math import fabs eps = 1e-3 X = [ ['青年', '否', '否', '一般'], ['青年', '否', '否', '好'], ['青年', '是', '否', '好'], ['青年', '是', '是', '一般'], ['青年', '否', '否', '一般'], ['中年', '否', '否', '一般'], ['中年', '否', '否', '好'], ['中年', '是', '是', '好'], ['中年', '否', '是', '非常好'], ['中年', '否', '是', '非常好'], ['老年', '否', '是', '非常好'], ['老年', '否', '是', '好'], ['老年', '是', '否', '好'], ['老年', '是', '否', '非常好'], ['老年', '否', '否', '一般'], ] Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'] assert(fabs(entropy(Counter(Y).values()) - .971) < eps) assert(fabs(information_gain(X, Y, 0) - .083) < eps) assert(fabs(information_gain(X, Y, 1) - .324) < eps) assert(fabs(information_gain(X, Y, 2) - .420) < eps) assert(fabs(information_gain(X, Y, 3) - .363) < eps)
from utils import information_gain, entropy from collections import Counter eps = 1e-3 X = [ ['青年', '否', '否', '一般'], ['青年', '否', '否', '好'], ['青年', '是', '否', '好'], ['青年', '是', '是', '一般'], ['青年', '否', '否', '一般'], ['老年', '否', '否', '一般'], ['老年', '否', '否', '好'], ['老年', '是', '是', '好'], ['老年', '否', '是', '非常好'], ['老年', '否', '是', '非常好'], ['老年', '否', '是', '非常好'], ['老年', '否', '是', '好'], ['老年', '是', '否', '好'], ['老年', '是', '否', '非常好'], ['老年', '否', '否', '一般'], ] Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'] assert (entropy(Counter(Y).values()) - .971 < eps) assert (information_gain(X, Y, 0) - .083 < eps) assert (information_gain(X, Y, 1) - .324 < eps) assert (information_gain(X, Y, 2) - .420 < eps) assert (information_gain(X, Y, 3) - .363 < eps)
['OVERCAST', 81, 75, 'F', 'PLAY'], ['RAIN', 71, 80, 'T', "DON'T PLAY"], ['RAIN', 65, 70, 'T', "DON'T PLAY"], ['RAIN', 75, 80, 'F', 'PLAY'], ['RAIN', 68, 80, 'F', 'PLAY'], ['RAIN', 70, 96, 'F', 'PLAY']], dtype='object', columns=original_attributes) X = data[data.columns[:-1]].values y = data['Class'].values print('Testing entropy, information gain, gain ratio...') assert (utils.entropy([1, 0, 0, 1, 0, 1]) == 1) assert (utils.entropy([1, 1, 1]) == 0) assert (utils.entropy([0]) == 0) outlook_index = np.where(original_attributes == 'Outlook')[0][0] Xs, ys, d = utils.split_categ(X, y, outlook_index, list(set(X[:, outlook_index]))) assert (np.isclose(utils.information_gain(y, ys), 0.246, rtol=1e-2)) assert (np.isclose(utils.gain_ratio(y, ys, y), 0.156, rtol=1e-2)) print('Testing gini index...') assert (utils.gini_impurity([1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0]) == 0.5) assert (utils.gini_impurity([0, 0, 0, 0, 0]) == 0) print('Testing gini...') assert (utils.gini([0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1], [[0, 1, 0, 0], [1, 1, 0, 0, 1, 1, 0, 1]]) == 0.0625) print('Testing Decision Tree...') m = dt.DecisionTreeClassifier(missing_branch=False) m.fit(X, y) m.to_pdf(original_attributes, out='tree1.pdf') assert (m.predict((['OVERCAST', 80, 90, 'T'])) == 'PLAY') assert (m.predict(['RAIN', 80, 50, 'F']) == 'PLAY')