def read_tree(self, parent_line, token_line): parents = list(map(lambda x: int(x) - 1,parent_line.split())) tokens = token_line.strip().split() tree_nodes = dict() root = None for i in range(len(parents)): crnt_node_id = i if crnt_node_id not in tree_nodes.keys(): prev_node = None while True: if crnt_node_id == -1: break parent_node_id = parents[crnt_node_id] crnt_node = TreeNode() if prev_node is not None: crnt_node.add_child(prev_node) tree_nodes[crnt_node_id] = crnt_node crnt_node.idx = crnt_node_id crnt_node.token = tokens[crnt_node_id] #if trees[parent-1] is not None: if parent_node_id in tree_nodes.keys(): tree_nodes[parent_node_id].add_child(crnt_node) break elif parent_node_id == -1: root = crnt_node break else: prev_node = crnt_node crnt_node_id = parent_node_id return root
def __decision_tree(self, X, Y, features, level, metric, classes): # returns the root of the Decision Tree(which consists of TreeNodes) built after fitting the training data # Here Nodes are printed as in PREORDER traversl # classes represents the different classes present in the classification problem # metric can take value gain_ratio or gini_index # level represents depth of the tree # We split a node on a particular feature only once (in a given root to leaf node path) # If the node consists of only 1 class if len(set(Y)) == 1: print("Level", level) output = None for i in classes: if i in Y: output = i print("Count of", i, "=", len(Y)) else: print("Count of", i, "=", 0) if metric == "gain_ratio": print("Current Entropy is = 0.0") elif metric == "gini_index": print("Current Gini Index is = 0.0") print("Reached leaf Node") print() return TreeNode(None, output) # If we have run out of features to split upon # In this case we will output the class with maximum count if len(features) == 0: print("Level", level) freq_map = self.__count_unique(Y) output = None max_count = -math.inf for i in classes: if i not in freq_map: print("Count of", i, "=", 0) else: if freq_map[i] > max_count: output = i max_count = freq_map[i] print("Count of", i, "=", freq_map[i]) if metric == "gain_ratio": print("Current Entropy is =", self.__entropy(Y)) elif metric == "gini_index": print("Current Gini Index is =", self.__gini_index(Y)) print("Reached leaf Node") print() return TreeNode(None, output) # Finding the best feature to split upon max_gain = -math.inf final_feature = None for f in features: if metric == "gain_ratio": current_gain = self.__gain_ratio(X, Y, f) elif metric == "gini_index": current_gain = self.__gini_gain(X, Y, f) if current_gain > max_gain: max_gain = current_gain final_feature = f print("Level", level) freq_map = self.__count_unique(Y) output = None max_count = -math.inf for i in classes: if i not in freq_map: print("Count of", i, "=", 0) else: if freq_map[i] > max_count: output = i max_count = freq_map[i] print("Count of", i, "=", freq_map[i]) if metric == "gain_ratio": print("Current Entropy is =", self.__entropy(Y)) print("Splitting on feature X[", final_feature, "] with gain ratio ", max_gain, sep="") print() elif metric == "gini_index": print("Current Gini Index is =", self.__gini_index(Y)) print("Splitting on feature X[", final_feature, "] with gini gain ", max_gain, sep="") print() unique_values = set( X[:, final_feature] ) # unique_values represents the unique values of the feature selected df = pd.DataFrame(X) # Adding Y values as the last column in the dataframe df[df.shape[1]] = Y current_node = TreeNode(final_feature, output) # Now removing the selected feature from the list as we do not want to split on one feature more than once(in a given root to leaf node path) index = features.index(final_feature) features.remove(final_feature) for i in unique_values: # Creating a new dataframe with value of selected feature = i df1 = df[df[final_feature] == i] # Segregating the X and Y values and recursively calling on the splits node = self.__decision_tree(df1.iloc[:, 0:df1.shape[1] - 1].values, df1.iloc[:, df1.shape[1] - 1].values, features, level + 1, metric, classes) current_node.add_child(i, node) # Add the removed feature features.insert(index, final_feature) return current_node
def __decision_tree(self,X,Y,features,level,metric,classes): # returns the root of the Decision Tree(which consists of TreeNodes) built after fitting the training data # Here Nodes are printed as in PREORDER traversl # classes represents the different classes present in the classification problem # metric can take value gain_ratio or gini_index # level represents depth of the tree # We split a node on a particular feature only once (in a given root to leaf node path) # If the node consists of only 1 class if len(set(Y)) == 1: print("Level",level) output = None for i in classes: if i in Y: output = i print("Count of",i,"=",len(Y)) else : print("Count of",i,"=",0) if metric == "gain_ratio": print("Current Entropy is = 0.0") elif metric == "gini_index": print("Current Gini Index is = 0.0") print("Reached leaf Node") print() return TreeNode(None,output) # If we have run out of features to split upon # In this case we will output the class with maximum count if len(features) == 0: print("Level",level) freq_map = self.__count_unique(Y) output = None max_count = -math.inf for i in classes: if i not in freq_map: print("Count of",i,"=",0) else : if freq_map[i] > max_count : output = i max_count = freq_map[i] print("Count of",i,"=",freq_map[i]) if metric == "gain_ratio": print("Current Entropy is =",self.__entropy(Y)) elif metric == "gini_index": print("Current Gini Index is =",self.__gini_index(Y)) print("Reached leaf Node") print() return TreeNode(None,output) # Finding the best feature to split upon max_gain = -math.inf final_feature = None for f in features : if metric == "gain_ratio": current_gain = self.__gain_ratio(X,Y,f) elif metric =="gini_index": current_gain = self.__gini_gain(X,Y,f) if current_gain > max_gain: max_gain = current_gain final_feature = f print("Level",level) freq_map = self.__count_unique(Y) output = None max_count = -math.inf for i in classes: if i not in freq_map: print("Count of",i,"=",0) else : if freq_map[i] > max_count : output = i max_count = freq_map[i] print("Count of",i,"=",freq_map[i]) if metric == "gain_ratio" : print("Current Entropy is =",self.__entropy(Y)) print("Splitting on feature X[",final_feature,"] with gain ratio ",max_gain,sep="") print() elif metric == "gini_index": print("Current Gini Index is =",self.__gini_index(Y)) print("Splitting on feature X[",final_feature,"] with gini gain ",max_gain,sep="") print() unique_values = set(X[:,final_feature]) # unique_values represents the unique values of the feature selected df = pd.DataFrame(X) # Adding Y values as the last column in the dataframe df[df.shape[1]] = Y current_node = TreeNode(final_feature,output) # Now removing the selected feature from the list as we do not want to split on one feature more than once(in a given root to leaf node path) index = features.index(final_feature) features.remove(final_feature) for i in unique_values: # Creating a new dataframe with value of selected feature = i df1 = df[df[final_feature] == i] # Segregating the X and Y values and recursively calling on the splits node = self.__decision_tree(df1.iloc[:,0:df1.shape[1]-1].values,df1.iloc[:,df1.shape[1]-1].values,features,level+1,metric,classes) current_node.add_child(i,node) # Add the removed feature features.insert(index,final_feature) return current_node
from treenode import TreeNode #TEST 1 dile = TreeNode("Dile") edo = TreeNode("Edo") raffo = TreeNode("Raffo", [dile, edo]) cami = TreeNode("Cami") stella = TreeNode("Stella", [TreeNode("Jess")]) gabry = TreeNode("Gabry", [TreeNode("Saba"), TreeNode("Luca"), stella]) ale = TreeNode("Ale", [TreeNode("Dave"), gabry, TreeNode("Greg")]) enzo = TreeNode("Enzo", [TreeNode("Diodato")]) fra = TreeNode("Fra", [enzo]) fede = TreeNode("Fede", [ale, fra]) cami.add_child(raffo) cami.add_child(fede) peppe = TreeNode( "Peppe", [TreeNode("Marco"), TreeNode("Gio"), TreeNode("Marghe")]) cami.add_child(peppe) print(cami) print("\n") print(cami.give_software()) #TEST 2 tredici = TreeNode("13", [TreeNode("16"), TreeNode("17")]) otto = TreeNode("8", [TreeNode("12"), tredici, TreeNode("14")]) quattro = TreeNode("4", [TreeNode("9"), TreeNode("10")]) sette = TreeNode("7", [TreeNode("11", [TreeNode("15")])])