def find_threshold(self, data_attr, data_target): ''' Find threshold of data_attr with respect to data_target ''' attr_name = data_attr.columns[0] target_name = data_target.columns[0] data = pd.concat([data_attr, data_target], axis=1) data = data.sort_values(attr_name).reset_index(drop=True) # Retrieve all indexes with different value of target attribute diff_index = list() for i in range(len(data) - 1): if data[target_name].iloc[i] != data[target_name].iloc[i + 1]: diff_index.append(i) best_point = 0 best_idx = -1 for i in diff_index: if (self.gain_ratio): point = Calculate.gain_ratio(data[attr_name], data[target_name], is_continue=True, split_index=i) else: point = Calculate.info_gain(data[attr_name], data[target_name], is_continue=True, split_index=i) if point > best_point: best_point = point best_idx = i best_splitter = (data[attr_name].iloc[best_idx] + data[attr_name].iloc[best_idx + 1]) / 2 return [best_splitter, best_point]
def fit(self, data, attributes, target_name): ''' Built and return decision tree using ID3 algorithm ''' data_target = data[target_name] # Data target contains one label entropy_data_target = Calculate.entropy(data_target) if entropy_data_target == 0: value_list = Calculate.get_unique_data(data, target_name) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) # Set current_node, info_gain, values tree = Tree( Node(None, entropy_data_target, value_dict, result=data_target[0], is_leaf=True)) return tree # Nothing attribute shall be chosen if len(attributes) == 0: # Set current_node, info_gain, values value_list = Calculate.get_unique_data(data, target_name) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) tree = Tree( Node(None, entropy_data_target, value_dict, result=Calculate.most_label(data_target), is_leaf=True)) return tree else: # Find best attribute to be node using either info gain or gain ratio best_attr = '' best_point = 0 # Could be Info gain or Gain ratio for attr in attributes: if self.gain_ratio: point = Calculate.gain_ratio(data[attr], data_target) if point > best_point: best_point = point best_attr = attr else: point = Calculate.info_gain(data[attr], data_target) if point > best_point: best_point = point best_attr = attr value_list = Calculate.get_unique_data(data, target_name) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) # Build decision tree recursively dtree = Tree(Node(best_attr, best_point, value_dict)) # Delete usage attribute in attributes attributes.remove(best_attr) # Scan all posible value to be generated subtree list_attribute = Calculate.get_unique_data(data, best_attr) i = 0 for attribute in list_attribute: data = pd.DataFrame( data=list_attribute[attribute]).reset_index(drop=True) data.drop(best_attr, axis=1, inplace=True) dtree.add_child(self.fit(data, attributes, target_name)) dtree.children[i].value.edge = attribute i += 1 return dtree