def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only): best_suggestion = None att_values = sorted( set([ att_val for att_val_per_class in self._att_val_dist_per_class.values() for att_val in att_val_per_class ])) if not binary_only: post_split_dist = self.get_class_dist_from_multiway_split() merit = criterion.get_merit_of_split(pre_split_dist, post_split_dist) branch_mapping = { attr_val: branch_id for branch_id, attr_val in enumerate(att_values) } best_suggestion = AttributeSplitSuggestion( NominalAttributeMultiwayTest(att_idx, branch_mapping), post_split_dist, merit) for att_val in att_values: post_split_dist = self.get_class_dist_from_binary_split(att_val) merit = criterion.get_merit_of_split(pre_split_dist, post_split_dist) if best_suggestion is None or merit > best_suggestion.merit: best_suggestion = AttributeSplitSuggestion( NominalAttributeBinaryTest(att_idx, att_val), post_split_dist, merit) return best_suggestion
def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only=True): self._criterion = criterion self._pre_split_dist = pre_split_dist self._att_idx = att_idx self._aux_sum_weight = 0 # Handles both single-target and multi-target tasks if np.ndim(pre_split_dist[1]) == 0: self._aux_sum = 0.0 self._aux_sum_sq = 0.0 else: self._aux_sum = np.zeros_like(pre_split_dist[1]) self._aux_sum_sq = np.zeros_like(pre_split_dist[2]) candidate = AttributeSplitSuggestion(None, [{}], -float('inf')) best_split = self._find_best_split(self._root, candidate) # Reset auxiliary variables self._criterion = None self._pre_split_dist = None self._att_idx = None self._aux_sum_weight = None self._aux_sum = None self._aux_sum_sq = None return best_split
def search_for_best_binary_split_option(self, current_node, current_best_option, criterion, att_idx): if current_node is None or self._count_rest == 0: return current_best_option if current_node._child is not None: current_best_option = self.search_for_best_binary_split_option( current_node._left, current_best_option, criterion, att_idx) self._sum_one = current_node._statistics.get(1) self._sum_rest = self._sum_total - self._sum_one self._sum_sq_one = current_node._statistics.get(2) self._sum_sq_rest = self._sum_sq_total - self._sum_sq_one self._count_one = current_node._statistics.get(0) self._count_rest = self._count - self._count_one one_dict = {self._count_one, self._sum_one, self._sum_sq_one} rest_dict = {self._count_rest, self._sum_rest, self._sum_sq_rest} post_split_dists = [one_dict, rest_dict] pre_split_dist = [{self._count, self._sum_total, self._sum_sq_total}] merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists) if current_best_option is None or merit > current_best_option.merit: nom_att_binary_test = NominalAttributeBinaryTest( att_idx, current_node._cut_point) current_best_option = AttributeSplitSuggestion( nom_att_binary_test, post_split_dists, merit) return current_best_option
def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only): best_suggestion = None suggested_split_values = self.get_split_point_suggestions() for split_value in suggested_split_values: post_split_dist = self.get_class_dists_from_binary_split( split_value) # merit = criterion.get_merit_of_split(pre_split_dist, post_split_dist) merit = -np.inf if ('0' in self._att_val_dist_per_class) and ( '1' in self._att_val_dist_per_class): if (self._att_val_dist_per_class[0] is not None) and (self._att_val_dist_per_class[1] is not None): n_mean = self._att_val_dist_per_class[0].get_mean() n_variance = self._att_val_dist_per_class[0].get_variance() p_mean = self._att_val_dist_per_class[1].get_mean() p_variance = self._att_val_dist_per_class[1].get_variance() merit = GaussianHellingerDistanceCriterion.compute_hellinger( p_mean, p_variance, n_mean, n_variance) if best_suggestion is None or merit > best_suggestion.merit: num_att_binary_test = NumericAttributeBinaryTest( att_idx, split_value, True) best_suggestion = AttributeSplitSuggestion( num_att_binary_test, post_split_dist, merit) return best_suggestion
def get_best_split_suggestions(self, criterion, ht): """ Find possible split candidates. Parameters ---------- criterion: SplitCriterion The splitting criterion to be used. ht: HoeffdingTreeClassifier Hoeffding Tree. Returns ------- list Split candidates. """ best_suggestions = [] pre_split_dist = self._observed_class_distribution if not ht.no_preprune: # Add null split as an option null_split = AttributeSplitSuggestion( None, [{}], criterion.get_merit_of_split(pre_split_dist, [pre_split_dist])) best_suggestions.append(null_split) for i, obs in self._attribute_observers.items(): best_suggestion = obs.get_best_evaluated_split_suggestion( criterion, pre_split_dist, i, ht.binary_split) if best_suggestion is not None: best_suggestions.append(best_suggestion) return best_suggestions
def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only): best_suggestion = None if not binary_only: post_split_dist = self.get_class_dist_from_multiway_split() merit = criterion.get_merit_of_split(pre_split_dist, post_split_dist) best_suggestion = AttributeSplitSuggestion( NominalAttributeMultiwayTest(att_idx), post_split_dist, merit) for val_idx in self._att_val_dist_per_class.keys(): post_split_dist = self.get_class_dist_from_binary_split(val_idx) merit = criterion.get_merit_of_split(pre_split_dist, post_split_dist) if best_suggestion is None or merit > best_suggestion.merit: best_suggestion = AttributeSplitSuggestion( NominalAttributeBinaryTest(att_idx, val_idx), post_split_dist, merit) return best_suggestion
def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only): best_suggestion = None suggested_split_values = self.get_split_point_suggestions() for split_value in suggested_split_values: post_split_dist = self.get_class_dists_from_binary_split(split_value) merit = criterion.get_merit_of_split(pre_split_dist, post_split_dist) if best_suggestion is None or merit > best_suggestion.merit: num_att_binary_test = NumericAttributeBinaryTest(att_idx, split_value, True) best_suggestion = AttributeSplitSuggestion(num_att_binary_test, post_split_dist, merit) return best_suggestion
def get_best_split_suggestions(self, criterion, hot): best_suggestions = [] pre_split_dist = self._observed_class_distribution null_split = AttributeSplitSuggestion(None, [{}], criterion.get_merit_of_split(pre_split_dist, [pre_split_dist])) best_suggestions.append(null_split) for i, obs in self._attribute_observers.items(): best_suggestion = obs.get_best_evaluated_split_suggestion(criterion, pre_split_dist, i, hot.binary_split) if best_suggestion is not None: best_suggestions.append(best_suggestion) return best_suggestions
def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only): current_best = None ordered_feature_values = sorted(list(self._statistics.keys())) if not binary_only: post_split_dist = [ self._statistics[k] for k in ordered_feature_values ] merit = criterion.get_merit_of_split(pre_split_dist, post_split_dist) branch_mapping = { attr_val: branch_id for branch_id, attr_val in enumerate(ordered_feature_values) } current_best = AttributeSplitSuggestion( NominalAttributeMultiwayTest(att_idx, branch_mapping), post_split_dist, merit) for att_val in ordered_feature_values: actual_dist = self._statistics[att_val] remaining_dist = { 0: pre_split_dist[0] - actual_dist[0], 1: pre_split_dist[1] - actual_dist[1], 2: pre_split_dist[2] - actual_dist[2] } post_split_dist = [actual_dist, remaining_dist] merit = criterion.get_merit_of_split(pre_split_dist, post_split_dist) if current_best is None or merit > current_best.merit: nom_att_binary_test = NominalAttributeBinaryTest( att_idx, att_val) current_best = AttributeSplitSuggestion( nom_att_binary_test, post_split_dist, merit) return current_best
def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only=True): self._criterion = criterion self._pre_split_dist = pre_split_dist self._att_idx = att_idx self._aux_k = 0 self._aux_sum = np.zeros_like(pre_split_dist[1]) self._aux_sq_sum = np.zeros_like(pre_split_dist[2]) candidate = AttributeSplitSuggestion(None, [{}], -float('inf')) best_split = self._find_best_split(self._root, candidate) return best_split
def search_for_best_split_option(self, current_node, current_best_option, criterion, att_idx): if current_node is None or self._count_right_total == 0: return current_best_option if current_node._left is not None: current_best_option = self.search_for_best_split_option( current_node._left, current_best_option, criterion, att_idx) self._sum_total_left += current_node._left_statistics[1] self._sum_total_right -= current_node._left_statistics[1] self._sum_sq_total_left += current_node._left_statistics[2] self._sum_sq_total_right -= current_node._left_statistics[2] self._count_right_total -= current_node._left_statistics[0] self._count_left_total += current_node._left_statistics[0] lhs_dist = {} rhs_dist = {} lhs_dist[0] = self._count_left_total lhs_dist[1] = self._sum_total_left lhs_dist[2] = self._sum_sq_total_left rhs_dist[0] = self._count_right_total rhs_dist[1] = self._sum_total_right rhs_dist[2] = self._sum_sq_total_right post_split_dists = [lhs_dist, rhs_dist] pre_split_dist = [(self._count_left_total + self._count_right_total), (self._sum_total_left + self._sum_total_right), (self._sum_sq_total_left + self._sum_sq_total_right)] merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists) if current_best_option is None or merit > current_best_option.merit: num_att_binary_test = NumericAttributeBinaryTest( att_idx, current_node._cut_point, True) current_best_option = AttributeSplitSuggestion( num_att_binary_test, post_split_dists, merit) if current_node._right is not None: current_best_option = self.search_for_best_split_option( current_node._right, current_best_option, criterion, att_idx) self._sum_total_left -= current_node._left_statistics.get(1) self._sum_total_right += current_node._left_statistics.get(1) self._sum_sq_total_left -= current_node._left_statistics.get(2) self._sum_sq_total_right += current_node._left_statistics.get(2) self._count_left_total -= current_node._left_statistics.get(0) self._count_right_total += current_node._left_statistics.get(0) return current_best_option
def get_null_split(self, criterion): """ Compute the null split (don't split). Parameters ---------- criterion: SplitCriterion The splitting criterion to be used. Returns ------- list Split candidates. """ pre_split_dist = self._observed_class_distribution null_split = AttributeSplitSuggestion(None, [{}], criterion.get_merit_of_split(pre_split_dist, [pre_split_dist])) return null_split
def _find_best_split(self, node, candidate): if node._left is not None: candidate = self._find_best_split(node._left, candidate) # Left post split distribution left_dist = {} left_dist[0] = node.k + self._aux_k left_dist[1] = node.sum_target + self._aux_sum left_dist[2] = node.sum_sq_target + self._aux_sq_sum # The right split distribution is calculated as the difference # between the total distribution (pre split distribution) and # the left distribution right_dist = {} right_dist[0] = self._pre_split_dist[0] - left_dist[0] right_dist[1] = self._pre_split_dist[1] - left_dist[1] right_dist[2] = self._pre_split_dist[2] - left_dist[2] post_split_dists = [left_dist, right_dist] merit = self._criterion.get_merit_of_split(self._pre_split_dist, post_split_dists) if merit > candidate.merit: num_att_binary_test = NumericAttributeBinaryTest(self._att_idx, node.att_val, True) candidate = AttributeSplitSuggestion(num_att_binary_test, post_split_dists, merit) if node._right is not None: self._aux_k += node.k self._aux_sum += node.sum_target self._aux_sq_sum += node.sum_sq_target right_candidate = self._find_best_split(node._right, candidate) if right_candidate.merit > candidate.merit: candidate = right_candidate self._aux_k -= node.k self._aux_sum -= node.sum_target self._aux_sq_sum -= node.sum_sq_target return candidate
def search_for_best_multiway_split_option(self, current_node, current_best_option, criterion, att_idx): post_split_dists = np.zeros([self._number_of_possible_values, 3]) if current_node is None or self._count_rest == 0: return current_best_option for i in range(self._number_of_possible_values): post_split_dists[i, 0] = current_node._statistics.get(0) post_split_dists[i, 1] = current_node._statistics.get(1) post_split_dists[i, 2] = current_node._statistics.get(2) current_node = current_node._child pre_split_dist = [{self._count, self._sum_total, self._sum_sq_total}] merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists) if current_best_option is None or merit > current_best_option.merit: nom_att_mutliway_test = NominalAttributeMultiwayTest(att_idx) current_best_option = AttributeSplitSuggestion( nom_att_mutliway_test, post_split_dists, merit) return current_best_option
def search_for_best_split_option(self, current_node, current_best_option, actual_parent_left, parent_left, parent_right, left_child, criterion, pre_split_dist, att_idx): if current_node is None: return current_best_option left_dist = {} right_dist = {} if parent_left is None: left_dist.update( dict( Counter(left_dist) + Counter(current_node._class_count_left))) right_dist.update( dict( Counter(right_dist) + Counter(current_node._class_count_right))) else: left_dist.update(dict(Counter(left_dist) + Counter(parent_left))) right_dist.update( dict(Counter(right_dist) + Counter(parent_right))) if left_child: """get the exact statistics of the parent value""" exact_parent_dist = {} exact_parent_dist.update( dict( Counter(exact_parent_dist) + Counter(actual_parent_left))) exact_parent_dist.update( dict( Counter(exact_parent_dist) - Counter(current_node._class_count_left))) exact_parent_dist.update( dict( Counter(exact_parent_dist) - Counter(current_node._class_count_right))) """move the subtrees""" left_dist.update( dict( Counter(left_dist) - Counter(current_node._class_count_right))) right_dist.update( dict( Counter(right_dist) + Counter(current_node._class_count_right))) """move the exact value from the parent""" right_dist.update( dict(Counter(right_dist) + Counter(exact_parent_dist))) left_dist.update( dict(Counter(left_dist) - Counter(exact_parent_dist))) else: left_dist.update( dict( Counter(left_dist) + Counter(current_node._class_count_left))) right_dist.update( dict( Counter(right_dist) - Counter(current_node._class_count_left))) post_split_dists = [left_dist, right_dist] merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists) if current_best_option is None or merit > current_best_option.merit: num_att_binary_test = \ NumericAttributeBinaryTest( att_idx=att_idx, att_value=current_node._cut_point, equal_passes_test=True ) current_best_option = \ AttributeSplitSuggestion( split_test=num_att_binary_test, resulting_class_distributions=post_split_dists, merit=merit ) current_best_option = \ self.search_for_best_split_option( current_node=current_node._left, current_best_option=current_best_option, actual_parent_left=current_node._class_count_left, parent_left=post_split_dists[0], parent_right=post_split_dists[1], left_child=True, criterion=criterion, pre_split_dist=pre_split_dist, att_idx=att_idx ) current_best_option = \ self.search_for_best_split_option( current_node=current_node._right, current_best_option=current_best_option, actual_parent_left=current_node._class_count_left, parent_left=post_split_dists[0], parent_right=post_split_dists[1], left_child=False, criterion=criterion, pre_split_dist=pre_split_dist, att_idx=att_idx ) return current_best_option