def get_distributions(tree, instances): """ Calculate piecewise distributions of good, bad, and unlabeled instances. Parameters ---------- tree : TreeNode The decision tree. instances : DataFrame Labeled instances used to train the decision tree. Must have a column for each attribute in the tree, and a 'class' column with values 'good', 'bad', or NaN (unlabeled). Returns ------- dict key: parameter name value: list of tuples, each of which describes a range of values for the parameter, delimited by a pair of split points in the tree: (high, low, good_count, bad_count, unlabeled_count) """ # key: parameter name # value: list of split values splits = {} for node in tree.get_internal_nodes(): if node.split_attribute not in splits: splits[node.split_attribute] = [node.split_value] else: splits[node.split_attribute].append(node.split_value) # key: parameter name # value: list of tuples: (high, low, good_count, bad_count, unlabeled_count) distributions = {} for param, split_values in splits.items(): split_values.sort() param_base_name = util.remove_trailing_digits(param) min_param_value = instances[param].min() max_param_value = instances[param].max() split_values.insert(0, min_param_value) split_values.append(max_param_value) # FIXME: There must be a more efficient way to use pandas for this df = instances segments = [] for i in range(len(split_values) - 1): low = split_values[i] high = split_values[i+1] if i == 0: df2 = df[df[param] <= high] else: df2 = df[(df[param] > low) & (df[param] <= high)] counts = df2['class'].value_counts(dropna=False) good = counts.loc['good'] if 'good' in counts.index else 0 bad = counts.loc['bad'] if 'bad' in counts.index else 0 unlabeled = counts.loc[np.nan] if np.nan in counts.index else 0 segments.append((low, high, int(good), int(bad), int(unlabeled))) distributions[param] = segments return distributions
def get_distributions(tree, instances): """ Calculate piecewise distributions of good, bad, and unlabeled instances. Parameters ---------- tree : TreeNode The decision tree. instances : DataFrame Labeled instances used to train the decision tree. Must have a column for each attribute in the tree, and a 'label' column with values 'good', 'bad', or NaN (unlabeled). Returns ------- dict key: parameter name value: list of tuples, each of which describes a range of values for the parameter, delimited by a pair of split points in the tree: (high, low, good_count, bad_count, unlabeled_count) """ # key: parameter name # value: list of split values splits = {} for node in tree.get_internal_nodes(): if node.split_attribute not in splits: splits[node.split_attribute] = [node.split_value] else: splits[node.split_attribute].append(node.split_value) # key: parameter name # value: list of tuples: (high, low, good_count, bad_count, unlabeled_count) distributions = {} for param, split_values in splits.items(): split_values.sort() param_base_name = util.remove_trailing_digits(param) min_param_value = instances[param].min() max_param_value = instances[param].max() split_values.insert(0, min_param_value) split_values.append(max_param_value) # FIXME: There must be a more efficient way to use pandas for this df = instances segments = [] for i in range(len(split_values) - 1): low = split_values[i] high = split_values[i+1] if i == 0: df2 = df[df[param] <= high] else: df2 = df[(df[param] > low) & (df[param] <= high)] counts = df2['label'].value_counts(dropna=False) good = counts.loc['good'] if 'good' in counts.index else 0 bad = counts.loc['bad'] if 'bad' in counts.index else 0 unlabeled = counts.loc[np.nan] if np.nan in counts.index else 0 segments.append((low, high, int(good), int(bad), int(unlabeled))) distributions[param] = segments return distributions
def get_parameter_distributions(tree): """ Calculate parameter distributions based on the given decision tree. For each parameter/attribute, a piecewise distribution is computed. Each segment of this distribution has two endpoints (on the 'x' axis) and a constant weight (on the 'y' axis'). The weight of a segment is the sum of the number of "good" instances within its range, minus the number of "bad" instances. Each leaf of the tree may contribute a segment of this distribution, based on the attribute split points along the path from the leaf to the root. Parameters ---------- tree : TreeNode The root node of the tree. Returns ------- dict key: parameter name value: list of non-overlapping weighted segments, each one of which is a tuple in the form (low, high, weight) """ # upper and lower limits for parameter ranges # key: parameter name # value: (low, high) limits = {} # Assemble the parameter ranges for each leaf into a list of weighted # segments for each parameter. # key: parameter name # value: list of (low, high, weight) segments for the parameter segments = {} for leaf in tree.get_leaves(): weight = leaf.instance_count - leaf.misclassified_count if leaf.class_label == 'bad': weight = -weight for param, (low, high) in get_ranges_for_leaf(leaf).items(): # Ranges can have None for low or high, if there is no lower or # upper bound, respectively. Replace None with the lowest or highest # valid value for the parameter. if param not in limits: param_base_name = util.remove_trailing_digits(param) limits[param] = valid_param_ranges[param_base_name] low = low or limits[param][0] high = high or limits[param][1] seg = (low, high, weight) if param not in segments: segments[param] = [seg] else: segments[param].append(seg) # For each parameter, combine the weighted segments. # key: parameter name # value: list of combined segments for the parameter, none overlapping combined_segments = {} for param, param_segments in segments.items(): combined_segments[param] = combine_weighted_segments(segments[param]) return combined_segments