def decision_tree_learning(examples, attrs, parent_examples=()): if len(examples) == 0: return plurality_value(parent_examples) if all_same_class(examples): return DecisionLeaf(examples[0][target]) if len(attrs) == 0: return plurality_value(examples) A = choose_attribute(attrs, examples) tree = DecisionFork(A, dataset.attr_names[A], plurality_value(examples)) for (v_k, exs) in split_by(A, examples): subtree = decision_tree_learning(exs, remove_all(A, attrs), examples) tree.add(v_k, subtree) return tree
def set_problem(self, target, inputs=None, exclude=()): """ Set (or change) the target and/or inputs. This way, one DataSet can be used multiple ways. inputs, if specified, is a list of attributes, or specify exclude as a list of attributes to not use in inputs. Attributes can be -n .. n, or an attr_name. Also computes the list of possible values, if that wasn't done yet. """ self.target = self.attr_num(target) exclude = list(map(self.attr_num, exclude)) if inputs: self.inputs = remove_all(self.target, inputs) else: self.inputs = [a for a in self.attrs if a != self.target and a not in exclude] if not self.values: self.update_values() self.check_me()
def information_content(values): """Number of bits to represent the probability distribution in values.""" probabilities = normalize(remove_all(0, values)) return sum(-p * math.log2(p) for p in probabilities)