def learn_tree(self, input_features, data_subset): """returns a decision tree for input_features is a set of possible conditions data_subset is a subset of the data used to build this (sub)tree where a decision tree is a function that takes an example and makes a prediction on the target feature """ if (input_features and len(data_subset) >= self.min_number_examples): first_target_val = self.target(data_subset[0]) allagree = all(self.target(inst)==first_target_val for inst in data_subset) if not allagree: split, partn = self.select_split(input_features, data_subset) if split: # the split succeeded in splitting the data false_examples, true_examples = partn rem_features = [fe for fe in input_features if fe != split] self.display(2,"Splitting on",split.__doc__,"with examples split", len(true_examples),":",len(false_examples)) true_tree = self.learn_tree(rem_features,true_examples) false_tree = self.learn_tree(rem_features,false_examples) def fun(e): if split(e): return true_tree(e) else: return false_tree(e) #fun = lambda e: true_tree(e) if split(e) else false_tree(e) fun.__doc__ = ("if "+split.__doc__+" then ("+true_tree.__doc__+ ") else ("+false_tree.__doc__+")") return fun # don't expand the trees but return a point prediction return point_prediction(self.target, data_subset, selection=self.leaf_selection)
def training_error(dataset, data_subset, to_optimize): """returns training error for dataset on to_optimize. This assumes that we choose the best value for the optimization criteria for dataset according to point_prediction """ select_dict = {"sum-of-squares":"mean", "sum_absolute":"median", "logloss":"Laplace"} # arbitrary mapping. Perhaps wrong. selection = select_dict[to_optimize] predictor = point_prediction(dataset.target, data_subset, selection=selection) error = sum(error_example(predictor(example), dataset.target(example), to_optimize) for example in data_subset) return error