def train(self, training_examples, train_on_subset=True, num_trees=100, features_considered_per_node=2, **kwds): print "Training a decision forest of %d trees, using %d examples, and %d features considered per node." % ( num_trees, len(training_examples), features_considered_per_node) self.trees = [] total_test_output_stats = SummaryStats() binary_classification = all(example["_OUTPUT"] in [0,1] for example in training_examples) #binary_classification = True #for example in training_examples: # output = example["_OUTPUT"] # if output not in [0,1]: # binary_classification = False # break for tree_i in xrange(1, num_trees+1): tree = DecisionTree() self.trees.append(tree) test_set_ids = set(xrange(len(training_examples))) for i in xrange(len(training_examples)): if train_on_subset: # N samples with replacement ("bootstrap") index = random.randint(0, len(training_examples)-1) else: index = i tree.add_example(training_examples[index]) test_set_ids.discard(index) print "Growing tree %d/%d ..." % (tree_i, num_trees), tree.grow_tree(features_considered_per_node=features_considered_per_node) # Report the in-sample training error if binary_classification: print "area-under-curve for %d training examples is %2.2f" % ( len(tree.examples), tree.test(tree.examples, print_level=0)) else: print "%2.2f avg err^2 on %d training examples" % ( tree.avg_squared_error(), len(tree.examples)), # Report the out-of-sample testing error, if we have any out-of-sample # examples to test on. if train_on_subset: print "; ", test_set = [training_examples[i] for i in test_set_ids] if binary_classification: # Do a true out-of-sample test just on this one tree # Temporarily make this a forest-of-one-tree... save_trees = self.trees self.trees = [tree] self.test(test_set) self.trees = save_trees else: avg_squared_error = tree.avg_squared_error(test_set) total_test_output_stats.add(avg_squared_error) print "out-of-sample avg err^2 on %d test cases: %.2f [%.2f avg. for all %d trees so far]" % (len(test_set), avg_squared_error, total_test_output_stats.avg(), tree_i), print
def test_DecisionForest(): """Train a decision forest against an arbitrary formula, to see if it can approximate it to an arbitrary low error, given enough examples.""" def formula(x,y,z): return (x ** 2) + (x * y * z) + (10 * z) + (y / z) + 25 def random_input_output(): x = random.random() + 0.1 y = random.random() + 0.1 z = random.random() + 0.1 output = formula(x,y,z) return ({'x':x, 'y':y, 'z':z}, output) te = TrainingExamples() for i in xrange(1, 5000): (input, output) = random_input_output() te.add_example(input, output) if i % 500: continue print "Testing after", i, "training examples" forest = DecisionForest() forest.train(te, train_on_subset=True, num_trees=10, features_considered_per_node=3) # Measure the true out-of-sample error rate for the entire forest. predict_err = SummaryStats() for j in xrange(10000): (input, output) = random_input_output() predicted_output = forest.predict(input) predict_err.add((output - predicted_output) ** 2) print "avg squared error = ", predict_err.avg()
def avg_squared_error(self, examples=None): "returns avg squared error of output" examples = examples or self.examples # by default, use training examples output_stats = SummaryStats() for example in examples: prediction = self.predict(example) output_stats.add((prediction - float(example["_OUTPUT"])) ** 2) return output_stats.avg()
def avg_squared_error(self, examples): """Returns average squared error of the predicted output. This is useful to determine if the DecisionForest was able to learn the training set completely. If the training_error is very high, then the forest was not powerful enough to learn the training set (and perhaps features_considered_per_node should be increased.) In theory, a DecisionForest should never overfit the training data, because each tree was trained on a random bootstrap sample of the training examples. """ errors = SummaryStats() for example in examples: prediction = self.predict(example) squared_error = (prediction - float(example["_OUTPUT"])) ** 2 errors.add(squared_error) return errors.avg()
def test(self, examples, print_level=1): """Computes the "area under the ROC curve". This is a way to measure the precision/recall WITHOUT choosing a cutoff-threshold. It is mathematically equivalent to: "the probability that a random positive example has a higher prob_output1 than a random negative case" (This equivalence is non-obvious). The algorithm below computes this average probability by effectively trying all combinations of positive-vs-negative examples, but does this in O(NlgN) instead of O(N^2)""" if type(examples) is TrainingExamples: examples = examples.examples prob_stats = SummaryStats() prob_hist = Histogram() output1_scores = list() output0_scores = list() for example in examples: assert example["_OUTPUT"] in [0, 1] prob = self.prob_output1(example) prob_stats.add(prob) prob_key = "%1.1f-%1.1f" % (int(prob * 10) / 10.0, (int(prob * 10) + 1) / 10.0) if prob == 1: prob_key = "0.9-1.0" # don't create a 1.0-1.1 bucket prob_hist.add(prob_key) real_output = example["_OUTPUT"] == 1 if real_output: output1_scores.append(prob) else: output0_scores.append(prob) output1_scores.sort() output0_scores.sort() if print_level >= 2: print "%d output1 scores:" % len(output1_scores), print ["%2.2f" % i for i in output1_scores[0:5]], print " ... ",
def predict(self, example): """Returns the average predicted output over all our trees.""" output_stats = SummaryStats() for tree in self.trees: output_stats.add(tree.predict(example)) return output_stats.avg()
def _find_best_split(self, feature): """Find the best threshold value to split this node on, using @feature. Returns (less_than_threshold, split_err). The @less_than_threshold is what you use to "decide", i.e.: if example[feature] < less_than_threshold: decide_left ... else: decide_right ... Note that this method doesn't actually split anything: it just figures out which threshold value would be best to split at. """ self._sort_by_features(feature) left_output_stats = SummaryStats() right_output_stats = SummaryStats() assert len(self.examples) == len(self.examples_sorted_by_feature[feature]) # To begin, let's assume we push all examples into the right child node. for example in self.examples: right_output_stats.add(float(example["_OUTPUT"])) # Now, move the examples one by one to the left child node. # (Note the examples sorted by value -- it's as if we're adjusting the # less_than_threshold.) # After each example, calculate the goodness-of-split, and track the best. best_threshold = None best_err = None last_feature_value = None for example in self.examples_sorted_by_feature[feature]: feature_value = example[feature] output_value = float(example["_OUTPUT"]) # Speed optimization: skip over examples with same feature value. if feature_value == last_feature_value: left_output_stats.add(output_value) right_output_stats.remove(output_value) continue last_feature_value = feature_value # remember for next iteration left_count = left_output_stats.count() right_count = right_output_stats.count() # Edge-case: left or right child is empty if left_count == 0 or right_count == 0: left_output_stats.add(output_value) right_output_stats.remove(output_value) continue # not a true split # Compute goodness-of-split: weighted average of the 2 output variances. if left_count <= 1: left_err = 0 else: left_err = (left_count - 1) * left_output_stats.var() if right_count <= 1: right_err = 0 else: right_err = (right_count - 1) * right_output_stats.var() err = left_err + right_err if best_err is None or err < best_err: best_threshold = feature_value best_err = err left_output_stats.add(output_value) right_output_stats.remove(output_value) # to save memory, delete this sorted array (we'll never use it again anyway) del self.examples_sorted_by_feature[feature] return (best_threshold, best_err)
class DecisionTree: """http://www-users.cs.umn.edu/~kumar/dmbook/ch4.pdf A real-output-valued decision tree, whose nodes split on real-valued inputs. You can still use this for binary classification by having 0/1 outputs. """ def __init__(self): self.examples = [] self.example_output_stats = SummaryStats() # For each feature, store a list of examples sorted by that feature's value self.examples_sorted_by_feature = collections.defaultdict(list) self.decision_feature = None self.decision_threshold = None # the two subtrees induced by the above decision function self.subtrees = [None, None] def add_example(self, example): self.examples.append(example) self.example_output_stats.add(float(example["_OUTPUT"])) def _examples_features(self): """Return the set of useable features from the examples. (Internally assumes examples[0] has all features.)""" return set(f for f in self.examples[0].keys() if not f.startswith("_")) def _sort_by_features(self, feature=None): """Called after final example is added. Only needs to be called once, for the root node, since sorting is preserved when splitting.""" if feature is None: features = self._examples_features() else: assert type(feature) is str features = [feature] for feature in features: if self.examples_sorted_by_feature[feature]: continue #already done self.examples_sorted_by_feature[feature] = list(self.examples) self.examples_sorted_by_feature[feature].sort(key=lambda e: e[feature]) def avg_squared_error(self, examples=None): "returns avg squared error of output" examples = examples or self.examples # by default, use training examples output_stats = SummaryStats() for example in examples: prediction = self.predict(example) output_stats.add((prediction - float(example["_OUTPUT"])) ** 2) return output_stats.avg() def predict(self, example): "recursively goes down decision tree, and returns avg output value at leaf." subtree = self._decision_subtree(example) if subtree is None: return self.example_output_stats.avg() else: return subtree.predict(example) def decision_str(self): return "%s < %s" % (self.decision_feature, self.decision_threshold) def print_tree(self, prefix=""): if self.decision_feature is None: print "Leaf(", len(self.examples), "examples,", print "avg_output=", self.example_output_stats.avg(), print "var_output=", self.example_output_stats.var(), print ")" else: print "Node(", len(self.examples), "examples,", print "decision = [", self.decision_str(), print "] )" prefix += " " print prefix, "false =>", self.subtrees[0].print_tree(prefix) print prefix, "true =>", self.subtrees[1].print_tree(prefix) def _find_best_split(self, feature): """Find the best threshold value to split this node on, using @feature. Returns (less_than_threshold, split_err). The @less_than_threshold is what you use to "decide", i.e.: if example[feature] < less_than_threshold: decide_left ... else: decide_right ... Note that this method doesn't actually split anything: it just figures out which threshold value would be best to split at. """ self._sort_by_features(feature) left_output_stats = SummaryStats() right_output_stats = SummaryStats() assert len(self.examples) == len(self.examples_sorted_by_feature[feature]) # To begin, let's assume we push all examples into the right child node. for example in self.examples: right_output_stats.add(float(example["_OUTPUT"])) # Now, move the examples one by one to the left child node. # (Note the examples sorted by value -- it's as if we're adjusting the # less_than_threshold.) # After each example, calculate the goodness-of-split, and track the best. best_threshold = None best_err = None last_feature_value = None for example in self.examples_sorted_by_feature[feature]: feature_value = example[feature] output_value = float(example["_OUTPUT"]) # Speed optimization: skip over examples with same feature value. if feature_value == last_feature_value: left_output_stats.add(output_value) right_output_stats.remove(output_value) continue last_feature_value = feature_value # remember for next iteration left_count = left_output_stats.count() right_count = right_output_stats.count() # Edge-case: left or right child is empty if left_count == 0 or right_count == 0: left_output_stats.add(output_value) right_output_stats.remove(output_value) continue # not a true split # Compute goodness-of-split: weighted average of the 2 output variances. if left_count <= 1: left_err = 0 else: left_err = (left_count - 1) * left_output_stats.var() if right_count <= 1: right_err = 0 else: right_err = (right_count - 1) * right_output_stats.var() err = left_err + right_err if best_err is None or err < best_err: best_threshold = feature_value best_err = err left_output_stats.add(output_value) right_output_stats.remove(output_value) # to save memory, delete this sorted array (we'll never use it again anyway) del self.examples_sorted_by_feature[feature] return (best_threshold, best_err) def _split_subtrees(self, feature, threshold): """Reset the two subtrees based on the current decision function.""" assert feature is not None assert threshold is not None assert len(self.examples) >= 2 self.decision_feature = feature self.decision_threshold = threshold self.subtrees = [DecisionTree(), DecisionTree()] for example in self.examples: decision = int(example[self.decision_feature] < self.decision_threshold) if decision == 0: self.subtrees[0].add_example(example) else: self.subtrees[1].add_example(example) def _decision_subtree(self, example): """returns one of the two child subtrees, or None if we are a leaf.""" if self.decision_feature is None: return None decision = example[self.decision_feature] < self.decision_threshold return self.subtrees[int(decision)] def grow_tree(self, features_considered_per_node=2): # Stop growing based on termination criteria: if len(self.examples) <= 1: return if self.example_output_stats.var() < 0.001: return # assume all examples have all features feature_names = self._examples_features() feature_subset = random.sample(feature_names, features_considered_per_node) best_feature = None best_threshold = None best_avg_error = None for feature in feature_subset: (threshold, avg_error) = self._find_best_split(feature) if avg_error is None: continue # no split possible (all same value?) if best_avg_error is None or avg_error < best_avg_error: best_feature = feature best_threshold = threshold best_avg_error = avg_error # Did we fail to find a good decision? if best_feature is None: return # Accept the best decision self._split_subtrees(best_feature, best_threshold) # Grow recursively at each branch. self.subtrees[0].grow_tree(features_considered_per_node) self.subtrees[1].grow_tree(features_considered_per_node)