def train(self, training_examples, train_on_subset=True, num_trees=100, features_considered_per_node=2, **kwds): print "Training a decision forest of %d trees, using %d examples, and %d features considered per node." % ( num_trees, len(training_examples), features_considered_per_node) self.trees = [] total_test_output_stats = SummaryStats() binary_classification = all(example["_OUTPUT"] in [0,1] for example in training_examples) #binary_classification = True #for example in training_examples: # output = example["_OUTPUT"] # if output not in [0,1]: # binary_classification = False # break for tree_i in xrange(1, num_trees+1): tree = DecisionTree() self.trees.append(tree) test_set_ids = set(xrange(len(training_examples))) for i in xrange(len(training_examples)): if train_on_subset: # N samples with replacement ("bootstrap") index = random.randint(0, len(training_examples)-1) else: index = i tree.add_example(training_examples[index]) test_set_ids.discard(index) print "Growing tree %d/%d ..." % (tree_i, num_trees), tree.grow_tree(features_considered_per_node=features_considered_per_node) # Report the in-sample training error if binary_classification: print "area-under-curve for %d training examples is %2.2f" % ( len(tree.examples), tree.test(tree.examples, print_level=0)) else: print "%2.2f avg err^2 on %d training examples" % ( tree.avg_squared_error(), len(tree.examples)), # Report the out-of-sample testing error, if we have any out-of-sample # examples to test on. if train_on_subset: print "; ", test_set = [training_examples[i] for i in test_set_ids] if binary_classification: # Do a true out-of-sample test just on this one tree # Temporarily make this a forest-of-one-tree... save_trees = self.trees self.trees = [tree] self.test(test_set) self.trees = save_trees else: avg_squared_error = tree.avg_squared_error(test_set) total_test_output_stats.add(avg_squared_error) print "out-of-sample avg err^2 on %d test cases: %.2f [%.2f avg. for all %d trees so far]" % (len(test_set), avg_squared_error, total_test_output_stats.avg(), tree_i), print