예제 #1
0
    def train(self, training_examples, train_on_subset=True, num_trees=100, features_considered_per_node=2, **kwds):
        print "Training a decision forest of %d trees, using %d examples, and %d features considered per node." % (
                num_trees, len(training_examples), features_considered_per_node)
        self.trees = []
        total_test_output_stats = SummaryStats()

        binary_classification = all(example["_OUTPUT"] in [0,1] for example in training_examples)
        #binary_classification = True
        #for example in training_examples:
        #    output = example["_OUTPUT"]
        #    if output not in [0,1]:
        #        binary_classification = False
        #        break

        for tree_i in xrange(1, num_trees+1):
            tree = DecisionTree()
            self.trees.append(tree)

            test_set_ids = set(xrange(len(training_examples)))
            for i in xrange(len(training_examples)):
                if train_on_subset:  # N samples with replacement ("bootstrap")
                    index = random.randint(0, len(training_examples)-1)
                else:
                    index = i

                tree.add_example(training_examples[index])
                test_set_ids.discard(index)

            print "Growing tree %d/%d ..." % (tree_i, num_trees),
            tree.grow_tree(features_considered_per_node=features_considered_per_node)

            # Report the in-sample training error
            if binary_classification:
                print "area-under-curve for %d training examples is %2.2f" % (
                        len(tree.examples), tree.test(tree.examples, print_level=0))
            else:
                print "%2.2f avg err^2 on %d training examples" % (
                        tree.avg_squared_error(), len(tree.examples)),


            # Report the out-of-sample testing error, if we have any out-of-sample
            # examples to test on.
            if train_on_subset:
                print "; ",
                test_set = [training_examples[i] for i in test_set_ids]

                if binary_classification:
                    # Do a true out-of-sample test just on this one tree
                    # Temporarily make this a forest-of-one-tree...
                    save_trees = self.trees
                    self.trees = [tree]
                    self.test(test_set)
                    self.trees = save_trees
                else:
                    avg_squared_error = tree.avg_squared_error(test_set)
                    total_test_output_stats.add(avg_squared_error)

                    print "out-of-sample avg err^2 on %d test cases: %.2f [%.2f avg. for all %d trees so far]" % (len(test_set), avg_squared_error, total_test_output_stats.avg(), tree_i),

            print