示例#1
0
    def train(self, training_examples, train_on_subset=True, num_trees=100, features_considered_per_node=2, **kwds):
        print "Training a decision forest of %d trees, using %d examples, and %d features considered per node." % (
                num_trees, len(training_examples), features_considered_per_node)
        self.trees = []
        total_test_output_stats = SummaryStats()

        binary_classification = all(example["_OUTPUT"] in [0,1] for example in training_examples)
        #binary_classification = True
        #for example in training_examples:
        #    output = example["_OUTPUT"]
        #    if output not in [0,1]:
        #        binary_classification = False
        #        break

        for tree_i in xrange(1, num_trees+1):
            tree = DecisionTree()
            self.trees.append(tree)

            test_set_ids = set(xrange(len(training_examples)))
            for i in xrange(len(training_examples)):
                if train_on_subset:  # N samples with replacement ("bootstrap")
                    index = random.randint(0, len(training_examples)-1)
                else:
                    index = i

                tree.add_example(training_examples[index])
                test_set_ids.discard(index)

            print "Growing tree %d/%d ..." % (tree_i, num_trees),
            tree.grow_tree(features_considered_per_node=features_considered_per_node)

            # Report the in-sample training error
            if binary_classification:
                print "area-under-curve for %d training examples is %2.2f" % (
                        len(tree.examples), tree.test(tree.examples, print_level=0))
            else:
                print "%2.2f avg err^2 on %d training examples" % (
                        tree.avg_squared_error(), len(tree.examples)),


            # Report the out-of-sample testing error, if we have any out-of-sample
            # examples to test on.
            if train_on_subset:
                print "; ",
                test_set = [training_examples[i] for i in test_set_ids]

                if binary_classification:
                    # Do a true out-of-sample test just on this one tree
                    # Temporarily make this a forest-of-one-tree...
                    save_trees = self.trees
                    self.trees = [tree]
                    self.test(test_set)
                    self.trees = save_trees
                else:
                    avg_squared_error = tree.avg_squared_error(test_set)
                    total_test_output_stats.add(avg_squared_error)

                    print "out-of-sample avg err^2 on %d test cases: %.2f [%.2f avg. for all %d trees so far]" % (len(test_set), avg_squared_error, total_test_output_stats.avg(), tree_i),

            print
示例#2
0
def test_DecisionForest():
    """Train a decision forest against an arbitrary formula, to see if it can
    approximate it to an arbitrary low error, given enough examples."""
    def formula(x,y,z):
        return (x ** 2) + (x * y * z) + (10 * z) + (y / z) + 25

    def random_input_output():
        x = random.random() + 0.1
        y = random.random() + 0.1
        z = random.random() + 0.1
        output = formula(x,y,z)
        return ({'x':x, 'y':y, 'z':z}, output)


    te = TrainingExamples()
    for i in xrange(1, 5000):
        (input, output) = random_input_output()
        te.add_example(input, output)

        if i % 500: continue

        print "Testing after", i, "training examples"
        forest = DecisionForest()
        forest.train(te, train_on_subset=True, num_trees=10, features_considered_per_node=3)

        # Measure the true out-of-sample error rate for the entire forest.
        predict_err = SummaryStats()
        for j in xrange(10000):
            (input, output) = random_input_output()
            predicted_output = forest.predict(input)
            predict_err.add((output - predicted_output) ** 2)
        print "avg squared error = ", predict_err.avg()
示例#3
0
 def avg_squared_error(self, examples=None):
     "returns avg squared error of output"
     examples = examples or self.examples  # by default, use training examples
     output_stats = SummaryStats()
     for example in examples:
         prediction = self.predict(example)
         output_stats.add((prediction - float(example["_OUTPUT"])) ** 2)
     return output_stats.avg()
示例#4
0
 def avg_squared_error(self, examples):
     """Returns average squared error of the predicted output.
     This is useful to determine if the DecisionForest was able to learn the
     training set completely.
     If the training_error is very high, then the forest was not powerful enough
     to learn the training set (and perhaps features_considered_per_node should be
     increased.)
     In theory, a DecisionForest should never overfit the training data, because
     each tree was trained on a random bootstrap sample of the training examples.
     """
     errors = SummaryStats()
     for example in examples:
         prediction = self.predict(example)
         squared_error = (prediction - float(example["_OUTPUT"])) ** 2
         errors.add(squared_error)
     return errors.avg()
示例#5
0
    def test(self, examples, print_level=1):
        """Computes the "area under the ROC curve". This is a way to measure the
        precision/recall WITHOUT choosing a cutoff-threshold.  It is mathematically
        equivalent to:
           "the probability that a random positive example has a higher
            prob_output1 than a random negative case"
        (This equivalence is non-obvious).

        The algorithm below computes this average probability by effectively trying
        all combinations of positive-vs-negative examples, but does this in O(NlgN)
        instead of O(N^2)"""
        if type(examples) is TrainingExamples:
            examples = examples.examples

        prob_stats = SummaryStats()
        prob_hist = Histogram()
        output1_scores = list()
        output0_scores = list()
        for example in examples:
            assert example["_OUTPUT"] in [0, 1]
            prob = self.prob_output1(example)
            prob_stats.add(prob)
            prob_key = "%1.1f-%1.1f" % (int(prob * 10) / 10.0, (int(prob * 10) + 1) / 10.0)
            if prob == 1:
                prob_key = "0.9-1.0"  # don't create a 1.0-1.1 bucket
            prob_hist.add(prob_key)
            real_output = example["_OUTPUT"] == 1
            if real_output:
                output1_scores.append(prob)
            else:
                output0_scores.append(prob)

        output1_scores.sort()
        output0_scores.sort()

        if print_level >= 2:
            print "%d output1 scores:" % len(output1_scores),
            print ["%2.2f" % i for i in output1_scores[0:5]],
            print " ... ",
示例#6
0
 def predict(self, example):
     """Returns the average predicted output over all our trees."""
     output_stats = SummaryStats()
     for tree in self.trees:
         output_stats.add(tree.predict(example))
     return output_stats.avg()
示例#7
0
    def _find_best_split(self, feature):
        """Find the best threshold value to split this node on, using @feature.
        Returns (less_than_threshold, split_err).
        The @less_than_threshold is what you use to "decide", i.e.:
            if example[feature] < less_than_threshold:
                decide_left ...
            else:
                decide_right ...
        Note that this method doesn't actually split anything: it just figures out
        which threshold value would be best to split at.
        """
        self._sort_by_features(feature)
        left_output_stats = SummaryStats()
        right_output_stats = SummaryStats()
        assert len(self.examples) == len(self.examples_sorted_by_feature[feature])

        # To begin, let's assume we push all examples into the right child node.
        for example in self.examples:
            right_output_stats.add(float(example["_OUTPUT"]))

        # Now, move the examples one by one to the left child node.
        # (Note the examples sorted by value -- it's as if we're adjusting the
        # less_than_threshold.)
        # After each example, calculate the goodness-of-split, and track the best.
        best_threshold = None
        best_err = None
        last_feature_value = None
        for example in self.examples_sorted_by_feature[feature]:
            feature_value = example[feature]
            output_value = float(example["_OUTPUT"])

            # Speed optimization: skip over examples with same feature value.
            if feature_value == last_feature_value:
                left_output_stats.add(output_value)
                right_output_stats.remove(output_value)
                continue

            last_feature_value = feature_value  # remember for next iteration

            left_count = left_output_stats.count()
            right_count = right_output_stats.count()

            # Edge-case: left or right child is empty
            if left_count == 0 or right_count == 0:
                left_output_stats.add(output_value)
                right_output_stats.remove(output_value)
                continue  # not a true split

            # Compute goodness-of-split: weighted average of the 2 output variances.
            if left_count <= 1: left_err = 0
            else: left_err = (left_count - 1) * left_output_stats.var()

            if right_count <= 1: right_err = 0
            else: right_err = (right_count - 1) * right_output_stats.var()

            err = left_err + right_err
            if best_err is None or err < best_err:
                best_threshold = feature_value
                best_err = err

            left_output_stats.add(output_value)
            right_output_stats.remove(output_value)

        # to save memory, delete this sorted array (we'll never use it again anyway)
        del self.examples_sorted_by_feature[feature]
        return (best_threshold, best_err)
示例#8
0
class DecisionTree:
    """http://www-users.cs.umn.edu/~kumar/dmbook/ch4.pdf
    A real-output-valued decision tree, whose nodes split on real-valued
    inputs. You can still use this for binary classification by having 0/1 outputs.
    """
    def __init__(self):
        self.examples = []
        self.example_output_stats = SummaryStats()
        # For each feature, store a list of examples sorted by that feature's value
        self.examples_sorted_by_feature = collections.defaultdict(list)

        self.decision_feature = None
        self.decision_threshold = None

        # the two subtrees induced by the above decision function
        self.subtrees = [None, None]

    def add_example(self, example):
        self.examples.append(example)
        self.example_output_stats.add(float(example["_OUTPUT"]))

    def _examples_features(self):
        """Return the set of useable features from the examples.
        (Internally assumes examples[0] has all features.)"""
        return set(f for f in self.examples[0].keys() if not f.startswith("_"))

    def _sort_by_features(self, feature=None):
        """Called after final example is added. Only needs to be called once,
        for the root node, since sorting is preserved when splitting."""
        if feature is None:
            features = self._examples_features()
        else:
            assert type(feature) is str
            features = [feature]

        for feature in features:
            if self.examples_sorted_by_feature[feature]: continue  #already done
            self.examples_sorted_by_feature[feature] = list(self.examples)
            self.examples_sorted_by_feature[feature].sort(key=lambda e: e[feature])

    def avg_squared_error(self, examples=None):
        "returns avg squared error of output"
        examples = examples or self.examples  # by default, use training examples
        output_stats = SummaryStats()
        for example in examples:
            prediction = self.predict(example)
            output_stats.add((prediction - float(example["_OUTPUT"])) ** 2)
        return output_stats.avg()

    def predict(self, example):
        "recursively goes down decision tree, and returns avg output value at leaf."
        subtree = self._decision_subtree(example)
        if subtree is None:
            return self.example_output_stats.avg()
        else:
            return subtree.predict(example)

    def decision_str(self):
        return "%s < %s" % (self.decision_feature, self.decision_threshold)

    def print_tree(self, prefix=""):
        if self.decision_feature is None:
            print "Leaf(", len(self.examples), "examples,",
            print "avg_output=", self.example_output_stats.avg(),
            print "var_output=", self.example_output_stats.var(),
            print ")"
        else:
            print "Node(", len(self.examples), "examples,",
            print "decision = [", self.decision_str(),
            print "] )"

            prefix += "    "
            print prefix, "false =>",
            self.subtrees[0].print_tree(prefix)

            print prefix, "true  =>",
            self.subtrees[1].print_tree(prefix)

    def _find_best_split(self, feature):
        """Find the best threshold value to split this node on, using @feature.
        Returns (less_than_threshold, split_err).
        The @less_than_threshold is what you use to "decide", i.e.:
            if example[feature] < less_than_threshold:
                decide_left ...
            else:
                decide_right ...
        Note that this method doesn't actually split anything: it just figures out
        which threshold value would be best to split at.
        """
        self._sort_by_features(feature)
        left_output_stats = SummaryStats()
        right_output_stats = SummaryStats()
        assert len(self.examples) == len(self.examples_sorted_by_feature[feature])

        # To begin, let's assume we push all examples into the right child node.
        for example in self.examples:
            right_output_stats.add(float(example["_OUTPUT"]))

        # Now, move the examples one by one to the left child node.
        # (Note the examples sorted by value -- it's as if we're adjusting the
        # less_than_threshold.)
        # After each example, calculate the goodness-of-split, and track the best.
        best_threshold = None
        best_err = None
        last_feature_value = None
        for example in self.examples_sorted_by_feature[feature]:
            feature_value = example[feature]
            output_value = float(example["_OUTPUT"])

            # Speed optimization: skip over examples with same feature value.
            if feature_value == last_feature_value:
                left_output_stats.add(output_value)
                right_output_stats.remove(output_value)
                continue

            last_feature_value = feature_value  # remember for next iteration

            left_count = left_output_stats.count()
            right_count = right_output_stats.count()

            # Edge-case: left or right child is empty
            if left_count == 0 or right_count == 0:
                left_output_stats.add(output_value)
                right_output_stats.remove(output_value)
                continue  # not a true split

            # Compute goodness-of-split: weighted average of the 2 output variances.
            if left_count <= 1: left_err = 0
            else: left_err = (left_count - 1) * left_output_stats.var()

            if right_count <= 1: right_err = 0
            else: right_err = (right_count - 1) * right_output_stats.var()

            err = left_err + right_err
            if best_err is None or err < best_err:
                best_threshold = feature_value
                best_err = err

            left_output_stats.add(output_value)
            right_output_stats.remove(output_value)

        # to save memory, delete this sorted array (we'll never use it again anyway)
        del self.examples_sorted_by_feature[feature]
        return (best_threshold, best_err)

    def _split_subtrees(self, feature, threshold):
        """Reset the two subtrees based on the current decision function."""
        assert feature is not None
        assert threshold is not None
        assert len(self.examples) >= 2

        self.decision_feature = feature
        self.decision_threshold = threshold

        self.subtrees = [DecisionTree(), DecisionTree()]
        for example in self.examples:
            decision = int(example[self.decision_feature] < self.decision_threshold)
            if decision == 0:
                self.subtrees[0].add_example(example)
            else:
                self.subtrees[1].add_example(example)

    def _decision_subtree(self, example):
        """returns one of the two child subtrees, or None if we are a leaf."""
        if self.decision_feature is None: return None
        decision = example[self.decision_feature] < self.decision_threshold
        return self.subtrees[int(decision)]

    def grow_tree(self, features_considered_per_node=2):
        # Stop growing based on termination criteria:
        if len(self.examples) <= 1: return
        if self.example_output_stats.var() < 0.001: return

        # assume all examples have all features
        feature_names = self._examples_features()
        feature_subset = random.sample(feature_names, features_considered_per_node)

        best_feature = None
        best_threshold = None
        best_avg_error = None
        for feature in feature_subset:
            (threshold, avg_error) = self._find_best_split(feature)
            if avg_error is None: continue  # no split possible (all same value?)

            if best_avg_error is None or avg_error < best_avg_error:
                best_feature = feature
                best_threshold = threshold
                best_avg_error = avg_error

        # Did we fail to find a good decision?
        if best_feature is None: return

        # Accept the best decision
        self._split_subtrees(best_feature, best_threshold)

        # Grow recursively at each branch.
        self.subtrees[0].grow_tree(features_considered_per_node)
        self.subtrees[1].grow_tree(features_considered_per_node)