예제 #1
0
    def _partition(self, X, y, variables, n_samples, depth=0):
        rng = self.random_state_

        # Leaf
        if len(variables) == 0 or (self.max_depth is not None and depth >= self.max_depth):
            values = 1. * np.bincount(y, minlength=self.n_classes_) / len(y)
            return (values, len(y))

        # Internal node
        else:
            variables = copy.copy(variables)
            n_variables = len(variables)
            n_node = len(X)

            best = None
            best_score = -np.inf
            best_children = None

            features = (rng.permutation(n_variables))[:min(self.k,
                                                           n_variables)]

            for i in features:
                X_i = variables[i]

                children = []

                for xi in self.values_[X_i]:
                    mask_xi = X[:, X_i] == xi
                    if sum(mask_xi) > 0:
                        children.append((X[mask_xi], y[mask_xi], sum(mask_xi)))

                score = ((1. * n_node / n_samples)  # P(B=b)
                         * (entropy(y) - sum([1. * entropy(c_y) * c_n / n_node
                                              for _, c_y, c_n in children])))

                if score > best_score:
                    best = i
                    best_score = score
                    best_children = children

            X_i = variables.pop(best)

            return (X_i,
                    best_score,
                    [self._partition(c_X,
                                     c_y,
                                     variables,
                                     n_samples,
                                     depth=depth+1) for c_X,
                                                        c_y,
                                                        _ in best_children])
예제 #2
0
models = [("TRT", partial(RandomizedID3Ensemble, base_estimator=RandomizedID3Classifier(k=1))),
          ("ETs K=1", partial(ExtraTreesClassifier, max_features=1, criterion="entropy")),
          ("ETs K=3", partial(ExtraTreesClassifier, max_features=3, criterion="entropy")),
          ("ETs K=5", partial(ExtraTreesClassifier, max_features=5, criterion="entropy")),
          ("RF K=1", partial(RandomForestClassifier, max_features=1, bootstrap=True, criterion="entropy")),
          ("RF K=3", partial(RandomForestClassifier, max_features=3, bootstrap=True, criterion="entropy")),
          ("RF K=5", partial(RandomForestClassifier, max_features=5, bootstrap=True, criterion="entropy")),]

n_repeat = 5
r = {}

for i in range(n_repeat):
    print "Iteration", i

    X, y = generate_strobl_null(n_samples=120)
    print entropy(y)

    for name, cls in models:
        f = feature_importances(X, y, cls=cls, n_trees=500)

        if i == 0:
            r[name] = np.array(f)
        else:
            r[name] += np.array(f)

        print name, np.sum(f)

for name in r:
    r[name] /= n_repeat

# Convert to pandas and plot
예제 #3
0
             criterion="entropy")),
    ("RF K=5",
     partial(RandomForestClassifier,
             max_features=5,
             bootstrap=True,
             criterion="entropy")),
]

n_repeat = 5
r = {}

for i in range(n_repeat):
    print "Iteration", i

    X, y = generate_strobl_null(n_samples=120)
    print entropy(y)

    for name, cls in models:
        f = feature_importances(X, y, cls=cls, n_trees=500)

        if i == 0:
            r[name] = np.array(f)
        else:
            r[name] += np.array(f)

        print name, np.sum(f)

for name in r:
    r[name] /= n_repeat

# Convert to pandas and plot