Exemplo n.º 1
0
def test_dimension_location():
    """
    Test dimension and location of split.
    """
    rng = np.random.RandomState(0)
    X = rng.rand(100, 3)
    X[:, 1] *= 100
    X[:, 2] *= 50
    y = np.round(rng.randn(100))

    for est in estimators:
        n = 1000
        features = []
        thresholds = []
        for random_state in np.arange(1000):
            est.set_params(random_state=random_state).fit(X, y)
            features.append(est.tree_.feature[0])
            thresholds.append(est.tree_.threshold[0])

        # Check that this converges to the actual probability p of the bernoulli.
        diff = np.max(X, axis=0) - np.min(X, axis=0)
        p_act = diff / np.sum(diff)
        features = np.array(features)
        thresholds = np.array(thresholds)
        counts = np.bincount(features)
        p_sim = counts / np.sum(counts)
        assert_array_almost_equal(p_act, p_sim, 2)

        # Check that the split location converges to the (u + l) / 2 where
        # u and l are the upper and lower bounds of the feature.
        u = np.max(X, axis=0)[1]
        l = np.min(X, axis=0)[1]
        thresh_sim = np.mean(thresholds[features == 1])
        thresh_act = (u + l) / 2.0
        assert_array_almost_equal(thresh_act, thresh_sim, 1)
Exemplo n.º 2
0
def check_and_return_children(tree, node, val, var=None):
    if var is not None:
        assert_almost_equal(tree.variance[node], var)
    assert_array_almost_equal(tree.value[node], val)
    l_id = tree.children_left[node]
    r_id = tree.children_right[node]
    return l_id, r_id
Exemplo n.º 3
0
def test_min_variance():
    rng = check_random_state(0)
    X = rng.normal(size=(1000, 1))
    y = np.ones(1000)
    rf = RandomForestRegressor(min_variance=0.1)
    rf.fit(X, y)
    mean, std = rf.predict(X, return_std=True)
    assert_array_almost_equal(mean, y)
    assert_array_almost_equal(std, np.sqrt(0.1 * np.ones(1000)))
Exemplo n.º 4
0
def test_pure_set():
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
    y = [1, 1, 1, 1, 1, 1]
    for est in estimators:
        est.fit(X, y)
        assert_array_almost_equal(est.predict(X), y)

        new_est = clone(est)
        new_est.partial_fit(X, y)
        assert_array_almost_equal(new_est.predict(X), y)
Exemplo n.º 5
0
def check_variance_no_split(Regressor):
    rng = check_random_state(0)
    X = np.ones((1000, 1))
    y = rng.normal(size=(1000, ))

    reg = Regressor(random_state=0, max_depth=3)
    reg.fit(X, y)

    pred, std = reg.predict(X, return_std=True)
    assert_array_almost_equal([np.std(y)] * 1000, std)
    assert_array_almost_equal([np.mean(y)] * 1000, pred)
Exemplo n.º 6
0
def check_variance_toy_data(Regressor):
    # Split into [2, 3, 4] and [100, 103, 106]
    X = [[2.0, 1.], [3.0, 1.0], [4., 1.0], [109.0, 1.0], [110.0, 1.0],
         [111., 1.]]
    y = [2, 3, 4, 100, 103, 106]

    reg = Regressor(max_depth=1, random_state=1)
    reg.fit(X, y)

    pred, var = reg.predict(X, return_std=True)
    assert_array_equal(pred, [3, 3, 3, 103, 103, 103])
    assert_array_almost_equal(
        var, np.sqrt([0.666667, 0.666667, 0.666667, 6.0, 6.0, 6.0]))
Exemplo n.º 7
0
def test_max_depth_None():
    # Since each leaf is pure and has just one unique value.
    # the mean equals any quantile.
    for est in estimators:
        est.set_params(max_depth=None)
        est.fit(X_train, y_train)

        for quantile in [20, 40, 50, 60, 80, 90]:

            for curr_X in [X_train, X_test]:
                assert_array_almost_equal(
                    est.predict(curr_X, quantile=None),
                    est.predict(curr_X, quantile=quantile), 1)
Exemplo n.º 8
0
def check_weighted_decision_path_classif(mtc, X_test):
    weights = mtc.weighted_decision_path(X_test)
    node_probas = (mtc.tree_.value[:, 0, :] /
                   np.expand_dims(mtc.tree_.n_node_samples, axis=1))
    probas1 = []

    for startptr, endptr in zip(weights.indptr[:-1], weights.indptr[1:]):
        curr_nodes = weights.indices[startptr:endptr]
        curr_weights = np.expand_dims(weights.data[startptr:endptr], axis=1)
        curr_probas = node_probas[curr_nodes]
        probas1.append(np.sum(curr_weights * curr_probas, axis=0))

    probas2 = mtc.predict_proba(X_test)
    assert_array_almost_equal(probas1, probas2, 5)
Exemplo n.º 9
0
def test_max_depth_None_rfqr():
    # Since each leaf is pure and has just one unique value.
    # the mean equals any quantile.
    rng = np.random.RandomState(0)
    X = rng.randn(10, 1)
    y = np.linspace(0.0, 100.0, 10)

    rfqr = RandomForestQuantileRegressor(
        random_state=0, bootstrap=False, max_depth=None)
    rfqr.fit(X, y)

    for quantile in [20, 40, 50, 60, 80, 90]:
        assert_array_almost_equal(
            rfqr.predict(X, quantile=None),
            rfqr.predict(X, quantile=quantile), 5)
Exemplo n.º 10
0
def test_tree_forest_equivalence():
    """
    Test that a DecisionTree and RandomForest give equal quantile
    predictions when bootstrap is set to False.
    """
    rfqr = RandomForestQuantileRegressor(
        random_state=0, bootstrap=False, max_depth=2)
    rfqr.fit(X_train, y_train)

    dtqr = DecisionTreeQuantileRegressor(random_state=0, max_depth=2)
    dtqr.fit(X_train, y_train)

    assert_true(np.all(rfqr.y_train_leaves_ == dtqr.y_train_leaves_))
    assert_array_almost_equal(
        rfqr.predict(X_test, quantile=10),
        dtqr.predict(X_test, quantile=10), 5)
Exemplo n.º 11
0
def check_proba_classif_convergence(est, X_train, y_train):
    lb = LabelBinarizer()
    y_bin = lb.fit_transform(y_train)
    le = LabelEncoder()
    y_enc = le.fit_transform(y_train)

    proba = est.predict_proba(X_train)
    labels = est.predict(X_train)
    assert_array_equal(proba, y_bin)
    assert_array_equal(labels, lb.inverse_transform(y_bin))

    # For points completely far away from the training data, this
    # should converge to the empirical distribution of labels.
    X_inf = np.vstack(
        (30.0 * np.ones(X_train.shape[1]), -30.0 * np.ones(X_train.shape[1])))
    inf_proba = est.predict_proba(X_inf)
    emp_proba = np.bincount(y_enc) / float(len(y_enc))
    assert_array_almost_equal(inf_proba, [emp_proba, emp_proba], 3)
Exemplo n.º 12
0
def test_percentile_equal_weights():
    rng = np.random.RandomState(0)
    x = rng.randn(10)
    weights = 0.1 * np.ones(10)

    # since weights are equal, quantiles lie in the midpoint.
    sorted_x = np.sort(x)
    expected = 0.5 * (sorted_x[1:] + sorted_x[:-1])
    actual = (
        [weighted_percentile(x, q, weights) for q in np.arange(10, 100, 10)]
    )
    assert_array_almost_equal(expected, actual)

    # check quantiles at (5, 95) at intervals of 10
    actual = (
        [weighted_percentile(x, q, weights) for q in np.arange(5, 105, 10)]
    )
    assert_array_almost_equal(sorted_x, actual)
Exemplo n.º 13
0
def test_tree_toy_data():
    rng = np.random.RandomState(0)
    x1 = rng.randn(1, 10)
    X1 = np.tile(x1, (10000, 1))
    x2 = 20.0 * rng.randn(1, 10)
    X2 = np.tile(x2, (10000, 1))
    X = np.vstack((X1, X2))

    y1 = rng.randn(10000)
    y2 = 5.0 + rng.randn(10000)
    y = np.concatenate((y1, y2))

    for est in estimators:
        est.set_params(max_depth=1)
        est.fit(X, y)
        for quantile in [20, 30, 40, 50, 60, 70, 80]:
            assert_array_almost_equal(est.predict(x1, quantile=quantile),
                                      [np.percentile(y1, quantile)], 3)
            assert_array_almost_equal(est.predict(x2, quantile=quantile),
                                      [np.percentile(y2, quantile)], 3)
Exemplo n.º 14
0
def check_weighted_decision_path_regression(mtr, X_test):
    weights = mtr.weighted_decision_path(X_test)
    node_means = mtr.tree_.mean
    node_variances = mtr.tree_.variance
    variances1 = []
    means1 = []

    for startptr, endptr in zip(weights.indptr[:-1], weights.indptr[1:]):
        curr_nodes = weights.indices[startptr:endptr]
        curr_weights = weights.data[startptr:endptr]
        curr_means = node_means[curr_nodes]
        curr_var = node_variances[curr_nodes]

        means1.append(np.sum(curr_weights * curr_means))
        variances1.append(np.sum(curr_weights * (curr_var + curr_means**2)))

    means1 = np.array(means1)
    variances1 = np.array(variances1)
    variances1 -= means1**2
    means2, std2 = mtr.predict(X_test, return_std=True)
    assert_array_almost_equal(means1, means2, 5)
    assert_array_almost_equal(variances1, std2**2, 3)
Exemplo n.º 15
0
def check_mean_std_reg_convergence(est, X_train, y_train):
    # For points completely in the training data and when
    # tree is grown to full depth.
    # mean should converge to the actual target value.
    # variance should converge to 0.0
    mean, std = est.predict(X_train, return_std=True)
    assert_array_almost_equal(mean, y_train, 5)
    assert_array_almost_equal(std, 0.0, 2)

    # For points completely far away from the training data, this
    # should converge to the empirical mean and variance.
    # X is scaled between to -1.0 and 1.0
    X_inf = np.vstack(
        (20.0 * np.ones(X_train.shape[1]), -20.0 * np.ones(X_train.shape[1])))
    inf_mean, inf_std = est.predict(X_inf, return_std=True)
    assert_array_almost_equal(inf_mean, y_train.mean(), 1)
    assert_array_almost_equal(inf_std, y_train.std(), 2)
Exemplo n.º 16
0
def check_mean_std_forest_regressor(est):
    # For points completely in the training data.
    # and max depth set to None.
    # mean should converge to the actual target value.
    # variance should converge to 0.0
    mean, std = est.predict(X, return_std=True)
    assert_array_almost_equal(mean, y, 5)
    assert_array_almost_equal(std, 0.0, 2)

    # For points completely far away from the training data, this
    # should converge to the empirical mean and variance.
    # X is scaled between to -1.0 and 1.0
    X_inf = np.vstack(
        (30.0 * np.ones(X.shape[1]), -30.0 * np.ones(X.shape[1])))
    inf_mean, inf_std = est.predict(X_inf, return_std=True)
    assert_array_almost_equal(inf_mean, y.mean(), 1)
    assert_array_almost_equal(inf_std, y.std(), 2)
Exemplo n.º 17
0
def test_quantiles():
    # Test with max depth 1.
    for est in estimators:
        est.set_params(max_depth=1)
        est.fit(X_train, y_train)
        tree = est.tree_

        for q in [20, 40, 50, 60, 80, 90]:
            left_ind = X_train[:, tree.feature[0]] <= tree.threshold[0]
            right_ind = X_train[:, tree.feature[0]] > tree.threshold[0]

            # fixme
            left_q = weighted_percentile(y_train[left_ind], q)
            right_q = weighted_percentile(y_train[right_ind], q)

            for curr_X, curr_y in [[X_train, y_train], [X_test, y_test]]:
                actual_q = np.zeros(curr_X.shape[0])
                left_ind = curr_X[:, tree.feature[0]] <= tree.threshold[0]
                actual_q[left_ind] = left_q
                right_ind = curr_X[:, tree.feature[0]] > tree.threshold[0]
                actual_q[right_ind] = right_q

                expected_q = est.predict(curr_X, quantile=q)
                assert_array_almost_equal(expected_q, actual_q)
Exemplo n.º 18
0
def check_weighted_decision_path(ensemble, X_train, X_test):
    # decision_path is implemented in sklearn while
    # weighted_decision_path is implemented here so check
    paths, col_inds = ensemble.decision_path(X_train)
    weight_paths, weight_col_inds = ensemble.weighted_decision_path(X_train)
    assert_array_equal(col_inds, weight_col_inds)

    n_nodes = [est.tree_.node_count for est in ensemble.estimators_]
    assert_equal(weight_paths.shape[0], X_train.shape[0])
    assert_equal(weight_paths.shape[1], sum(n_nodes))

    # We are calculating the weighted decision path on train data, so
    # the weights should be concentrated at the leaves.
    leaf_indices = ensemble.apply(X_train)
    for est_ind, curr_leaf_indices in enumerate(leaf_indices.T):
        curr_path = weight_paths[:, col_inds[est_ind]:col_inds[est_ind +
                                                               1]].toarray()
        assert_array_equal(np.where(curr_path)[1], curr_leaf_indices)

        # Sum of the weights across all the nodes in each estimator
        # for each sample should sum up to 1.0
        assert_array_almost_equal(
            np.ravel(ensemble.weighted_decision_path(X_test)[0].sum(axis=1)),
            ensemble.n_estimators * np.ones(X_test.shape[0]), 5)
Exemplo n.º 19
0
def test_partial_fit_two_samples():
    rng = np.random.RandomState(10)
    X = rng.randn(2, 5)
    y = rng.randn(2)
    for r in range(10):
        mtr = MondrianTreeRegressor(random_state=r)
        mtr.partial_fit(X, y)
        tree = mtr.tree_
        assert_array_almost_equal(tree.value[:, 0, 0], [y[0], y[1], np.mean(y)])
        assert_array_almost_equal(tree.variance, [0, 0, np.var(y)])
        check_partial_fit_two_samples(tree, X)

    y = [0, 1]
    for r in range(10):
        mtc = MondrianTreeClassifier(random_state=r)
        mtc.partial_fit(X, y)
        tree = mtc.tree_
        assert_array_almost_equal(tree.value[:, 0, :], [[1, 0], [0, 1], [1, 1]])
        check_partial_fit_two_samples(tree, X)
Exemplo n.º 20
0
def test_memory_layout():
    for est in ensembles:
        for dtype in [np.float32, np.float64]:
            X_curr = np.asarray(X, dtype=dtype)
            assert_array_almost_equal(est.fit(X_curr, y).predict(X_curr), y, 3)
            assert_array_almost_equal(
                est.partial_fit(X_curr, y).predict(X_curr), y, 3)

            # C-order
            X_curr = np.asarray(X, order="C", dtype=dtype)
            assert_array_almost_equal(est.fit(X_curr, y).predict(X_curr), y, 3)
            assert_array_almost_equal(
                est.partial_fit(X_curr, y).predict(X_curr), y, 3)

            X_curr = np.asarray(X, order="F", dtype=dtype)
            assert_array_almost_equal(est.fit(X_curr, y).predict(X_curr), y, 3)
            assert_array_almost_equal(
                est.partial_fit(X_curr, y).predict(X_curr), y, 3)

            # Contiguous
            X_curr = np.ascontiguousarray(X_curr, dtype=dtype)
            assert_array_almost_equal(est.fit(X_curr, y).predict(X_curr), y, 3)
            assert_array_almost_equal(
                est.partial_fit(X_curr, y).predict(X_curr), y, 3)

            X_curr = np.array(X[::2], dtype=dtype)
            y_curr = np.asarray(y[::2])
            assert_array_almost_equal(
                est.fit(X_curr, y_curr).predict(X_curr), y_curr, 3)
            assert_array_almost_equal(
                est.partial_fit(X_curr, y_curr).predict(X_curr), y_curr, 3)
Exemplo n.º 21
0
def test_tree_predict():
    X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
    y = [-1, -1, -1, 1, 1, 1]
    T = [[-1, -1], [2, 2], [3, 2]]

    # This test is dependent on the random-state since the feature
    # and the threshold selected at every split is independent of the
    # label.
    for est_true in estimators:
        est = clone(est_true)
        est.set_params(random_state=0, max_depth=1)
        est.fit(X, y)

        # mtr_tree = est.tree_
        cand_feature = est.tree_.feature[0]
        cand_thresh = est.tree_.threshold[0]
        assert_almost_equal(cand_thresh, -0.38669141)
        assert_almost_equal(cand_feature, 0.0)

        # Close to (1.0 / np.sum(np.max(X, axis=0) - np.min(X, axis=0)))
        assert_almost_equal(est.tree_.tau[0], 0.07112633)

        # For [-1, -1]:
        # P_not_separated = 1.0
        # Root:
        # eta_root = 0.0 (inside the bounding boc of the root)
        # P_root = 1 - exp(0.0) = 0.0
        # weight_root = P_root
        # mean_root = 0.0
        # Leaf:
        # P_not_separated = 1.0 * (1 - 0.0) = 1.0
        # weight_leaf = P_not_separated = 1.0
        # mean_leaf = -1.0

        # For regresssion:
        # prediction = weight_leaf * P_leaf = -1.0

        # For classifier:
        # proba = weight_leaf * P_leaf = [1.0, 0.0]

        # variance = (weight_root * (var_root + mean_root**2) +
        #             weight_leaf * (var_leaf + mean_leaf**2)) - mean**2
        # This reduces to weight_leaf * mean_leaf**2 - mean**2 = 1.0 * (1.0 - 1.0)
        # = 0.0

        # Similarly for [2, 2]:

        # For regression = weight_leaf * P_leaf = 1.0
        # prediction = 0.0 + 1.0
        # Variance reduces to zero

        # For classification
        # proba = weight_leaf * P_leaf = [0.0, 1.0]

        # For [3, 2]
        # P_not_separated = 1.0
        # Root:
        # Delta_root = 0.07112633
        # eta_root = 1.0
        # weight_root = 1 - exp(-0.07112633) = 0.0686
        # Leaf:
        # weight_leaf = P_not_separated = (1 - 0.0686) = 0.93134421

        # For regression:
        # prediction = mean_root * weight_root + mean_leaf * weight_leaf
        # prediction = 0.0 * 0.0686 + 0.93134421 * 1.0 = 0.93134421
        # For classification
        # proba = weight_root * P_root + weight_leaf * P_leaf
        # proba = 0.0686 * [0.5, 0.5] + 0.93134421 * [0.0 * 1.0]

        # variance = (weight_root * (var_root + mean_root**2) +
        #             weight_leaf * (var_leaf + mean_leaf**2)) - mean**2
        # = 0.0686 * (1 + 0) + 0.93134 * (0 + 1) - 0.93134421**2 = 0.132597

        if isinstance(est, RegressorMixin):
            T_predict, T_std = est.predict(T, return_std=True)
            assert_array_almost_equal(T_predict, [-1.0, 1.0, 0.93134421])
            assert_array_almost_equal(T_std, np.sqrt([0.0, 0.0, 0.132597]))
        else:
            last = (0.0686 * np.array([0.5, 0.5]) +
                    0.93134421 * np.array([0.0, 1.0]))
            T_proba = est.predict_proba(T)
            assert_array_almost_equal(T_proba, [[1.0, 0.0], [0.0, 1.0], last],
                                      4)