Exemplo n.º 1
0
def check_decision_path(name):
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]

    TreeEstimator = ALL_TREES[name]
    est = TreeEstimator(random_state=0, max_depth=2)
    est.fit(X, y)

    node_indicator_csr = est.decision_path(X)
    node_indicator = node_indicator_csr.toarray()
    assert_equal(node_indicator.shape, (n_samples, est.tree_.node_count))

    # Assert that leaves index are correct
    leaves = est.apply(X)
    leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)]
    assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))

    # Ensure only one leave node per sample
    all_leaves = est.tree_.children_left == TREE_LEAF
    assert_array_almost_equal(np.dot(node_indicator, all_leaves),
                              np.ones(shape=n_samples))

    # Ensure max depth is consistent with sum of indicator
    max_depth = node_indicator.sum(axis=1).max()
    assert_less_equal(est.tree_.max_depth, max_depth)
Exemplo n.º 2
0
def test_min_max_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.max(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.max(axis=0), 1.0)

    # Constant feature.
    X = np.zeros(5)
    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_greater_equal(X_scaled.min(), 0.)
    assert_less_equal(X_scaled.max(), 1.)
Exemplo n.º 3
0
    def test_pearsonr_mat(self):
        pear_mat = pearsonr_mat(self.mat)
        assert_equal(pear_mat.shape, (10, 10))

        pear_mat = pearsonr_mat(self.mat, self.w_mat)
        assert_equal(pear_mat.shape, (10, 10))

        assert_greater_equal(np.min(pear_mat), -1)
        assert_less_equal(np.max(pear_mat), 1)
Exemplo n.º 4
0
def test_optimization_minimizes_kl_divergence():
    """t-SNE should give a lower KL divergence with more iterations."""
    random_state = check_random_state(0)
    X, _ = make_blobs(n_features=3, random_state=random_state)
    kl_divergences = []
    for n_iter in [200, 250, 300]:
        tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0, n_iter=n_iter, random_state=0)
        tsne.fit_transform(X)
        kl_divergences.append(tsne.kl_divergence_)
    assert_less_equal(kl_divergences[1], kl_divergences[0])
    assert_less_equal(kl_divergences[2], kl_divergences[1])
Exemplo n.º 5
0
def test_optimization_minimizes_kl_divergence():
    """t-SNE should give a lower KL divergence with more iterations."""
    random_state = check_random_state(0)
    X, _ = make_blobs(n_features=3, random_state=random_state)
    kl_divergences = []
    for n_iter in [200, 250, 300]:
        tsne = TSNE(n_components=2,
                    perplexity=10,
                    learning_rate=100.0,
                    n_iter=n_iter,
                    random_state=0)
        tsne.fit_transform(X)
        kl_divergences.append(tsne.kl_divergence_)
    assert_less_equal(kl_divergences[1], kl_divergences[0])
    assert_less_equal(kl_divergences[2], kl_divergences[1])
Exemplo n.º 6
0
    def test_prediction_proba(self):
        y_test_predicted = self.clf.predict_proba(self.X_test)
        assert_greater_equal(y_test_predicted.min(), 0)
        assert_less_equal(y_test_predicted.max(), 1)

        # check performance
        assert_greater(roc_auc_score(self.y_test, y_test_predicted[:, 1]),
                       self.roc_floor)

        # check shape of integrity
        n_classes = len(np.unique(self.y_train))
        assert_equal(y_test_predicted.shape, (self.X_test.shape[0], n_classes))

        # check probability sum is 1
        y_test_predicted_sum = np.sum(y_test_predicted, axis=1)
        assert_allclose(np.ones([self.X_test.shape[0], ]),
                        y_test_predicted_sum)
def test_precision():

    rng_reg = RandomState(2)
    rng_clf = RandomState(8)
    for X, y, clf in zip(
        (rng_reg.random_sample((5, 2)), rng_clf.random_sample(
            (1000, 4))), (rng_reg.random_sample(
                (5, )), rng_clf.randint(2, size=(1000, ))),
        (DecisionTreeRegressor(
            criterion="friedman_mse", random_state=0, max_depth=1),
         DecisionTreeClassifier(max_depth=1, random_state=0))):

        clf.fit(X, y)
        for precision in (4, 3):
            dot_data = export_graphviz(clf,
                                       out_file=None,
                                       precision=precision,
                                       proportion=True)

            # With the current random state, the impurity and the threshold
            # will have the number of precision set in the export_graphviz
            # function. We will check the number of precision with a strict
            # equality. The value reported will have only 2 precision and
            # therefore, only a less equal comparison will be done.

            # check value
            for finding in finditer(r"value = \d+\.\d+", dot_data):
                assert_less_equal(
                    len(search(r"\.\d+", finding.group()).group()),
                    precision + 1)
            # check impurity
            if is_classifier(clf):
                pattern = r"gini = \d+\.\d+"
            else:
                pattern = r"friedman_mse = \d+\.\d+"

            # check impurity
            for finding in finditer(pattern, dot_data):
                assert_equal(len(search(r"\.\d+", finding.group()).group()),
                             precision + 1)
            # check threshold
            for finding in finditer(r"<= \d+\.\d+", dot_data):
                assert_equal(len(search(r"\.\d+", finding.group()).group()),
                             precision + 1)
Exemplo n.º 8
0
    def test_data_generate(self):
        X_train, y_train, X_test, y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination)

        assert_equal(y_train.shape[0], X_train.shape[0])
        assert_equal(y_test.shape[0], X_test.shape[0])

        assert_less_equal(self.n_train - X_train.shape[0], 1)
        assert_equal(X_train.shape[1], 2)

        assert_less_equal(self.n_test - X_test.shape[0], 1)
        assert_equal(X_test.shape[1], 2)

        out_perc = np.sum(y_train) / self.n_train
        assert_allclose(self.contamination, out_perc, atol=0.01)

        out_perc = np.sum(y_test) / self.n_test
        assert_allclose(self.contamination, out_perc, atol=0.01)
Exemplo n.º 9
0
def test_min_grad_norm():
    # Make sure that the parameter min_grad_norm is used correctly
    random_state = check_random_state(0)
    X = random_state.randn(100, 2)
    min_grad_norm = 0.002
    tsne = TSNE(min_grad_norm=min_grad_norm,
                verbose=2,
                random_state=0,
                method='exact')

    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        tsne.fit_transform(X)
    finally:
        out = sys.stdout.getvalue()
        sys.stdout.close()
        sys.stdout = old_stdout

    lines_out = out.split('\n')

    # extract the gradient norm from the verbose output
    gradient_norm_values = []
    for line in lines_out:
        # When the computation is Finished just an old gradient norm value
        # is repeated that we do not need to store
        if 'Finished' in line:
            break

        start_grad_norm = line.find('gradient norm')
        if start_grad_norm >= 0:
            line = line[start_grad_norm:]
            line = line.replace('gradient norm = ', '').split(' ')[0]
            gradient_norm_values.append(float(line))

    # Compute how often the gradient norm is smaller than min_grad_norm
    gradient_norm_values = np.array(gradient_norm_values)
    n_smaller_gradient_norms = \
        len(gradient_norm_values[gradient_norm_values <= min_grad_norm])

    # The gradient norm can be smaller than min_grad_norm at most once,
    # because in the moment it becomes smaller the optimization stops
    assert_less_equal(n_smaller_gradient_norms, 1)
Exemplo n.º 10
0
def test_precision():

    rng_reg = RandomState(2)
    rng_clf = RandomState(8)
    for X, y, clf in zip(
            (rng_reg.random_sample((5, 2)),
             rng_clf.random_sample((1000, 4))),
            (rng_reg.random_sample((5, )),
             rng_clf.randint(2, size=(1000, ))),
            (DecisionTreeRegressor(criterion="friedman_mse", random_state=0,
                                   max_depth=1),
             DecisionTreeClassifier(max_depth=1, random_state=0))):

        clf.fit(X, y)
        for precision in (4, 3):
            dot_data = export_graphviz(clf, out_file=None, precision=precision,
                                       proportion=True)

            # With the current random state, the impurity and the threshold
            # will have the number of precision set in the export_graphviz
            # function. We will check the number of precision with a strict
            # equality. The value reported will have only 2 precision and
            # therefore, only a less equal comparison will be done.

            # check value
            for finding in finditer(r"value = \d+\.\d+", dot_data):
                assert_less_equal(
                    len(search(r"\.\d+", finding.group()).group()),
                    precision + 1)
            # check impurity
            if is_classifier(clf):
                pattern = r"gini = \d+\.\d+"
            else:
                pattern = r"friedman_mse = \d+\.\d+"

            # check impurity
            for finding in finditer(pattern, dot_data):
                assert_equal(len(search(r"\.\d+", finding.group()).group()),
                             precision + 1)
            # check threshold
            for finding in finditer(r"<= \d+\.\d+", dot_data):
                assert_equal(len(search(r"\.\d+", finding.group()).group()),
                             precision + 1)
Exemplo n.º 11
0
    def test_data_generate(self):
        X_train, y_train, X_test, y_test = \
            generate_data(n_train=self.n_train,
                          n_test=self.n_test,
                          contamination=self.contamination)

        assert_equal(y_train.shape[0], X_train.shape[0])
        assert_equal(y_test.shape[0], X_test.shape[0])

        assert_less_equal(self.n_train - X_train.shape[0], 1)
        assert_equal(X_train.shape[1], 2)

        assert_less_equal(self.n_test - X_test.shape[0], 1)
        assert_equal(X_test.shape[1], 2)

        out_perc = np.sum(y_train) / self.n_train
        assert_allclose(self.contamination, out_perc, atol=0.01)

        out_perc = np.sum(y_test) / self.n_test
        assert_allclose(self.contamination, out_perc, atol=0.01)
    def test_data_generate_cluster(self):
        X_train, X_test, y_train, y_test = \
            generate_data_clusters(n_train=self.n_train,
                                   n_test=self.n_test,
                                   n_features=2,
                                   contamination=self.contamination,
                                   random_state=self.random_state)

        assert_equal(y_train.shape[0], X_train.shape[0])
        assert_equal(y_test.shape[0], X_test.shape[0])

        assert_less_equal(self.n_train - X_train.shape[0], 1)
        assert_equal(X_train.shape[1], 2)

        assert_less_equal(self.n_test - X_test.shape[0], 1)
        assert_equal(X_test.shape[1], 2)

        out_perc = (np.sum(y_train) + np.sum(y_test)) / (self.n_train +
                                                         self.n_test)
        assert_allclose(self.contamination, out_perc, atol=0.01)
Exemplo n.º 13
0
def test_min_grad_norm():
    # Make sure that the parameter min_grad_norm is used correctly
    random_state = check_random_state(0)
    X = random_state.randn(100, 2)
    min_grad_norm = 0.002
    tsne = TSNE(min_grad_norm=min_grad_norm, verbose=2,
                random_state=0, method='exact')

    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        tsne.fit_transform(X)
    finally:
        out = sys.stdout.getvalue()
        sys.stdout.close()
        sys.stdout = old_stdout

    lines_out = out.split('\n')

    # extract the gradient norm from the verbose output
    gradient_norm_values = []
    for line in lines_out:
        # When the computation is Finished just an old gradient norm value
        # is repeated that we do not need to store
        if 'Finished' in line:
            break

        start_grad_norm = line.find('gradient norm')
        if start_grad_norm >= 0:
            line = line[start_grad_norm:]
            line = line.replace('gradient norm = ', '').split(' ')[0]
            gradient_norm_values.append(float(line))

    # Compute how often the gradient norm is smaller than min_grad_norm
    gradient_norm_values = np.array(gradient_norm_values)
    n_smaller_gradient_norms = \
        len(gradient_norm_values[gradient_norm_values <= min_grad_norm])

    # The gradient norm can be smaller than min_grad_norm at most once,
    # because in the moment it becomes smaller the optimization stops
    assert_less_equal(n_smaller_gradient_norms, 1)
Exemplo n.º 14
0
def test_rbf_sampler():
    # test that RBFSampler approximates kernel on random data
    # compute exact kernel
    gamma = 10.
    kernel = rbf_kernel(X, Y, gamma=gamma)

    # approximate kernel mapping
    rbf_transform = RBFSampler(gamma=gamma, n_components=1000, random_state=42)
    X_trans = rbf_transform.fit_transform(X)
    Y_trans = rbf_transform.transform(Y)
    kernel_approx = np.dot(X_trans, Y_trans.T)

    error = kernel - kernel_approx
    assert_less_equal(np.abs(np.mean(error)), 0.01)  # close to unbiased
    np.abs(error, out=error)
    assert_less_equal(np.max(error), 0.1)  # nothing too far off
    assert_less_equal(np.mean(error), 0.05)  # mean is fairly close
Exemplo n.º 15
0
def test_rbf_sampler():
    # test that RBFSampler approximates kernel on random data
    # compute exact kernel
    gamma = 10.
    kernel = rbf_kernel(X, Y, gamma=gamma)

    # approximate kernel mapping
    rbf_transform = RBFSampler(gamma=gamma, n_components=1000, random_state=42)
    X_trans = rbf_transform.fit_transform(X)
    Y_trans = rbf_transform.transform(Y)
    kernel_approx = np.dot(X_trans, Y_trans.T)

    error = kernel - kernel_approx
    assert_less_equal(np.abs(np.mean(error)), 0.01)  # close to unbiased
    np.abs(error, out=error)
    assert_less_equal(np.max(error), 0.1)  # nothing too far off
    assert_less_equal(np.mean(error), 0.05)  # mean is fairly close
Exemplo n.º 16
0
 def test_prediction_proba_unify(self):
     pred_proba = self.clf.predict_proba(self.X_test, method='unify')
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
Exemplo n.º 17
0
 def test_prediction_proba(self):
     pred_proba = self.clf.predict_proba(self.X_test)
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
Exemplo n.º 18
0
def check_limits(value, lower_bound, upper_bound):
    assert_less_equal(lower_bound, value)
    assert_greater(upper_bound, value)
Exemplo n.º 19
0
 def test_prediction_proba_unify(self):
     pred_proba = self.clf.predict_proba(self.X_test, method='unify')
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
Exemplo n.º 20
0
 def test_prediction_proba(self):
     pred_proba = self.clf.predict_proba(self.X_test)
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
Exemplo n.º 21
0
 def test_prediction_proba(self):
     pred_proba = self.clf.predict_proba(self.X_test)
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
     # check performance
     assert_greater(roc_auc_score(self.y_test, pred_proba), self.roc_floor)
Exemplo n.º 22
0
def test_assert_less_equal():
    assert_less_equal(0, 1)
    assert_less_equal(1, 1)
    assert_raises(AssertionError, assert_less_equal, 1, 0)
Exemplo n.º 23
0
 def test_prediction_proba(self):
     pred_proba = self.clf.predict_proba(self.X_test)
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
     # check performance
     assert_greater(roc_auc_score(self.y_test, pred_proba), self.roc_floor)
Exemplo n.º 24
0
 def test_prediction_proba_linear(self):
     pred_proba = self.clf.predict_proba(self.X_test, proba_method='linear')
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
Exemplo n.º 25
0
def check_limits(value, lower_bound, upper_bound):
    assert_less_equal(lower_bound, value)
    assert_greater(upper_bound, value)
Exemplo n.º 26
0
def test_assert_less_equal():
    assert_less_equal(0, 1)
    assert_less_equal(1, 1)
    assert_raises(AssertionError, assert_less_equal, 1, 0)
Exemplo n.º 27
0
def check_limits(value, low, high):
    # check if low <= value <= high
    assert_less_equal(low, value)
    assert_greater_equal(high, value)