def test_iris():
    # Check consistency on dataset iris.
    classes = np.unique(iris.target)
    clf_samme = prob_samme = None

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)
        clf.fit(iris.data, iris.target)

        assert_array_equal(classes, clf.classes_)
        proba = clf.predict_proba(iris.data)
        if alg == "SAMME":
            clf_samme = clf
            prob_samme = proba
        assert_equal(proba.shape[1], len(classes))
        assert_equal(clf.decision_function(iris.data).shape[1], len(classes))

        score = clf.score(iris.data, iris.target)
        assert score > 0.9, "Failed with algorithm %s and score = %f" % \
            (alg, score)

    # Somewhat hacky regression test: prior to
    # ae7adc880d624615a34bafdb1d75ef67051b8200,
    # predict_proba returned SAMME.R values for SAMME.
    clf_samme.algorithm = "SAMME.R"
    assert_array_less(0,
                      np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
示例#2
0
def test_graph_lasso(random_state=0):
    # Sample data from a sparse multivariate normal
    dim = 20
    n_samples = 100
    random_state = check_random_state(random_state)
    prec = make_sparse_spd_matrix(dim, alpha=.95,
                                  random_state=random_state)
    cov = linalg.inv(prec)
    X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
    emp_cov = empirical_covariance(X)

    for alpha in (.1, .01):
        covs = dict()
        for method in ('cd', 'lars'):
            cov_, _, costs = graph_lasso(emp_cov, alpha=.1, return_costs=True)
            covs[method] = cov_
            costs, dual_gap = np.array(costs).T
            # Check that the costs always decrease
            assert_array_less(np.diff(costs), 0)
        # Check that the 2 approaches give similar results
        assert_array_almost_equal(covs['cd'], covs['lars'])

    # Smoke test the estimator
    model = GraphLasso(alpha=.1).fit(X)
    assert_array_almost_equal(model.covariance_, covs['cd'])
示例#3
0
def test_normalize_option_multilabel_classification():
    # Test in the multilabel case
    n_classes = 4
    n_samples = 100

    # for both random_state 0 and 1, y_true and y_pred has at least one
    # unlabelled entry
    _, y_true = make_multilabel_classification(n_features=1,
                                               n_classes=n_classes,
                                               random_state=0,
                                               allow_unlabeled=True,
                                               n_samples=n_samples)
    _, y_pred = make_multilabel_classification(n_features=1,
                                               n_classes=n_classes,
                                               random_state=1,
                                               allow_unlabeled=True,
                                               n_samples=n_samples)

    # To make sure at least one empty label is present
    y_true += [0]*n_classes
    y_pred += [0]*n_classes

    for name in METRICS_WITH_NORMALIZE_OPTION:
        metrics = ALL_METRICS[name]
        measure = metrics(y_true, y_pred, normalize=True)
        assert_array_less(-1.0 * measure, 0,
                          err_msg="We failed to test correctly the normalize "
                                  "option")
        assert_allclose(metrics(y_true, y_pred, normalize=False) / n_samples,
                        measure, err_msg="Failed with %s" % name)
def test_radius_neighbors():
    # Checks whether Returned distances are less than `radius`
    # At least one point should be returned when the `radius` is set
    # to mean distance from the considering point to other points in
    # the database.
    # Moreover, this test compares the radius neighbors of LSHForest
    # with the `sklearn.neighbors.NearestNeighbors`.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        # Select a random point in the dataset as the query
        query = X[rng.randint(0, n_samples)].reshape(1, -1)

        # At least one neighbor should be returned when the radius is the
        # mean distance from the query to the points of the dataset.
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)

        assert_equal(neighbors.shape, (1,))
        assert_equal(neighbors.dtype, object)
        assert_greater(neighbors[0].shape[0], 0)
        # All distances to points in the results of the radius query should
        # be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)

    # dists and inds should not be 1D arrays or arrays of variable lengths
    # hence the use of the object dtype.
    assert_equal(distances.shape, (n_queries,))
    assert_equal(distances.dtype, object)
    assert_equal(neighbors.shape, (n_queries,))
    assert_equal(neighbors.dtype, object)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)].reshape(1, -1)
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)
    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)
示例#7
0
    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)
示例#8
0
    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)
示例#9
0
def test_radius_neighbors():
    """Checks whether Returned distances are less than `radius`

    At least one point should be returned when the `radius` is set
    to mean distance from the considering point to other points in
    the database.
    Moreover, this test compares the radius neighbors of LSHForest
    with the `sklearn.neighbors.NearestNeighbors`.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query,
                                          radius=mean_dist,
                                          return_distance=False)
        # At least one neighbor should be returned.
        assert_greater(neighbors.shape[0], 0)
        # All distances should be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries, return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # dists and inds should not be 2D arrays
    assert_equal(distances.ndim, 1)
    assert_equal(neighbors.ndim, 1)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)]
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
    nbrs.fit(X)

    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    # Distances of exact neighbors is less than or equal to approximate
    assert_true(
        np.all(
            np.less_equal(np.sort(distances_exact[0]),
                          np.sort(distances_approx[0]))))
示例#10
0
def test_api():
    res = dummy_minimize(
        branin, [(-5.0, 10.0), (0.0, 15.0)], random_state=0, maxiter=100)
    assert_array_equal(res.x.shape, (2,))
    assert_array_equal(res.x_iters.shape, (100, 2))
    assert_array_equal(res.func_vals.shape, (100,))
    assert_array_less(res.x_iters, np.tile([10, 15], (100, 1)))
    assert_array_less(np.tile([-5, 0], (100, 1)), res.x_iters)
    assert_raises(ValueError, dummy_minimize, lambda x: x, [[-5, 10]])
def test_api():
    res = gp_minimize(branin, [[-5, 10], [0, 15]], random_state=0, maxiter=20)
    assert_array_equal(res.x.shape, (2, ))
    assert_array_equal(res.x_iters.shape, (20, 2))
    assert_array_equal(res.func_vals.shape, (20, ))
    assert_array_less(res.x_iters, np.tile([10, 15], (20, 1)))
    assert_array_less(np.tile([-5, 0], (20, 1)), res.x_iters)

    assert_raises(ValueError, gp_minimize, lambda x: x, [[-5, 10]])
示例#12
0
def test_radius_neighbors():
    """Checks whether Returned distances are less than `radius`

    At least one point should be returned when the `radius` is set
    to mean distance from the considering point to other points in
    the database.
    Moreover, this test compares the radius neighbors of LSHForest
    with the `sklearn.neighbors.NearestNeighbors`.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)
        # At least one neighbor should be returned.
        assert_greater(neighbors.shape[0], 0)
        # All distances should be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # dists and inds should not be 2D arrays
    assert_equal(distances.ndim, 1)
    assert_equal(neighbors.ndim, 1)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)]
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
    nbrs.fit(X)

    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    # Distances of exact neighbors is less than or equal to approximate
    assert_true(np.all(np.less_equal(np.sort(distances_exact[0]),
                                     np.sort(distances_approx[0]))))
示例#13
0
def test_solution_inside_bounds():
    gpr = VGP(kernel=kernel).fit(X, y)

    bounds = gpr.kernel.bounds
    max_ = np.finfo(gpr.kernel.theta.dtype).max
    tiny = 1e-10
    bounds[~np.isfinite(bounds[:, 1]), 1] = max_

    assert_array_less(bounds[:, 0], gpr.kernel.theta + tiny)
    assert_array_less(gpr.kernel.theta, bounds[:, 1] + tiny)
def test_solution_inside_bounds(kernel):
    # Test that hyperparameter-optimization remains in bounds#
    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)

    bounds = gpr.kernel_.bounds
    max_ = np.finfo(gpr.kernel_.theta.dtype).max
    tiny = 1e-10
    bounds[~np.isfinite(bounds[:, 1]), 1] = max_

    assert_array_less(bounds[:, 0], gpr.kernel_.theta + tiny)
    assert_array_less(gpr.kernel_.theta, bounds[:, 1] + tiny)
示例#15
0
def test_solution_inside_bounds(kernel):
    # Test that hyperparameter-optimization remains in bounds#
    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)

    bounds = gpr.kernel_.bounds
    max_ = np.finfo(gpr.kernel_.theta.dtype).max
    tiny = 1e-10
    bounds[~np.isfinite(bounds[:, 1]), 1] = max_

    assert_array_less(bounds[:, 0], gpr.kernel_.theta + tiny)
    assert_array_less(gpr.kernel_.theta, bounds[:, 1] + tiny)
示例#16
0
def test_normalize_option_multiclass_classification(name):
    # Test in the multiclass case
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 4, size=(20, ))
    y_pred = random_state.randint(0, 4, size=(20, ))
    n_samples = y_true.shape[0]

    metrics = ALL_METRICS[name]
    measure = metrics(y_true, y_pred, normalize=True)
    assert_array_less(-1.0 * measure, 0,
                      err_msg="We failed to test correctly the normalize "
                              "option")
    assert_allclose(metrics(y_true, y_pred, normalize=False) / n_samples,
                    measure)
示例#17
0
def test_normalize_option_multiclass_classification(name):
    # Test in the multiclass case
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 4, size=(20, ))
    y_pred = random_state.randint(0, 4, size=(20, ))
    n_samples = y_true.shape[0]

    metrics = ALL_METRICS[name]
    measure = metrics(y_true, y_pred, normalize=True)
    assert_array_less(-1.0 * measure, 0,
                      err_msg="We failed to test correctly the normalize "
                              "option")
    assert_allclose(metrics(y_true, y_pred, normalize=False) / n_samples,
                    measure)
示例#18
0
def test_std_bayesian_ridge_ard_with_constant_input():
    # Test BayesianRidge and ARDRegression standard dev. for edge case of
    # constant target vector
    # The standard dev. should be relatively small (< 0.01 is tested here)
    n_samples = 4
    n_features = 5
    random_state = check_random_state(42)
    constant_value = random_state.rand()
    X = random_state.random_sample((n_samples, n_features))
    y = np.full(n_samples, constant_value)
    expected_upper_boundary = 0.01

    for clf in [BayesianRidge(), ARDRegression()]:
        _, y_std = clf.fit(X, y).predict(X, return_std=True)
        assert_array_less(y_std, expected_upper_boundary)
示例#19
0
def test_class_weights():
    # check that the class weights are updated
    # simple 3 cluster dataset
    X, y = make_blobs(random_state=1)
    for Model in [DPGMM, VBGMM]:
        dpgmm = Model(n_components=10, random_state=1, alpha=20, n_iter=50)
        dpgmm.fit(X)
        # get indices of components that are used:
        indices = np.unique(dpgmm.predict(X))
        active = np.zeros(10, dtype=np.bool)
        active[indices] = True
        # used components are important
        assert_array_less(.1, dpgmm.weights_[active])
        # others are not
        assert_array_less(dpgmm.weights_[~active], .05)
示例#20
0
def test_mem_vec_diff_clusters():
    """
    Verify membership vector produces as many clusters as requested
    """
    # Ignore user warnings in this function
    warnings.filterwarnings("ignore", category=UserWarning)

    # Given a flat clustering trained for n_clusters picked by HDBSCAN,
    n_clusters_fit = None
    clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit)
    n_clusters_fitted = n_clusters_from_labels(clusterer.labels_)

    # When membership_vector_flat is called with new data for some n_clusters,
    n_clusters_predict = n_clusters_fitted + 3
    memberships = membership_vector_flat(clusterer,
                                         X_test,
                                         n_clusters=n_clusters_predict)

    # Then the number of clusters in memberships should be as requested,
    assert_equal(memberships.shape[1], n_clusters_predict)
    # and the number of points should equal those in the test set
    assert_equal(len(memberships), len(X_test))
    # and all probabilities are <= 1.
    assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14)

    # ========================================
    # Given a flat clustering for a specified n_clusters,
    n_clusters_fit = n_clusters_from_labels(clusterer.labels_) + 2
    clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit)

    # When membership_vector_flat is called with new data for some n_clusters,
    n_clusters_predict = n_clusters_fit + 3
    memberships = membership_vector_flat(clusterer,
                                         X_test,
                                         n_clusters=n_clusters_predict)

    # Then the number of clusters in memberships should be as requested,
    assert_equal(memberships.shape[1], n_clusters_predict)
    # and the number of points should equal those in the test set
    assert_equal(len(memberships), len(X_test))
    # and all probabilities are <= 1.
    assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14)
    return
示例#21
0
def test_explained_variance(X_sparse, kind, n_components, solver):
    X = X_sparse if kind == 'sparse' else X_sparse.toarray()
    svd = TruncatedSVD(n_components, algorithm=solver)
    X_tr = svd.fit_transform(X)
    # Assert that all the values are greater than 0
    assert_array_less(0.0, svd.explained_variance_ratio_)

    # Assert that total explained variance is less than 1
    assert_array_less(svd.explained_variance_ratio_.sum(), 1.0)

    # Test that explained_variance is correct
    total_variance = np.var(X_sparse.toarray(), axis=0).sum()
    variances = np.var(X_tr, axis=0)
    true_explained_variance_ratio = variances / total_variance

    assert_allclose(
        svd.explained_variance_ratio_,
        true_explained_variance_ratio,
    )
def test_graphical_lasso(random_state=0):
    # Sample data from a sparse multivariate normal
    dim = 20
    n_samples = 100
    random_state = check_random_state(random_state)
    prec = make_sparse_spd_matrix(dim, alpha=.95, random_state=random_state)
    cov = linalg.inv(prec)
    X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
    emp_cov = empirical_covariance(X)

    for alpha in (0., .1, .25):
        covs = dict()
        icovs = dict()
        for method in ('cd', 'lars'):
            cov_, icov_, costs = graphical_lasso(emp_cov,
                                                 return_costs=True,
                                                 alpha=alpha,
                                                 mode=method)
            covs[method] = cov_
            icovs[method] = icov_
            costs, dual_gap = np.array(costs).T
            # Check that the costs always decrease (doesn't hold if alpha == 0)
            if not alpha == 0:
                assert_array_less(np.diff(costs), 0)
        # Check that the 2 approaches give similar results
        assert_array_almost_equal(covs['cd'], covs['lars'], decimal=4)
        assert_array_almost_equal(icovs['cd'], icovs['lars'], decimal=4)

    # Smoke test the estimator
    model = GraphicalLasso(alpha=.25).fit(X)
    model.score(X)
    assert_array_almost_equal(model.covariance_, covs['cd'], decimal=4)
    assert_array_almost_equal(model.covariance_, covs['lars'], decimal=4)

    # For a centered matrix, assume_centered could be chosen True or False
    # Check that this returns indeed the same result for centered data
    Z = X - X.mean(0)
    precs = list()
    for assume_centered in (False, True):
        prec_ = GraphicalLasso(
            assume_centered=assume_centered).fit(Z).precision_
        precs.append(prec_)
    assert_array_almost_equal(precs[0], precs[1])
def test_graphical_lasso(random_state=0):
    # Sample data from a sparse multivariate normal
    dim = 20
    n_samples = 100
    random_state = check_random_state(random_state)
    prec = make_sparse_spd_matrix(dim, alpha=.95,
                                  random_state=random_state)
    cov = linalg.inv(prec)
    X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
    emp_cov = empirical_covariance(X)

    for alpha in (0., .1, .25):
        covs = dict()
        icovs = dict()
        for method in ('cd', 'lars'):
            cov_, icov_, costs = graphical_lasso(emp_cov, return_costs=True,
                                                 alpha=alpha, mode=method)
            covs[method] = cov_
            icovs[method] = icov_
            costs, dual_gap = np.array(costs).T
            # Check that the costs always decrease (doesn't hold if alpha == 0)
            if not alpha == 0:
                assert_array_less(np.diff(costs), 0)
        # Check that the 2 approaches give similar results
        assert_array_almost_equal(covs['cd'], covs['lars'], decimal=4)
        assert_array_almost_equal(icovs['cd'], icovs['lars'], decimal=4)

    # Smoke test the estimator
    model = GraphicalLasso(alpha=.25).fit(X)
    model.score(X)
    assert_array_almost_equal(model.covariance_, covs['cd'], decimal=4)
    assert_array_almost_equal(model.covariance_, covs['lars'], decimal=4)

    # For a centered matrix, assume_centered could be chosen True or False
    # Check that this returns indeed the same result for centered data
    Z = X - X.mean(0)
    precs = list()
    for assume_centered in (False, True):
        prec_ = GraphicalLasso(
            assume_centered=assume_centered).fit(Z).precision_
        precs.append(prec_)
    assert_array_almost_equal(precs[0], precs[1])
示例#24
0
def test_approx_predict_same_clusters():
    """
    Verify that approximate_predict_flat produces as many clusters as clusterer
    """
    # Given a flat clustering trained for some n_clusters,
    n_clusters = 5
    clusterer = HDBSCAN_flat(X,
                             cluster_selection_method='eom',
                             n_clusters=n_clusters)

    # When using approximate_predict_flat without specifying n_clusters,
    labels_flat, proba_flat = approximate_predict_flat(clusterer,
                                                       X_test,
                                                       n_clusters=None)

    # Then, the number of clusters produced must match the original n_clusters
    n_clusters_out = n_clusters_from_labels(labels_flat)
    assert_equal(n_clusters_out, n_clusters)
    # and all probabilities are <= 1.
    assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.e-14)
    return
示例#25
0
def test_approx_predict_diff_clusters():
    """
    Verify that approximate_predict_flat produces as many clusters as asked
    """
    # Given a flat clustering trained for some n_clusters,
    n_clusters_fit = 5
    clusterer = HDBSCAN_flat(X,
                             cluster_selection_method='eom',
                             n_clusters=n_clusters_fit,
                             prediction_data=True)

    # When using approximate_predict_flat with specified n_clusters,
    n_clusters_predict = 3
    labels_flat, proba_flat = approximate_predict_flat(
        clusterer, X_test, n_clusters=n_clusters_predict)

    # Then, the requested number of clusters must be produced
    n_clusters_out = n_clusters_from_labels(labels_flat)
    assert_equal(n_clusters_out, n_clusters_predict)
    # and all probabilities are <= 1.
    assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.e-14)

    # When using approximate_predict_flat with more clusters
    #   than 'eom' can handle,
    n_clusters_predict = 12
    with warnings.catch_warnings(record=True) as w:
        labels_flat, proba_flat = approximate_predict_flat(
            clusterer, X_test, n_clusters=n_clusters_predict)
        # Then, a warning is raised saying 'eom' can't get this clustering,
        assert len(w) > 0
        assert issubclass(w[-1].category, UserWarning)
        assert "Cannot predict" in str(w[-1].message)
    # But the requested number of clusters must still be produced using 'leaf'
    n_clusters_out = n_clusters_from_labels(labels_flat)
    assert_equal(n_clusters_out, n_clusters_predict)
    # and all probabilities are <= 1.
    assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.e-14)
    return
示例#26
0
def test_all_points_mem_vec_same_clusters():
    """
    Verify membership vector for training set produces same n_clusters
        as clusterer
    """
    # Given a flat clustering trained for n_clusters picked by HDBSCAN,
    n_clusters_fit = None
    clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit)

    # When all_points_membership_vectors_flat is called,
    memberships = all_points_membership_vectors_flat(clusterer)

    # Then the number of clusters in memberships matches those of clusterer,
    assert_equal(memberships.shape[1],
                 n_clusters_from_labels(clusterer.labels_))
    # and the number of points should equal those in the training set
    assert_equal(len(memberships), len(X))
    # and all probabilities are <= 1.
    assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14)

    # ========================================
    # Given a flat clustering for a specified n_clusters,
    n_clusters_fit = n_clusters_from_labels(clusterer.labels_) - 2
    clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit)

    # When all_points_membership_vectors_flat is called,
    memberships = all_points_membership_vectors_flat(clusterer)

    # Then the number of clusters in memberships matches those of clusterer,
    assert_equal(memberships.shape[1],
                 n_clusters_from_labels(clusterer.labels_))
    # and the number of points should equal those in the training set
    assert_equal(len(memberships), len(X))
    # and all probabilities are <= 1.
    assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14)
    return
示例#27
0
def test_graph_lasso(random_state=0):
    # Sample data from a sparse multivariate normal
    dim = 20
    n_samples = 100
    random_state = check_random_state(random_state)
    prec = make_sparse_spd_matrix(dim, alpha=.95, random_state=random_state)
    cov = linalg.inv(prec)
    X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
    emp_cov = empirical_covariance(X)

    for alpha in (.1, .01):
        covs = dict()
        for method in ('cd', 'lars'):
            cov_, _, costs = graph_lasso(emp_cov, alpha=.1, return_costs=True)
            covs[method] = cov_
            costs, dual_gap = np.array(costs).T
            # Check that the costs always decrease
            assert_array_less(np.diff(costs), 0)
        # Check that the 2 approaches give similar results
        assert_array_almost_equal(covs['cd'], covs['lars'])

    # Smoke test the estimator
    model = GraphLasso(alpha=.1).fit(X)
    assert_array_almost_equal(model.covariance_, covs['cd'])
示例#28
0
def test_forest_interface():
    forest, tree1, tree2, i1, i2, x, y, y_ = generate_dummy_forest()

    i = _tree_apply(tree1, x)
    assert_array_equal(i, i1)

    i = _tree_apply(tree2, x)
    assert_array_equal(i, i2)

    i, ti, ni = _compute_coverage_matrix(forest, x, max_nodecount=np.inf, max_interaction=np.inf)
    i_ref, _ = _unique_rows(np.hstack([i1, i2]).T)
    i_ref = i_ref[::-1].T
    assert_array_equal(i, i_ref)

    assert_array_equal(np.unique(ti), [0, 1])  # two trees
    assert_array_less(-1, ni)
    assert_array_less(ni[ti == 0], tree1.node_count)
    assert_array_less(ni[ti == 1], tree2.node_count)

    for i, f in enumerate(_tree_featurecount(tree1)):
        assert f <= min(2, i)

    for i, f in enumerate(_tree_featurecount(tree2)):
        assert f <= min(2, i)
示例#29
0
def test_radius_neighbors():
    # Checks whether Returned distances are less than `radius`
    # At least one point should be returned when the `radius` is set
    # to mean distance from the considering point to other points in
    # the database.
    # Moreover, this test compares the radius neighbors of LSHForest
    # with the `sklearn.neighbors.NearestNeighbors`.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        # Select a random point in the dataset as the query
        query = X[rng.randint(0, n_samples)].reshape(1, -1)

        # At least one neighbor should be returned when the radius is the
        # mean distance from the query to the points of the dataset.
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)

        assert_equal(neighbors.shape, (1,))
        assert_equal(neighbors.dtype, object)
        assert_greater(neighbors[0].shape[0], 0)
        # All distances to points in the results of the radius query should
        # be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)

    # dists and inds should not be 1D arrays or arrays of variable lengths
    # hence the use of the object dtype.
    assert_equal(distances.shape, (n_queries,))
    assert_equal(distances.dtype, object)
    assert_equal(neighbors.shape, (n_queries,))
    assert_equal(neighbors.dtype, object)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)].reshape(1, -1)
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)

    # Radius-based queries do not sort the result points and the order
    # depends on the method, the random_state and the dataset order. Therefore
    # we need to sort the results ourselves before performing any comparison.
    sorted_dists_exact = np.sort(distances_exact[0])
    sorted_dists_approx = np.sort(distances_approx[0])

    # Distances to exact neighbors are less than or equal to approximate
    # counterparts as the approximate radius query might have missed some
    # closer neighbors.
    assert_true(np.all(np.less_equal(sorted_dists_exact,
                                     sorted_dists_approx)))
def test_explained_variance():
    # Test sparse data
    svd_a_10_sp = TruncatedSVD(10, algorithm="arpack")
    svd_r_10_sp = TruncatedSVD(10, algorithm="randomized", random_state=42)
    svd_a_20_sp = TruncatedSVD(20, algorithm="arpack")
    svd_r_20_sp = TruncatedSVD(20, algorithm="randomized", random_state=42)
    X_trans_a_10_sp = svd_a_10_sp.fit_transform(X)
    X_trans_r_10_sp = svd_r_10_sp.fit_transform(X)
    X_trans_a_20_sp = svd_a_20_sp.fit_transform(X)
    X_trans_r_20_sp = svd_r_20_sp.fit_transform(X)

    # Test dense data
    svd_a_10_de = TruncatedSVD(10, algorithm="arpack")
    svd_r_10_de = TruncatedSVD(10, algorithm="randomized", random_state=42)
    svd_a_20_de = TruncatedSVD(20, algorithm="arpack")
    svd_r_20_de = TruncatedSVD(20, algorithm="randomized", random_state=42)
    X_trans_a_10_de = svd_a_10_de.fit_transform(X.toarray())
    X_trans_r_10_de = svd_r_10_de.fit_transform(X.toarray())
    X_trans_a_20_de = svd_a_20_de.fit_transform(X.toarray())
    X_trans_r_20_de = svd_r_20_de.fit_transform(X.toarray())

    # helper arrays for tests below
    svds = (svd_a_10_sp, svd_r_10_sp, svd_a_20_sp, svd_r_20_sp, svd_a_10_de,
            svd_r_10_de, svd_a_20_de, svd_r_20_de)
    svds_trans = (
        (svd_a_10_sp, X_trans_a_10_sp),
        (svd_r_10_sp, X_trans_r_10_sp),
        (svd_a_20_sp, X_trans_a_20_sp),
        (svd_r_20_sp, X_trans_r_20_sp),
        (svd_a_10_de, X_trans_a_10_de),
        (svd_r_10_de, X_trans_r_10_de),
        (svd_a_20_de, X_trans_a_20_de),
        (svd_r_20_de, X_trans_r_20_de),
    )
    svds_10_v_20 = (
        (svd_a_10_sp, svd_a_20_sp),
        (svd_r_10_sp, svd_r_20_sp),
        (svd_a_10_de, svd_a_20_de),
        (svd_r_10_de, svd_r_20_de),
    )
    svds_sparse_v_dense = (
        (svd_a_10_sp, svd_a_10_de),
        (svd_a_20_sp, svd_a_20_de),
        (svd_r_10_sp, svd_r_10_de),
        (svd_r_20_sp, svd_r_20_de),
    )

    # Assert the 1st component is equal
    for svd_10, svd_20 in svds_10_v_20:
        assert_array_almost_equal(
            svd_10.explained_variance_ratio_,
            svd_20.explained_variance_ratio_[:10],
            decimal=5,
        )

    # Assert that 20 components has higher explained variance than 10
    for svd_10, svd_20 in svds_10_v_20:
        assert_greater(
            svd_20.explained_variance_ratio_.sum(),
            svd_10.explained_variance_ratio_.sum(),
        )

    # Assert that all the values are greater than 0
    for svd in svds:
        assert_array_less(0.0, svd.explained_variance_ratio_)

    # Assert that total explained variance is less than 1
    for svd in svds:
        assert_array_less(svd.explained_variance_ratio_.sum(), 1.0)

    # Compare sparse vs. dense
    for svd_sparse, svd_dense in svds_sparse_v_dense:
        assert_array_almost_equal(svd_sparse.explained_variance_ratio_,
                                  svd_dense.explained_variance_ratio_)

    # Test that explained_variance is correct
    for svd, transformed in svds_trans:
        total_variance = np.var(X.toarray(), axis=0).sum()
        variances = np.var(transformed, axis=0)
        true_explained_variance_ratio = variances / total_variance

        assert_array_almost_equal(
            svd.explained_variance_ratio_,
            true_explained_variance_ratio,
        )
def test_radius_neighbors():
    # Checks whether Returned distances are less than `radius`
    # At least one point should be returned when the `radius` is set
    # to mean distance from the considering point to other points in
    # the database.
    # Moreover, this test compares the radius neighbors of LSHForest
    # with the `sklearn.neighbors.NearestNeighbors`.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        # Select a random point in the dataset as the query
        query = X[rng.randint(0, n_samples)].reshape(1, -1)

        # At least one neighbor should be returned when the radius is the
        # mean distance from the query to the points of the dataset.
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)

        assert_equal(neighbors.shape, (1,))
        assert_equal(neighbors.dtype, object)
        assert_greater(neighbors[0].shape[0], 0)
        # All distances to points in the results of the radius query should
        # be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)

    # dists and inds should not be 1D arrays or arrays of variable lengths
    # hence the use of the object dtype.
    assert_equal(distances.shape, (n_queries,))
    assert_equal(distances.dtype, object)
    assert_equal(neighbors.shape, (n_queries,))
    assert_equal(neighbors.dtype, object)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)].reshape(1, -1)
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)

    # Radius-based queries do not sort the result points and the order
    # depends on the method, the random_state and the dataset order. Therefore
    # we need to sort the results ourselves before performing any comparison.
    sorted_dists_exact = np.sort(distances_exact[0])
    sorted_dists_approx = np.sort(distances_approx[0])

    # Distances to exact neighbors are less than or equal to approximate
    # counterparts as the approximate radius query might have missed some
    # closer neighbors.
    assert_true(np.all(np.less_equal(sorted_dists_exact,
                                     sorted_dists_approx)))
示例#32
0
def check_minimizer_bounds(result, n_calls):
    # no values should be below or above the bounds
    eps = 10e-9  # check for assert_array_less OR equal
    assert_array_less(result.x_iters, np.tile([10+eps, 15+eps], (n_calls, 1)))
    assert_array_less(np.tile([-5-eps, 0-eps], (n_calls, 1)), result.x_iters)
示例#33
0
def test_explained_variance():
    # Test sparse data
    svd_a_10_sp = TruncatedSVD(10, algorithm="arpack")
    svd_r_10_sp = TruncatedSVD(10, algorithm="randomized", random_state=42)
    svd_a_20_sp = TruncatedSVD(20, algorithm="arpack")
    svd_r_20_sp = TruncatedSVD(20, algorithm="randomized", random_state=42)
    X_trans_a_10_sp = svd_a_10_sp.fit_transform(X)
    X_trans_r_10_sp = svd_r_10_sp.fit_transform(X)
    X_trans_a_20_sp = svd_a_20_sp.fit_transform(X)
    X_trans_r_20_sp = svd_r_20_sp.fit_transform(X)

    # Test dense data
    svd_a_10_de = TruncatedSVD(10, algorithm="arpack")
    svd_r_10_de = TruncatedSVD(10, algorithm="randomized", random_state=42)
    svd_a_20_de = TruncatedSVD(20, algorithm="arpack")
    svd_r_20_de = TruncatedSVD(20, algorithm="randomized", random_state=42)
    X_trans_a_10_de = svd_a_10_de.fit_transform(X.toarray())
    X_trans_r_10_de = svd_r_10_de.fit_transform(X.toarray())
    X_trans_a_20_de = svd_a_20_de.fit_transform(X.toarray())
    X_trans_r_20_de = svd_r_20_de.fit_transform(X.toarray())

    # helper arrays for tests below
    svds = (svd_a_10_sp, svd_r_10_sp, svd_a_20_sp, svd_r_20_sp, svd_a_10_de,
            svd_r_10_de, svd_a_20_de, svd_r_20_de)
    svds_trans = (
        (svd_a_10_sp, X_trans_a_10_sp),
        (svd_r_10_sp, X_trans_r_10_sp),
        (svd_a_20_sp, X_trans_a_20_sp),
        (svd_r_20_sp, X_trans_r_20_sp),
        (svd_a_10_de, X_trans_a_10_de),
        (svd_r_10_de, X_trans_r_10_de),
        (svd_a_20_de, X_trans_a_20_de),
        (svd_r_20_de, X_trans_r_20_de),
    )
    svds_10_v_20 = (
        (svd_a_10_sp, svd_a_20_sp),
        (svd_r_10_sp, svd_r_20_sp),
        (svd_a_10_de, svd_a_20_de),
        (svd_r_10_de, svd_r_20_de),
    )
    svds_sparse_v_dense = (
        (svd_a_10_sp, svd_a_10_de),
        (svd_a_20_sp, svd_a_20_de),
        (svd_r_10_sp, svd_r_10_de),
        (svd_r_20_sp, svd_r_20_de),
    )

    # Assert the 1st component is equal
    for svd_10, svd_20 in svds_10_v_20:
        assert_array_almost_equal(
            svd_10.explained_variance_ratio_,
            svd_20.explained_variance_ratio_[:10],
            decimal=5,
        )

    # Assert that 20 components has higher explained variance than 10
    for svd_10, svd_20 in svds_10_v_20:
        assert_greater(
            svd_20.explained_variance_ratio_.sum(),
            svd_10.explained_variance_ratio_.sum(),
        )

    # Assert that all the values are greater than 0
    for svd in svds:
        assert_array_less(0.0, svd.explained_variance_ratio_)

    # Assert that total explained variance is less than 1
    for svd in svds:
        assert_array_less(svd.explained_variance_ratio_.sum(), 1.0)

    # Compare sparse vs. dense
    for svd_sparse, svd_dense in svds_sparse_v_dense:
        assert_array_almost_equal(svd_sparse.explained_variance_ratio_,
                                  svd_dense.explained_variance_ratio_)

    # Test that explained_variance is correct
    for svd, transformed in svds_trans:
        total_variance = np.var(X.toarray(), axis=0).sum()
        variances = np.var(transformed, axis=0)
        true_explained_variance_ratio = variances / total_variance

        assert_array_almost_equal(
            svd.explained_variance_ratio_,
            true_explained_variance_ratio,
        )
def check_minimizer_bounds(result):
    # no values should be below or above the bounds
    eps = 10e-9  # check for assert_array_less OR equal
    assert_array_less(result.x_iters, np.tile([10 + eps, 15 + eps], (7, 1)))
    assert_array_less(np.tile([-5 - eps, 0 - eps], (7, 1)), result.x_iters)