def test_iris(): # Check consistency on dataset iris. classes = np.unique(iris.target) clf_samme = prob_samme = None for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(iris.data, iris.target) assert_array_equal(classes, clf.classes_) proba = clf.predict_proba(iris.data) if alg == "SAMME": clf_samme = clf prob_samme = proba assert_equal(proba.shape[1], len(classes)) assert_equal(clf.decision_function(iris.data).shape[1], len(classes)) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with algorithm %s and score = %f" % \ (alg, score) # Somewhat hacky regression test: prior to # ae7adc880d624615a34bafdb1d75ef67051b8200, # predict_proba returned SAMME.R values for SAMME. clf_samme.algorithm = "SAMME.R" assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
def test_graph_lasso(random_state=0): # Sample data from a sparse multivariate normal dim = 20 n_samples = 100 random_state = check_random_state(random_state) prec = make_sparse_spd_matrix(dim, alpha=.95, random_state=random_state) cov = linalg.inv(prec) X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples) emp_cov = empirical_covariance(X) for alpha in (.1, .01): covs = dict() for method in ('cd', 'lars'): cov_, _, costs = graph_lasso(emp_cov, alpha=.1, return_costs=True) covs[method] = cov_ costs, dual_gap = np.array(costs).T # Check that the costs always decrease assert_array_less(np.diff(costs), 0) # Check that the 2 approaches give similar results assert_array_almost_equal(covs['cd'], covs['lars']) # Smoke test the estimator model = GraphLasso(alpha=.1).fit(X) assert_array_almost_equal(model.covariance_, covs['cd'])
def test_normalize_option_multilabel_classification(): # Test in the multilabel case n_classes = 4 n_samples = 100 # for both random_state 0 and 1, y_true and y_pred has at least one # unlabelled entry _, y_true = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=0, allow_unlabeled=True, n_samples=n_samples) _, y_pred = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=1, allow_unlabeled=True, n_samples=n_samples) # To make sure at least one empty label is present y_true += [0]*n_classes y_pred += [0]*n_classes for name in METRICS_WITH_NORMALIZE_OPTION: metrics = ALL_METRICS[name] measure = metrics(y_true, y_pred, normalize=True) assert_array_less(-1.0 * measure, 0, err_msg="We failed to test correctly the normalize " "option") assert_allclose(metrics(y_true, y_pred, normalize=False) / n_samples, measure, err_msg="Failed with %s" % name)
def test_radius_neighbors(): # Checks whether Returned distances are less than `radius` # At least one point should be returned when the `radius` is set # to mean distance from the considering point to other points in # the database. # Moreover, this test compares the radius neighbors of LSHForest # with the `sklearn.neighbors.NearestNeighbors`. n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = ignore_warnings(LSHForest, category=DeprecationWarning)() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) ignore_warnings(lshf.fit)(X) for i in range(n_iter): # Select a random point in the dataset as the query query = X[rng.randint(0, n_samples)].reshape(1, -1) # At least one neighbor should be returned when the radius is the # mean distance from the query to the points of the dataset. mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) assert_equal(neighbors.shape, (1,)) assert_equal(neighbors.dtype, object) assert_greater(neighbors[0].shape[0], 0) # All distances to points in the results of the radius query should # be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) # dists and inds should not be 1D arrays or arrays of variable lengths # hence the use of the object dtype. assert_equal(distances.shape, (n_queries,)) assert_equal(distances.dtype, object) assert_equal(neighbors.shape, (n_queries,)) assert_equal(neighbors.dtype, object) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)].reshape(1, -1) mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks)
def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks)
def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks)
def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks)
def test_radius_neighbors(): """Checks whether Returned distances are less than `radius` At least one point should be returned when the `radius` is set to mean distance from the considering point to other points in the database. Moreover, this test compares the radius neighbors of LSHForest with the `sklearn.neighbors.NearestNeighbors`. """ n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) lshf.fit(X) for i in range(n_iter): query = X[rng.randint(0, n_samples)] mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) # At least one neighbor should be returned. assert_greater(neighbors.shape[0], 0) # All distances should be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) assert_equal(neighbors.shape[0], n_queries) assert_equal(distances.shape[0], n_queries) # dists and inds should not be 2D arrays assert_equal(distances.ndim, 1) assert_equal(neighbors.ndim, 1) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)] mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine') nbrs.fit(X) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) # Distances of exact neighbors is less than or equal to approximate assert_true( np.all( np.less_equal(np.sort(distances_exact[0]), np.sort(distances_approx[0]))))
def test_api(): res = dummy_minimize( branin, [(-5.0, 10.0), (0.0, 15.0)], random_state=0, maxiter=100) assert_array_equal(res.x.shape, (2,)) assert_array_equal(res.x_iters.shape, (100, 2)) assert_array_equal(res.func_vals.shape, (100,)) assert_array_less(res.x_iters, np.tile([10, 15], (100, 1))) assert_array_less(np.tile([-5, 0], (100, 1)), res.x_iters) assert_raises(ValueError, dummy_minimize, lambda x: x, [[-5, 10]])
def test_api(): res = gp_minimize(branin, [[-5, 10], [0, 15]], random_state=0, maxiter=20) assert_array_equal(res.x.shape, (2, )) assert_array_equal(res.x_iters.shape, (20, 2)) assert_array_equal(res.func_vals.shape, (20, )) assert_array_less(res.x_iters, np.tile([10, 15], (20, 1))) assert_array_less(np.tile([-5, 0], (20, 1)), res.x_iters) assert_raises(ValueError, gp_minimize, lambda x: x, [[-5, 10]])
def test_radius_neighbors(): """Checks whether Returned distances are less than `radius` At least one point should be returned when the `radius` is set to mean distance from the considering point to other points in the database. Moreover, this test compares the radius neighbors of LSHForest with the `sklearn.neighbors.NearestNeighbors`. """ n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) lshf.fit(X) for i in range(n_iter): query = X[rng.randint(0, n_samples)] mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) # At least one neighbor should be returned. assert_greater(neighbors.shape[0], 0) # All distances should be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) assert_equal(neighbors.shape[0], n_queries) assert_equal(distances.shape[0], n_queries) # dists and inds should not be 2D arrays assert_equal(distances.ndim, 1) assert_equal(neighbors.ndim, 1) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)] mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine') nbrs.fit(X) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) # Distances of exact neighbors is less than or equal to approximate assert_true(np.all(np.less_equal(np.sort(distances_exact[0]), np.sort(distances_approx[0]))))
def test_solution_inside_bounds(): gpr = VGP(kernel=kernel).fit(X, y) bounds = gpr.kernel.bounds max_ = np.finfo(gpr.kernel.theta.dtype).max tiny = 1e-10 bounds[~np.isfinite(bounds[:, 1]), 1] = max_ assert_array_less(bounds[:, 0], gpr.kernel.theta + tiny) assert_array_less(gpr.kernel.theta, bounds[:, 1] + tiny)
def test_solution_inside_bounds(kernel): # Test that hyperparameter-optimization remains in bounds# gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) bounds = gpr.kernel_.bounds max_ = np.finfo(gpr.kernel_.theta.dtype).max tiny = 1e-10 bounds[~np.isfinite(bounds[:, 1]), 1] = max_ assert_array_less(bounds[:, 0], gpr.kernel_.theta + tiny) assert_array_less(gpr.kernel_.theta, bounds[:, 1] + tiny)
def test_normalize_option_multiclass_classification(name): # Test in the multiclass case random_state = check_random_state(0) y_true = random_state.randint(0, 4, size=(20, )) y_pred = random_state.randint(0, 4, size=(20, )) n_samples = y_true.shape[0] metrics = ALL_METRICS[name] measure = metrics(y_true, y_pred, normalize=True) assert_array_less(-1.0 * measure, 0, err_msg="We failed to test correctly the normalize " "option") assert_allclose(metrics(y_true, y_pred, normalize=False) / n_samples, measure)
def test_std_bayesian_ridge_ard_with_constant_input(): # Test BayesianRidge and ARDRegression standard dev. for edge case of # constant target vector # The standard dev. should be relatively small (< 0.01 is tested here) n_samples = 4 n_features = 5 random_state = check_random_state(42) constant_value = random_state.rand() X = random_state.random_sample((n_samples, n_features)) y = np.full(n_samples, constant_value) expected_upper_boundary = 0.01 for clf in [BayesianRidge(), ARDRegression()]: _, y_std = clf.fit(X, y).predict(X, return_std=True) assert_array_less(y_std, expected_upper_boundary)
def test_class_weights(): # check that the class weights are updated # simple 3 cluster dataset X, y = make_blobs(random_state=1) for Model in [DPGMM, VBGMM]: dpgmm = Model(n_components=10, random_state=1, alpha=20, n_iter=50) dpgmm.fit(X) # get indices of components that are used: indices = np.unique(dpgmm.predict(X)) active = np.zeros(10, dtype=np.bool) active[indices] = True # used components are important assert_array_less(.1, dpgmm.weights_[active]) # others are not assert_array_less(dpgmm.weights_[~active], .05)
def test_mem_vec_diff_clusters(): """ Verify membership vector produces as many clusters as requested """ # Ignore user warnings in this function warnings.filterwarnings("ignore", category=UserWarning) # Given a flat clustering trained for n_clusters picked by HDBSCAN, n_clusters_fit = None clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) n_clusters_fitted = n_clusters_from_labels(clusterer.labels_) # When membership_vector_flat is called with new data for some n_clusters, n_clusters_predict = n_clusters_fitted + 3 memberships = membership_vector_flat(clusterer, X_test, n_clusters=n_clusters_predict) # Then the number of clusters in memberships should be as requested, assert_equal(memberships.shape[1], n_clusters_predict) # and the number of points should equal those in the test set assert_equal(len(memberships), len(X_test)) # and all probabilities are <= 1. assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14) # ======================================== # Given a flat clustering for a specified n_clusters, n_clusters_fit = n_clusters_from_labels(clusterer.labels_) + 2 clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) # When membership_vector_flat is called with new data for some n_clusters, n_clusters_predict = n_clusters_fit + 3 memberships = membership_vector_flat(clusterer, X_test, n_clusters=n_clusters_predict) # Then the number of clusters in memberships should be as requested, assert_equal(memberships.shape[1], n_clusters_predict) # and the number of points should equal those in the test set assert_equal(len(memberships), len(X_test)) # and all probabilities are <= 1. assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14) return
def test_explained_variance(X_sparse, kind, n_components, solver): X = X_sparse if kind == 'sparse' else X_sparse.toarray() svd = TruncatedSVD(n_components, algorithm=solver) X_tr = svd.fit_transform(X) # Assert that all the values are greater than 0 assert_array_less(0.0, svd.explained_variance_ratio_) # Assert that total explained variance is less than 1 assert_array_less(svd.explained_variance_ratio_.sum(), 1.0) # Test that explained_variance is correct total_variance = np.var(X_sparse.toarray(), axis=0).sum() variances = np.var(X_tr, axis=0) true_explained_variance_ratio = variances / total_variance assert_allclose( svd.explained_variance_ratio_, true_explained_variance_ratio, )
def test_graphical_lasso(random_state=0): # Sample data from a sparse multivariate normal dim = 20 n_samples = 100 random_state = check_random_state(random_state) prec = make_sparse_spd_matrix(dim, alpha=.95, random_state=random_state) cov = linalg.inv(prec) X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples) emp_cov = empirical_covariance(X) for alpha in (0., .1, .25): covs = dict() icovs = dict() for method in ('cd', 'lars'): cov_, icov_, costs = graphical_lasso(emp_cov, return_costs=True, alpha=alpha, mode=method) covs[method] = cov_ icovs[method] = icov_ costs, dual_gap = np.array(costs).T # Check that the costs always decrease (doesn't hold if alpha == 0) if not alpha == 0: assert_array_less(np.diff(costs), 0) # Check that the 2 approaches give similar results assert_array_almost_equal(covs['cd'], covs['lars'], decimal=4) assert_array_almost_equal(icovs['cd'], icovs['lars'], decimal=4) # Smoke test the estimator model = GraphicalLasso(alpha=.25).fit(X) model.score(X) assert_array_almost_equal(model.covariance_, covs['cd'], decimal=4) assert_array_almost_equal(model.covariance_, covs['lars'], decimal=4) # For a centered matrix, assume_centered could be chosen True or False # Check that this returns indeed the same result for centered data Z = X - X.mean(0) precs = list() for assume_centered in (False, True): prec_ = GraphicalLasso( assume_centered=assume_centered).fit(Z).precision_ precs.append(prec_) assert_array_almost_equal(precs[0], precs[1])
def test_approx_predict_same_clusters(): """ Verify that approximate_predict_flat produces as many clusters as clusterer """ # Given a flat clustering trained for some n_clusters, n_clusters = 5 clusterer = HDBSCAN_flat(X, cluster_selection_method='eom', n_clusters=n_clusters) # When using approximate_predict_flat without specifying n_clusters, labels_flat, proba_flat = approximate_predict_flat(clusterer, X_test, n_clusters=None) # Then, the number of clusters produced must match the original n_clusters n_clusters_out = n_clusters_from_labels(labels_flat) assert_equal(n_clusters_out, n_clusters) # and all probabilities are <= 1. assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.e-14) return
def test_approx_predict_diff_clusters(): """ Verify that approximate_predict_flat produces as many clusters as asked """ # Given a flat clustering trained for some n_clusters, n_clusters_fit = 5 clusterer = HDBSCAN_flat(X, cluster_selection_method='eom', n_clusters=n_clusters_fit, prediction_data=True) # When using approximate_predict_flat with specified n_clusters, n_clusters_predict = 3 labels_flat, proba_flat = approximate_predict_flat( clusterer, X_test, n_clusters=n_clusters_predict) # Then, the requested number of clusters must be produced n_clusters_out = n_clusters_from_labels(labels_flat) assert_equal(n_clusters_out, n_clusters_predict) # and all probabilities are <= 1. assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.e-14) # When using approximate_predict_flat with more clusters # than 'eom' can handle, n_clusters_predict = 12 with warnings.catch_warnings(record=True) as w: labels_flat, proba_flat = approximate_predict_flat( clusterer, X_test, n_clusters=n_clusters_predict) # Then, a warning is raised saying 'eom' can't get this clustering, assert len(w) > 0 assert issubclass(w[-1].category, UserWarning) assert "Cannot predict" in str(w[-1].message) # But the requested number of clusters must still be produced using 'leaf' n_clusters_out = n_clusters_from_labels(labels_flat) assert_equal(n_clusters_out, n_clusters_predict) # and all probabilities are <= 1. assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.e-14) return
def test_all_points_mem_vec_same_clusters(): """ Verify membership vector for training set produces same n_clusters as clusterer """ # Given a flat clustering trained for n_clusters picked by HDBSCAN, n_clusters_fit = None clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) # When all_points_membership_vectors_flat is called, memberships = all_points_membership_vectors_flat(clusterer) # Then the number of clusters in memberships matches those of clusterer, assert_equal(memberships.shape[1], n_clusters_from_labels(clusterer.labels_)) # and the number of points should equal those in the training set assert_equal(len(memberships), len(X)) # and all probabilities are <= 1. assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14) # ======================================== # Given a flat clustering for a specified n_clusters, n_clusters_fit = n_clusters_from_labels(clusterer.labels_) - 2 clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) # When all_points_membership_vectors_flat is called, memberships = all_points_membership_vectors_flat(clusterer) # Then the number of clusters in memberships matches those of clusterer, assert_equal(memberships.shape[1], n_clusters_from_labels(clusterer.labels_)) # and the number of points should equal those in the training set assert_equal(len(memberships), len(X)) # and all probabilities are <= 1. assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14) return
def test_forest_interface(): forest, tree1, tree2, i1, i2, x, y, y_ = generate_dummy_forest() i = _tree_apply(tree1, x) assert_array_equal(i, i1) i = _tree_apply(tree2, x) assert_array_equal(i, i2) i, ti, ni = _compute_coverage_matrix(forest, x, max_nodecount=np.inf, max_interaction=np.inf) i_ref, _ = _unique_rows(np.hstack([i1, i2]).T) i_ref = i_ref[::-1].T assert_array_equal(i, i_ref) assert_array_equal(np.unique(ti), [0, 1]) # two trees assert_array_less(-1, ni) assert_array_less(ni[ti == 0], tree1.node_count) assert_array_less(ni[ti == 1], tree2.node_count) for i, f in enumerate(_tree_featurecount(tree1)): assert f <= min(2, i) for i, f in enumerate(_tree_featurecount(tree2)): assert f <= min(2, i)
def test_radius_neighbors(): # Checks whether Returned distances are less than `radius` # At least one point should be returned when the `radius` is set # to mean distance from the considering point to other points in # the database. # Moreover, this test compares the radius neighbors of LSHForest # with the `sklearn.neighbors.NearestNeighbors`. n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = ignore_warnings(LSHForest, category=DeprecationWarning)() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) ignore_warnings(lshf.fit)(X) for i in range(n_iter): # Select a random point in the dataset as the query query = X[rng.randint(0, n_samples)].reshape(1, -1) # At least one neighbor should be returned when the radius is the # mean distance from the query to the points of the dataset. mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) assert_equal(neighbors.shape, (1,)) assert_equal(neighbors.dtype, object) assert_greater(neighbors[0].shape[0], 0) # All distances to points in the results of the radius query should # be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) # dists and inds should not be 1D arrays or arrays of variable lengths # hence the use of the object dtype. assert_equal(distances.shape, (n_queries,)) assert_equal(distances.dtype, object) assert_equal(neighbors.shape, (n_queries,)) assert_equal(neighbors.dtype, object) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)].reshape(1, -1) mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist) # Radius-based queries do not sort the result points and the order # depends on the method, the random_state and the dataset order. Therefore # we need to sort the results ourselves before performing any comparison. sorted_dists_exact = np.sort(distances_exact[0]) sorted_dists_approx = np.sort(distances_approx[0]) # Distances to exact neighbors are less than or equal to approximate # counterparts as the approximate radius query might have missed some # closer neighbors. assert_true(np.all(np.less_equal(sorted_dists_exact, sorted_dists_approx)))
def test_explained_variance(): # Test sparse data svd_a_10_sp = TruncatedSVD(10, algorithm="arpack") svd_r_10_sp = TruncatedSVD(10, algorithm="randomized", random_state=42) svd_a_20_sp = TruncatedSVD(20, algorithm="arpack") svd_r_20_sp = TruncatedSVD(20, algorithm="randomized", random_state=42) X_trans_a_10_sp = svd_a_10_sp.fit_transform(X) X_trans_r_10_sp = svd_r_10_sp.fit_transform(X) X_trans_a_20_sp = svd_a_20_sp.fit_transform(X) X_trans_r_20_sp = svd_r_20_sp.fit_transform(X) # Test dense data svd_a_10_de = TruncatedSVD(10, algorithm="arpack") svd_r_10_de = TruncatedSVD(10, algorithm="randomized", random_state=42) svd_a_20_de = TruncatedSVD(20, algorithm="arpack") svd_r_20_de = TruncatedSVD(20, algorithm="randomized", random_state=42) X_trans_a_10_de = svd_a_10_de.fit_transform(X.toarray()) X_trans_r_10_de = svd_r_10_de.fit_transform(X.toarray()) X_trans_a_20_de = svd_a_20_de.fit_transform(X.toarray()) X_trans_r_20_de = svd_r_20_de.fit_transform(X.toarray()) # helper arrays for tests below svds = (svd_a_10_sp, svd_r_10_sp, svd_a_20_sp, svd_r_20_sp, svd_a_10_de, svd_r_10_de, svd_a_20_de, svd_r_20_de) svds_trans = ( (svd_a_10_sp, X_trans_a_10_sp), (svd_r_10_sp, X_trans_r_10_sp), (svd_a_20_sp, X_trans_a_20_sp), (svd_r_20_sp, X_trans_r_20_sp), (svd_a_10_de, X_trans_a_10_de), (svd_r_10_de, X_trans_r_10_de), (svd_a_20_de, X_trans_a_20_de), (svd_r_20_de, X_trans_r_20_de), ) svds_10_v_20 = ( (svd_a_10_sp, svd_a_20_sp), (svd_r_10_sp, svd_r_20_sp), (svd_a_10_de, svd_a_20_de), (svd_r_10_de, svd_r_20_de), ) svds_sparse_v_dense = ( (svd_a_10_sp, svd_a_10_de), (svd_a_20_sp, svd_a_20_de), (svd_r_10_sp, svd_r_10_de), (svd_r_20_sp, svd_r_20_de), ) # Assert the 1st component is equal for svd_10, svd_20 in svds_10_v_20: assert_array_almost_equal( svd_10.explained_variance_ratio_, svd_20.explained_variance_ratio_[:10], decimal=5, ) # Assert that 20 components has higher explained variance than 10 for svd_10, svd_20 in svds_10_v_20: assert_greater( svd_20.explained_variance_ratio_.sum(), svd_10.explained_variance_ratio_.sum(), ) # Assert that all the values are greater than 0 for svd in svds: assert_array_less(0.0, svd.explained_variance_ratio_) # Assert that total explained variance is less than 1 for svd in svds: assert_array_less(svd.explained_variance_ratio_.sum(), 1.0) # Compare sparse vs. dense for svd_sparse, svd_dense in svds_sparse_v_dense: assert_array_almost_equal(svd_sparse.explained_variance_ratio_, svd_dense.explained_variance_ratio_) # Test that explained_variance is correct for svd, transformed in svds_trans: total_variance = np.var(X.toarray(), axis=0).sum() variances = np.var(transformed, axis=0) true_explained_variance_ratio = variances / total_variance assert_array_almost_equal( svd.explained_variance_ratio_, true_explained_variance_ratio, )
def test_radius_neighbors(): # Checks whether Returned distances are less than `radius` # At least one point should be returned when the `radius` is set # to mean distance from the considering point to other points in # the database. # Moreover, this test compares the radius neighbors of LSHForest # with the `sklearn.neighbors.NearestNeighbors`. n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) ignore_warnings(lshf.fit)(X) for i in range(n_iter): # Select a random point in the dataset as the query query = X[rng.randint(0, n_samples)].reshape(1, -1) # At least one neighbor should be returned when the radius is the # mean distance from the query to the points of the dataset. mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) assert_equal(neighbors.shape, (1,)) assert_equal(neighbors.dtype, object) assert_greater(neighbors[0].shape[0], 0) # All distances to points in the results of the radius query should # be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) # dists and inds should not be 1D arrays or arrays of variable lengths # hence the use of the object dtype. assert_equal(distances.shape, (n_queries,)) assert_equal(distances.dtype, object) assert_equal(neighbors.shape, (n_queries,)) assert_equal(neighbors.dtype, object) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)].reshape(1, -1) mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist) # Radius-based queries do not sort the result points and the order # depends on the method, the random_state and the dataset order. Therefore # we need to sort the results ourselves before performing any comparison. sorted_dists_exact = np.sort(distances_exact[0]) sorted_dists_approx = np.sort(distances_approx[0]) # Distances to exact neighbors are less than or equal to approximate # counterparts as the approximate radius query might have missed some # closer neighbors. assert_true(np.all(np.less_equal(sorted_dists_exact, sorted_dists_approx)))
def check_minimizer_bounds(result, n_calls): # no values should be below or above the bounds eps = 10e-9 # check for assert_array_less OR equal assert_array_less(result.x_iters, np.tile([10+eps, 15+eps], (n_calls, 1))) assert_array_less(np.tile([-5-eps, 0-eps], (n_calls, 1)), result.x_iters)
def check_minimizer_bounds(result): # no values should be below or above the bounds eps = 10e-9 # check for assert_array_less OR equal assert_array_less(result.x_iters, np.tile([10 + eps, 15 + eps], (7, 1))) assert_array_less(np.tile([-5 - eps, 0 - eps], (7, 1)), result.x_iters)