def test_all_points_mem_vec_same_clusters(): """ Verify membership vector for training set produces same n_clusters as clusterer """ # Given a flat clustering trained for n_clusters picked by HDBSCAN, n_clusters_fit = None clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) # When all_points_membership_vectors_flat is called, memberships = all_points_membership_vectors_flat(clusterer) # Then the number of clusters in memberships matches those of clusterer, assert (memberships.shape[1] == n_clusters_from_labels(clusterer.labels_)) # and the number of points should equal those in the training set assert (len(memberships) == len(X)) # and all probabilities are <= 1. assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14) # ======================================== # Given a flat clustering for a specified n_clusters, n_clusters_fit = n_clusters_from_labels(clusterer.labels_) - 2 clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) # When all_points_membership_vectors_flat is called, memberships = all_points_membership_vectors_flat(clusterer) # Then the number of clusters in memberships matches those of clusterer, assert (memberships.shape[1] == n_clusters_from_labels(clusterer.labels_)) # and the number of points should equal those in the training set assert (len(memberships) == len(X)) # and all probabilities are <= 1. assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14) return
def test_iris(): # Check consistency on dataset iris. classes = np.unique(iris.target) clf_samme = prob_samme = None for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(iris.data, iris.target) assert_array_equal(classes, clf.classes_) proba = clf.predict_proba(iris.data) if alg == "SAMME": clf_samme = clf prob_samme = proba assert proba.shape[1] == len(classes) assert clf.decision_function(iris.data).shape[1] == len(classes) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with algorithm %s and score = %f" % \ (alg, score) # Check we used multiple estimators assert len(clf.estimators_) > 1 # Check for distinct random states (see issue #7408) assert (len(set(est.random_state for est in clf.estimators_)) == len(clf.estimators_)) # Somewhat hacky regression test: prior to # ae7adc880d624615a34bafdb1d75ef67051b8200, # predict_proba returned SAMME.R values for SAMME. clf_samme.algorithm = "SAMME.R" assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
def test_solution_inside_bounds(kernel): # Test that hyperparameter-optimization remains in bounds# gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) bounds = gpr.kernel_.bounds max_ = np.finfo(gpr.kernel_.theta.dtype).max tiny = 1e-10 bounds[~np.isfinite(bounds[:, 1]), 1] = max_ assert_array_less(bounds[:, 0], gpr.kernel_.theta + tiny) assert_array_less(gpr.kernel_.theta, bounds[:, 1] + tiny)
def test_std_bayesian_ridge_ard_with_constant_input(): # Test BayesianRidge and ARDRegression standard dev. for edge case of # constant target vector # The standard dev. should be relatively small (< 0.01 is tested here) n_samples = 4 n_features = 5 random_state = check_random_state(42) constant_value = random_state.rand() X = random_state.random_sample((n_samples, n_features)) y = np.full(n_samples, constant_value, dtype=np.array(constant_value).dtype) expected_upper_boundary = 0.01 for clf in [BayesianRidge(), ARDRegression()]: _, y_std = clf.fit(X, y).predict(X, return_std=True) assert_array_less(y_std, expected_upper_boundary)
def test_graphical_lasso(random_state=0): # Sample area_data from a sparse multivariate normal dim = 20 n_samples = 100 random_state = check_random_state(random_state) prec = make_sparse_spd_matrix(dim, alpha=.95, random_state=random_state) cov = linalg.inv(prec) X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples) emp_cov = empirical_covariance(X) for alpha in (0., .1, .25): covs = dict() icovs = dict() for method in ('cd', 'lars'): cov_, icov_, costs = graphical_lasso(emp_cov, return_costs=True, alpha=alpha, mode=method) covs[method] = cov_ icovs[method] = icov_ costs, dual_gap = np.array(costs).T # Check that the costs always decrease (doesn't hold if alpha == 0) if not alpha == 0: assert_array_less(np.diff(costs), 0) # Check that the 2 approaches give similar results assert_array_almost_equal(covs['cd'], covs['lars'], decimal=4) assert_array_almost_equal(icovs['cd'], icovs['lars'], decimal=4) # Smoke test the estimator model = GraphicalLasso(alpha=.25).fit(X) model.score(X) assert_array_almost_equal(model.covariance_, covs['cd'], decimal=4) assert_array_almost_equal(model.covariance_, covs['lars'], decimal=4) # For a centered matrix, assume_centered could be chosen True or False # Check that this returns indeed the same result for centered area_data Z = X - X.mean(0) precs = list() for assume_centered in (False, True): prec_ = GraphicalLasso( assume_centered=assume_centered).fit(Z).precision_ precs.append(prec_) assert_array_almost_equal(precs[0], precs[1])
def test_explained_variance(X_sparse, kind, n_components, solver): X = X_sparse if kind == "sparse" else X_sparse.toarray() svd = TruncatedSVD(n_components, algorithm=solver) X_tr = svd.fit_transform(X) # Assert that all the values are greater than 0 assert_array_less(0.0, svd.explained_variance_ratio_) # Assert that total explained variance is less than 1 assert_array_less(svd.explained_variance_ratio_.sum(), 1.0) # Test that explained_variance is correct total_variance = np.var(X_sparse.toarray(), axis=0).sum() variances = np.var(X_tr, axis=0) true_explained_variance_ratio = variances / total_variance assert_allclose( svd.explained_variance_ratio_, true_explained_variance_ratio, )
def test_mem_vec_diff_clusters(): """ Verify membership vector produces as many clusters as requested """ # Ignore user warnings in this function warnings.filterwarnings("ignore", category=UserWarning) # Given a flat clustering trained for n_clusters picked by HDBSCAN, n_clusters_fit = None clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) n_clusters_fitted = n_clusters_from_labels(clusterer.labels_) # When membership_vector_flat is called with new data for some n_clusters, n_clusters_predict = n_clusters_fitted + 3 memberships = membership_vector_flat(clusterer, X_test, n_clusters=n_clusters_predict) # Then the number of clusters in memberships should be as requested, assert (memberships.shape[1] == n_clusters_predict) # and the number of points should equal those in the test set assert (len(memberships) == len(X_test)) # and all probabilities are <= 1. assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14) # ======================================== # Given a flat clustering for a specified n_clusters, n_clusters_fit = n_clusters_from_labels(clusterer.labels_) + 2 clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) # When membership_vector_flat is called with new data for some n_clusters, n_clusters_predict = n_clusters_fit + 3 memberships = membership_vector_flat(clusterer, X_test, n_clusters=n_clusters_predict) # Then the number of clusters in memberships should be as requested, assert (memberships.shape[1] == n_clusters_predict) # and the number of points should equal those in the test set assert (len(memberships) == len(X_test)) # and all probabilities are <= 1. assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14) return
def test_approx_predict_same_clusters(): """ Verify that approximate_predict_flat produces as many clusters as clusterer """ # Given a flat clustering trained for some n_clusters, n_clusters = 5 clusterer = HDBSCAN_flat(X, cluster_selection_method='eom', n_clusters=n_clusters) # When using approximate_predict_flat without specifying n_clusters, labels_flat, proba_flat = approximate_predict_flat( clusterer, X_test, n_clusters=None) # Then, the number of clusters produced must match the original n_clusters n_clusters_out = n_clusters_from_labels(labels_flat) assert(n_clusters_out == n_clusters) # and all probabilities are <= 1. assert_array_less(proba_flat, np.ones(len(proba_flat))+1.e-14) return
def test_approx_predict_diff_clusters(): """ Verify that approximate_predict_flat produces as many clusters as asked """ # Given a flat clustering trained for some n_clusters, n_clusters_fit = 5 clusterer = HDBSCAN_flat(X, cluster_selection_method='eom', n_clusters=n_clusters_fit, prediction_data=True) # When using approximate_predict_flat with specified n_clusters, n_clusters_predict = 3 labels_flat, proba_flat = approximate_predict_flat( clusterer, X_test, n_clusters=n_clusters_predict) # Then, the requested number of clusters must be produced n_clusters_out = n_clusters_from_labels(labels_flat) assert (n_clusters_out == n_clusters_predict) # and all probabilities are <= 1. assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.e-14) # When using approximate_predict_flat with more clusters # than 'eom' can handle, n_clusters_predict = 12 with warnings.catch_warnings(record=True) as w: labels_flat, proba_flat = approximate_predict_flat( clusterer, X_test, n_clusters=n_clusters_predict) # Then, a warning is raised saying 'eom' can't get this clustering, assert len(w) > 0 assert issubclass(w[-1].category, UserWarning) assert "Cannot predict" in str(w[-1].message) # But the requested number of clusters must still be produced using 'leaf' n_clusters_out = n_clusters_from_labels(labels_flat) assert (n_clusters_out == n_clusters_predict) # and all probabilities are <= 1. assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.e-14) return
def test_explained_variance(setup): # Test sparse data svd_r_10_sp = TruncatedSVD(10, algorithm="randomized", random_state=42) svd_r_20_sp = TruncatedSVD(20, algorithm="randomized", random_state=42) X_trans_r_10_sp = svd_r_10_sp.fit_transform(X) X_trans_r_20_sp = svd_r_20_sp.fit_transform(X) # Test dense data svd_r_10_de = TruncatedSVD(10, algorithm="randomized", random_state=42) svd_r_20_de = TruncatedSVD(20, algorithm="randomized", random_state=42) X_trans_r_10_de = svd_r_10_de.fit_transform(X.toarray()) X_trans_r_20_de = svd_r_20_de.fit_transform(X.toarray()) # helper arrays for tests below svds = (svd_r_10_sp, svd_r_20_sp, svd_r_10_de, svd_r_20_de) svds_trans = ( (svd_r_10_sp, X_trans_r_10_sp), (svd_r_20_sp, X_trans_r_20_sp), (svd_r_10_de, X_trans_r_10_de), (svd_r_20_de, X_trans_r_20_de), ) svds_10_v_20 = ( (svd_r_10_sp, svd_r_20_sp), (svd_r_10_de, svd_r_20_de), ) svds_sparse_v_dense = ( (svd_r_10_sp, svd_r_10_de), (svd_r_20_sp, svd_r_20_de), ) # Assert the 1st component is equal for svd_10, svd_20 in svds_10_v_20: assert_array_almost_equal( svd_10.explained_variance_ratio_.to_numpy(), svd_20.explained_variance_ratio_[:10].to_numpy(), decimal=4, ) # Assert that 20 components has higher explained variance than 10 for svd_10, svd_20 in svds_10_v_20: assert svd_20.explained_variance_ratio_.sum().to_numpy( ) > svd_10.explained_variance_ratio_.sum().to_numpy() # Assert that all the values are greater than 0 for svd in svds: assert_array_less(0.0, svd.explained_variance_ratio_.to_numpy()) # Assert that total explained variance is less than 1 for svd in svds: assert_array_less(svd.explained_variance_ratio_.sum().to_numpy(), 1.0) # Compare sparse vs. dense for svd_sparse, svd_dense in svds_sparse_v_dense: assert_array_almost_equal( svd_sparse.explained_variance_ratio_.to_numpy(), svd_dense.explained_variance_ratio_.to_numpy()) # Test that explained_variance is correct for svd, transformed in svds_trans: total_variance = mt.var(X.toarray(), axis=0).sum().to_numpy() variances = mt.var(transformed, axis=0) true_explained_variance_ratio = variances / total_variance assert_array_almost_equal( svd.explained_variance_ratio_.to_numpy(), true_explained_variance_ratio.to_numpy(), )