def test_kernel_diag(kernel): # Test that diag method of kernel returns consistent results. K_call_diag = np.diag(kernel(X)) K_diag = kernel.diag(X) assert_array_almost_equal(K_call_diag, K_diag, 5)
def test_lasso_gives_lstsq_solution(): # Test that Lars Lasso gives least square solution at the end # of the path _, _, coef_path_ = linear_model.lars_path(X, y, method='lasso') coef_lstsq = np.linalg.lstsq(X, y)[0] assert_array_almost_equal(coef_lstsq, coef_path_[:, -1])
def test_lasso_lars_vs_R_implementation(): # Test that sklearn LassoLars implementation agrees with the LassoLars # implementation available in R (lars library) under the following # scenarios: # 1) fit_intercept=False and normalize=False # 2) fit_intercept=True and normalize=True # Let's generate the data used in the bug report 7778 y = np.array( [-6.45006793, -3.51251449, -8.52445396, 6.12277822, -19.42109366]) x = np.array( [[0.47299829, 0, 0, 0, 0], [0.08239882, 0.85784863, 0, 0, 0], [0.30114139, -0.07501577, 0.80895216, 0, 0], [-0.01460346, -0.1015233, 0.0407278, 0.80338378, 0], [-0.69363927, 0.06754067, 0.18064514, -0.0803561, 0.40427291]]) X = x.T ########################################################################### # Scenario 1: Let's compare R vs sklearn when fit_intercept=False and # normalize=False ########################################################################### # # The R result was obtained using the following code: # # library(lars) # model_lasso_lars = lars(X, t(y), type="lasso", intercept=FALSE, # trace=TRUE, normalize=FALSE) # r = t(model_lasso_lars$beta) # r = np.array([[ 0, 0, 0, 0, 0, -79.810362809499026, -83.528788732782829, -83.777653739190711, -83.784156932888934, -84.033390591756657 ], [0, 0, 0, 0, -0.476624256777266, 0, 0, 0, 0, 0.025219751009936], [ 0, -3.577397088285891, -4.702795355871871, -7.016748621359461, -7.614898471899412, -0.336938391359179, 0, 0, 0.001213370600853, 0.048162321585148 ], [ 0, 0, 0, 2.231558436628169, 2.723267514525966, 2.811549786389614, 2.813766976061531, 2.817462468949557, 2.817368178703816, 2.816221090636795 ], [ 0, 0, -1.218422599914637, -3.457726183014808, -4.021304522060710, -45.827461592423745, -47.776608869312305, -47.911561610746404, -47.914845922736234, -48.039562334265717 ]]) model_lasso_lars = linear_model.LassoLars(alpha=0, fit_intercept=False, normalize=False) model_lasso_lars.fit(X, y) skl_betas = model_lasso_lars.coef_path_ assert_array_almost_equal(r, skl_betas, decimal=12) ########################################################################### ########################################################################### # Scenario 2: Let's compare R vs sklearn when fit_intercept=True and # normalize=True # # Note: When normalize is equal to True, R returns the coefficients in # their original units, that is, they are rescaled back, whereas sklearn # does not do that, therefore, we need to do this step before comparing # their results. ########################################################################### # # The R result was obtained using the following code: # # library(lars) # model_lasso_lars2 = lars(X, t(y), type="lasso", intercept=TRUE, # trace=TRUE, normalize=TRUE) # r2 = t(model_lasso_lars2$beta) r2 = np.array( [[0, 0, 0, 0, 0], [0, 0, 0, 8.371887668009453, 19.463768371044026], [0, 0, 0, 0, 9.901611055290553], [ 0, 7.495923132833733, 9.245133544334507, 17.389369207545062, 26.971656815643499 ], [0, 0, -1.569380717440311, -5.924804108067312, -7.996385265061972]]) model_lasso_lars2 = linear_model.LassoLars(alpha=0, normalize=True) model_lasso_lars2.fit(X, y) skl_betas2 = model_lasso_lars2.coef_path_ # Let's rescale back the coefficients returned by sklearn before comparing # against the R result (read the note above) temp = X - np.mean(X, axis=0) normx = np.sqrt(np.sum(temp**2, axis=0)) skl_betas2 /= normx[:, np.newaxis] assert_array_almost_equal(r2, skl_betas2, decimal=12)
def test_fastica_simple(add_noise, seed): # Test the FastICA algorithm on very simple data. rng = np.random.RandomState(seed) # scipy.stats uses the global RNG: n_samples = 1000 # Generate two sources: s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 s2 = stats.t.rvs(1, size=n_samples) s = np.c_[s1, s2].T center_and_norm(s) s1, s2 = s # Mixing angle phi = 0.6 mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) m = np.dot(mixing, s) if add_noise: m += 0.1 * rng.randn(2, 1000) center_and_norm(m) # function as fun arg def g_test(x): return x**3, (3 * x**2).mean(axis=-1) algos = ['parallel', 'deflation'] nls = ['logcosh', 'exp', 'cube', g_test] whitening = [True, False] for algo, nl, whiten in itertools.product(algos, nls, whitening): if whiten: k_, mixing_, s_ = fastica(m.T, fun=nl, algorithm=algo, random_state=rng) with pytest.raises(ValueError): fastica(m.T, fun=np.tanh, algorithm=algo) else: pca = PCA(n_components=2, whiten=True, random_state=rng) X = pca.fit_transform(m.T) k_, mixing_, s_ = fastica(X, fun=nl, algorithm=algo, whiten=False, random_state=rng) with pytest.raises(ValueError): fastica(X, fun=np.tanh, algorithm=algo) s_ = s_.T # Check that the mixing model described in the docstring holds: if whiten: assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m)) center_and_norm(s_) s1_, s2_ = s_ # Check to see if the sources have been estimated # in the wrong order if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)): s2_, s1_ = s_ s1_ *= np.sign(np.dot(s1_, s1)) s2_ *= np.sign(np.dot(s2_, s2)) # Check that we have estimated the original sources if not add_noise: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2) else: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) # Test FastICA class _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=seed) ica = FastICA(fun=nl, algorithm=algo, random_state=seed) sources = ica.fit_transform(m.T) assert ica.components_.shape == (2, 2) assert sources.shape == (1000, 2) assert_array_almost_equal(sources_fun, sources) assert_array_almost_equal(sources, ica.transform(m.T)) assert ica.mixing_.shape == (2, 2) for fn in [np.tanh, "exp(-.5(x^2))"]: ica = FastICA(fun=fn, algorithm=algo) with pytest.raises(ValueError): ica.fit(m.T) with pytest.raises(TypeError): FastICA(fun=range(10)).fit(m.T)
def test_factor_analysis(): # Test FactorAnalysis ability to recover the data covariance structure rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise with pytest.raises(ValueError): FactorAnalysis(svd_method='foo') fa_fail = FactorAnalysis() fa_fail.svd_method = 'foo' with pytest.raises(ValueError): fa_fail.fit(X) fas = [] for method in ['randomized', 'lapack']: fa = FactorAnalysis(n_components=n_components, svd_method=method) fa.fit(X) fas.append(fa) X_t = fa.transform(X) assert X_t.shape == (n_samples, n_components) assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum()) assert_almost_equal(fa.score_samples(X).mean(), fa.score(X)) diff = np.all(np.diff(fa.loglike_)) assert diff > 0., 'Log likelihood dif not increase' # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert diff < 0.1, "Mean absolute difference is %f" % diff fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) with pytest.raises(ValueError): fa.fit(X[:, :2]) def f(x, y): return np.abs(getattr(x, y)) # sign will not be equal fa1, fa2 = fas for attr in ['loglike_', 'components_', 'noise_variance_']: assert_almost_equal(f(fa1, attr), f(fa2, attr)) fa1.max_iter = 1 fa1.verbose = True with pytest.warns(ConvergenceWarning): fa1.fit(X) # Test get_covariance and get_precision with n_components == n_features # with n_components < n_features and with n_components == 0 for n_components in [0, 2, X.shape[1]]: fa.n_components = n_components fa.fit(X) cov = fa.get_covariance() precision = fa.get_precision() assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]), 12) # test rotation n_components = 2 results, projections = {}, {} for method in (None, "varimax", 'quartimax'): fa_var = FactorAnalysis(n_components=n_components, rotation=method) results[method] = fa_var.fit_transform(X) projections[method] = fa_var.get_covariance() for rot1, rot2 in combinations([None, 'varimax', 'quartimax'], 2): assert not np.allclose(results[rot1], results[rot2]) assert np.allclose(projections[rot1], projections[rot2], atol=3) with pytest.raises(ValueError): FactorAnalysis(rotation="not_implemented").fit_transform(X) # test against R's psych::principal with rotate="varimax" # (i.e., the values below stem from rotating the components in R) # R's factor analysis returns quite different values; therefore, we only # test the rotation itself factors = np.array([[0.89421016, -0.35854928, -0.27770122, 0.03773647], [-0.45081822, -0.89132754, 0.0932195, -0.01787973], [0.99500666, -0.02031465, 0.05426497, -0.11539407], [0.96822861, -0.06299656, 0.24411001, 0.07540887]]) r_solution = np.array([[0.962, 0.052], [-0.141, 0.989], [0.949, -0.300], [0.937, -0.251]]) rotated = _ortho_rotation(factors[:, :n_components], method='varimax').T assert_array_almost_equal(np.abs(rotated), np.abs(r_solution), decimal=3)
def test_lasso_toy(): # Test Lasso on a toy example for various values of alpha. # When validating this against glmnet notice that glmnet divides it # against nobs. X = [[-1], [0], [1]] Y = [-1, 0, 1] # just a straight line T = [[2], [3], [4]] # test sample clf = Lasso(alpha=1e-8) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [1]) assert_array_almost_equal(pred, [2, 3, 4]) assert_almost_equal(clf.dual_gap_, 0) clf = Lasso(alpha=0.1) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [.85]) assert_array_almost_equal(pred, [1.7, 2.55, 3.4]) assert_almost_equal(clf.dual_gap_, 0) clf = Lasso(alpha=0.5) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [.25]) assert_array_almost_equal(pred, [0.5, 0.75, 1.]) assert_almost_equal(clf.dual_gap_, 0) clf = Lasso(alpha=1) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [.0]) assert_array_almost_equal(pred, [0, 0, 0]) assert_almost_equal(clf.dual_gap_, 0)
def test_singular_values(svd_solver): # Check that the IncrementalPCA output has the correct singular values rng = np.random.RandomState(0) n_samples = 1000 n_features = 100 X = datasets.make_low_rank_matrix( n_samples, n_features, tail_strength=0.0, effective_rank=10, random_state=rng ) X = da.from_array(X, chunks=[200, -1]) pca = PCA(n_components=10, svd_solver=svd_solver, random_state=rng).fit(X) ipca = IncrementalPCA(n_components=10, batch_size=100, svd_solver=svd_solver).fit(X) assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2) # Compare to the Frobenius norm X_pca = pca.transform(X) X_ipca = ipca.transform(X) assert_array_almost_equal( np.sum(pca.singular_values_ ** 2.0), np.linalg.norm(X_pca, "fro") ** 2.0, 12 ) assert_array_almost_equal( np.sum(ipca.singular_values_ ** 2.0), np.linalg.norm(X_ipca, "fro") ** 2.0, 2 ) # Compare to the 2-norms of the score vectors assert_array_almost_equal( pca.singular_values_, np.sqrt(np.sum(X_pca ** 2.0, axis=0)), 12 ) assert_array_almost_equal( ipca.singular_values_, np.sqrt(np.sum(X_ipca ** 2.0, axis=0)), 2 ) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = datasets.make_low_rank_matrix( n_samples, n_features, tail_strength=0.0, effective_rank=3, random_state=rng ) X = da.from_array(X, chunks=[4, -1]) pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng) ipca = IncrementalPCA(n_components=3, batch_size=100, svd_solver=svd_solver) X_pca = pca.fit_transform(X) X_pca /= np.sqrt(np.sum(X_pca ** 2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat = np.dot(X_pca, pca.components_) pca.fit(X_hat) X_hat = da.from_array(X_hat, chunks=(4, -1)) ipca.fit(X_hat) assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14) assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
def test_compute_sample_weight_more_than_32(): # Non-regression smoke test for #12146 y = np.arange(50) # more than 32 distinct classes indices = np.arange(50) # use subsampling weight = compute_sample_weight("balanced", y, indices=indices) assert_array_almost_equal(weight, np.ones(y.shape[0]))
def test_sag_multiclass_computed_correctly(): """tests if the multiclass classifier is computed correctly""" alpha = 0.1 n_samples = 20 tol = 0.00001 max_iter = 40 fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) clf1 = LogisticRegression( solver="sag", C=1.0 / alpha / n_samples, max_iter=max_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, multi_class="ovr", ) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) coef1 = [] intercept1 = [] coef2 = [] intercept2 = [] for cl in classes: y_encoded = np.ones(n_samples) y_encoded[y != cl] = -1 spweights1, spintercept1 = sag_sparse( X, y_encoded, step_size, alpha, dloss=log_dloss, n_iter=max_iter, fit_intercept=fit_intercept, ) spweights2, spintercept2 = sag_sparse( X, y_encoded, step_size, alpha, dloss=log_dloss, n_iter=max_iter, sparse=True, fit_intercept=fit_intercept, ) coef1.append(spweights1) intercept1.append(spintercept1) coef2.append(spweights2) intercept2.append(spintercept2) coef1 = np.vstack(coef1) intercept1 = np.array(intercept1) coef2 = np.vstack(coef2) intercept2 = np.array(intercept2) for i, cl in enumerate(classes): assert_array_almost_equal(clf1.coef_[i].ravel(), coef1[i].ravel(), decimal=2) assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1) assert_array_almost_equal(clf2.coef_[i].ravel(), coef2[i].ravel(), decimal=2) assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
def test_compute_sample_weight(): # Test (and demo) compute_sample_weight. # Test with balanced classes y = np.asarray([1, 1, 1, 2, 2, 2]) sample_weight = compute_sample_weight("balanced", y) assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]) # Test with user-defined weights sample_weight = compute_sample_weight({1: 2, 2: 1}, y) assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 1.0, 1.0, 1.0]) # Test with column vector of balanced classes y = np.asarray([[1], [1], [1], [2], [2], [2]]) sample_weight = compute_sample_weight("balanced", y) assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]) # Test with unbalanced classes y = np.asarray([1, 1, 1, 2, 2, 2, 3]) sample_weight = compute_sample_weight("balanced", y) expected_balanced = np.array( [0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 2.3333] ) assert_array_almost_equal(sample_weight, expected_balanced, decimal=4) # Test with `None` weights sample_weight = compute_sample_weight(None, y) assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]) # Test with multi-output of balanced classes y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]]) sample_weight = compute_sample_weight("balanced", y) assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]) # Test with multi-output with user-defined weights y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]]) sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y) assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 2.0, 2.0, 2.0]) # Test with multi-output of unbalanced classes y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]]) sample_weight = compute_sample_weight("balanced", y) assert_array_almost_equal(sample_weight, expected_balanced**2, decimal=3)
def test_compute_sample_weight_with_subsample(): # Test compute_sample_weight with subsamples specified. # Test with balanced classes and all samples present y = np.asarray([1, 1, 1, 2, 2, 2]) sample_weight = compute_sample_weight("balanced", y, indices=range(6)) assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]) # Test with column vector of balanced classes and all samples present y = np.asarray([[1], [1], [1], [2], [2], [2]]) sample_weight = compute_sample_weight("balanced", y, indices=range(6)) assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]) # Test with a subsample y = np.asarray([1, 1, 1, 2, 2, 2]) sample_weight = compute_sample_weight("balanced", y, indices=range(4)) assert_array_almost_equal(sample_weight, [2.0 / 3, 2.0 / 3, 2.0 / 3, 2.0, 2.0, 2.0]) # Test with a bootstrap subsample y = np.asarray([1, 1, 1, 2, 2, 2]) sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3]) expected_balanced = np.asarray([0.6, 0.6, 0.6, 3.0, 3.0, 3.0]) assert_array_almost_equal(sample_weight, expected_balanced) # Test with a bootstrap subsample for multi-output y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]]) sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3]) assert_array_almost_equal(sample_weight, expected_balanced**2) # Test with a missing class y = np.asarray([1, 1, 1, 2, 2, 2, 3]) sample_weight = compute_sample_weight("balanced", y, indices=range(6)) assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]) # Test with a missing class for multi-output y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]]) sample_weight = compute_sample_weight("balanced", y, indices=range(6)) assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
def test_additive_chi2_sampler(): # test that AdditiveChi2Sampler approximates kernel on random data # compute exact kernel # abbreviations for easier formula X_ = X[:, np.newaxis, :] Y_ = Y[np.newaxis, :, :] large_kernel = 2 * X_ * Y_ / (X_ + Y_) # reduce to n_samples_x x n_samples_y by summing over features kernel = large_kernel.sum(axis=2) # approximate kernel mapping transform = AdditiveChi2Sampler(sample_steps=3) X_trans = transform.fit_transform(X) Y_trans = transform.transform(Y) kernel_approx = np.dot(X_trans, Y_trans.T) assert_array_almost_equal(kernel, kernel_approx, 1) X_sp_trans = transform.fit_transform(csr_matrix(X)) Y_sp_trans = transform.transform(csr_matrix(Y)) assert_array_equal(X_trans, X_sp_trans.A) assert_array_equal(Y_trans, Y_sp_trans.A) # test error is raised on negative input Y_neg = Y.copy() Y_neg[0, 0] = -1 msg = "Negative values in data passed to" with pytest.raises(ValueError, match=msg): transform.transform(Y_neg) # test error on invalid sample_steps transform = AdditiveChi2Sampler(sample_steps=4) msg = re.escape( "If sample_steps is not in [1, 2, 3], you need to provide sample_interval" ) with pytest.raises(ValueError, match=msg): transform.fit(X) # test that the sample interval is set correctly sample_steps_available = [1, 2, 3] for sample_steps in sample_steps_available: # test that the sample_interval is initialized correctly transform = AdditiveChi2Sampler(sample_steps=sample_steps) assert transform.sample_interval is None # test that the sample_interval is changed in the fit method transform.fit(X) assert transform.sample_interval_ is not None # test that the sample_interval is set correctly sample_interval = 0.3 transform = AdditiveChi2Sampler(sample_steps=4, sample_interval=sample_interval) assert transform.sample_interval == sample_interval transform.fit(X) assert transform.sample_interval_ == sample_interval
def test_sparse_classification(): # Check classification for various parameter settings on sparse input. class CustomSVC(SVC): """SVC variant that records the nature of the training set""" def fit(self, X, y): super().fit(X, y) self.data_type_ = type(X) return self rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) parameter_sets = [ { "max_samples": 0.5, "max_features": 2, "bootstrap": True, "bootstrap_features": True }, { "max_samples": 1.0, "max_features": 4, "bootstrap": True, "bootstrap_features": True }, { "max_features": 2, "bootstrap": False, "bootstrap_features": True }, { "max_samples": 0.5, "bootstrap": True, "bootstrap_features": False }, ] for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in parameter_sets: for f in [ 'predict', 'predict_proba', 'predict_log_proba', 'decision_function' ]: # Trained on sparse format sparse_classifier = BaggingClassifier( base_estimator=CustomSVC(decision_function_shape='ovr'), random_state=1, **params).fit(X_train_sparse, y_train) sparse_results = getattr(sparse_classifier, f)(X_test_sparse) # Trained on dense format dense_classifier = BaggingClassifier( base_estimator=CustomSVC(decision_function_shape='ovr'), random_state=1, **params).fit(X_train, y_train) dense_results = getattr(dense_classifier, f)(X_test) assert_array_almost_equal(sparse_results, dense_results) sparse_type = type(X_train_sparse) types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([t == sparse_type for t in types])
def test_nmf_multiplicative_update_sparse(): # Compare sparse and dense input in multiplicative update NMF # Also test continuity of the results with respect to beta_loss parameter n_samples = 20 n_features = 10 n_components = 5 alpha = 0.1 l1_ratio = 0.5 n_iter = 20 # initialization rng = np.random.mtrand.RandomState(1337) X = rng.randn(n_samples, n_features) X = np.abs(X) X_csr = sp.csr_matrix(X) W0, H0 = nmf._initialize_nmf(X, n_components, init='random', random_state=42) for beta_loss in (-1.2, 0, 0.2, 1., 2., 2.5): # Reference with dense array X W, H = W0.copy(), H0.copy() W1, H1, _ = non_negative_factorization(X, W, H, n_components, init='custom', update_H=True, solver='mu', beta_loss=beta_loss, max_iter=n_iter, alpha=alpha, l1_ratio=l1_ratio, regularization='both', random_state=42) # Compare with sparse X W, H = W0.copy(), H0.copy() W2, H2, _ = non_negative_factorization(X_csr, W, H, n_components, init='custom', update_H=True, solver='mu', beta_loss=beta_loss, max_iter=n_iter, alpha=alpha, l1_ratio=l1_ratio, regularization='both', random_state=42) assert_array_almost_equal(W1, W2, decimal=7) assert_array_almost_equal(H1, H2, decimal=7) # Compare with almost same beta_loss, since some values have a specific # behavior, but the results should be continuous w.r.t beta_loss beta_loss -= 1.e-5 W, H = W0.copy(), H0.copy() W3, H3, _ = non_negative_factorization(X_csr, W, H, n_components, init='custom', update_H=True, solver='mu', beta_loss=beta_loss, max_iter=n_iter, alpha=alpha, l1_ratio=l1_ratio, regularization='both', random_state=42) assert_array_almost_equal(W1, W3, decimal=4) assert_array_almost_equal(H1, H3, decimal=4)
def test_incremental_variance_numerical_stability(): # Test Youngs and Cramer incremental variance formulas. def np_var(A): return A.var(axis=0) # Naive one pass variance computation - not numerically stable # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance def one_pass_var(X): n = X.shape[0] exp_x2 = (X ** 2).sum(axis=0) / n expx_2 = (X.sum(axis=0) / n) ** 2 return exp_x2 - expx_2 # Two-pass algorithm, stable. # We use it as a benchmark. It is not an online algorithm # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm def two_pass_var(X): mean = X.mean(axis=0) Y = X.copy() return np.mean((Y - mean)**2, axis=0) # Naive online implementation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm # This works only for chunks for size 1 def naive_mean_variance_update(x, last_mean, last_variance, last_sample_count): updated_sample_count = (last_sample_count + 1) samples_ratio = last_sample_count / float(updated_sample_count) updated_mean = x / updated_sample_count + last_mean * samples_ratio updated_variance = last_variance * samples_ratio + \ (x - last_mean) * (x - updated_mean) / updated_sample_count return updated_mean, updated_variance, updated_sample_count # We want to show a case when one_pass_var has error > 1e-3 while # _batch_mean_variance_update has less. tol = 200 n_features = 2 n_samples = 10000 x1 = np.array(1e8, dtype=np.float64) x2 = np.log(1e-5, dtype=np.float64) A0 = np.full((n_samples // 2, n_features), x1, dtype=np.float64) A1 = np.full((n_samples // 2, n_features), x2, dtype=np.float64) A = np.vstack((A0, A1)) # Naive one pass var: >tol (=1063) assert np.abs(np_var(A) - one_pass_var(A)).max() > tol # Starting point for online algorithms: after A0 # Naive implementation: >tol (436) mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2 for i in range(A1.shape[0]): mean, var, n = \ naive_mean_variance_update(A1[i, :], mean, var, n) assert n == A.shape[0] # the mean is also slightly unstable assert np.abs(A.mean(axis=0) - mean).max() > 1e-6 assert np.abs(np_var(A) - var).max() > tol # Robust implementation: <tol (177) mean, var = A0[0, :], np.zeros(n_features) n = np.full(n_features, n_samples // 2, dtype=np.int32) for i in range(A1.shape[0]): mean, var, n = \ _incremental_mean_and_var(A1[i, :].reshape((1, A1.shape[1])), mean, var, n) assert_array_equal(n, A.shape[0]) assert_array_almost_equal(A.mean(axis=0), mean) assert tol > np.abs(np_var(A) - var).max()
def test_binary_classifier_class_weight(): """tests binary classifier with classweights for each class""" alpha = 0.1 n_samples = 50 n_iter = 20 tol = 0.00001 fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=2, random_state=10, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) y_tmp = np.ones(n_samples) y_tmp[y != classes[1]] = -1 y = y_tmp class_weight = {1: 0.45, -1: 0.55} clf1 = LogisticRegression( solver="sag", C=1.0 / alpha / n_samples, max_iter=n_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, multi_class="ovr", class_weight=class_weight, ) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, classes=np.unique(y), y=y) sample_weight = class_weight_[le.fit_transform(y)] spweights, spintercept = sag_sparse( X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, sample_weight=sample_weight, fit_intercept=fit_intercept, ) spweights2, spintercept2 = sag_sparse( X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, sparse=True, sample_weight=sample_weight, fit_intercept=fit_intercept, ) assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2) assert_almost_equal(clf1.intercept_, spintercept, decimal=1) assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2) assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
def test_softmax(): rng = np.random.RandomState(0) X = rng.randn(3, 5) exp_X = np.exp(X) sum_exp_X = np.sum(exp_X, axis=1).reshape((-1, 1)) assert_array_almost_equal(softmax(X), exp_X / sum_exp_X)
def test_multiclass_classifier_class_weight(): """tests multiclass with classweights for each class""" alpha = 0.1 n_samples = 20 tol = 0.00001 max_iter = 50 class_weight = {0: 0.45, 1: 0.55, 2: 0.75} fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) clf1 = LogisticRegression( solver="sag", C=1.0 / alpha / n_samples, max_iter=max_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, multi_class="ovr", class_weight=class_weight, ) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, classes=np.unique(y), y=y) sample_weight = class_weight_[le.fit_transform(y)] coef1 = [] intercept1 = [] coef2 = [] intercept2 = [] for cl in classes: y_encoded = np.ones(n_samples) y_encoded[y != cl] = -1 spweights1, spintercept1 = sag_sparse( X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight, ) spweights2, spintercept2 = sag_sparse( X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight, sparse=True, ) coef1.append(spweights1) intercept1.append(spintercept1) coef2.append(spweights2) intercept2.append(spintercept2) coef1 = np.vstack(coef1) intercept1 = np.array(intercept1) coef2 = np.vstack(coef2) intercept2 = np.array(intercept2) for i, cl in enumerate(classes): assert_array_almost_equal(clf1.coef_[i].ravel(), coef1[i].ravel(), decimal=2) assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1) assert_array_almost_equal(clf2.coef_[i].ravel(), coef2[i].ravel(), decimal=2) assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
def test_enet_toy(): # Test ElasticNet for various parameters of alpha and l1_ratio. # Actually, the parameters alpha = 0 should not be allowed. However, # we test it as a border case. # ElasticNet is tested with and without precomputed Gram matrix X = np.array([[-1.], [0.], [1.]]) Y = [-1, 0, 1] # just a straight line T = [[2.], [3.], [4.]] # test sample # this should be the same as lasso clf = ElasticNet(alpha=1e-8, l1_ratio=1.0) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [1]) assert_array_almost_equal(pred, [2, 3, 4]) assert_almost_equal(clf.dual_gap_, 0) clf = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=100, precompute=False) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.50819], decimal=3) assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3) assert_almost_equal(clf.dual_gap_, 0) clf.set_params(max_iter=100, precompute=True) clf.fit(X, Y) # with Gram pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.50819], decimal=3) assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3) assert_almost_equal(clf.dual_gap_, 0) clf.set_params(max_iter=100, precompute=np.dot(X.T, X)) clf.fit(X, Y) # with Gram pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.50819], decimal=3) assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3) assert_almost_equal(clf.dual_gap_, 0) clf = ElasticNet(alpha=0.5, l1_ratio=0.5) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.45454], 3) assert_array_almost_equal(pred, [0.9090, 1.3636, 1.8181], 3) assert_almost_equal(clf.dual_gap_, 0)
def test_explained_variance(setup): # Check that PCA output has unit-variance rng = np.random.RandomState(0) n_samples = 100 n_features = 80 X = mt.tensor(rng.randn(n_samples, n_features)) pca = PCA(n_components=2, svd_solver='full').fit(X) rpca = PCA(n_components=2, svd_solver='randomized', random_state=42).fit(X) assert_array_almost_equal(pca.explained_variance_.to_numpy(), rpca.explained_variance_.to_numpy(), 1) assert_array_almost_equal(pca.explained_variance_ratio_.to_numpy(), rpca.explained_variance_ratio_.to_numpy(), 1) # compare to empirical variances expected_result = np.linalg.eig(np.cov(X.to_numpy(), rowvar=False))[0] expected_result = sorted(expected_result, reverse=True)[:2] X_pca = pca.transform(X) assert_array_almost_equal(pca.explained_variance_.to_numpy(), mt.var(X_pca, ddof=1, axis=0).to_numpy()) assert_array_almost_equal(pca.explained_variance_.to_numpy(), expected_result) X_rpca = rpca.transform(X) assert_array_almost_equal(rpca.explained_variance_.to_numpy(), mt.var(X_rpca, ddof=1, axis=0).to_numpy(), decimal=1) assert_array_almost_equal(rpca.explained_variance_.to_numpy(), expected_result, decimal=1) # Same with correlated data X = datasets.make_classification(n_samples, n_features, n_informative=n_features-2, random_state=rng)[0] X = mt.tensor(X) pca = PCA(n_components=2).fit(X) rpca = PCA(n_components=2, svd_solver='randomized', random_state=rng).fit(X) assert_array_almost_equal(pca.explained_variance_ratio_.to_numpy(), rpca.explained_variance_ratio_.to_numpy(), 5)
def test_sparse_regression(): # Check regression for various parameter settings on sparse input. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50], diabetes.target[:50], random_state=rng) class CustomSVR(SVR): """SVC variant that records the nature of the training set""" def fit(self, X, y): super().fit(X, y) self.data_type_ = type(X) return self parameter_sets = [ { "max_samples": 0.5, "max_features": 2, "bootstrap": True, "bootstrap_features": True }, { "max_samples": 1.0, "max_features": 4, "bootstrap": True, "bootstrap_features": True }, { "max_features": 2, "bootstrap": False, "bootstrap_features": True }, { "max_samples": 0.5, "bootstrap": True, "bootstrap_features": False }, ] for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in parameter_sets: # Trained on sparse format sparse_classifier = BaggingRegressor(base_estimator=CustomSVR(), random_state=1, **params).fit( X_train_sparse, y_train) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_results = BaggingRegressor(base_estimator=CustomSVR(), random_state=1, **params).fit( X_train, y_train).predict(X_test) sparse_type = type(X_train_sparse) types = [i.data_type_ for i in sparse_classifier.estimators_] assert_array_almost_equal(sparse_results, dense_results) assert all([t == sparse_type for t in types]) assert_array_almost_equal(sparse_results, dense_results)
def test_singular_values(setup): # Check that the PCA output has the correct singular values rng = np.random.RandomState(0) n_samples = 100 n_features = 80 X = mt.tensor(rng.randn(n_samples, n_features)) pca = PCA(n_components=2, svd_solver='full', random_state=rng).fit(X) rpca = PCA(n_components=2, svd_solver='randomized', random_state=rng).fit(X) assert_array_almost_equal(pca.singular_values_.fetch(), rpca.singular_values_.fetch(), 1) # Compare to the Frobenius norm X_pca = pca.transform(X) X_rpca = rpca.transform(X) assert_array_almost_equal(mt.sum(pca.singular_values_**2.0).to_numpy(), (mt.linalg.norm(X_pca, "fro")**2.0).to_numpy(), 12) assert_array_almost_equal(mt.sum(rpca.singular_values_**2.0).to_numpy(), (mt.linalg.norm(X_rpca, "fro")**2.0).to_numpy(), 0) # Compare to the 2-norms of the score vectors assert_array_almost_equal(pca.singular_values_.fetch(), mt.sqrt(mt.sum(X_pca**2.0, axis=0)).to_numpy(), 12) assert_array_almost_equal(rpca.singular_values_.fetch(), mt.sqrt(mt.sum(X_rpca**2.0, axis=0)).to_numpy(), 2) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = mt.tensor(rng.randn(n_samples, n_features)) pca = PCA(n_components=3, svd_solver='full', random_state=rng) rpca = PCA(n_components=3, svd_solver='randomized', random_state=rng) X_pca = pca.fit_transform(X) X_pca /= mt.sqrt(mt.sum(X_pca**2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat = mt.dot(X_pca, pca.components_) pca.fit(X_hat) rpca.fit(X_hat) assert_array_almost_equal(pca.singular_values_.fetch(), [3.142, 2.718, 1.0], 14) assert_array_almost_equal(rpca.singular_values_.fetch(), [3.142, 2.718, 1.0], 14)
def test_ward_linkage_tree_return_distance(): # Test return_distance option on linkage and ward trees # test that return_distance when set true, gives same # output on both structured and unstructured clustering. n, p = 10, 5 rng = np.random.RandomState(0) connectivity = np.ones((n, n)) for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out_unstructured = ward_tree(X, return_distance=True) out_structured = ward_tree(X, connectivity=connectivity, return_distance=True) # get children children_unstructured = out_unstructured[0] children_structured = out_structured[0] # check if we got the same clusters assert_array_equal(children_unstructured, children_structured) # check if the distances are the same dist_unstructured = out_unstructured[-1] dist_structured = out_structured[-1] assert_array_almost_equal(dist_unstructured, dist_structured) for linkage in ['average', 'complete', 'single']: structured_items = linkage_tree(X, connectivity=connectivity, linkage=linkage, return_distance=True)[-1] unstructured_items = linkage_tree(X, linkage=linkage, return_distance=True)[-1] structured_dist = structured_items[-1] unstructured_dist = unstructured_items[-1] structured_children = structured_items[0] unstructured_children = unstructured_items[0] assert_array_almost_equal(structured_dist, unstructured_dist) assert_array_almost_equal(structured_children, unstructured_children) # test on the following dataset where we know the truth # taken from scipy/cluster/tests/hierarchy_test_data.py X = np.array([[1.43054825, -7.5693489], [6.95887839, 6.82293382], [2.87137846, -9.68248579], [7.87974764, -6.05485803], [8.24018364, -6.09495602], [7.39020262, 8.54004355]]) # truth linkage_X_ward = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 9.10208346, 4.], [7., 9., 24.7784379, 6.]]) linkage_X_complete = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.96742194, 4.], [7., 9., 18.77445997, 6.]]) linkage_X_average = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.55832839, 4.], [7., 9., 15.44089605, 6.]]) n_samples, n_features = np.shape(X) connectivity_X = np.ones((n_samples, n_samples)) out_X_unstructured = ward_tree(X, return_distance=True) out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True) # check that the labels are the same assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0]) assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4]) assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4]) linkage_options = ['complete', 'average', 'single'] X_linkage_truth = [linkage_X_complete, linkage_X_average] for (linkage, X_truth) in zip(linkage_options, X_linkage_truth): out_X_unstructured = linkage_tree(X, return_distance=True, linkage=linkage) out_X_structured = linkage_tree(X, connectivity=connectivity_X, linkage=linkage, return_distance=True) # check that the labels are the same assert_array_equal(X_truth[:, :2], out_X_unstructured[0]) assert_array_equal(X_truth[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4]) assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
def test_dump(): X_sparse, y_dense = load_svmlight_file(datafile) X_dense = X_sparse.toarray() y_sparse = sp.csr_matrix(y_dense) # slicing a csr_matrix can unsort its .indices, so test that we sort # those correctly X_sliced = X_sparse[np.arange(X_sparse.shape[0])] y_sliced = y_sparse[np.arange(y_sparse.shape[0])] for X in (X_sparse, X_dense, X_sliced): for y in (y_sparse, y_dense, y_sliced): for zero_based in (True, False): for dtype in [np.float32, np.float64, np.int32, np.int64]: f = BytesIO() # we need to pass a comment to get the version info in; # LibSVM doesn't grok comments so they're not put in by # default anymore. if (sp.issparse(y) and y.shape[0] == 1): # make sure y's shape is: (n_samples, n_labels) # when it is sparse y = y.T # Note: with dtype=np.int32 we are performing unsafe casts, # where X.astype(dtype) overflows. The result is # then platform dependent and X_dense.astype(dtype) may be # different from X_sparse.astype(dtype).asarray(). X_input = X.astype(dtype) dump_svmlight_file(X_input, y, f, comment="test", zero_based=zero_based) f.seek(0) comment = f.readline() comment = str(comment, "utf-8") assert "scikit-learn %s" % sklearn.__version__ in comment comment = f.readline() comment = str(comment, "utf-8") assert ["one", "zero"][zero_based] + "-based" in comment X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based) assert X2.dtype == dtype assert_array_equal(X2.sorted_indices().indices, X2.indices) X2_dense = X2.toarray() if sp.issparse(X_input): X_input_dense = X_input.toarray() else: X_input_dense = X_input if dtype == np.float32: # allow a rounding error at the last decimal place assert_array_almost_equal( X_input_dense, X2_dense, 4) assert_array_almost_equal( y_dense.astype(dtype, copy=False), y2, 4) else: # allow a rounding error at the last decimal place assert_array_almost_equal( X_input_dense, X2_dense, 15) assert_array_almost_equal( y_dense.astype(dtype, copy=False), y2, 15)
def test_set_estimator_none(drop): """VotingClassifier set_params should be able to set estimators as None or drop""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(n_estimators=10, random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) with pytest.warns(None) as record: eclf2.set_params(rf=drop).fit(X, y) assert record if drop is None else not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert dict(eclf2.estimators)["rf"] is drop assert len(eclf2.estimators_) == 2 assert all( isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_) assert eclf2.get_params()["rf"] is drop eclf1.set_params(voting='soft').fit(X, y) with pytest.warns(None) as record: eclf2.set_params(voting='soft').fit(X, y) assert record if drop is None else not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = 'All estimators are dropped. At least one is required' with pytest.warns(None) as record: with pytest.raises(ValueError, match=msg): eclf2.set_params(lr=drop, rf=drop, nb=drop).fit(X, y) assert record if drop is None else not record # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) with pytest.warns(None) as record: eclf2.set_params(rf=drop).fit(X1, y1) assert record if drop is None else not record assert_array_almost_equal( eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() weight_sums = np.zeros(new_centers.shape[0], dtype=np.double) weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double) x_squared_norms = (X ** 2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double) # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step( X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums, buffer, 1, None, random_reassign=False) assert old_inertia > 0.0 # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia( X_mb, sample_weight_mb, x_mb_squared_norms, new_centers) assert new_inertia > 0.0 assert new_inertia < old_inertia # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers) ** 2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr, weight_sums_csr, buffer_csr, 1, None, random_reassign=False) assert old_inertia_csr > 0.0 # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia( X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr) assert new_inertia_csr > 0.0 assert new_inertia_csr < old_inertia_csr # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers) ** 2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
def test_singular_matrix(): # Test when input is a singular matrix X1 = np.array([[1, 1.], [1., 1.]]) y1 = np.array([1, 1]) _, _, coef_path = linear_model.lars_path(X1, y1) assert_array_almost_equal(coef_path.T, [[0, 0], [1, 0]])
def test_fit_transform(): X1 = KMeans(n_clusters=3, random_state=51).fit(X).transform(X) X2 = KMeans(n_clusters=3, random_state=51).fit_transform(X) assert_array_almost_equal(X1, X2)
def test_regression_multioutput_array(): y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]] y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]] mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') mape = mean_absolute_percentage_error(y_true, y_pred, multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') evs = explained_variance_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2) assert_array_almost_equal(mae, [0.25, 0.625], decimal=2) assert_array_almost_equal(mape, [0.0778, 0.2262], decimal=2) assert_array_almost_equal(r, [0.95, 0.93], decimal=2) assert_array_almost_equal(evs, [0.95, 0.93], decimal=2) # mean_absolute_error and mean_squared_error are equal because # it is a binary problem. y_true = [[0, 0]] * 4 y_pred = [[1, 1]] * 4 mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(mse, [1., 1.], decimal=2) assert_array_almost_equal(mae, [1., 1.], decimal=2) assert_array_almost_equal(r, [0., 0.], decimal=2) r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='raw_values') assert_array_almost_equal(r, [0, -3.5], decimal=2) assert np.mean(r) == r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='uniform_average') evs = explained_variance_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='raw_values') assert_array_almost_equal(evs, [0, -1.25], decimal=2) # Checking for the condition in which both numerator and denominator is # zero. y_true = [[1, 3], [-1, 2]] y_pred = [[1, 4], [-1, 1]] r2 = r2_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(r2, [1., -3.], decimal=2) assert np.mean(r2) == r2_score(y_true, y_pred, multioutput='uniform_average') evs = explained_variance_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(evs, [1., -3.], decimal=2) assert np.mean(evs) == explained_variance_score(y_true, y_pred) # Handling msle separately as it does not accept negative inputs. y_true = np.array([[0.5, 1], [1, 2], [7, 6]]) y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]]) msle = mean_squared_log_error(y_true, y_pred, multioutput='raw_values') msle2 = mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred), multioutput='raw_values') assert_array_almost_equal(msle, msle2, decimal=2)
def test_auto_vs_cross(kernel): # Auto-correlation and cross-correlation should be consistent. K_auto = kernel(X) K_cross = kernel(X, X) print(K_auto, K_cross) assert_array_almost_equal(K_auto, K_cross, 5)