def test_max_features(): # Test max_features parameter using various values X, y = datasets.make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) max_features = X.shape[1] est = RandomForestClassifier(n_estimators=50, random_state=0) transformer1 = SelectFromModel(estimator=est, threshold=-np.inf) transformer2 = SelectFromModel(estimator=est, max_features=max_features, threshold=-np.inf) X_new1 = transformer1.fit_transform(X, y) X_new2 = transformer2.fit_transform(X, y) assert_allclose(X_new1, X_new2) # Test max_features against actual model. transformer1 = SelectFromModel( estimator=Lasso(alpha=0.025, random_state=42)) X_new1 = transformer1.fit_transform(X, y) scores1 = np.abs(transformer1.estimator_.coef_) candidate_indices1 = np.argsort(-scores1, kind='mergesort') for n_features in range(1, X_new1.shape[1] + 1): transformer2 = SelectFromModel(estimator=Lasso(alpha=0.025, random_state=42), max_features=n_features, threshold=-np.inf) X_new2 = transformer2.fit_transform(X, y) scores2 = np.abs(transformer2.estimator_.coef_) candidate_indices2 = np.argsort(-scores2, kind='mergesort') assert_allclose(X[:, candidate_indices1[:n_features]], X[:, candidate_indices2[:n_features]]) assert_allclose(transformer1.estimator_.coef_, transformer2.estimator_.coef_)
def test_inertia(dtype): rng = np.random.RandomState(0) X_sparse = sp.random(100, 10, density=0.5, format="csr", random_state=rng, dtype=dtype) X_dense = X_sparse.toarray() sample_weight = rng.randn(100).astype(dtype, copy=False) centers = rng.randn(5, 10).astype(dtype, copy=False) labels = rng.randint(5, size=100, dtype=np.int32) distances = ((X_dense - centers[labels])**2).sum(axis=1) expected = np.sum(distances * sample_weight) inertia_dense = _inertia_dense(X_dense, sample_weight, centers, labels) inertia_sparse = _inertia_sparse(X_sparse, sample_weight, centers, labels) assert_allclose(inertia_dense, inertia_sparse, rtol=1e-6) assert_allclose(inertia_dense, expected, rtol=1e-6) assert_allclose(inertia_sparse, expected, rtol=1e-6)
def test_euclidean_distance(dtype, squared): # Check that the _euclidean_(dense/sparse)_dense helpers produce correct # results rng = np.random.RandomState(0) a_sparse = sp.random( 1, 100, density=0.5, format="csr", random_state=rng, dtype=dtype ) a_dense = a_sparse.toarray().reshape(-1) b = rng.randn(100).astype(dtype, copy=False) b_squared_norm = (b**2).sum() expected = ((a_dense - b) ** 2).sum() expected = expected if squared else np.sqrt(expected) distance_dense_dense = _euclidean_dense_dense_wrapper(a_dense, b, squared) distance_sparse_dense = _euclidean_sparse_dense_wrapper( a_sparse.data, a_sparse.indices, b, b_squared_norm, squared ) assert_allclose(distance_dense_dense, distance_sparse_dense, rtol=1e-6) assert_allclose(distance_dense_dense, expected, rtol=1e-6) assert_allclose(distance_sparse_dense, expected, rtol=1e-6)
def test_incremental_mean_and_variance_ignore_nan(): old_means = np.array([535., 535., 535., 535.]) old_variances = np.array([4225., 4225., 4225., 4225.]) old_sample_count = np.array([2, 2, 2, 2], dtype=np.int32) X = np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]]) X_nan = np.array([[170, np.nan, 170, 170], [np.nan, 170, 430, 430], [430, 430, np.nan, 300], [300, 300, 300, np.nan]]) X_means, X_variances, X_count = _incremental_mean_and_var( X, old_means, old_variances, old_sample_count) X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var( X_nan, old_means, old_variances, old_sample_count) assert_allclose(X_nan_means, X_means) assert_allclose(X_nan_variances, X_variances) assert_allclose(X_nan_count, X_count)
def test_iterative_imputer_clip_truncnorm(): rng = np.random.RandomState(0) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 imputer = IterativeImputer( missing_values=0, max_iter=2, n_nearest_features=5, sample_posterior=True, min_value=0.1, max_value=0.2, verbose=1, imputation_order="random", random_state=rng, ) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def test_knn_imputer_weight_uniform(na): X = np.array([ [0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10] ]) # Test with "uniform" weight (or unweighted) X_imputed_uniform = np.array([ [0, 0], [5, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10] ]) imputer = KNNImputer(weights="uniform", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed_uniform) # Test with "callable" weight def no_weight(dist): return None imputer = KNNImputer(weights=no_weight, missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed_uniform) # Test with "callable" uniform weight def uniform_weight(dist): return np.ones_like(dist) imputer = KNNImputer(weights=uniform_weight, missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
def check_samplers_multiclass_ova(name, Sampler): # Check that multiclass target lead to the same results than OVA encoding X, y = make_classification( n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) y_ova = label_binarize(y, np.unique(y)) sampler = Sampler() set_random_state(sampler) X_res, y_res = sampler.fit_resample(X, y) X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova) assert_allclose(X_res, X_res_ova) if issubclass(Sampler, BaseEnsembleSampler): for batch_y, batch_y_ova in zip(y_res, y_res_ova): assert type_of_target(batch_y_ova) == type_of_target(y_ova) assert_allclose(batch_y, batch_y_ova.argmax(axis=1)) else: assert type_of_target(y_res_ova) == type_of_target(y_ova) assert_allclose(y_res, y_res_ova.argmax(axis=1))
def check_samplers_pandas(name, Sampler): pd = pytest.importorskip("pandas") # Check that the samplers handle pandas dataframe and pandas series X, y = make_classification( n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) X_df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) y_df = pd.DataFrame(y) y_s = pd.Series(y, name="class") sampler = Sampler() if isinstance(Sampler(), NearMiss): samplers = [Sampler(version=version) for version in (1, 2, 3)] else: samplers = [Sampler()] for sampler in samplers: set_random_state(sampler) X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) X_res_df, y_res_df = sampler.fit_resample(X_df, y_df) X_res, y_res = sampler.fit_resample(X, y) # check that we return the same type for dataframes or series types assert isinstance(X_res_df, pd.DataFrame) assert isinstance(y_res_df, pd.DataFrame) assert isinstance(y_res_s, pd.Series) assert X_df.columns.to_list() == X_res_df.columns.to_list() assert y_df.columns.to_list() == y_res_df.columns.to_list() assert y_s.name == y_res_s.name assert_allclose(X_res_df.to_numpy(), X_res) assert_allclose(y_res_df.to_numpy().ravel(), y_res) assert_allclose(y_res_s.to_numpy(), y_res)
def test_transform_target_regressor_2d_transformer_multioutput(): # Check consistency with transformer accepting only 2D array and a 2D y # array. X = friedman[0] y = np.vstack((friedman[1], friedman[1]**2 + 1)).T transformer = StandardScaler() regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform y_tran = regr.transformer_.transform(y) _check_standard_scaled(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_partial_dependence_dataframe(estimator, preprocessor, features): # check that the partial dependence support dataframe and pipeline # including a column transformer pd = pytest.importorskip("pandas") df = pd.DataFrame(iris.data, columns=iris.feature_names) pipe = make_pipeline(preprocessor, estimator) pipe.fit(df, iris.target) pdp_pipe, values_pipe = partial_dependence(pipe, df, features=features, grid_resolution=10) # the column transformer will reorder the column when transforming # we mixed the index to be sure that we are computing the partial # dependence of the right columns if preprocessor is not None: X_proc = clone(preprocessor).fit_transform(df) features_clf = [0, 1] else: X_proc = df features_clf = [0, 2] clf = clone(estimator).fit(X_proc, iris.target) pdp_clf, values_clf = partial_dependence(clf, X_proc, features=features_clf, method='brute', grid_resolution=10) assert_allclose(pdp_pipe, pdp_clf) if preprocessor is not None: scaler = preprocessor.named_transformers_['standardscaler'] assert_allclose(values_pipe[1], values_clf[1] * scaler.scale_[1] + scaler.mean_[1]) else: assert_allclose(values_pipe[1], values_clf[1])
def test_model_pipeline_same_dense_and_sparse(LinearModel, params): # Test that linear model preceeded by StandardScaler in the pipeline and # with normalize set to False gives the same y_pred and the same .coef_ # given X sparse or dense model_dense = make_pipeline( StandardScaler(with_mean=False), LinearModel(normalize=False, **params) ) model_sparse = make_pipeline( StandardScaler(with_mean=False), LinearModel(normalize=False, **params) ) # prepare the data rng = np.random.RandomState(0) n_samples = 200 n_features = 2 X = rng.randn(n_samples, n_features) X[X < 0.1] = 0. X_sparse = sparse.csr_matrix(X) y = rng.rand(n_samples) if is_classifier(model_dense): y = np.sign(y) model_dense.fit(X, y) model_sparse.fit(X_sparse, y) assert_allclose(model_sparse[1].coef_, model_dense[1].coef_) y_pred_dense = model_dense.predict(X) y_pred_sparse = model_sparse.predict(X_sparse) assert_allclose(y_pred_dense, y_pred_sparse) assert_allclose(model_dense[1].intercept_, model_sparse[1].intercept_)
def test_transform_target_regressor_1d_transformer(X, y): # All transformer in scikit-learn expect 2D data. FunctionTransformer with # validate=False lift this constraint without checking that the input is a # 2D vector. We check the consistency of the data shape using a 1D and 2D y # array. transformer = FunctionTransformer(func=lambda x: x + 1, inverse_func=lambda x: x - 1) regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform y_tran = regr.transformer_.transform(y) _check_shifted_by_one(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_lars_numeric_consistency(LARS, has_coef_path, args): # The test ensures numerical consistency between trained coefficients # of float32 and float64. rtol = 1e-5 atol = 1e-5 rng = np.random.RandomState(0) X_64 = rng.rand(6, 6) y_64 = rng.rand(6) model_64 = LARS(**args).fit(X_64, y_64) model_32 = LARS(**args).fit(X_64.astype(np.float32), y_64.astype(np.float32)) assert_allclose(model_64.coef_, model_32.coef_, rtol=rtol, atol=atol) if has_coef_path: assert_allclose(model_64.coef_path_, model_32.coef_path_, rtol=rtol, atol=atol) assert_allclose(model_64.intercept_, model_32.intercept_, rtol=rtol, atol=atol)
def test_incr_mean_variance_axis_ignore_nan(axis, sparse_constructor): old_means = np.array([535., 535., 535., 535.]) old_variances = np.array([4225., 4225., 4225., 4225.]) old_sample_count = np.array([2, 2, 2, 2], dtype=np.int64) X = sparse_constructor( np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]])) X_nan = sparse_constructor( np.array([[170, np.nan, 170, 170], [np.nan, 170, 430, 430], [430, 430, np.nan, 300], [300, 300, 300, np.nan]])) # we avoid creating specific data for axis 0 and 1: translating the data is # enough. if axis: X = X.T X_nan = X_nan.T # take a copy of the old statistics since they are modified in place. X_means, X_vars, X_sample_count = incr_mean_variance_axis( X, axis=axis, last_mean=old_means.copy(), last_var=old_variances.copy(), last_n=old_sample_count.copy()) X_nan_means, X_nan_vars, X_nan_sample_count = incr_mean_variance_axis( X_nan, axis=axis, last_mean=old_means.copy(), last_var=old_variances.copy(), last_n=old_sample_count.copy()) assert_allclose(X_nan_means, X_means) assert_allclose(X_nan_vars, X_vars) assert_allclose(X_nan_sample_count, X_sample_count)
def test_knn_imputer_verify(na): # Test with an imputable matrix X = np.array([ [1, 0, 0, 1], [2, 1, 2, na], [3, 2, 3, na], [na, 4, 5, 5], [6, na, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) X_imputed = np.array([ [1, 0, 0, 1], [2, 1, 2, 8], [3, 2, 3, 8], [4, 4, 5, 5], [6, 3, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) imputer = KNNImputer(missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed) # Test when there is not enough neighbors X = np.array([ [1, 0, 0, na], [2, 1, 2, na], [3, 2, 3, na], [4, 4, 5, na], [6, 7, 6, na], [8, 8, 8, na], [20, 20, 20, 20], [22, 22, 22, 22], ]) # Not enough neighbors, use column mean from training X_impute_value = (20 + 22) / 2 X_imputed = np.array([ [1, 0, 0, X_impute_value], [2, 1, 2, X_impute_value], [3, 2, 3, X_impute_value], [4, 4, 5, X_impute_value], [6, 7, 6, X_impute_value], [8, 8, 8, X_impute_value], [20, 20, 20, 20], [22, 22, 22, 22], ]) imputer = KNNImputer(missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed) # Test when data in fit() and transform() are different X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 16]]) X1 = np.array([[1, 0], [3, 2], [4, na]]) X_2_1 = (0 + 3 + 6 + 7 + 8) / 5 X1_imputed = np.array([[1, 0], [3, 2], [4, X_2_1]]) imputer = KNNImputer(missing_values=na) assert_allclose(imputer.fit(X).transform(X1), X1_imputed)
def _assert_same_lars_path_result(output1, output2): assert len(output1) == len(output2) for o1, o2 in zip(output1, output2): assert_allclose(o1, o2)
def test_lda_predict(): # Test LDA classification. # This checks that LDA implements fit and predict and returns correct # values for simple toy data. for test_case in solver_shrinkage: solver, shrinkage = test_case clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage) y_pred = clf.fit(X, y).predict(X) assert_array_equal(y_pred, y, 'solver %s' % solver) # Assert that it works with 1D data y_pred1 = clf.fit(X1, y).predict(X1) assert_array_equal(y_pred1, y, 'solver %s' % solver) # Test probability estimates y_proba_pred1 = clf.predict_proba(X1) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, 'solver %s' % solver) y_log_proba_pred1 = clf.predict_log_proba(X1) assert_allclose(np.exp(y_log_proba_pred1), y_proba_pred1, rtol=1e-6, atol=1e-6, err_msg='solver %s' % solver) # Primarily test for commit 2f34950 -- "reuse" of priors y_pred3 = clf.fit(X, y3).predict(X) # LDA shouldn't be able to separate those assert np.any(y_pred3 != y3), 'solver %s' % solver # Test invalid shrinkages clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231) assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy") assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto") assert_raises(NotImplementedError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=np.array([1, 2])) with pytest.raises(TypeError, match="shrinkage must be a float or a string"): clf.fit(X, y) clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=0.1, covariance_estimator=ShrunkCovariance()) with pytest.raises(ValueError, match=("covariance_estimator and shrinkage " "parameters are not None. " "Only one of the two can be set.")): clf.fit(X, y) # Test unknown solver clf = LinearDiscriminantAnalysis(solver="dummy") assert_raises(ValueError, clf.fit, X, y) # test bad solver with covariance_estimator clf = LinearDiscriminantAnalysis(solver="svd", covariance_estimator=LedoitWolf()) with pytest.raises(ValueError, match="covariance estimator is not supported with svd"): clf.fit(X, y) # test bad covariance estimator clf = LinearDiscriminantAnalysis(solver="lsqr", covariance_estimator=KMeans(n_clusters=2)) with pytest.raises(ValueError, match="KMeans does not have a covariance_ attribute"): clf.fit(X, y)
def test_lda_predict_proba(solver, n_classes): def generate_dataset(n_samples, centers, covariances, random_state=None): """Generate a multivariate normal data given some centers and covariances""" rng = check_random_state(random_state) X = np.vstack([rng.multivariate_normal(mean, cov, size=n_samples // len(centers)) for mean, cov in zip(centers, covariances)]) y = np.hstack([[clazz] * (n_samples // len(centers)) for clazz in range(len(centers))]) return X, y blob_centers = np.array([[0, 0], [-10, 40], [-30, 30]])[:n_classes] blob_stds = np.array([[[10, 10], [10, 100]]] * len(blob_centers)) X, y = generate_dataset( n_samples=90000, centers=blob_centers, covariances=blob_stds, random_state=42 ) lda = LinearDiscriminantAnalysis(solver=solver, store_covariance=True, shrinkage=None).fit(X, y) # check that the empirical means and covariances are close enough to the # one used to generate the data assert_allclose(lda.means_, blob_centers, atol=1e-1) assert_allclose(lda.covariance_, blob_stds[0], atol=1) # implement the method to compute the probability given in The Elements # of Statistical Learning (cf. p.127, Sect. 4.4.5 "Logistic Regression # or LDA?") precision = linalg.inv(blob_stds[0]) alpha_k = [] alpha_k_0 = [] for clazz in range(len(blob_centers) - 1): alpha_k.append( np.dot(precision, (blob_centers[clazz] - blob_centers[-1])[:, np.newaxis])) alpha_k_0.append( np.dot(- 0.5 * (blob_centers[clazz] + blob_centers[-1])[np.newaxis, :], alpha_k[-1])) sample = np.array([[-22, 22]]) def discriminant_func(sample, coef, intercept, clazz): return np.exp(intercept[clazz] + np.dot(sample, coef[clazz])) prob = np.array([float( discriminant_func(sample, alpha_k, alpha_k_0, clazz) / (1 + sum([discriminant_func(sample, alpha_k, alpha_k_0, clazz) for clazz in range(n_classes - 1)]))) for clazz in range( n_classes - 1)]) prob_ref = 1 - np.sum(prob) # check the consistency of the computed probability # all probabilities should sum to one prob_ref_2 = float( 1 / (1 + sum([discriminant_func(sample, alpha_k, alpha_k_0, clazz) for clazz in range(n_classes - 1)])) ) assert prob_ref == pytest.approx(prob_ref_2) # check that the probability of LDA are close to the theoretical # probabilties assert_allclose(lda.predict_proba(sample), np.hstack([prob, prob_ref])[np.newaxis], atol=1e-2)
def test_compare_to_ELKI(): # Expected values, computed with (future) ELKI 0.7.5 using: # java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter # -algorithm clustering.optics.OPTICSHeap -optics.minpts 5 # where the FixedDBIDsFilter gives 0-indexed ids. r1 = [np.inf, 1.0574896366427478, 0.7587934993548423, 0.7290174038973836, 0.7290174038973836, 0.7290174038973836, 0.6861627576116127, 0.7587934993548423, 0.9280118450166668, 1.1748022534146194, 3.3355455741292257, 0.49618389254482587, 0.2552805046961355, 0.2552805046961355, 0.24944622248445714, 0.24944622248445714, 0.24944622248445714, 0.2552805046961355, 0.2552805046961355, 0.3086779122185853, 4.163024452756142, 1.623152630340929, 0.45315840475822655, 0.25468325192031926, 0.2254004358159971, 0.18765711877083036, 0.1821471333893275, 0.1821471333893275, 0.18765711877083036, 0.18765711877083036, 0.2240202988740153, 1.154337614548715, 1.342604473837069, 1.323308536402633, 0.8607514948648837, 0.27219111215810565, 0.13260875220533205, 0.13260875220533205, 0.09890587675958984, 0.09890587675958984, 0.13548790801634494, 0.1575483940837384, 0.17515137170530226, 0.17575920159442388, 0.27219111215810565, 0.6101447895405373, 1.3189208094864302, 1.323308536402633, 2.2509184159764577, 2.4517810628594527, 3.675977064404973, 3.8264795626020365, 2.9130735341510614, 2.9130735341510614, 2.9130735341510614, 2.9130735341510614, 2.8459300127258036, 2.8459300127258036, 2.8459300127258036, 3.0321982337972537] o1 = [0, 3, 6, 4, 7, 8, 2, 9, 5, 1, 31, 30, 32, 34, 33, 38, 39, 35, 37, 36, 44, 21, 23, 24, 22, 25, 27, 29, 26, 28, 20, 40, 45, 46, 10, 15, 11, 13, 17, 19, 18, 12, 16, 14, 47, 49, 43, 48, 42, 41, 53, 57, 51, 52, 56, 59, 54, 55, 58, 50] p1 = [-1, 0, 3, 6, 6, 6, 8, 3, 7, 5, 1, 31, 30, 30, 34, 34, 34, 32, 32, 37, 36, 44, 21, 23, 24, 22, 25, 25, 22, 22, 22, 21, 40, 45, 46, 10, 15, 15, 13, 13, 15, 11, 19, 15, 10, 47, 12, 45, 14, 43, 42, 53, 57, 57, 57, 57, 59, 59, 59, 58] # Tests against known extraction array # Does NOT work with metric='euclidean', because sklearn euclidean has # worse numeric precision. 'minkowski' is slower but more accurate. clust1 = OPTICS(min_samples=5).fit(X) assert_array_equal(clust1.ordering_, np.array(o1)) assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1)) assert_allclose(clust1.reachability_[clust1.ordering_], np.array(r1)) # ELKI currently does not print the core distances (which are not used much # in literature, but we can at least ensure to have this consistency: for i in clust1.ordering_[1:]: assert (clust1.reachability_[i] >= clust1.core_distances_[clust1.predecessor_[i]]) # Expected values, computed with (future) ELKI 0.7.5 using r2 = [np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, 0.27219111215810565, 0.13260875220533205, 0.13260875220533205, 0.09890587675958984, 0.09890587675958984, 0.13548790801634494, 0.1575483940837384, 0.17515137170530226, 0.17575920159442388, 0.27219111215810565, 0.4928068613197889, np.inf, 0.2666183922512113, 0.18765711877083036, 0.1821471333893275, 0.1821471333893275, 0.1821471333893275, 0.18715928772277457, 0.18765711877083036, 0.18765711877083036, 0.25468325192031926, np.inf, 0.2552805046961355, 0.2552805046961355, 0.24944622248445714, 0.24944622248445714, 0.24944622248445714, 0.2552805046961355, 0.2552805046961355, 0.3086779122185853, 0.34466409325984865, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf] o2 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 11, 13, 17, 19, 18, 12, 16, 14, 47, 46, 20, 22, 25, 23, 27, 29, 24, 26, 28, 21, 30, 32, 34, 33, 38, 39, 35, 37, 36, 31, 40, 41, 42, 43, 44, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59] p2 = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10, 15, 15, 13, 13, 15, 11, 19, 15, 10, 47, -1, 20, 22, 25, 25, 25, 25, 22, 22, 23, -1, 30, 30, 34, 34, 34, 32, 32, 37, 38, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1] clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X) assert_array_equal(clust2.ordering_, np.array(o2)) assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2)) assert_allclose(clust2.reachability_[clust2.ordering_], np.array(r2)) index = np.where(clust1.core_distances_ <= 0.5)[0] assert_allclose(clust1.core_distances_[index], clust2.core_distances_[index])
def test_iforest_average_path_length(): # It tests non-regression for #8549 which used the wrong formula # for average path length, strictly for the integer case # Updated to check average path length when input is <= 2 (issue #11839) result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0 result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0 assert_allclose(_average_path_length([0]), [0.0]) assert_allclose(_average_path_length([1]), [0.0]) assert_allclose(_average_path_length([2]), [1.0]) assert_allclose(_average_path_length([5]), [result_one]) assert_allclose(_average_path_length([999]), [result_two]) assert_allclose( _average_path_length(np.array([1, 2, 5, 999])), [0.0, 1.0, result_one, result_two], ) # _average_path_length is increasing avg_path_length = _average_path_length(np.arange(5)) assert_array_equal(avg_path_length, np.sort(avg_path_length))
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) centers_old = centers + rng.normal(size=centers.shape) centers_old_csr = centers_old.copy() centers_new = np.zeros_like(centers_old) centers_new_csr = np.zeros_like(centers_old_csr) weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype) weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype) x_squared_norms = (X**2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) sample_weight = np.ones(X.shape[0], dtype=X.dtype) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] sample_weight_mb = sample_weight[:10] # step 1: compute the dense minibatch update old_inertia = _mini_batch_step( X_mb, x_mb_squared_norms, sample_weight_mb, centers_old, centers_new, weight_sums, np.random.RandomState(0), random_reassign=False, ) assert old_inertia > 0.0 # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia( X_mb, sample_weight_mb, x_mb_squared_norms, centers_new ) assert new_inertia > 0.0 assert new_inertia < old_inertia # step 2: compute the sparse minibatch update old_inertia_csr = _mini_batch_step( X_mb_csr, x_mb_squared_norms_csr, sample_weight_mb, centers_old_csr, centers_new_csr, weight_sums_csr, np.random.RandomState(0), random_reassign=False, ) assert old_inertia_csr > 0.0 # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia( X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr ) assert new_inertia_csr > 0.0 assert new_inertia_csr < old_inertia_csr # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_allclose(centers_new, centers_new_csr) assert_allclose(old_inertia, old_inertia_csr) assert_allclose(new_inertia, new_inertia_csr)
def _check_standard_scaled(y, y_pred): y_mean = np.mean(y, axis=0) y_std = np.std(y, axis=0) assert_allclose((y - y_mean) / y_std, y_pred)
def test_missing_value_handling( est, func, support_sparse, strictly_positive, omit_kwargs ): # check that the preprocessing method let pass nan rng = np.random.RandomState(42) X = iris.data.copy() n_missing = 50 X[ rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing) ] = np.nan if strictly_positive: X += np.nanmin(X) + 0.1 X_train, X_test = train_test_split(X, random_state=1) # sanity check assert not np.all(np.isnan(X_train), axis=0).any() assert np.any(np.isnan(X_train), axis=0).all() assert np.any(np.isnan(X_test), axis=0).all() X_test[:, 0] = np.nan # make sure this boundary case is tested with pytest.warns(None) as records: Xt = est.fit(X_train).transform(X_test) # ensure no warnings are raised assert len(records) == 0 # missing values should still be missing, and only them assert_array_equal(np.isnan(Xt), np.isnan(X_test)) # check that the function leads to the same results as the class with pytest.warns(None) as records: Xt_class = est.transform(X_train) assert len(records) == 0 kwargs = est.get_params() # remove the parameters which should be omitted because they # are not defined in the sister function of the preprocessing class for kwarg in omit_kwargs: _ = kwargs.pop(kwarg) Xt_func = func(X_train, **kwargs) assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class)) assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)]) # check that the inverse transform keep NaN Xt_inv = est.inverse_transform(Xt) assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test)) # FIXME: we can introduce equal_nan=True in recent version of numpy. # For the moment which just check that non-NaN values are almost equal. assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)]) for i in range(X.shape[1]): # train only on non-NaN est.fit(_get_valid_samples_by_column(X_train, i)) # check transforming with NaN works even when training without NaN with pytest.warns(None) as records: Xt_col = est.transform(X_test[:, [i]]) assert len(records) == 0 assert_allclose(Xt_col, Xt[:, [i]]) # check non-NaN is handled as before - the 1st column is all nan if not np.isnan(X_test[:, i]).all(): Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i)) assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())]) if support_sparse: est_dense = clone(est) est_sparse = clone(est) with pytest.warns(None) as records: Xt_dense = est_dense.fit(X_train).transform(X_test) Xt_inv_dense = est_dense.inverse_transform(Xt_dense) assert len(records) == 0 for sparse_constructor in ( sparse.csr_matrix, sparse.csc_matrix, sparse.bsr_matrix, sparse.coo_matrix, sparse.dia_matrix, sparse.dok_matrix, sparse.lil_matrix, ): # check that the dense and sparse inputs lead to the same results # precompute the matrix to avoid catching side warnings X_train_sp = sparse_constructor(X_train) X_test_sp = sparse_constructor(X_test) with pytest.warns(None) as records: warnings.simplefilter("ignore", PendingDeprecationWarning) Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp) assert len(records) == 0 assert_allclose(Xt_sp.A, Xt_dense) with pytest.warns(None) as records: warnings.simplefilter("ignore", PendingDeprecationWarning) Xt_inv_sp = est_sparse.inverse_transform(Xt_sp) assert len(records) == 0 assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
def test_fastica_simple(add_noise, global_random_seed, global_dtype): # Test the FastICA algorithm on very simple data. rng = np.random.RandomState(global_random_seed) n_samples = 1000 # Generate two sources: s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 s2 = stats.t.rvs(1, size=n_samples, random_state=global_random_seed) s = np.c_[s1, s2].T center_and_norm(s) s = s.astype(global_dtype) s1, s2 = s # Mixing angle phi = 0.6 mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) mixing = mixing.astype(global_dtype) m = np.dot(mixing, s) if add_noise: m += 0.1 * rng.randn(2, 1000) center_and_norm(m) # function as fun arg def g_test(x): return x**3, (3 * x**2).mean(axis=-1) algos = ["parallel", "deflation"] nls = ["logcosh", "exp", "cube", g_test] whitening = ["arbitrary-variance", "unit-variance", False] for algo, nl, whiten in itertools.product(algos, nls, whitening): if whiten: k_, mixing_, s_ = fastica(m.T, fun=nl, whiten=whiten, algorithm=algo, random_state=rng) with pytest.raises(ValueError): fastica(m.T, fun=np.tanh, whiten=whiten, algorithm=algo) else: pca = PCA(n_components=2, whiten=True, random_state=rng) X = pca.fit_transform(m.T) k_, mixing_, s_ = fastica(X, fun=nl, algorithm=algo, whiten=False, random_state=rng) with pytest.raises(ValueError): fastica(X, fun=np.tanh, algorithm=algo) s_ = s_.T # Check that the mixing model described in the docstring holds: if whiten: # XXX: exact reconstruction to standard relative tolerance is not # possible. This is probably expected when add_noise is True but we # also need a non-trivial atol in float32 when add_noise is False. # # Note that the 2 sources are non-Gaussian in this test. atol = 1e-5 if global_dtype == np.float32 else 0 assert_allclose(np.dot(np.dot(mixing_, k_), m), s_, atol=atol) center_and_norm(s_) s1_, s2_ = s_ # Check to see if the sources have been estimated # in the wrong order if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)): s2_, s1_ = s_ s1_ *= np.sign(np.dot(s1_, s1)) s2_ *= np.sign(np.dot(s2_, s2)) # Check that we have estimated the original sources if not add_noise: assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-2) assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-2) else: assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-1) assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-1) # Test FastICA class _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=global_random_seed) ica = FastICA(fun=nl, algorithm=algo, random_state=global_random_seed) sources = ica.fit_transform(m.T) assert ica.components_.shape == (2, 2) assert sources.shape == (1000, 2) assert_allclose(sources_fun, sources) assert_allclose(sources, ica.transform(m.T)) assert ica.mixing_.shape == (2, 2) for fn in [np.tanh, "exp(-.5(x^2))"]: ica = FastICA(fun=fn, algorithm=algo) with pytest.raises(ValueError): ica.fit(m.T) with pytest.raises(TypeError): FastICA(fun=range(10)).fit(m.T)
def _check_shifted_by_one(y, y_pred): assert_allclose(y + 1, y_pred)
def test_knn_imputer_weight_distance(na): X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]) # Test with "distance" weight nn = KNeighborsRegressor(metric="euclidean", weights="distance") X_rows_idx = [0, 2, 3, 4, 5, 6] nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0]) knn_imputed_value = nn.predict(X[1:2, 1:])[0] # Manual calculation X_neighbors_idx = [0, 2, 3, 4, 5] dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na) weights = 1 / dist[:, X_neighbors_idx].ravel() manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights) X_imputed_distance1 = np.array([[0, 0], [manual_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]) # NearestNeighbor calculation X_imputed_distance2 = np.array([[0, 0], [knn_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]) imputer = KNNImputer(weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed_distance1) assert_allclose(imputer.fit_transform(X), X_imputed_distance2) # Test with weights = "distance" and n_neighbors=2 X = np.array([ [na, 0, 0], [2, 1, 2], [3, 2, 3], [4, 5, 5], ]) # neighbors are rows 1, 2, the nan_euclidean_distances are: dist_0_1 = np.sqrt((3 / 2) * ((1 - 0)**2 + (2 - 0)**2)) dist_0_2 = np.sqrt((3 / 2) * ((2 - 0)**2 + (3 - 0)**2)) imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2]) X_imputed = np.array([ [imputed_value, 0, 0], [2, 1, 2], [3, 2, 3], [4, 5, 5], ]) imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed) # Test with varying missingness patterns X = np.array([ [1, 0, 0, 1], [0, na, 1, na], [1, 1, 1, na], [0, 1, 0, 0], [0, 0, 0, 0], [1, 0, 1, 1], [10, 10, 10, 10], ]) # Get weights of donor neighbors dist = nan_euclidean_distances(X, missing_values=na) r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]] r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]] r1c1_nbor_wt = 1 / r1c1_nbor_dists r1c3_nbor_wt = 1 / r1c3_nbor_dists r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]] r2c3_nbor_wt = 1 / r2c3_nbor_dists # Collect donor values col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy() col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy() # Final imputed values r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt) r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt) r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt) X_imputed = np.array([ [1, 0, 0, 1], [0, r1c1_imp, 1, r1c3_imp], [1, 1, 1, r2c3_imp], [0, 1, 0, 0], [0, 0, 0, 0], [1, 0, 1, 1], [10, 10, 10, 10], ]) imputer = KNNImputer(weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed) X = np.array([ [0, 0, 0, na], [1, 1, 1, na], [2, 2, na, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [na, 7, 7, 7], ]) dist = pairwise_distances(X, metric="nan_euclidean", squared=False, missing_values=na) # Calculate weights r0c3_w = 1.0 / dist[0, 2:-1] r1c3_w = 1.0 / dist[1, 2:-1] r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)] r7c0_w = 1.0 / dist[7, 2:7] # Calculate weighted averages r0c3 = np.average(X[2:-1, -1], weights=r0c3_w) r1c3 = np.average(X[2:-1, -1], weights=r1c3_w) r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w) r7c0 = np.average(X[2:7, 0], weights=r7c0_w) X_imputed = np.array([ [0, 0, 0, r0c3], [1, 1, 1, r1c3], [2, 2, r2c2, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [r7c0, 7, 7, 7], ]) imputer_comp_wt = KNNImputer(missing_values=na, weights="distance") assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed)
def test_kmeans_plusplus_norms(x_squared_norms): # Check that defining x_squared_norms returns the same as default=None. centers, indices = kmeans_plusplus(X, n_clusters, x_squared_norms=x_squared_norms) assert_allclose(X[indices], centers)
def assert_argkmin_results_quasi_equality( ref_dist, dist, ref_indices, indices, rtol=1e-4, ): """Assert that argkmin results are valid up to: - relative tolerance on computed distance values - permutations of indices for distances values that differ up to a precision level To be used for testing neighbors queries on float32 datasets: we accept neighbors rank swaps only if they are caused by small rounding errors on the distance computations. """ is_sorted = lambda a: np.all(a[:-1] <= a[1:]) n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1) assert (ref_dist.shape == dist.shape == ref_indices.shape == indices.shape), "Arrays of results have various shapes." n_queries, n_neighbors = ref_dist.shape # Asserting equality results one row at a time for query_idx in range(n_queries): ref_dist_row = ref_dist[query_idx] dist_row = dist[query_idx] assert is_sorted( ref_dist_row ), f"Reference distances aren't sorted on row {query_idx}" assert is_sorted( dist_row), f"Distances aren't sorted on row {query_idx}" assert_allclose(ref_dist_row, dist_row, rtol=rtol) ref_indices_row = ref_indices[query_idx] indices_row = indices[query_idx] # Grouping indices by distances using sets on a rounded distances up # to a given number of decimals of significant digits derived from rtol. reference_neighbors_groups = defaultdict(set) effective_neighbors_groups = defaultdict(set) for neighbor_rank in range(n_neighbors): rounded_dist = relative_rounding( ref_dist_row[neighbor_rank], n_significant_digits=n_significant_digits, ) reference_neighbors_groups[rounded_dist].add( ref_indices_row[neighbor_rank]) effective_neighbors_groups[rounded_dist].add( indices_row[neighbor_rank]) # Asserting equality of groups (sets) for each distance msg = ( f"Neighbors indices for query {query_idx} are not matching " f"when rounding distances at {n_significant_digits} significant digits " f"derived from rtol={rtol:.1e}") for rounded_distance in reference_neighbors_groups.keys(): assert (reference_neighbors_groups[rounded_distance] == effective_neighbors_groups[rounded_distance]), msg
def test_fit_transform(Estimator): # Check equivalence between fit.transform and fit_transform X1 = Estimator(random_state=0, n_init=1).fit(X).transform(X) X2 = Estimator(random_state=0, n_init=1).fit_transform(X) assert_allclose(X1, X2)
def assert_radius_neighborhood_results_quasi_equality( ref_dist, dist, ref_indices, indices, radius, rtol=1e-4, ): """Assert that radius neighborhood results are valid up to: - relative tolerance on computed distance values - permutations of indices for distances values that differ up to a precision level - missing or extra last elements if their distance is close to the radius To be used for testing neighbors queries on float32 datasets: we accept neighbors rank swaps only if they are caused by small rounding errors on the distance computations. Input arrays must be sorted w.r.t distances. """ is_sorted = lambda a: np.all(a[:-1] <= a[1:]) n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1) assert (len(ref_dist) == len(dist) == len(ref_indices) == len(indices)), "Arrays of results have various lengths." n_queries = len(ref_dist) # Asserting equality of results one vector at a time for query_idx in range(n_queries): ref_dist_row = ref_dist[query_idx] dist_row = dist[query_idx] assert is_sorted( ref_dist_row ), f"Reference distances aren't sorted on row {query_idx}" assert is_sorted( dist_row), f"Distances aren't sorted on row {query_idx}" # Vectors' lengths might be different due to small # numerical differences of distance w.r.t the `radius` threshold. largest_row = ref_dist_row if len(ref_dist_row) > len( dist_row) else dist_row # For the longest distances vector, we check that last extra elements # that aren't present in the other vector are all in: [radius ± rtol] min_length = min(len(ref_dist_row), len(dist_row)) last_extra_elements = largest_row[min_length:] if last_extra_elements.size > 0: assert np.all( radius - rtol <= last_extra_elements <= radius + rtol ), (f"The last extra elements ({last_extra_elements}) aren't in [radius ±" f" rtol]=[{radius} ± {rtol}]") # We truncate the neighbors results list on the smallest length to # be able to compare them, ignoring the elements checked above. ref_dist_row = ref_dist_row[:min_length] dist_row = dist_row[:min_length] assert_allclose(ref_dist_row, dist_row, rtol=rtol) ref_indices_row = ref_indices[query_idx] indices_row = indices[query_idx] # Grouping indices by distances using sets on a rounded distances up # to a given number of significant digits derived from rtol. reference_neighbors_groups = defaultdict(set) effective_neighbors_groups = defaultdict(set) for neighbor_rank in range(min_length): rounded_dist = relative_rounding( ref_dist_row[neighbor_rank], n_significant_digits=n_significant_digits, ) reference_neighbors_groups[rounded_dist].add( ref_indices_row[neighbor_rank]) effective_neighbors_groups[rounded_dist].add( indices_row[neighbor_rank]) # Asserting equality of groups (sets) for each distance msg = ( f"Neighbors indices for query {query_idx} are not matching " f"when rounding distances at {n_significant_digits} significant digits " f"derived from rtol={rtol:.1e}") for rounded_distance in reference_neighbors_groups.keys(): assert (reference_neighbors_groups[rounded_distance] == effective_neighbors_groups[rounded_distance]), msg