def test_enet_copy_X_True(check_input): X, y, _, _ = build_dataset() X = X.copy(order='F') original_X = X.copy() enet = ElasticNet(copy_X=True) enet.fit(X, y, check_input=check_input) assert_array_equal(original_X, X)
def test_additive_chi2_sampler(): # test that AdditiveChi2Sampler approximates kernel on random data # compute exact kernel # abbreviations for easier formula X_ = X[:, np.newaxis, :] Y_ = Y[np.newaxis, :, :] large_kernel = 2 * X_ * Y_ / (X_ + Y_) # reduce to n_samples_x x n_samples_y by summing over features kernel = (large_kernel.sum(axis=2)) # approximate kernel mapping transform = AdditiveChi2Sampler(sample_steps=3) X_trans = transform.fit_transform(X) Y_trans = transform.transform(Y) kernel_approx = np.dot(X_trans, Y_trans.T) assert_array_almost_equal(kernel, kernel_approx, 1) X_sp_trans = transform.fit_transform(csr_matrix(X)) Y_sp_trans = transform.transform(csr_matrix(Y)) assert_array_equal(X_trans, X_sp_trans.A) assert_array_equal(Y_trans, Y_sp_trans.A) # test error is raised on negative input Y_neg = Y.copy() Y_neg[0, 0] = -1 assert_raises(ValueError, transform.transform, Y_neg) # test error on invalid sample_steps transform = AdditiveChi2Sampler(sample_steps=4) assert_raises(ValueError, transform.fit, X) # test that the sample interval is set correctly sample_steps_available = [1, 2, 3] for sample_steps in sample_steps_available: # test that the sample_interval is initialized correctly transform = AdditiveChi2Sampler(sample_steps=sample_steps) assert transform.sample_interval is None # test that the sample_interval is changed in the fit method transform.fit(X) assert transform.sample_interval_ is not None # test that the sample_interval is set correctly sample_interval = 0.3 transform = AdditiveChi2Sampler(sample_steps=4, sample_interval=sample_interval) assert transform.sample_interval == sample_interval transform.fit(X) assert transform.sample_interval_ == sample_interval
def test_check_array_series(): # regression test that check_array works on pandas Series pd = importorskip("pandas") res = check_array(pd.Series([1, 2, 3]), ensure_2d=False) assert_array_equal(res, np.array([1, 2, 3])) # with categorical dtype (not a numpy dtype) (GH12699) s = pd.Series(['a', 'b', 'c']).astype('category') res = check_array(s, dtype=None, ensure_2d=False) assert_array_equal(res, np.array(['a', 'b', 'c'], dtype=object))
def test_huber_sparse(): X, y = make_regression_with_outliers() huber = HuberRegressor(alpha=0.1) huber.fit(X, y) X_csr = sparse.csr_matrix(X) huber_sparse = HuberRegressor(alpha=0.1) huber_sparse.fit(X_csr, y) assert_array_almost_equal(huber_sparse.coef_, huber.coef_) assert_array_equal(huber.outliers_, huber_sparse.outliers_)
def test_make_spd_matrix(): X = make_spd_matrix(n_dim=5, random_state=0) assert X.shape == (5, 5), "X shape mismatch" assert_array_almost_equal(X, X.T) from numpy.linalg import eig eigenvalues, _ = eig(X) assert_array_equal(eigenvalues > 0, np.array([True] * 5), "X is not positive-definite")
def test_check_tuple_input(): # Ensures that checks return valid tuples. rng = np.random.RandomState(0) XA = rng.random_sample((5, 4)) XA_tuples = tuplify(XA) XB = rng.random_sample((5, 4)) XB_tuples = tuplify(XB) XA_checked, XB_checked = check_pairwise_arrays(XA_tuples, XB_tuples) assert_array_equal(XA_tuples, XA_checked) assert_array_equal(XB_tuples, XB_checked)
def test_connectivity_ignores_diagonal(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) connectivity_include_self = kneighbors_graph(X, 3, include_self=True) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_cluster_intensity_tie(): X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]]) c1 = MeanShift(bandwidth=2).fit(X) X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]]) c2 = MeanShift(bandwidth=2).fit(X) assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0]) assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1])
def test_column_transformer_no_remaining_remainder_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T ct = ColumnTransformer([('trans1', Trans(), [0, 1, 2])], remainder=DoubleTrans()) assert_array_equal(ct.fit_transform(X_array), X_array) assert_array_equal(ct.fit(X_array).transform(X_array), X_array) assert len(ct.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder'
def test_dbscan_no_core_samples(): rng = np.random.RandomState(0) X = rng.rand(40, 10) X[X < .8] = 0 for X_ in [X, sparse.csr_matrix(X)]: db = DBSCAN(min_samples=6).fit(X_) assert_array_equal(db.components_, np.empty((0, X_.shape[1]))) assert_array_equal(db.labels_, -1) assert db.core_sample_indices_.shape == (0, )
def test_select_kbest_all(): # Test whether k="all" correctly returns all features. X, y = make_classification(n_samples=20, n_features=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k='all') X_r = univariate_filter.fit(X, y).transform(X) assert_array_equal(X, X_r)
def test_precomputed_dists(): redX = X[::2] dists = pairwise_distances(redX, metric='euclidean') clust1 = OPTICS(min_samples=10, algorithm='brute', metric='precomputed').fit(dists) clust2 = OPTICS(min_samples=10, algorithm='brute', metric='euclidean').fit(redX) assert_allclose(clust1.reachability_, clust2.reachability_) assert_array_equal(clust1.labels_, clust2.labels_)
def test_load_large_qid(): """ load large libsvm / svmlight file with qid attribute. Tests 64-bit query ID """ data = b"\n".join( ("3 qid:{0} 1:0.53 2:0.12\n2 qid:{0} 1:0.13 2:0.1".format(i).encode() for i in range(1, 40 * 1000 * 1000))) X, y, qid = load_svmlight_file(BytesIO(data), query_id=True) assert_array_equal(y[-4:], [3, 2, 3, 2]) assert_array_equal(np.unique(qid), np.arange(1, 40 * 1000 * 1000))
def test_multi_class(kernel): # Test GPC for multi-class classification problems. gpc = GaussianProcessClassifier(kernel=kernel) gpc.fit(X, y_mc) y_prob = gpc.predict_proba(X2) assert_almost_equal(y_prob.sum(1), 1) y_pred = gpc.predict(X2) assert_array_equal(np.argmax(y_prob, 1), y_pred)
def test_ovr_pipeline(): # Test with pipeline of length one # This test is needed because the multiclass estimators may fail to detect # the presence of predict_proba or decision_function. clf = Pipeline([("tree", DecisionTreeClassifier())]) ovr_pipe = OneVsRestClassifier(clf) ovr_pipe.fit(iris.data, iris.target) ovr = OneVsRestClassifier(DecisionTreeClassifier()) ovr.fit(iris.data, iris.target) assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data))
def test_lasso_non_float_y(model): X = [[0, 0], [1, 1], [-1, -1]] y = [0, 1, 2] y_float = [0.0, 1.0, 2.0] clf = model(fit_intercept=False) clf.fit(X, y) clf_float = model(fit_intercept=False) clf_float.fit(X, y_float) assert_array_equal(clf.coef_, clf_float.coef_)
def test_connectivity_callable(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering(connectivity=partial( kneighbors_graph, n_neighbors=3, include_self=False)) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_median_strategy_regressor(): random_state = np.random.RandomState(seed=1) X = [[0]] * 5 # ignored y = random_state.randn(5) reg = DummyRegressor(strategy="median") reg.fit(X, y) assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
def test_column_transformer_negative_column_indexes(): X = np.random.randn(2, 2) X_categories = np.array([[1], [2]]) X = np.concatenate([X, X_categories], axis=1) ohe = OneHotEncoder() tf_1 = ColumnTransformer([('ohe', ohe, [-1])], remainder='passthrough') tf_2 = ColumnTransformer([('ohe', ohe, [2])], remainder='passthrough') assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
def test_check_precisions(): rng = np.random.RandomState(0) rand_data = RandomData(rng) n_components, n_features = rand_data.n_components, rand_data.n_features # Define the bad precisions for each covariance_type precisions_bad_shape = { 'full': np.ones((n_components + 1, n_features, n_features)), 'tied': np.ones((n_features + 1, n_features + 1)), 'diag': np.ones((n_components + 1, n_features)), 'spherical': np.ones((n_components + 1)) } # Define not positive-definite precisions precisions_not_pos = np.ones((n_components, n_features, n_features)) precisions_not_pos[0] = np.eye(n_features) precisions_not_pos[0, 0, 0] = -1. precisions_not_positive = { 'full': precisions_not_pos, 'tied': precisions_not_pos[0], 'diag': np.full((n_components, n_features), -1.), 'spherical': np.full(n_components, -1.) } not_positive_errors = { 'full': 'symmetric, positive-definite', 'tied': 'symmetric, positive-definite', 'diag': 'positive', 'spherical': 'positive' } for covar_type in COVARIANCE_TYPE: X = RandomData(rng).X[covar_type] g = GaussianMixture(n_components=n_components, covariance_type=covar_type, random_state=rng) # Check precisions with bad shapes g.precisions_init = precisions_bad_shape[covar_type] assert_raise_message( ValueError, "The parameter '%s precision' should have " "the shape of" % covar_type, g.fit, X) # Check not positive precisions g.precisions_init = precisions_not_positive[covar_type] assert_raise_message( ValueError, "'%s precision' should be %s" % (covar_type, not_positive_errors[covar_type]), g.fit, X) # Check the correct init of precisions_init g.precisions_init = rand_data.precisions[covar_type] g.fit(X) assert_array_equal(rand_data.precisions[covar_type], g.precisions_init)
def test_isotonic_regression_ties_max(): # Setup examples with ties on maximum x = [1, 2, 3, 4, 5, 5] y = [1, 2, 3, 4, 5, 6] y_true = [1, 2, 3, 4, 5.5, 5.5] # Check that we get identical results for fit/transform and fit_transform ir = IsotonicRegression() ir.fit(x, y) assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y)) assert_array_equal(y_true, ir.fit_transform(x, y))
def test_min_cluster_size(min_cluster_size): redX = X[::2] # reduce for speed clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX) cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1]) if cluster_sizes.size: assert min(cluster_sizes) >= min_cluster_size # check behaviour is the same when min_cluster_size is a fraction clust_frac = OPTICS(min_samples=9, min_cluster_size=min_cluster_size / redX.shape[0]) clust_frac.fit(redX) assert_array_equal(clust.labels_, clust_frac.labels_)
def test_one_hot_encoder_specified_categories_mixed_columns(): # multiple columns X = np.array([['a', 'b'], [0, 2]], dtype=object).T enc = OneHotEncoder(categories=[['a', 'b', 'c'], [0, 1, 2]]) exp = np.array([[1., 0., 0., 1., 0., 0.], [0., 1., 0., 0., 0., 1.]]) assert_array_equal(enc.fit_transform(X).toarray(), exp) assert enc.categories_[0].tolist() == ['a', 'b', 'c'] assert np.issubdtype(enc.categories_[0].dtype, np.object_) assert enc.categories_[1].tolist() == [0, 1, 2] # integer categories but from object dtype data assert np.issubdtype(enc.categories_[1].dtype, np.object_)
def test_zero_variance(): # Test VarianceThreshold with default setting, zero variance. for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]: sel = VarianceThreshold().fit(X) assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True)) with pytest.raises(ValueError): VarianceThreshold().fit([[0, 1, 2, 3]]) with pytest.raises(ValueError): VarianceThreshold().fit([[0, 1], [0, 1]])
def test_dummy_classifier_on_3D_array(): X = np.array([[['foo']], [['bar']], [['baz']]]) y = [2, 2, 2] y_expected = [2, 2, 2] y_proba_expected = [[1], [1], [1]] cls = DummyClassifier() cls.fit(X, y) y_pred = cls.predict(X) y_pred_proba = cls.predict_proba(X) assert_array_equal(y_pred, y_expected) assert_array_equal(y_pred_proba, y_proba_expected)
def test_nans(): # Assert that SelectKBest and SelectPercentile can handle NaNs. # First feature has zero variance to confuse f_classif (ANOVA) and # make it return a NaN. X = [[0, 1, 0], [0, -1, -1], [0, .5, .5]] y = [1, 0, 1] for select in (SelectKBest(f_classif, 2), SelectPercentile(f_classif, percentile=67)): ignore_warnings(select.fit)(X, y) assert_array_equal(select.get_support(indices=True), np.array([1, 2]))
def test_sgd_optimizer_no_momentum(): params = [np.zeros(shape) for shape in shapes] for lr in [10**i for i in range(-3, 4)]: optimizer = SGDOptimizer(params, lr, momentum=0, nesterov=False) grads = [np.random.random(shape) for shape in shapes] expected = [param - lr * grad for param, grad in zip(params, grads)] optimizer.update_params(grads) for exp, param in zip(expected, optimizer.params): assert_array_equal(exp, param)
def test_clone_sparse_matrices(): sparse_matrix_classes = [ getattr(sp, name) for name in dir(sp) if name.endswith('_matrix') ] for cls in sparse_matrix_classes: sparse_matrix = cls(np.eye(5)) clf = MyEstimator(empty=sparse_matrix) clf_cloned = clone(clf) assert clf.empty.__class__ is clf_cloned.empty.__class__ assert_array_equal(clf.empty.toarray(), clf_cloned.empty.toarray())
def test_dummy_regressor_return_std(): X = [[0]] * 3 # ignored y = np.array([2, 2, 2]) y_std_expected = np.array([0, 0, 0]) cls = DummyRegressor() cls.fit(X, y) y_pred_list = cls.predict(X, return_std=True) # there should be two elements when return_std is True assert len(y_pred_list) == 2 # the second element should be all zeros assert_array_equal(y_pred_list[1], y_std_expected)
def test_dbscan_input_not_modified(use_sparse, metric): # test that the input is not modified by dbscan X = np.random.RandomState(0).rand(10, 10) X = sparse.csr_matrix(X) if use_sparse else X X_copy = X.copy() dbscan(X, metric=metric) if use_sparse: assert_array_equal(X.toarray(), X_copy.toarray()) else: assert_array_equal(X, X_copy)