def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices): # validation of the indices # we make a copy because indices is mutable and shared between tests indices_converted = copy(indices) if indices_type == "slice" and isinstance(indices[1], int): indices_converted[1] += 1 columns_name = ["col_0", "col_1", "col_2"] array = _convert_container( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name ) indices_converted = _convert_container(indices_converted, indices_type) if isinstance(indices[0], str) and array_type != "dataframe": err_msg = ( "Specifying the columns using strings is only supported " "for pandas DataFrames" ) with pytest.raises(ValueError, match=err_msg): _safe_indexing(array, indices_converted, axis=1) else: subset = _safe_indexing(array, indices_converted, axis=1) assert_allclose_dense_sparse( subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type) )
def test_safe_indexing_1d_container_mask(array_type, indices_type): indices = [False] + [True] * 2 + [False] * 6 array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type) indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=0) assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
def test_isomap_simple_grid(n_neighbors, radius): # Isomap should preserve distances when all neighbors are used n_pts = 25 X = create_sample_data(n_pts=n_pts, add_noise=False) # distances from each point to all others if n_neighbors is not None: G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance") else: G = neighbors.radius_neighbors_graph(X, radius, mode="distance") for eigen_solver in eigen_solvers: for path_method in path_methods: clf = manifold.Isomap( n_neighbors=n_neighbors, radius=radius, n_components=2, eigen_solver=eigen_solver, path_method=path_method, ) clf.fit(X) if n_neighbors is not None: G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance") else: G_iso = neighbors.radius_neighbors_graph(clf.embedding_, radius, mode="distance") assert_allclose_dense_sparse(G, G_iso)
def test_check_array_force_all_finite_valid(value, force_all_finite, retype): X = retype(np.arange(4).reshape(2, 2).astype(np.float)) X[0, 0] = value X_checked = check_array(X, force_all_finite=force_all_finite, accept_sparse=True) assert_allclose_dense_sparse(X, X_checked)
def test_check_fit_params(indices): X = np.random.randn(4, 2) fit_params = { 'list': [1, 2, 3, 4], 'array': np.array([1, 2, 3, 4]), 'sparse-col': sp.csc_matrix([1, 2, 3, 4]).T, 'sparse-row': sp.csc_matrix([1, 2, 3, 4]), 'scalar-int': 1, 'scalar-str': 'xxx', 'None': None, } result = _check_fit_params(X, fit_params, indices) indices_ = indices if indices is not None else list(range(X.shape[0])) for key in ['sparse-row', 'scalar-int', 'scalar-str', 'None']: assert result[key] is fit_params[key] assert result['list'] == _safe_indexing(fit_params['list'], indices_) assert_array_equal( result['array'], _safe_indexing(fit_params['array'], indices_) ) assert_allclose_dense_sparse( result['sparse-col'], _safe_indexing(fit_params['sparse-col'], indices_) )
def check_pipeline_consistency(name, estimator_orig): if estimator_orig._get_tags()['non_deterministic']: msg = name + ' is non deterministic' raise SkipTest(msg) # check that make_pipeline(est) gives same score as est X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) set_random_state(estimator) pipeline = make_pipeline(estimator) estimator.fit(X, y) pipeline.fit(X, y) funcs = ["score", "fit_transform"] for func_name in funcs: func = getattr(estimator, func_name, None) if func is not None: func_pipeline = getattr(pipeline, func_name) result = func(X, y) result_pipe = func_pipeline(X, y) assert_allclose_dense_sparse(result, result_pipe)
def test_imputers_add_indicator_sparse(imputer, marker): X = sparse.csr_matrix( [ [marker, 1, 5, marker, 1], [2, marker, 1, marker, 2], [6, 3, marker, marker, 3], [1, 2, 9, marker, 4], ] ) X_true_indicator = sparse.csr_matrix( [ [1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0], ] ) imputer.set_params(missing_values=marker, add_indicator=True) X_trans = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans[:, -4:], X_true_indicator) assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3])) imputer.set_params(add_indicator=False) X_trans_no_indicator = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator)
def test_20news_normalization(fetch_20newsgroups_vectorized_fxt): X = fetch_20newsgroups_vectorized_fxt(normalize=False) X_ = fetch_20newsgroups_vectorized_fxt(normalize=True) X_norm = X_['data'][:100] X = X['data'][:100] assert_allclose_dense_sparse(X_norm, normalize(X)) assert np.allclose(np.linalg.norm(X_norm.todense(), axis=1), 1)
def test_safe_indexing_1d_container(array_type, indices_type): indices = [1, 2] if indices_type == "slice" and isinstance(indices[1], int): indices[1] += 1 array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type) indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=0) assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
def test_function_sampler_func(X, y): def func(X, y): return X[:10], y[:10] sampler = FunctionSampler(func=func) X_res, y_res = sampler.fit_resample(X, y) assert_allclose_dense_sparse(X_res, X[:10]) assert_array_equal(y_res, y[:10])
def test_safe_indexing_2d_container_axis_0(array_type, indices_type): indices = [1, 2] if indices_type == 'slice' and isinstance(indices[1], int): indices[1] += 1 array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type) indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=0) assert_allclose_dense_sparse( subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type))
def test_tfidf_transformer_sparse(): X = sparse.rand(10, 20000, dtype=np.float64, random_state=42) X_csc = sparse.csc_matrix(X) X_csr = sparse.csr_matrix(X) X_trans_csc = TfidfTransformer().fit_transform(X_csc) X_trans_csr = TfidfTransformer().fit_transform(X_csr) assert_allclose_dense_sparse(X_trans_csc, X_trans_csr) assert X_trans_csc.format == X_trans_csr.format
def test_safe_indexing_2d_mask(array_type, indices_type, axis, expected_subset): columns_name = ['col_0', 'col_1', 'col_2'] array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name) indices = [False, True, True] indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=axis) assert_allclose_dense_sparse( subset, _convert_container(expected_subset, array_type))
def test_check_inverse(): X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2)) X_list = [X_dense, sparse.csr_matrix(X_dense), sparse.csc_matrix(X_dense)] for X in X_list: if sparse.issparse(X): accept_sparse = True else: accept_sparse = False trans = FunctionTransformer( func=np.sqrt, inverse_func=np.around, accept_sparse=accept_sparse, check_inverse=True, validate=True, ) warning_message = ( "The provided functions are not strictly" " inverse of each other. If you are sure you" " want to proceed regardless, set" " 'check_inverse=False'." ) with pytest.warns(UserWarning, match=warning_message): trans.fit(X) trans = FunctionTransformer( func=np.expm1, inverse_func=np.log1p, accept_sparse=accept_sparse, check_inverse=True, validate=True, ) with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) Xt = trans.fit_transform(X) assert_allclose_dense_sparse(X, trans.inverse_transform(Xt)) # check that we don't check inverse when one of the func or inverse is not # provided. trans = FunctionTransformer( func=np.expm1, inverse_func=None, check_inverse=True, validate=True ) with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) trans.fit(X_dense) trans = FunctionTransformer( func=None, inverse_func=np.expm1, check_inverse=True, validate=True ) with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) trans.fit(X_dense)
def test_safe_indexing_2d_read_only_axis_1( array_read_only, indices_read_only, array_type, indices_type, axis, expected_array ): array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) if array_read_only: array.setflags(write=False) array = _convert_container(array, array_type) indices = np.array([1, 2]) if indices_read_only: indices.setflags(write=False) indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=axis) assert_allclose_dense_sparse(subset, _convert_container(expected_array, array_type))
def test_column_transformer_sparse_array(): X_sparse = sparse.eye(3, 2).tocsr() # no distinction between 1D and 2D X_res_first = X_sparse[:, 0] X_res_both = X_sparse for col in [0, [0], slice(0, 1)]: for remainder, res in [('drop', X_res_first), ('passthrough', X_res_both)]: ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder, sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res) assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), res) for col in [[0, 1], slice(0, 2)]: ct = ColumnTransformer([('trans', Trans(), col)], sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both) assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), X_res_both)
def test_20news_normalization(): try: X = datasets.fetch_20newsgroups_vectorized(normalize=False, download_if_missing=False) X_ = datasets.fetch_20newsgroups_vectorized(normalize=True, download_if_missing=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") X_norm = X_['data'][:100] X = X['data'][:100] assert_allclose_dense_sparse(X_norm, normalize(X)) assert np.allclose(np.linalg.norm(X_norm.todense(), axis=1), 1)
def test_incremental_pca_batch_rank(): # Test sample size in each batch is always larger or equal to n_components rng = np.random.RandomState(1999) n_samples = 100 n_features = 20 X = rng.randn(n_samples, n_features) all_components = [] batch_sizes = np.arange(20, 90, 3) for batch_size in batch_sizes: ipca = IncrementalPCA(n_components=20, batch_size=batch_size).fit(X) all_components.append(ipca.components_) for components_i, components_j in zip(all_components[:-1], all_components[1:]): assert_allclose_dense_sparse(components_i, components_j)
def test_random_projection_numerical_consistency(random_projection_cls): # Verify numerical consistency among np.float32 and np.float64 atol = 1e-5 rng = np.random.RandomState(42) X = rng.rand(25, 3000) rp_32 = random_projection_cls(random_state=0) rp_64 = random_projection_cls(random_state=0) projection_32 = rp_32.fit_transform(X.astype(np.float32)) projection_64 = rp_64.fit_transform(X.astype(np.float64)) assert_allclose(projection_64, projection_32, atol=atol) assert_allclose_dense_sparse(rp_32.components_, rp_64.components_)
def test_function_sampler_func_kwargs(X, y): def func(X, y, sampling_strategy, random_state): rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=random_state ) return rus.fit_resample(X, y) sampler = FunctionSampler( func=func, kw_args={"sampling_strategy": "auto", "random_state": 0} ) X_res, y_res = sampler.fit_resample(X, y) X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y) assert_allclose_dense_sparse(X_res, X_res_2) assert_array_equal(y_res, y_res_2)
def test_safe_sparse_dot_dense_output(dense_output): rng = np.random.RandomState(0) A = sparse.random(30, 10, density=0.1, random_state=rng) B = sparse.random(10, 20, density=0.1, random_state=rng) expected = A.dot(B) actual = safe_sparse_dot(A, B, dense_output=dense_output) assert sparse.issparse(actual) == (not dense_output) if dense_output: expected = expected.toarray() assert_allclose_dense_sparse(actual, expected)
def test_32_equal_64(input_dtype, encode): # TODO this check is redundant with common checks and can be removed # once #16290 is merged X_input = np.array(X, dtype=input_dtype) # 32 bit output kbd_32 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float32) kbd_32.fit(X_input) Xt_32 = kbd_32.transform(X_input) # 64 bit output kbd_64 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float64) kbd_64.fit(X_input) Xt_64 = kbd_64.transform(X_input) assert_allclose_dense_sparse(Xt_32, Xt_64)
def test_stacking_classifier_sparse_passthrough(fmt): # Check passthrough behavior on a sparse X matrix X_train, X_test, y_train, _ = train_test_split( sparse.coo_matrix(scale(X_iris)).asformat(fmt), y_iris, random_state=42 ) estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] rf = RandomForestClassifier(n_estimators=10, random_state=42) clf = StackingClassifier( estimators=estimators, final_estimator=rf, cv=5, passthrough=True ) clf.fit(X_train, y_train) X_trans = clf.transform(X_test) assert_allclose_dense_sparse(X_test, X_trans[:, -4:]) assert sparse.issparse(X_trans) assert X_test.format == X_trans.format
def test_imputation_constant_float(array_constructor): # Test imputation using the constant strategy on floats X = np.array([[np.nan, 1.1, 0, np.nan], [1.2, np.nan, 1.3, np.nan], [0, 0, np.nan, np.nan], [1.4, 1.5, 0, np.nan]]) X_true = np.array([[-1, 1.1, 0, -1], [1.2, -1, 1.3, -1], [0, 0, -1, -1], [1.4, 1.5, 0, -1]]) X = array_constructor(X) X_true = array_constructor(X_true) imputer = SimpleImputer(strategy="constant", fill_value=-1) X_trans = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans, X_true)
def test_check_sparse_pandas_sp_format(sp_format): # check_array converts pandas dataframe with only sparse arrays into # sparse matrix pd = pytest.importorskip("pandas") sp_mat = _sparse_random_matrix(10, 3) sdf = pd.DataFrame.sparse.from_spmatrix(sp_mat) result = check_array(sdf, accept_sparse=sp_format) if sp_format is True: # by default pandas converts to coo when accept_sparse is True sp_format = 'coo' assert sp.issparse(result) assert result.format == sp_format assert_allclose_dense_sparse(sp_mat, result)
def test_stacking_regressor_sparse_passthrough(fmt): # Check passthrough behavior on a sparse X matrix X_train, X_test, y_train, _ = train_test_split(sparse.coo_matrix( scale(X_diabetes)).asformat(fmt), y_diabetes, random_state=42) estimators = [("lr", LinearRegression()), ("svr", LinearSVR())] rf = RandomForestRegressor(n_estimators=10, random_state=42) clf = StackingRegressor(estimators=estimators, final_estimator=rf, cv=5, passthrough=True) clf.fit(X_train, y_train) X_trans = clf.transform(X_test) assert_allclose_dense_sparse(X_test, X_trans[:, -10:]) assert sparse.issparse(X_trans) assert X_test.format == X_trans.format
def test_assert_allclose_dense_sparse(): x = np.arange(9).reshape(3, 3) msg = "Not equal to tolerance " y = sparse.csc_matrix(x) for X in [x, y]: # basic compare assert_raise_message(AssertionError, msg, assert_allclose_dense_sparse, X, X * 2) assert_allclose_dense_sparse(X, X) assert_raise_message(ValueError, "Can only compare two sparse", assert_allclose_dense_sparse, x, y) A = sparse.diags(np.ones(5), offsets=0).tocsr() B = sparse.csr_matrix(np.ones((1, 5))) assert_raise_message(AssertionError, "Arrays are not equal", assert_allclose_dense_sparse, B, A)
def test_check_dataframe_mixed_float_dtypes(): # pandas dataframe will coerce a boolean into a object, this is a mismatch # with np.result_type which will return a float # check_array needs to explicitly check for bool dtype in a dataframe for # this situation # https://github.com/scikit-learn/scikit-learn/issues/15787 pd = importorskip("pandas") df = pd.DataFrame( {"int": [1, 2, 3], "float": [0, 0.1, 2.1], "bool": [True, False, True]}, columns=["int", "float", "bool"], ) array = check_array(df, dtype=(np.float64, np.float32, np.float16)) expected_array = np.array( [[1.0, 0.0, 1.0], [2.0, 0.1, 0.0], [3.0, 2.1, 1.0]], dtype=float ) assert_allclose_dense_sparse(array, expected_array)
def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type, indices): columns_name = ['col_0', 'col_1', 'col_2'] array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name) if isinstance(indices, str) and array_type != 'dataframe': err_msg = ("Specifying the columns using strings is only supported " "for pandas DataFrames") with pytest.raises(ValueError, match=err_msg): _safe_indexing(array, indices, axis=1) else: subset = _safe_indexing(array, indices, axis=1) expected_output = [3, 6, 9] if expected_output_type == 'sparse': # sparse matrix are keeping the 2D shape expected_output = [[3], [6], [9]] expected_array = _convert_container(expected_output, expected_output_type) assert_allclose_dense_sparse(subset, expected_array)
def test_check_fit_params(indices): X = np.random.randn(4, 2) fit_params = { "list": [1, 2, 3, 4], "array": np.array([1, 2, 3, 4]), "sparse-col": sp.csc_matrix([1, 2, 3, 4]).T, "sparse-row": sp.csc_matrix([1, 2, 3, 4]), "scalar-int": 1, "scalar-str": "xxx", "None": None, } result = _check_fit_params(X, fit_params, indices) indices_ = indices if indices is not None else list(range(X.shape[0])) for key in ["sparse-row", "scalar-int", "scalar-str", "None"]: assert result[key] is fit_params[key] assert result["list"] == _safe_indexing(fit_params["list"], indices_) assert_array_equal(result["array"], _safe_indexing(fit_params["array"], indices_)) assert_allclose_dense_sparse( result["sparse-col"], _safe_indexing(fit_params["sparse-col"], indices_) )