def test_exceptions(): with pytest.raises(NotFittedError): X = cp.random.random((10, 10)) cuPCA().transform(X) with pytest.raises(NotFittedError): X = cp.random.random((10, 10)) cuPCA().inverse_transform(X)
def __init__(self, random_seed=0, n_components=4, **kwargs): parameters = {'n_components': n_components} parameters.update(kwargs) super().__init__(parameters=parameters, component_obj=cuPCA(random_state=random_seed, **parameters), random_seed=random_seed)
def test_pca_defaults(n_samples, n_features, sparse): # FIXME: Disable the case True-300-200 due to flaky test if sparse and n_features == 300 and n_samples == 200: pytest.xfail('Skipping the case True-300-200 due to flaky test') if sparse: X = cupyx.scipy.sparse.random(n_samples, n_features, density=0.03, dtype=cp.float32, random_state=10) else: X, Y = make_multilabel_classification(n_samples=n_samples, n_features=n_features, n_classes=2, n_labels=1, random_state=1) cupca = cuPCA() cupca.fit(X) curesult = cupca.transform(X) cupca.handle.sync() if sparse: X = X.toarray().get() skpca = skPCA() skpca.fit(X) skresult = skpca.transform(X) assert skpca.svd_solver == cupca.svd_solver assert cupca.components_.shape[0] == skpca.components_.shape[0] assert curesult.shape == skresult.shape assert array_equal(curesult, skresult, 1e-3, with_sign=False)
def test_pca_defaults(n_samples, n_features, sparse): if sparse: X = cupyx.scipy.sparse.random(n_samples, n_features, density=0.03, dtype=cp.float32, random_state=10) else: X, Y = make_multilabel_classification(n_samples=n_samples, n_features=n_features, n_classes=2, n_labels=1, random_state=1) cupca = cuPCA() cupca.fit(X) curesult = cupca.transform(X) cupca.handle.sync() if sparse: X = X.toarray().get() skpca = skPCA() skpca.fit(X) skresult = skpca.transform(X) assert skpca.svd_solver == cupca.svd_solver assert cupca.components_.shape[0] == skpca.components_.shape[0] assert curesult.shape == skresult.shape assert array_equal(curesult, skresult, 1e-3, with_sign=False)
def test_sparse_pca_inputs(nrows, ncols, whiten, return_sparse, cupy_input): if return_sparse: pytest.skip("Loss of information in converting to cupy sparse csr") X = cupyx.scipy.sparse.random(nrows, ncols, density=0.07, dtype=cp.float32, random_state=10) if not(cupy_input): X = X.get() p_sparse = cuPCA(n_components=ncols, whiten=whiten) p_sparse.fit(X) t_sparse = p_sparse.transform(X) i_sparse = p_sparse.inverse_transform(t_sparse, return_sparse=return_sparse) if return_sparse: assert isinstance(i_sparse, cupyx.scipy.sparse.csr_matrix) assert array_equal(i_sparse.todense(), X.todense(), 1e-1, with_sign=True) else: if cupy_input: assert isinstance(i_sparse, cp.ndarray) assert array_equal(i_sparse, X.todense(), 1e-1, with_sign=True)
def test_pca_fit(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'digits': X, _ = datasets.load_digits(return_X_y=True) else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1) skpca = skPCA(n_components=2) skpca.fit(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) cupca.fit(X) cupca.handle.sync() for attr in ['singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_']: with_sign = False if attr in ['components_'] else True print(attr) print(getattr(cupca, attr)) print(getattr(skpca, attr)) cuml_res = (getattr(cupca, attr)) skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
def test_pca_fit(datatype, input_type): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) skpca = skPCA(n_components=2) skpca.fit(X) cupca = cuPCA(n_components=2) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cupca.fit(gdf) else: cupca.fit(X) for attr in [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_', 'noise_variance_' ]: with_sign = False if attr in ['components_'] else True print(attr) print(getattr(cupca, attr)) print(getattr(skpca, attr)) cuml_res = (getattr(cupca, attr)) if isinstance(cuml_res, cudf.Series): cuml_res = cuml_res.to_array() else: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
def test_pca_fit_transform(datatype, input_type, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1) if name != 'blobs': skpca = skPCA(n_components=2) Xskpca = skpca.fit_transform(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) X_cupca = cupca.fit_transform(X) cupca.handle.sync() if name != 'blobs': assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True) assert Xskpca.shape[0] == X_cupca.shape[0] assert Xskpca.shape[1] == X_cupca.shape[1]
def test_pca_inverse_transform(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) X_pd = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X_pd) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) if input_type == 'dataframe': X_cupca = cupca.fit_transform(X_cudf) else: X_cupca = cupca.fit_transform(X) input_gdf = cupca.inverse_transform(X_cupca) cupca.handle.sync() assert array_equal(input_gdf, X, 1e-0, with_sign=True)
def test_pca_fit_transform(datatype, input_type, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) if name != 'blobs': skpca = skPCA(n_components=2) Xskpca = skpca.fit_transform(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) if input_type == 'dataframe': X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) X_cupca = cupca.fit_transform(X_cudf) else: X_cupca = cupca.fit_transform(X) cupca.handle.sync() if name != 'blobs': assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True)
def test_pca_inverse_transform(datatype): gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cupca = cuPCA(n_components=2) Xcupca = cupca.fit_transform(gdf) print("Calling inverse_transform") input_gdf = cupca.inverse_transform(Xcupca) assert array_equal(input_gdf, gdf, 1e-3, with_sign=True)
def test_pca_fit_transform(datatype): gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) print("Calling fit_transform") cupca = cuPCA(n_components=2) Xcupca = cupca.fit_transform(gdf) skpca = skPCA(n_components=2) Xskpca = skpca.fit_transform(X) assert array_equal(Xcupca, Xskpca, 1e-3, with_sign=False)
def test_pca_defaults(n_samples, n_features): X, Y = make_multilabel_classification(n_samples=n_samples, n_features=n_features, n_classes=2, n_labels=1, random_state=1) skpca = skPCA() skpca.fit(X) cupca = cuPCA() cupca.fit(X) cupca.handle.sync() assert skpca.svd_solver == cupca.svd_solver assert cupca.components_.shape[0] == skpca.components_.shape[0]
def test_pca_inverse_transform(datatype, input_type): gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cupca = cuPCA(n_components=2) if input_type == 'dataframe': Xcupca = cupca.fit_transform(gdf) else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) Xcupca = cupca.fit_transform(X) input_gdf = cupca.inverse_transform(Xcupca) assert array_equal(input_gdf, gdf, 1e-3, with_sign=True)
def test_pca_fit(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) skpca = skPCA(n_components=2) skpca.fit(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) if input_type == 'dataframe': X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) cupca.fit(X_cudf) else: cupca.fit(X) cupca.handle.sync() for attr in [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_', 'noise_variance_' ]: with_sign = False if attr in ['components_'] else True print(attr) print(getattr(cupca, attr)) print(getattr(skpca, attr)) cuml_res = (getattr(cupca, attr)) if isinstance(cuml_res, cudf.Series): cuml_res = cuml_res.to_array() else: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def test_pca_inverse_transform(datatype, input_type, name, use_handle, nrows): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) else: rng = np.random.RandomState(0) n, p = nrows, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [3, 4, 2] # make a large mean handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) X_cupca = cupca.fit_transform(X) input_gdf = cupca.inverse_transform(X_cupca) cupca.handle.sync() assert array_equal(input_gdf, X, 5e-5, with_sign=True)
def test_pca_fit_then_transform(datatype, input_type, name, use_handle): blobs_n_samples = 500000 if name == 'blobs' and pytest.max_gpu_memory < 32: if pytest.adapt_stress_test: blobs_n_samples = int(blobs_n_samples * pytest.max_gpu_memory / 32) else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") if name == 'blobs': X, y = make_blobs(n_samples=blobs_n_samples, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1) if name != 'blobs': skpca = skPCA(n_components=2) skpca.fit(X) Xskpca = skpca.transform(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) cupca.fit(X) X_cupca = cupca.transform(X) cupca.handle.sync() if name != 'blobs': assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True) assert Xskpca.shape[0] == X_cupca.shape[0] assert Xskpca.shape[1] == X_cupca.shape[1]
def test_sparse_pca_inputs(nrows, ncols, whiten, return_sparse, cupy_input): if ncols == 20000 and pytest.max_gpu_memory < 48: if pytest.adapt_stress_test: ncols = int(ncols * pytest.max_gpu_memory / 48) else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") if return_sparse: pytest.skip("Loss of information in converting to cupy sparse csr") X = cupyx.scipy.sparse.random(nrows, ncols, density=0.07, dtype=cp.float32, random_state=10) if not (cupy_input): X = X.get() p_sparse = cuPCA(n_components=ncols, whiten=whiten) p_sparse.fit(X) t_sparse = p_sparse.transform(X) i_sparse = p_sparse.inverse_transform(t_sparse, return_sparse=return_sparse) if return_sparse: assert isinstance(i_sparse, cupyx.scipy.sparse.csr_matrix) assert array_equal(i_sparse.todense(), X.todense(), 1e-1, with_sign=True) else: if cupy_input: assert isinstance(i_sparse, cp.ndarray) assert array_equal(i_sparse, X.todense(), 1e-1, with_sign=True)
def test_pca_fit(datatype): gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) print("Calling fit") cupca = cuPCA(n_components=2) cupca.fit(gdf) skpca = skPCA(n_components=2) skpca.fit(X) for attr in [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_', 'noise_variance_' ]: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cupca, attr), getattr(skpca, attr), 1e-3, with_sign=with_sign)