Exemplo n.º 1
0
def test_exceptions():
    with pytest.raises(NotFittedError):
        X = cp.random.random((10, 10))
        cuPCA().transform(X)

    with pytest.raises(NotFittedError):
        X = cp.random.random((10, 10))
        cuPCA().inverse_transform(X)
Exemplo n.º 2
0
 def __init__(self, random_seed=0, n_components=4, **kwargs):
     parameters = {'n_components': n_components}
     parameters.update(kwargs)
     super().__init__(parameters=parameters,
                      component_obj=cuPCA(random_state=random_seed,
                                          **parameters),
                      random_seed=random_seed)
Exemplo n.º 3
0
def test_pca_defaults(n_samples, n_features, sparse):
    # FIXME: Disable the case True-300-200 due to flaky test
    if sparse and n_features == 300 and n_samples == 200:
        pytest.xfail('Skipping the case True-300-200 due to flaky test')

    if sparse:
        X = cupyx.scipy.sparse.random(n_samples,
                                      n_features,
                                      density=0.03,
                                      dtype=cp.float32,
                                      random_state=10)
    else:
        X, Y = make_multilabel_classification(n_samples=n_samples,
                                              n_features=n_features,
                                              n_classes=2,
                                              n_labels=1,
                                              random_state=1)
    cupca = cuPCA()
    cupca.fit(X)
    curesult = cupca.transform(X)
    cupca.handle.sync()

    if sparse:
        X = X.toarray().get()
    skpca = skPCA()
    skpca.fit(X)
    skresult = skpca.transform(X)

    assert skpca.svd_solver == cupca.svd_solver
    assert cupca.components_.shape[0] == skpca.components_.shape[0]
    assert curesult.shape == skresult.shape
    assert array_equal(curesult, skresult, 1e-3, with_sign=False)
Exemplo n.º 4
0
def test_pca_defaults(n_samples, n_features, sparse):
    if sparse:
        X = cupyx.scipy.sparse.random(n_samples,
                                      n_features,
                                      density=0.03,
                                      dtype=cp.float32,
                                      random_state=10)
    else:
        X, Y = make_multilabel_classification(n_samples=n_samples,
                                              n_features=n_features,
                                              n_classes=2,
                                              n_labels=1,
                                              random_state=1)
    cupca = cuPCA()
    cupca.fit(X)
    curesult = cupca.transform(X)
    cupca.handle.sync()

    if sparse:
        X = X.toarray().get()
    skpca = skPCA()
    skpca.fit(X)
    skresult = skpca.transform(X)

    assert skpca.svd_solver == cupca.svd_solver
    assert cupca.components_.shape[0] == skpca.components_.shape[0]
    assert curesult.shape == skresult.shape
    assert array_equal(curesult, skresult, 1e-3, with_sign=False)
Exemplo n.º 5
0
def test_sparse_pca_inputs(nrows, ncols, whiten, return_sparse, cupy_input):

    if return_sparse:
        pytest.skip("Loss of information in converting to cupy sparse csr")

    X = cupyx.scipy.sparse.random(nrows, ncols, density=0.07, dtype=cp.float32,
                                  random_state=10)
    if not(cupy_input):
        X = X.get()

    p_sparse = cuPCA(n_components=ncols, whiten=whiten)

    p_sparse.fit(X)
    t_sparse = p_sparse.transform(X)
    i_sparse = p_sparse.inverse_transform(t_sparse,
                                          return_sparse=return_sparse)

    if return_sparse:

        assert isinstance(i_sparse, cupyx.scipy.sparse.csr_matrix)

        assert array_equal(i_sparse.todense(), X.todense(), 1e-1,
                           with_sign=True)
    else:
        if cupy_input:
            assert isinstance(i_sparse, cp.ndarray)

        assert array_equal(i_sparse, X.todense(), 1e-1, with_sign=True)
Exemplo n.º 6
0
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000,
                          n_features=1000, random_state=0)

    elif name == 'digits':
        X, _ = datasets.load_digits(return_X_y=True)

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)

    skpca = skPCA(n_components=2)
    skpca.fit(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)
    cupca.fit(X)
    cupca.handle.sync()

    for attr in ['singular_values_', 'components_', 'explained_variance_',
                 'explained_variance_ratio_']:
        with_sign = False if attr in ['components_'] else True
        print(attr)
        print(getattr(cupca, attr))
        print(getattr(skpca, attr))
        cuml_res = (getattr(cupca, attr))

        skl_res = getattr(skpca, attr)
        assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
Exemplo n.º 7
0
def test_pca_fit(datatype, input_type):

    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                 dtype=datatype)
    skpca = skPCA(n_components=2)
    skpca.fit(X)

    cupca = cuPCA(n_components=2)

    if input_type == 'dataframe':
        gdf = cudf.DataFrame()
        gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
        gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)
        cupca.fit(gdf)

    else:
        cupca.fit(X)

    for attr in [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_', 'noise_variance_'
    ]:
        with_sign = False if attr in ['components_'] else True
        print(attr)
        print(getattr(cupca, attr))
        print(getattr(skpca, attr))
        cuml_res = (getattr(cupca, attr))
        if isinstance(cuml_res, cudf.Series):
            cuml_res = cuml_res.to_array()
        else:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(skpca, attr)
        assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
Exemplo n.º 8
0
def test_pca_fit_transform(datatype, input_type,
                           name, use_handle):

    if name == 'blobs':
        X, y = make_blobs(n_samples=500000,
                          n_features=1000, random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)

    if name != 'blobs':
        skpca = skPCA(n_components=2)
        Xskpca = skpca.fit_transform(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    X_cupca = cupca.fit_transform(X)
    cupca.handle.sync()

    if name != 'blobs':
        assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True)
        assert Xskpca.shape[0] == X_cupca.shape[0]
        assert Xskpca.shape[1] == X_cupca.shape[1]
Exemplo n.º 9
0
def test_pca_inverse_transform(datatype, input_type, name, use_handle):
    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)

    X_pd = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
    X_cudf = cudf.DataFrame.from_pandas(X_pd)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    if input_type == 'dataframe':
        X_cupca = cupca.fit_transform(X_cudf)

    else:
        X_cupca = cupca.fit_transform(X)

    input_gdf = cupca.inverse_transform(X_cupca)
    cupca.handle.sync()

    assert array_equal(input_gdf, X, 1e-0, with_sign=True)
Exemplo n.º 10
0
def test_pca_fit_transform(datatype, input_type, name, use_handle):
    if name == 'blobs':
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)

    if name != 'blobs':
        skpca = skPCA(n_components=2)
        Xskpca = skpca.fit_transform(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    if input_type == 'dataframe':
        X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X)
        X_cupca = cupca.fit_transform(X_cudf)

    else:
        X_cupca = cupca.fit_transform(X)
    cupca.handle.sync()

    if name != 'blobs':
        assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True)
Exemplo n.º 11
0
def test_pca_inverse_transform(datatype):
    gdf = cudf.DataFrame()
    gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
    gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)

    cupca = cuPCA(n_components=2)
    Xcupca = cupca.fit_transform(gdf)

    print("Calling inverse_transform")
    input_gdf = cupca.inverse_transform(Xcupca)

    assert array_equal(input_gdf, gdf, 1e-3, with_sign=True)
Exemplo n.º 12
0
def test_pca_fit_transform(datatype):
    gdf = cudf.DataFrame()
    gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
    gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)

    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                 dtype=datatype)

    print("Calling fit_transform")
    cupca = cuPCA(n_components=2)
    Xcupca = cupca.fit_transform(gdf)
    skpca = skPCA(n_components=2)
    Xskpca = skpca.fit_transform(X)

    assert array_equal(Xcupca, Xskpca, 1e-3, with_sign=False)
Exemplo n.º 13
0
def test_pca_defaults(n_samples, n_features):
    X, Y = make_multilabel_classification(n_samples=n_samples,
                                          n_features=n_features,
                                          n_classes=2,
                                          n_labels=1,
                                          random_state=1)
    skpca = skPCA()
    skpca.fit(X)

    cupca = cuPCA()
    cupca.fit(X)
    cupca.handle.sync()

    assert skpca.svd_solver == cupca.svd_solver
    assert cupca.components_.shape[0] == skpca.components_.shape[0]
Exemplo n.º 14
0
def test_pca_inverse_transform(datatype, input_type):
    gdf = cudf.DataFrame()
    gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
    gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)
    cupca = cuPCA(n_components=2)

    if input_type == 'dataframe':
        Xcupca = cupca.fit_transform(gdf)

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)
        Xcupca = cupca.fit_transform(X)

    input_gdf = cupca.inverse_transform(Xcupca)

    assert array_equal(input_gdf, gdf, 1e-3, with_sign=True)
Exemplo n.º 15
0
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)

    skpca = skPCA(n_components=2)
    skpca.fit(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    if input_type == 'dataframe':
        X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X)
        cupca.fit(X_cudf)

    else:
        cupca.fit(X)

    cupca.handle.sync()

    for attr in [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_', 'noise_variance_'
    ]:
        with_sign = False if attr in ['components_'] else True
        print(attr)
        print(getattr(cupca, attr))
        print(getattr(skpca, attr))
        cuml_res = (getattr(cupca, attr))
        if isinstance(cuml_res, cudf.Series):
            cuml_res = cuml_res.to_array()
        else:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(skpca, attr)
        assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
Exemplo n.º 16
0
def test_pca_inverse_transform(datatype, input_type, name, use_handle, nrows):
    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    else:
        rng = np.random.RandomState(0)
        n, p = nrows, 3
        X = rng.randn(n, p)  # spherical data
        X[:, 1] *= .00001  # make middle component relatively small
        X += [3, 4, 2]  # make a large mean

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    X_cupca = cupca.fit_transform(X)

    input_gdf = cupca.inverse_transform(X_cupca)
    cupca.handle.sync()
    assert array_equal(input_gdf, X, 5e-5, with_sign=True)
Exemplo n.º 17
0
def test_pca_fit_then_transform(datatype, input_type, name, use_handle):
    blobs_n_samples = 500000
    if name == 'blobs' and pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            blobs_n_samples = int(blobs_n_samples * pytest.max_gpu_memory / 32)
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    if name == 'blobs':
        X, y = make_blobs(n_samples=blobs_n_samples,
                          n_features=1000,
                          random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)

    if name != 'blobs':
        skpca = skPCA(n_components=2)
        skpca.fit(X)
        Xskpca = skpca.transform(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    cupca.fit(X)
    X_cupca = cupca.transform(X)
    cupca.handle.sync()

    if name != 'blobs':
        assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True)
        assert Xskpca.shape[0] == X_cupca.shape[0]
        assert Xskpca.shape[1] == X_cupca.shape[1]
Exemplo n.º 18
0
def test_sparse_pca_inputs(nrows, ncols, whiten, return_sparse, cupy_input):
    if ncols == 20000 and pytest.max_gpu_memory < 48:
        if pytest.adapt_stress_test:
            ncols = int(ncols * pytest.max_gpu_memory / 48)
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    if return_sparse:
        pytest.skip("Loss of information in converting to cupy sparse csr")

    X = cupyx.scipy.sparse.random(nrows,
                                  ncols,
                                  density=0.07,
                                  dtype=cp.float32,
                                  random_state=10)
    if not (cupy_input):
        X = X.get()

    p_sparse = cuPCA(n_components=ncols, whiten=whiten)

    p_sparse.fit(X)
    t_sparse = p_sparse.transform(X)
    i_sparse = p_sparse.inverse_transform(t_sparse,
                                          return_sparse=return_sparse)

    if return_sparse:

        assert isinstance(i_sparse, cupyx.scipy.sparse.csr_matrix)

        assert array_equal(i_sparse.todense(),
                           X.todense(),
                           1e-1,
                           with_sign=True)
    else:
        if cupy_input:
            assert isinstance(i_sparse, cp.ndarray)

        assert array_equal(i_sparse, X.todense(), 1e-1, with_sign=True)
Exemplo n.º 19
0
def test_pca_fit(datatype):
    gdf = cudf.DataFrame()
    gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
    gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)

    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                 dtype=datatype)

    print("Calling fit")
    cupca = cuPCA(n_components=2)
    cupca.fit(gdf)
    skpca = skPCA(n_components=2)
    skpca.fit(X)

    for attr in [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_', 'noise_variance_'
    ]:
        with_sign = False if attr in ['components_'] else True
        assert array_equal(getattr(cupca, attr),
                           getattr(skpca, attr),
                           1e-3,
                           with_sign=with_sign)