예제 #1
0
def test_targetencoder_multi_column():
    """
    Test jointly encoding multiple columns
    """
    train = cudf.DataFrame({
        'cat_1': ['a', 'b', 'b', 'a', 'a', 'b'],
        'cat_2': [1, 1, 2, 2, 1, 2],
        'label': [1, 0, 1, 1, 0, 1]
    })
    test = cudf.DataFrame({
        'cat_1': ['b', 'b', 'a', 'b'],
        'cat_2': [1, 2, 1, 2]
    })
    encoder = TargetEncoder()
    train_encoded = encoder.fit_transform(train[['cat_1', 'cat_2']],
                                          train.label)
    test_encoded = encoder.transform(test[['cat_1', 'cat_2']])
    train_answer = np.array([2. / 3, 2. / 3, 1., 2. / 3, 2. / 3, 1.])
    test_answer = np.array([0., 1., 0.5, 1.])
    assert array_equal(train_encoded, train_answer)
    assert array_equal(test_encoded, test_answer)

    encoder = TargetEncoder()
    encoder.fit(train[['cat_1', 'cat_2']], train.label)
    train_encoded = encoder.transform(train[['cat_1', 'cat_2']])
    test_encoded = encoder.transform(test[['cat_1', 'cat_2']])
    assert array_equal(train_encoded, train_answer)
    assert array_equal(test_encoded, test_answer)
예제 #2
0
def test_predict_proba(nrows, ncols, n_neighbors, n_clusters, datatype):

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      cluster_std=0.01,
                      random_state=0)

    X = X.astype(np.float32)

    X_train, X_test, y_train, y_test = _build_train_test_data(X, y, datatype)

    knn_cu = cuKNN(n_neighbors=n_neighbors)
    knn_cu.fit(X_train, y_train)

    predictions = knn_cu.predict_proba(X_test)

    if datatype == "dataframe":
        assert isinstance(predictions, cudf.DataFrame)
        predictions = predictions.to_numpy()
        y_test = y_test.to_numpy().reshape(y_test.shape[0])
    else:
        assert isinstance(predictions, np.ndarray)

    y_hat = np.argmax(predictions, axis=1)

    assert array_equal(y_hat.astype(np.int32), y_test.astype(np.int32))
    assert array_equal(predictions.sum(axis=1), np.ones(y_test.shape[0]))
예제 #3
0
def test_predict_proba_multioutput(input_type, output_type):

    X = np.array([[0, 0, 1, 0], [1, 0, 1, 0]]).astype(np.float32)
    y = np.array([[15, 2], [5, 4]]).astype(np.int32)

    if input_type == "cudf":
        X = cudf.DataFrame(X)
        y = cudf.DataFrame(y)
    elif input_type == "cupy":
        X = cp.asarray(X)
        y = cp.asarray(y)

    expected = (np.array([[0., 1.], [1., 0.]]).astype(np.float32),
                np.array([[1., 0.], [0., 1.]]).astype(np.float32))

    knn_cu = cuKNN(n_neighbors=1, output_type=output_type)
    knn_cu.fit(X, y)

    p = knn_cu.predict_proba(X)

    assert isinstance(p, tuple)

    for i in p:
        if output_type == "cudf":
            assert isinstance(i, cudf.DataFrame)
        elif output_type == "numpy":
            assert isinstance(i, np.ndarray)
        elif output_type == "cupy":
            assert isinstance(i, cp.ndarray)

    assert array_equal(p[0].astype(np.float32), expected[0])
    assert array_equal(p[1].astype(np.float32), expected[1])
예제 #4
0
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                  datatype):

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      cluster_std=0.01,
                      random_state=0)

    X = X.astype(np.float32)

    X_train, X_test, y_train, y_test = _build_train_test_data(X, y, datatype)

    knn_cu = cuKNN(n_neighbors=n_neighbors)
    knn_cu.fit(X_train, y_train)

    predictions = knn_cu.predict(X_test)

    if datatype == "dataframe":
        assert isinstance(predictions, cudf.Series)
        assert array_equal(predictions.to_frame().astype(np.int32),
                           y_test.astype(np.int32))
    else:
        assert isinstance(predictions, np.ndarray)

        assert array_equal(predictions.astype(np.int32),
                           y_test.astype(np.int32))
예제 #5
0
def test_basic_functions(labels, multipart, client):

    fit_labels, xform_labels = labels

    s = cp.asarray(fit_labels, dtype=np.int32)
    df = dask.array.from_array(s)

    s2 = cp.asarray(xform_labels, dtype=np.int32)
    df2 = dask.array.from_array(s2)

    if multipart:
        df = df.rechunk((1, ))
        df2 = df2.rechunk((1, ))

    binarizer = LabelBinarizer(client=client, sparse_output=False)
    binarizer.fit(df)

    assert array_equal(cp.asnumpy(binarizer.classes_),
                       np.unique(cp.asnumpy(s)))

    xformed = binarizer.transform(df2)

    xformed = xformed.map_blocks(lambda x: x.get(), dtype=cp.float32)
    xformed.compute_chunk_sizes()

    assert xformed.compute().shape[1] == binarizer.classes_.shape[0]

    original = binarizer.inverse_transform(xformed)
    test = original.compute()

    assert array_equal(cp.asnumpy(test), xform_labels)
예제 #6
0
def test_svd_flip():
    x = cp.array(range(-10, 80)).reshape((9, 10))
    u, s, v = cp.linalg.svd(x, full_matrices=False)
    u_true, v_true = _svd_flip(u, v, u_based_decision=True)
    reco_true = cp.dot(u_true * s, v_true)
    u_false, v_false = _svd_flip(u, v, u_based_decision=False)
    reco_false = cp.dot(u_false * s, v_false)

    assert array_equal(reco_true, x)
    assert array_equal(reco_false, x)
예제 #7
0
def test_pca_fit(data_info, input_type, client):

    nrows, ncols, n_parts = data_info
    if nrows == int(9e6) and pytest.max_gpu_memory < 48:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 256
            ncols = ncols * pytest.max_gpu_memory // 256
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    from cuml.dask.decomposition import TruncatedSVD as daskTPCA
    from sklearn.decomposition import TruncatedSVD

    from cuml.dask.datasets import make_blobs

    X, _ = make_blobs(n_samples=nrows,
                      n_features=ncols,
                      centers=1,
                      n_parts=n_parts,
                      cluster_std=0.5,
                      random_state=10,
                      dtype=np.float32)

    if input_type == "dataframe":
        X_train = to_dask_cudf(X)
        X_cpu = X_train.compute().to_pandas().values
    elif input_type == "array":
        X_train = X
        X_cpu = cp.asnumpy(X_train.compute())

    cutsvd = daskTPCA(n_components=5)
    cutsvd.fit(X_train)

    sktsvd = TruncatedSVD(n_components=5, algorithm="arpack")
    sktsvd.fit(X_cpu)

    all_attr = [
        'singular_values_', 'components_', 'explained_variance_',
        'explained_variance_ratio_'
    ]

    for attr in all_attr:
        with_sign = False if attr in ['components_'] else True
        cuml_res = (getattr(cutsvd, attr))
        if type(cuml_res) == np.ndarray:
            cuml_res = cuml_res.to_numpy()
        skl_res = getattr(sktsvd, attr)
        if attr == 'singular_values_':
            assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign)
        else:
            assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
예제 #8
0
def test_compare_skl(nrows, ncols, nclusters, n_parts, n_neighbors,
                     streams_per_handle, reverse_worker_order, client):

    from cuml.dask.neighbors import NearestNeighbors as daskNN

    from sklearn.datasets import make_blobs

    nrows = _scale_rows(client, nrows)

    X, y = make_blobs(n_samples=int(nrows),
                      n_features=ncols,
                      centers=nclusters,
                      random_state=0)
    X = X.astype(np.float32)

    X_cudf = _prep_training_data(client, X, n_parts, reverse_worker_order)

    from dask.distributed import wait

    wait(X_cudf)

    dist = np.array([len(v) for v in client.has_what().values()])

    assert np.all(dist == dist[0])

    cumlModel = daskNN(n_neighbors=n_neighbors,
                       streams_per_handle=streams_per_handle)
    cumlModel.fit(X_cudf)

    out_d, out_i = cumlModel.kneighbors(X_cudf)

    local_i = np.array(out_i.compute().to_numpy(), dtype="int64")

    sklModel = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
    skl_y_hat = sklModel.predict(X)
    y_hat, _ = predict(local_i, y, n_neighbors)

    sk_d, sk_i = sklModel.kneighbors(X)

    sk_i = sk_i.astype("int64")

    assert array_equal(local_i[:, 0], np.arange(nrows))

    diff = sk_i - local_i
    n_diff = len(diff[diff > 0])

    perc_diff = n_diff / (nrows * n_neighbors)

    assert perc_diff <= 3e-3

    assert array_equal(y_hat, skl_y_hat)
예제 #9
0
def test_targetencoder_fit_transform():
    train = cudf.DataFrame({
        'category': ['a', 'b', 'b', 'a'],
        'label': [1, 0, 1, 1]
    })
    encoder = TargetEncoder()
    train_encoded = encoder.fit_transform(train.category, train.label)
    answer = np.array([1., 1., 0., 1.])
    assert array_equal(train_encoded, answer)

    encoder = TargetEncoder()
    encoder.fit(train.category, train.label)
    train_encoded = encoder.transform(train.category)

    assert array_equal(train_encoded, answer)
예제 #10
0
def test_one_category():
    train = cudf.DataFrame({
        'category': ['a', 'a', 'a', 'a'],
        'label': [3, 0, 0, 3]
    })
    test = cudf.DataFrame({'category': ['c', 'b', 'a', 'd']})

    encoder = TargetEncoder()
    train_encoded = encoder.fit_transform(train.category, train.label)
    answer = np.array([1., 2., 2., 1.])
    assert array_equal(train_encoded, answer)

    test_encoded = encoder.transform(test.category)
    answer = np.array([1.5, 1.5, 1.5, 1.5])
    assert array_equal(test_encoded, answer)
예제 #11
0
def test_targetencoder_var():
    train = cudf.DataFrame({
        'category': ['a', 'b', 'b', 'b'],
        'label': [1, 0, 1, 1]
    })
    encoder = TargetEncoder(stat='var')
    train_encoded = encoder.fit_transform(train.category, train.label)
    answer = np.array([.25, 0., .5, .5])
    assert array_equal(train_encoded, answer)

    encoder = TargetEncoder(stat='var')
    encoder.fit(train.category, train.label)
    train_encoded = encoder.transform(train.category)

    assert array_equal(train_encoded, answer)
예제 #12
0
def test_logistic_regression_weighting(regression_dataset,
                                       option, test_status):
    regression_type, data, coef, output = regression_dataset[test_status]

    class_weight = None
    sample_weight = None
    if option == 'sample_weight':
        n_samples = data.shape[0]
        sample_weight = np.abs(np.random.rand(n_samples))
    elif option == 'class_weight':
        class_weight = np.random.rand(2)
        class_weight = {0: class_weight[0], 1: class_weight[1]}
    elif option == 'balanced':
        class_weight = 'balanced'

    culog = cuLog(fit_intercept=False, class_weight=class_weight)
    culog.fit(data, output, sample_weight=sample_weight)

    sklog = skLog(fit_intercept=False, class_weight=class_weight)
    sklog.fit(data, output, sample_weight=sample_weight)

    skcoef = np.squeeze(sklog.coef_)
    cucoef = np.squeeze(culog.coef_)
    if regression_type == 'binary':
        skcoef /= np.linalg.norm(skcoef)
        cucoef /= np.linalg.norm(cucoef)
        unit_tol = 0.04
        total_tol = 0.08
    elif regression_type.startswith('multiclass'):
        skcoef = skcoef.T
        skcoef /= np.linalg.norm(skcoef, axis=1)[:, None]
        cucoef /= np.linalg.norm(cucoef, axis=1)[:, None]
        unit_tol = 0.2
        total_tol = 0.3

    equality = array_equal(skcoef, cucoef, unit_tol=unit_tol,
                           total_tol=total_tol)
    if not equality:
        print('\ncoef.shape: ', coef.shape)
        print('coef:\n', coef)
        print('cucoef.shape: ', cucoef.shape)
        print('cucoef:\n', cucoef)
    assert equality

    cuOut = culog.predict(data)
    skOut = sklog.predict(data)
    assert array_equal(skOut, cuOut, unit_tol=unit_tol,
                       total_tol=total_tol)
예제 #13
0
def test_dbscan_default(name, client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    eps = 0.5
    default_base = {
        'quantile': .3,
        'eps': eps,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = 500
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
예제 #14
0
def test_predict_multioutput(input_type, output_type):

    X = np.array([[0, 0, 1, 0], [1, 0, 1, 0]]).astype(np.float32)
    y = np.array([[15, 2], [5, 4]]).astype(np.int32)

    if input_type == "cudf":
        X = cudf.DataFrame(X)
        y = cudf.DataFrame(y)
    elif input_type == "cupy":
        X = cp.asarray(X)
        y = cp.asarray(y)

    knn_cu = cuKNN(n_neighbors=1, output_type=output_type)
    knn_cu.fit(X, y)

    p = knn_cu.predict(X)

    if output_type == "cudf":
        assert isinstance(p, cudf.DataFrame)
    elif output_type == "numpy":
        assert isinstance(p, np.ndarray)
    elif output_type == "cupy":
        assert isinstance(p, cp.ndarray)

    assert array_equal(p.astype(np.int32), y)
예제 #15
0
def test_nonmonotonic_labels(n_classes, n_rows, n_cols, datatype, n_neighbors):

    X, y = make_blobs(n_samples=n_rows,
                      centers=n_classes,
                      n_features=n_cols,
                      cluster_std=0.01,
                      random_state=0)

    X = X.astype(np.float32)

    # Draw labels from non-monotonically increasing set
    classes = np.arange(0, n_classes * 5, 5)
    for i in range(n_classes):
        y[y == i] = classes[i]

    X_train, X_test, y_train, y_test = _build_train_test_data(X, y, datatype)

    knn_cu = cuKNN(n_neighbors=n_neighbors)
    knn_cu.fit(X_train, y_train)

    p = knn_cu.predict(X_test)

    if datatype == "dataframe":
        assert isinstance(p, cudf.Series)
        p = p.to_frame().to_numpy().reshape(p.shape[0])
        y_test = y_test.to_numpy().reshape(y_test.shape[0])

    assert array_equal(p.astype(np.int32), y_test.astype(np.int32))
예제 #16
0
def test_umap_fit_transform_score(nrows, n_feats):

    n_samples = nrows
    n_features = n_feats

    data, labels = make_blobs(n_samples=n_samples, n_features=n_features,
                              centers=10, random_state=42)

    model = umap.UMAP(n_neighbors=10, min_dist=0.1)
    cuml_model = cuUMAP(n_neighbors=10, min_dist=0.01)

    embedding = model.fit_transform(data)
    cuml_embedding = cuml_model.fit_transform(data, convert_dtype=True)

    assert not np.isnan(embedding).any()
    assert not np.isnan(cuml_embedding).any()

    if nrows < 500000:
        cuml_score = adjusted_rand_score(labels,
                                         KMeans(10).fit_predict(
                                             cuml_embedding))
        score = adjusted_rand_score(labels,
                                    KMeans(10).fit_predict(embedding))

        assert array_equal(score, cuml_score, 1e-2, with_sign=True)
예제 #17
0
def test_batch_size(nrows, ncols, n_parts, batch_size, client):

    n_neighbors = 10
    n_clusters = 5
    from cuml.dask.neighbors import NearestNeighbors as daskNN

    from sklearn.datasets import make_blobs

    nrows = _scale_rows(client, nrows)

    X, y = make_blobs(n_samples=int(nrows),
                      n_features=ncols,
                      centers=n_clusters,
                      random_state=0)

    X = X.astype(np.float32)

    X_cudf = _prep_training_data(client, X, n_parts)

    cumlModel = daskNN(n_neighbors=n_neighbors,
                       batch_size=batch_size,
                       streams_per_handle=5)

    cumlModel.fit(X_cudf)

    out_d, out_i = cumlModel.kneighbors(X_cudf)

    local_i = out_i.compute().to_numpy()

    y_hat, _ = predict(local_i, y, n_neighbors)

    assert array_equal(y_hat, y)
예제 #18
0
def test_core_point_prop3():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a two-barred (orhodox) cross or
    # two stars sharing a link:
    #   .   .
    # . . . . .
    #   .   .
    # There are 2 core-points but they are not reachable from each other
    # So there should be two clusters.
    # However, the link that is shared between the stars
    # actually has an ambiguous label (to the best of my knowledge)
    # as it will depend on the order in which we process the core-points.
    # So we exclude that point from the comparison with sklearn

    # TODO: the above text does not correspond to the actual test!

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [3, 0], [4, 0], [4, 1],
                  [4, -1], [5, 0], [2, 0]],
                 dtype=np.float32)
    cuml_dbscan = cuDBSCAN(**params)
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(**params)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, params['eps'])
예제 #19
0
def test_pca_defaults(n_samples, n_features, sparse):
    # FIXME: Disable the case True-300-200 due to flaky test
    if sparse and n_features == 300 and n_samples == 200:
        pytest.xfail('Skipping the case True-300-200 due to flaky test')

    if sparse:
        X = cupyx.scipy.sparse.random(n_samples,
                                      n_features,
                                      density=0.03,
                                      dtype=cp.float32,
                                      random_state=10)
    else:
        X, Y = make_multilabel_classification(n_samples=n_samples,
                                              n_features=n_features,
                                              n_classes=2,
                                              n_labels=1,
                                              random_state=1)
    cupca = cuPCA()
    cupca.fit(X)
    curesult = cupca.transform(X)
    cupca.handle.sync()

    if sparse:
        X = X.toarray().get()
    skpca = skPCA()
    skpca.fit(X)
    skresult = skpca.transform(X)

    assert skpca.svd_solver == cupca.svd_solver
    assert cupca.components_.shape[0] == skpca.components_.shape[0]
    assert curesult.shape == skresult.shape
    assert array_equal(curesult, skresult, 1e-3, with_sign=False)
예제 #20
0
def test_ridge_regression_model(datatype, algorithm, nrows, column_info):

    if algorithm == "svd" and nrows > 46340:
        pytest.skip("svd solver is not supported for the data that has more"
                    "than 46340 rows or columns if you are using CUDA version"
                    "10.x")

    ncols, n_info = column_info
    X_train, X_test, y_train, y_test = make_regression_dataset(
        datatype, nrows, ncols, n_info
    )

    # Initialization of cuML's ridge regression model
    curidge = cuRidge(fit_intercept=False, normalize=False, solver=algorithm)

    # fit and predict cuml ridge regression model
    curidge.fit(X_train, y_train)
    curidge_predict = curidge.predict(X_test)

    if nrows < 500000:
        # sklearn ridge regression model initialization, fit and predict
        skridge = skRidge(fit_intercept=False, normalize=False)
        skridge.fit(X_train, y_train)

        skridge_predict = skridge.predict(X_test)

        assert array_equal(skridge_predict,
                           curidge_predict,
                           1e-1,
                           with_sign=True)
예제 #21
0
def test_partial_fit(nrows, ncols, n_components, density, batch_size_divider,
                     whiten):

    X, _ = make_blobs(n_samples=nrows, n_features=ncols, random_state=10)

    cu_ipca = cuIPCA(n_components=n_components, whiten=whiten)

    sample_size = int(nrows / batch_size_divider)
    for i in range(0, nrows, sample_size):
        cu_ipca.partial_fit(X[i:i + sample_size].copy())

    cu_t = cu_ipca.transform(X)
    cu_inv = cu_ipca.inverse_transform(cu_t)

    sk_ipca = skIPCA(n_components=n_components, whiten=whiten)

    X = cp.asnumpy(X)

    for i in range(0, nrows, sample_size):
        sk_ipca.partial_fit(X[i:i + sample_size].copy())

    sk_t = sk_ipca.transform(X)
    sk_inv = sk_ipca.inverse_transform(sk_t)

    assert array_equal(cu_inv, sk_inv, 5e-5, with_sign=True)
예제 #22
0
def test_fit(nrows, ncols, n_components, sparse_input, density, sparse_format,
             batch_size_divider, whiten):

    if sparse_format == 'csc':
        pytest.skip("cupyx.scipy.sparse.csc.csc_matrix does not support"
                    " indexing as of cupy 7.6.0")

    if sparse_input:
        X = cupyx.scipy.sparse.random(nrows,
                                      ncols,
                                      density=density,
                                      random_state=10,
                                      format=sparse_format)
    else:
        X, _ = make_blobs(n_samples=nrows, n_features=ncols, random_state=10)

    cu_ipca = cuIPCA(n_components=n_components,
                     whiten=whiten,
                     batch_size=int(nrows / batch_size_divider))
    cu_ipca.fit(X)
    cu_t = cu_ipca.transform(X)
    cu_inv = cu_ipca.inverse_transform(cu_t)

    sk_ipca = skIPCA(n_components=n_components,
                     whiten=whiten,
                     batch_size=int(nrows / batch_size_divider))
    if sparse_input:
        X = X.get()
    else:
        X = cp.asnumpy(X)
    sk_ipca.fit(X)
    sk_t = sk_ipca.transform(X)
    sk_inv = sk_ipca.inverse_transform(sk_t)

    assert array_equal(cu_inv, sk_inv, 5e-5, with_sign=True)
예제 #23
0
def test_dbscan_propagation(datatype, use_handle, out_dtype, n_samples):
    X, y = make_blobs(n_samples,
                      centers=1,
                      cluster_std=8.0,
                      center_box=(-100.0, 100.0),
                      random_state=8)
    X = X.astype(datatype)

    handle, stream = get_handle(use_handle)
    eps = 0.5
    cuml_dbscan = cuDBSCAN(handle=handle,
                           eps=eps,
                           min_samples=5,
                           output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
예제 #24
0
def test_dbscan(datatype, nrows, ncols, max_mbytes_per_batch, out_dtype,
                client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples,
                      cluster_std=0.01,
                      n_features=n_feats,
                      random_state=0)

    eps = 1
    cuml_dbscan = cuDBSCAN(eps=eps,
                           min_samples=2,
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
예제 #25
0
def test_dbscan_precomputed(datatype, nrows, max_mbytes_per_batch, out_dtype,
                            client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    # 2-dimensional dataset for easy distance matrix computation
    X, y = make_blobs(n_samples=nrows,
                      cluster_std=0.01,
                      n_features=2,
                      random_state=0)

    # Precompute distances
    X_dist = pairwise_distances(X).astype(datatype)

    eps = 1
    cuml_dbscan = cuDBSCAN(eps=eps,
                           min_samples=2,
                           metric='precomputed',
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X_dist, out_dtype=out_dtype)

    sk_dbscan = skDBSCAN(eps=eps,
                         min_samples=2,
                         metric='precomputed',
                         algorithm="brute")
    sk_labels = sk_dbscan.fit_predict(X_dist)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
예제 #26
0
def test_predict_large_n_classes(datatype):

    nrows = 10000
    ncols = 100
    n_neighbors = 2
    n_clusters = 1000

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      cluster_std=0.01,
                      random_state=0)

    X = X.astype(np.float32)

    X_train, X_test, y_train, y_test = _build_train_test_data(X, y, datatype)

    knn_cu = cuKNN(n_neighbors=n_neighbors)
    knn_cu.fit(X_train, y_train)

    y_hat = knn_cu.predict(X_test)

    if datatype == "dataframe":
        y_hat = y_hat.to_numpy()
        y_test = y_test.to_numpy().ravel()

    assert array_equal(y_hat.astype(np.int32), y_test.astype(np.int32))
예제 #27
0
def test_weighted_ridge(datatype, algorithm, fit_intercept,
                        normalize, distribution):
    nrows, ncols, n_info = 1000, 20, 10
    max_weight = 10
    noise = 20
    X_train, X_test, y_train, y_test = make_regression_dataset(
        datatype, nrows, ncols, n_info, noise=noise
    )

    # set weight per sample to be from 1 to max_weight
    if distribution == "uniform":
        wt = np.random.randint(1, high=max_weight, size=len(X_train))
    elif distribution == "exponential":
        wt = np.random.exponential(size=len(X_train))
    else:
        wt = np.random.lognormal(size=len(X_train))

    # Initialization of cuML's linear regression model
    curidge = cuRidge(fit_intercept=fit_intercept,
                      normalize=normalize,
                      solver=algorithm)

    # fit and predict cuml linear regression model
    curidge.fit(X_train, y_train, sample_weight=wt)
    curidge_predict = curidge.predict(X_test)

    # sklearn linear regression model initialization, fit and predict
    skridge = skRidge(fit_intercept=fit_intercept,
                      normalize=normalize)
    skridge.fit(X_train, y_train, sample_weight=wt)

    skridge_predict = skridge.predict(X_test)

    assert array_equal(skridge_predict, curidge_predict, 1e-1, with_sign=True)
예제 #28
0
def test_umap_fit_transform_trust(name, target_metric):

    if name == 'iris':
        iris = datasets.load_iris()
        data = iris.data
        labels = iris.target

    elif name == 'digits':
        digits = datasets.load_digits(n_class=5)
        data = digits.data
        labels = digits.target

    elif name == 'wine':
        wine = datasets.load_wine()
        data = wine.data
        labels = wine.target
    else:
        data, labels = make_blobs(n_samples=500, n_features=10,
                                  centers=10, random_state=42)

    model = umap.UMAP(n_neighbors=10, min_dist=0.01,
                      target_metric=target_metric)
    cuml_model = cuUMAP(n_neighbors=10, min_dist=0.01,
                        target_metric=target_metric)
    embedding = model.fit_transform(data)
    cuml_embedding = cuml_model.fit_transform(data, convert_dtype=True)

    trust = trustworthiness(data, embedding, n_neighbors=10)
    cuml_trust = trustworthiness(data, cuml_embedding, n_neighbors=10)

    assert array_equal(trust, cuml_trust, 1e-1, with_sign=True)
예제 #29
0
def test_tsvd_fit_transform(datatype, name, use_handle):
    if name == 'blobs':
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'random':
        pytest.skip('fails when using random dataset '
                    'used by sklearn for testing')
        shape = 5000, 100
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)

    else:
        n, p = 500, 5
        rng = np.random.RandomState(0)
        X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5])

    if name != 'blobs':
        skpca = skTSVD(n_components=1)
        Xsktsvd = skpca.fit_transform(X)

    handle, stream = get_handle(use_handle)
    cutsvd = cuTSVD(n_components=1, handle=handle)

    Xcutsvd = cutsvd.fit_transform(X)
    cutsvd.handle.sync()

    if name != 'blobs':
        assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
예제 #30
0
def test_logistic_regression_decision_function(
    dtype, nrows, column_info, num_classes, fit_intercept, sparse_input
):
    ncols, n_info = column_info
    X_train, X_test, y_train, y_test = make_classification_dataset(
        datatype=dtype, nrows=nrows, ncols=ncols,
        n_info=n_info, num_classes=num_classes
    )
    X_train = csr_matrix(X_train) if sparse_input else X_train
    X_test = csr_matrix(X_test) if sparse_input else X_test

    y_train = y_train.astype(dtype)
    y_test = y_test.astype(dtype)

    culog = cuLog(fit_intercept=fit_intercept, output_type="numpy")
    culog.fit(X_train, y_train)

    sklog = skLog(fit_intercept=fit_intercept)
    sklog.coef_ = culog.coef_.T
    if fit_intercept:
        sklog.intercept_ = culog.intercept_
    else:
        skLog.intercept_ = 0
    sklog.classes_ = np.arange(num_classes)

    cu_dec_func = culog.decision_function(X_test)
    if num_classes > 2:
        cu_dec_func = cu_dec_func.T
    sk_dec_func = sklog.decision_function(X_test)

    assert array_equal(cu_dec_func, sk_dec_func)