import cudf


def generate_dask_array(np_array, n_parts):
    n_samples = np_array.shape[0]
    n_samples_per_part = int(n_samples / n_parts)
    chunks = [n_samples_per_part] * n_parts
    chunks[-1] += n_samples % n_samples_per_part
    chunks = tuple(chunks)
    return da.from_array(np_array, chunks=(chunks, -1))


@pytest.fixture(
    scope="module",
    params=[
        unit_param({'n_samples': 3000, 'n_features': 30,
                    'n_classes': 5, 'n_targets': 2}),
        quality_param({'n_samples': 8000, 'n_features': 35,
                       'n_classes': 12, 'n_targets': 3}),
        stress_param({'n_samples': 20000, 'n_features': 40,
                      'n_classes': 12, 'n_targets': 4})
    ])
def dataset(request):
    X, y = make_multilabel_classification(
        n_samples=int(request.param['n_samples'] * 1.2),
        n_features=request.param['n_features'],
        n_classes=request.param['n_classes'],
        n_labels=request.param['n_classes'],
        length=request.param['n_targets'])
    new_x = []
    new_y = []
    for i in range(y.shape[0]):
示例#2
0
                               max_iter=skit,
                               dual=skdual)
            skm.fit(X_train, y_train)
            return skm.score(X_test, y_test)

        sks = with_timeout(timeout=t, target=run_sklearn)
        good_enough(cus, sks, nrows)
    except TimeoutError:
        pytest.skip(f"sklearn did not finish within {t} seconds.")


@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize(
    "loss", ["epsilon_insensitive", "squared_epsilon_insensitive"])
@pytest.mark.parametrize("dims", [
    unit_param((3, 1)),
    unit_param((100, 1)),
    unit_param((1000, 10)),
    unit_param((100, 100)),
    unit_param((100, 300)),
    quality_param((10000, 10)),
    quality_param((10000, 50)),
    stress_param((100000, 1000))
])
def test_regression_basic(datatype, loss, dims):
    run_regression(datatype, loss, 0, dims)


@pytest.mark.parametrize(
    "loss", ["epsilon_insensitive", "squared_epsilon_insensitive"])
@pytest.mark.parametrize("epsilon", [0, 0.001, 0.1])
示例#3
0
    X_train_df = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)
    X_train_df, = dask_utils.persist_across_workers(c, [X_train_df],
                                                    workers=list(workers))

    return X_train_df


def _scale_rows(client, nrows):
    workers = list(client.scheduler_info()['workers'].keys())
    n_workers = len(workers)
    return n_workers * nrows


@pytest.mark.parametrize(
    "nrows",
    [unit_param(300), quality_param(1e6),
     stress_param(5e8)])
@pytest.mark.parametrize("ncols", [10, 30])
@pytest.mark.parametrize(
    "nclusters",
    [unit_param(5), quality_param(10),
     stress_param(15)])
@pytest.mark.parametrize(
    "n_neighbors",
    [unit_param(10), quality_param(4),
     stress_param(100)])
@pytest.mark.parametrize(
    "n_parts",
    [unit_param(1),
     unit_param(5),
     quality_param(7),
示例#4
0
from cuml import Lasso as cuLasso
from cuml.linear_model import ElasticNet as cuElasticNet
from cuml.metrics import r2_score
from cuml.testing.utils import unit_param, quality_param, stress_param

from sklearn.linear_model import Lasso, ElasticNet
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('X_type', ['ndarray'])
@pytest.mark.parametrize('alpha', [0.1, 0.001])
@pytest.mark.parametrize('algorithm', ['cyclic', 'random'])
@pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000),
                         stress_param(500000)])
@pytest.mark.parametrize('column_info', [unit_param([20, 10]),
                         quality_param([100, 50]),
                         stress_param([1000, 500])])
@pytest.mark.filterwarnings("ignore:Objective did not converge::sklearn[.*]")
def test_lasso(datatype, X_type, alpha, algorithm,
               nrows, column_info):
    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows, n_features=ncols,
                           n_informative=n_info, random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)
    cu_lasso = cuLasso(alpha=np.array([alpha]), fit_intercept=True,
示例#5
0
import numpy as np
import pytest
import cupy as cp

from cuml.linear_model import MBSGDClassifier as cumlMBSGClassifier
from cuml.testing.utils import unit_param, quality_param, stress_param

from sklearn.linear_model import SGDClassifier
from cuml.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


@pytest.fixture(scope="module",
                params=[
                    unit_param([500, 20, 10, np.float32]),
                    unit_param([500, 20, 10, np.float64]),
                    quality_param([5000, 100, 50, np.float32]),
                    quality_param([5000, 100, 50, np.float64]),
                    stress_param([500000, 1000, 500, np.float32]),
                    stress_param([500000, 1000, 500, np.float64]),
                ],
                ids=[
                    '500-20-10-f32', '500-20-10-f64', '5000-100-50-f32',
                    '5000-100-50-f64', '500000-1000-500-f32',
                    '500000-1000-500-f64'
                ])
def make_dataset(request):
    nrows, ncols, n_info, datatype = request.param
    X, y = make_classification(n_samples=nrows,
                               n_informative=n_info,
示例#6
0
import pytest

import dask.array as da
import numpy as np
import cupy as cp

from cuml.dask.datasets.blobs import make_blobs
from cuml.dask.common.input_utils import DistributedDataHandler

from cuml.testing.utils import unit_param, quality_param, stress_param

from cuml.dask.common.part_utils import _extract_partitions


@pytest.mark.parametrize('nrows', [unit_param(1e3), quality_param(1e5),
                                   stress_param(1e6)])
@pytest.mark.parametrize('ncols', [unit_param(10), quality_param(100),
                                   stress_param(1000)])
@pytest.mark.parametrize('centers', [10])
@pytest.mark.parametrize("cluster_std", [0.1])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("nparts", [unit_param(1), unit_param(7),
                                    quality_param(100),
                                    stress_param(1000)])
@pytest.mark.parametrize("order", ['F', 'C'])
def test_make_blobs(nrows,
                    ncols,
                    centers,
                    cluster_std,
                    dtype,
示例#7
0

def make_dataset(datatype, nrows, ncols, n_info):
    X, y = make_regression(n_samples=nrows,
                           n_features=ncols,
                           n_informative=n_info,
                           random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
    return X_train, y_train, X_test


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('key', rf_models.keys())
@pytest.mark.parametrize('nrows', [unit_param(500)])
@pytest.mark.parametrize('ncols', [unit_param(16)])
@pytest.mark.parametrize('n_info', [unit_param(7)])
@pytest.mark.parametrize('n_classes', [unit_param(2), unit_param(5)])
def test_rf_regression_pickle(tmpdir, datatype, nrows, ncols, n_info,
                              n_classes, key):

    result = {}
    if datatype == np.float64:
        pytest.xfail("Pickling is not supported for dataset with"
                     " dtype float64")

    def create_mod():
        if key == 'RandomForestRegressor':
            X_train, y_train, X_test = make_dataset(datatype, nrows, ncols,
                                                    n_info)
示例#8
0
import pytest

from cuml.testing.utils import get_pattern, unit_param, \
    quality_param, stress_param, array_equal, assert_dbscan_equal

from sklearn.cluster import DBSCAN as skDBSCAN
from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler


@pytest.mark.mg
@pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None])
@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize(
    'ncols',
    [unit_param(20), quality_param(100),
     stress_param(1000)])
@pytest.mark.parametrize('out_dtype', [
    unit_param("int32"),
    unit_param(np.int32),
    unit_param("int64"),
    unit_param(np.int64),
    quality_param("int32"),
    stress_param("int32")
])
def test_dbscan(datatype, nrows, ncols, max_mbytes_per_batch, out_dtype,
示例#9
0
from cuml.testing.utils import unit_param
from cuml.testing.utils import quality_param
from cuml.testing.utils import stress_param

import dask.array as da

from cuml.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_rand_score as sk_adjusted_rand_score

from cuml.dask.common.dask_arr_utils import to_dask_cudf


@pytest.mark.mg
@pytest.mark.parametrize(
    "nrows",
    [unit_param(1e3), quality_param(1e5),
     stress_param(5e6)])
@pytest.mark.parametrize("ncols", [10, 30])
@pytest.mark.parametrize(
    "nclusters",
    [unit_param(5), quality_param(10),
     stress_param(50)])
@pytest.mark.parametrize(
    "n_parts",
    [unit_param(None), quality_param(7),
     stress_param(50)])
@pytest.mark.parametrize("delayed_predict", [True, False])
@pytest.mark.parametrize("input_type", ["dataframe", "array"])
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict,
                    input_type, client):
示例#10
0
    cuSVC = cu_svm.SVC(**params)
    cuSVC.fit(X_train, y_train)

    sklSVC = svm.SVC(**params)
    sklSVC.fit(X_train, y_train)

    compare_svm(cuSVC, sklSVC, X_train, y_train, cmp_decision_func=True)


@pytest.mark.parametrize('params', [
    {'kernel': 'linear', 'C': 1},
    {'kernel': 'rbf', 'C': 1, 'gamma': 1},
    {'kernel': 'poly', 'C': 1, 'gamma': 1},
])
@pytest.mark.parametrize('dataset', ['classification2', 'gaussian', 'blobs'])
@pytest.mark.parametrize('n_rows', [3, unit_param(100), quality_param(1000),
                                    stress_param(5000)])
@pytest.mark.parametrize('n_cols', [2, unit_param(100), quality_param(1000),
                         stress_param(1000)])
def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols):
    if (params['kernel'] == 'linear' and
            dataset in ['gaussian', 'classification2'] and
            n_rows > 1000 and n_cols >= 1000):
        # linear kernel will not fit the gaussian dataset, but takes very long
        return
    X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols)

    # Default to numpy for testing
    with cuml.using_output_type("numpy"):

        cuSVC = cu_svm.SVC(**params)
示例#11
0
        n_classes=num_classes,
        random_state=0,
    )
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=10
    )

    return X_train, X_test, y_train, y_test


@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("algorithm", ["eig", "svd"])
@pytest.mark.parametrize(
    "nrows", [unit_param(1000), quality_param(5000), stress_param(500000)]
)
@pytest.mark.parametrize(
    "column_info",
    [
        unit_param([20, 10]),
        quality_param([100, 50]),
        stress_param([1000, 500])
    ],
)
def test_linear_regression_model(datatype, algorithm, nrows, column_info):

    if algorithm == "svd" and nrows > 46340:
        pytest.skip("svd solver is not supported for the data that has more"
                    "than 46340 rows or columns if you are using CUDA version"
                    "10.x")
示例#12
0
from sklearn.neighbors import NearestNeighbors

import joblib

from cuml.common import logger

from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.manifold import trustworthiness
from sklearn.metrics import adjusted_rand_score

dataset_names = ['iris', 'digits', 'wine', 'blobs']


@pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000),
                         stress_param(500000)])
@pytest.mark.parametrize('n_feats', [unit_param(20), quality_param(100),
                         stress_param(1000)])
def test_blobs_cluster(nrows, n_feats):

    data, labels = datasets.make_blobs(
        n_samples=nrows, n_features=n_feats, centers=5, random_state=0)
    embedding = cuUMAP().fit_transform(data, convert_dtype=True)

    if nrows < 500000:
        score = adjusted_rand_score(labels,
                                    KMeans(5).fit_predict(embedding))
        assert score == 1.0

示例#13
0
    make_classification, make_regression, load_iris, load_breast_cancer, \
    load_boston
from sklearn.model_selection import train_test_split

import treelite

pytestmark = pytest.mark.filterwarnings("ignore: For reproducible results(.*)"
                                        "::cuml[.*]")


@pytest.fixture(
    scope="session",
    params=[
        unit_param({
            "n_samples": 350,
            "n_features": 20,
            "n_informative": 10
        }),
        quality_param({
            "n_samples": 5000,
            "n_features": 200,
            "n_informative": 80
        }),
        stress_param({
            "n_samples": 500000,
            "n_features": 400,
            "n_informative": 180
        }),
    ],
)
def small_clf(request):
示例#14
0
from cuml.testing.utils import get_handle
from cuml import DBSCAN as cuDBSCAN
from cuml.testing.utils import get_pattern, unit_param, \
    quality_param, stress_param, array_equal, assert_dbscan_equal

from sklearn.cluster import DBSCAN as skDBSCAN
from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler


@pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None])
@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('use_handle', [True, False])
@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize(
    'ncols',
    [unit_param(20), quality_param(100),
     stress_param(1000)])
@pytest.mark.parametrize('out_dtype', [
    unit_param("int32"),
    unit_param(np.int32),
    unit_param("int64"),
    unit_param(np.int64),
    quality_param("int32"),
    stress_param("int32")
])
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch,
示例#15
0
from cuml import PCA as cuPCA
from cuml.testing.utils import get_handle, array_equal, unit_param, \
    quality_param, stress_param

from sklearn import datasets
from sklearn.datasets import make_multilabel_classification
from sklearn.decomposition import PCA as skPCA
from sklearn.datasets import make_blobs
from cuml.common.exceptions import NotFittedError


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('input_type', ['ndarray'])
@pytest.mark.parametrize('use_handle', [True, False])
@pytest.mark.parametrize(
    'name', [unit_param(None),
             quality_param('digits'),
             stress_param('blobs')])
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'digits':
        X, _ = datasets.load_digits(return_X_y=True)

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
示例#16
0
from test_linear_model import make_regression_dataset  # noqa: E402


def normalize_data(X, y):
    y_mean = np.mean(y)
    y = y - y_mean
    x_mean = np.mean(X, axis=0)
    x_scale = np.sqrt(np.var(X, axis=0) * X.shape[0])
    x_scale[x_scale == 0] = 1
    X = (X - x_mean) / x_scale
    return X, y, x_mean, x_scale, y_mean


@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize(
    "nrows", [unit_param(500),
              quality_param(5000),
              stress_param(90000)])
@pytest.mark.parametrize(
    "column_info",
    [
        unit_param([1, 1]),
        unit_param([20, 10]),
        quality_param([100, 50]),
        stress_param([1000, 500])
    ],
)
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize("precompute", [True, False, 'precompute'])
def test_lars_model(datatype, nrows, column_info, precompute, normalize):
    ncols, n_info = column_info
示例#17
0
import pytest

from cuml import TruncatedSVD as cuTSVD
from cuml.testing.utils import get_handle
from cuml.testing.utils import array_equal, unit_param, \
    quality_param, stress_param

from sklearn.datasets import make_blobs
from sklearn.decomposition import TruncatedSVD as skTSVD
from sklearn.utils import check_random_state


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('use_handle', [True, False])
@pytest.mark.parametrize(
    'name', [unit_param(None),
             quality_param('random'),
             stress_param('blobs')])
def test_tsvd_fit(datatype, name, use_handle):

    if name == 'blobs':
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'random':
        pytest.skip('fails when using random dataset '
                    'used by sklearn for testing')
        shape = 5000, 100
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)

    else:
示例#18
0
                      cluster_std=cluster_std,
                      shuffle=False,
                      random_state=0)

    cuml_kmeans = cuml.KMeans(init="k-means||",
                              n_clusters=nclusters,
                              random_state=random_state,
                              output_type='numpy')

    preds = cuml_kmeans.fit_predict(X)

    assert adjusted_rand_score(cp.asnumpy(preds), cp.asnumpy(y)) >= 0.99


@pytest.mark.parametrize('name', dataset_names)
@pytest.mark.parametrize('nrows', [unit_param(1000), quality_param(5000)])
def test_kmeans_sklearn_comparison(name, nrows, random_state):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])
示例#19
0
from cuml.dask.datasets import make_regression
from cuml.dask.linear_model import ElasticNet
from cuml.dask.linear_model import Lasso
from cuml.metrics import r2_score
from cuml.testing.utils import unit_param, quality_param, stress_param

import numpy as np


@pytest.mark.mg
@pytest.mark.parametrize('dtype', [np.float32, np.float64])
@pytest.mark.parametrize('alpha', [0.001])
@pytest.mark.parametrize('algorithm', ['cyclic', 'random'])
@pytest.mark.parametrize(
    'nrows',
    [unit_param(50), quality_param(5000),
     stress_param(500000)])
@pytest.mark.parametrize('column_info', [
    unit_param([20, 10]),
    quality_param([100, 50]),
    stress_param([1000, 500])
])
@pytest.mark.parametrize(
    'n_parts',
    [unit_param(4), quality_param(32),
     stress_param(64)])
@pytest.mark.parametrize("delayed", [True, False])
def test_lasso(dtype, alpha, algorithm, nrows, column_info, n_parts, delayed,
               client):
    ncols, n_info = column_info
示例#20
0
#

import pytest

import numpy as np
from cuml.testing.utils import array_equal, \
    unit_param, stress_param
import cupy as cp

from cuml.dask.common.dask_arr_utils import to_dask_cudf


@pytest.mark.mg
@pytest.mark.parametrize(
    "data_info",
    [unit_param([1000, 20, 30]),
     stress_param([int(9e6), 5000, 30])])
@pytest.mark.parametrize("input_type", ["dataframe", "array"])
def test_pca_fit(data_info, input_type, client):

    nrows, ncols, n_parts = data_info
    if nrows == int(9e6) and pytest.max_gpu_memory < 48:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 256
            ncols = ncols * pytest.max_gpu_memory // 256
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    from cuml.dask.decomposition import TruncatedSVD as daskTPCA
    from sklearn.decomposition import TruncatedSVD