Exemplo n.º 1
0
from cuml.linear_model import MBSGDClassifier as cumlMBSGClassifier
from cuml.testing.utils import unit_param, quality_param, stress_param

from sklearn.linear_model import SGDClassifier
from cuml.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


@pytest.fixture(scope="module",
                params=[
                    unit_param([500, 20, 10, np.float32]),
                    unit_param([500, 20, 10, np.float64]),
                    quality_param([5000, 100, 50, np.float32]),
                    quality_param([5000, 100, 50, np.float64]),
                    stress_param([500000, 1000, 500, np.float32]),
                    stress_param([500000, 1000, 500, np.float64]),
                ],
                ids=[
                    '500-20-10-f32', '500-20-10-f64', '5000-100-50-f32',
                    '5000-100-50-f64', '500000-1000-500-f32',
                    '500000-1000-500-f64'
                ])
def make_dataset(request):
    nrows, ncols, n_info, datatype = request.param
    X, y = make_classification(n_samples=nrows,
                               n_informative=n_info,
                               n_features=ncols,
                               random_state=10)
    X = X.astype(datatype)
    y = y.astype(datatype)
Exemplo n.º 2
0
    except TimeoutError:
        pytest.skip(f"sklearn did not finish within {t} seconds.")


@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize(
    "loss", ["epsilon_insensitive", "squared_epsilon_insensitive"])
@pytest.mark.parametrize("dims", [
    unit_param((3, 1)),
    unit_param((100, 1)),
    unit_param((1000, 10)),
    unit_param((100, 100)),
    unit_param((100, 300)),
    quality_param((10000, 10)),
    quality_param((10000, 50)),
    stress_param((100000, 1000))
])
def test_regression_basic(datatype, loss, dims):
    run_regression(datatype, loss, 0, dims)


@pytest.mark.parametrize(
    "loss", ["epsilon_insensitive", "squared_epsilon_insensitive"])
@pytest.mark.parametrize("epsilon", [0, 0.001, 0.1])
@pytest.mark.parametrize("dims", [
    quality_param((10000, 10)),
    quality_param((10000, 50)),
    quality_param((10000, 500))
])
def test_regression_eps(loss, epsilon, dims):
    run_regression(np.float32, loss, epsilon, dims)
Exemplo n.º 3
0
    ref = cp.array([[2, 0, 0], [0, 0, 1], [1, 0, 2]])
    cp.testing.assert_array_equal(cm, ref)


@pytest.mark.mg
@pytest.mark.parametrize('chunks', ['auto', 2, 1])
def test_confusion_matrix_binary(client, chunks):
    y_true = da.from_array(cp.array([0, 1, 0, 1]), chunks=chunks)
    y_pred = da.from_array(cp.array([1, 1, 1, 0]), chunks=chunks)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    ref = cp.array([0, 2, 1, 1])
    cp.testing.assert_array_equal(ref, cp.array([tn, fp, fn, tp]))


@pytest.mark.mg
@pytest.mark.parametrize('n_samples', [50, 3000, stress_param(500000)])
@pytest.mark.parametrize('dtype', [np.int32, np.int64])
@pytest.mark.parametrize('problem_type', ['binary', 'multiclass'])
def test_confusion_matrix_random(n_samples, dtype, problem_type, client):
    upper_range = 2 if problem_type == 'binary' else 1000

    y_true, y_pred, np_y_true, np_y_pred = generate_random_labels(
        lambda rng: rng.randint(0, upper_range, n_samples).astype(dtype),
        as_cupy=True)
    y_true, y_pred = da.from_array(y_true), da.from_array(y_pred)

    cm = confusion_matrix(y_true, y_pred)
    ref = sk_confusion_matrix(np_y_true, np_y_pred)
    cp.testing.assert_array_almost_equal(ref, cm, decimal=4)

Exemplo n.º 4
0
    cu_y_pred = cuml_kmeans.fit_predict(X)
    cu_score = adjusted_rand_score(cu_y_pred, y)
    kmeans = cluster.KMeans(random_state=random_state,
                            n_clusters=params['n_clusters'])
    sk_y_pred = kmeans.fit_predict(X)
    sk_score = adjusted_rand_score(sk_y_pred, y)

    assert sk_score - 1e-2 <= cu_score <= sk_score + 1e-2


@pytest.mark.parametrize('name', dataset_names)
@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
def test_kmeans_sklearn_comparison_default(name, nrows, random_state):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])
Exemplo n.º 5
0
import pytest

import dask.array as da
import numpy as np
import cupy as cp

from cuml.dask.datasets.blobs import make_blobs
from cuml.dask.common.input_utils import DistributedDataHandler

from cuml.testing.utils import unit_param, quality_param, stress_param

from cuml.dask.common.part_utils import _extract_partitions


@pytest.mark.parametrize('nrows', [unit_param(1e3), quality_param(1e5),
                                   stress_param(1e6)])
@pytest.mark.parametrize('ncols', [unit_param(10), quality_param(100),
                                   stress_param(1000)])
@pytest.mark.parametrize('centers', [10])
@pytest.mark.parametrize("cluster_std", [0.1])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("nparts", [unit_param(1), unit_param(7),
                                    quality_param(100),
                                    stress_param(1000)])
@pytest.mark.parametrize("order", ['F', 'C'])
def test_make_blobs(nrows,
                    ncols,
                    centers,
                    cluster_std,
                    dtype,
                    nparts,
Exemplo n.º 6
0
        assert array_equal(result["rf_res"], pickled_model.predict(X_test))
        # Confirm no crash from score
        pickled_model.score(X_test,
                            np.zeros(X_test.shape[0]),
                            predict_model="GPU")

        pickle_save_load(tmpdir, create_mod, assert_model)


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('keys', regression_models.keys())
@pytest.mark.parametrize(
    'data_size',
    [unit_param([500, 20, 10]),
     stress_param([500000, 1000, 500])])
@pytest.mark.parametrize('fit_intercept', [True, False])
def test_regressor_pickle(tmpdir, datatype, keys, data_size, fit_intercept):
    if data_size[0] == 500000 and datatype == np.float64 and \
            ("LogisticRegression" in keys or "Ridge" in keys) and \
            pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            data_size[0] = data_size[0] * pytest.max_gpu_memory // 640
            data_size[1] = data_size[1] * pytest.max_gpu_memory // 640
            data_size[2] = data_size[2] * pytest.max_gpu_memory // 640
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")
    result = {}

    def create_mod():
Exemplo n.º 7
0
        n_classes=num_classes,
        random_state=0,
    )
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=10
    )

    return X_train, X_test, y_train, y_test


@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("algorithm", ["eig", "svd"])
@pytest.mark.parametrize(
    "nrows", [unit_param(1000), quality_param(5000), stress_param(500000)]
)
@pytest.mark.parametrize(
    "column_info",
    [
        unit_param([20, 10]),
        quality_param([100, 50]),
        stress_param([1000, 500])
    ],
)
def test_linear_regression_model(datatype, algorithm, nrows, column_info):

    if algorithm == "svd" and nrows > 46340:
        pytest.skip("svd solver is not supported for the data that has more"
                    "than 46340 rows or columns if you are using CUDA version"
                    "10.x")
Exemplo n.º 8
0
    quality_param, stress_param

from sklearn import datasets
from sklearn.datasets import make_multilabel_classification
from sklearn.decomposition import PCA as skPCA
from sklearn.datasets import make_blobs
from cuml.common.exceptions import NotFittedError


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('input_type', ['ndarray'])
@pytest.mark.parametrize('use_handle', [True, False])
@pytest.mark.parametrize(
    'name', [unit_param(None),
             quality_param('digits'),
             stress_param('blobs')])
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'digits':
        X, _ = datasets.load_digits(return_X_y=True)

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)
Exemplo n.º 9
0
    cuSVC.fit(X_train, y_train)

    sklSVC = svm.SVC(**params)
    sklSVC.fit(X_train, y_train)

    compare_svm(cuSVC, sklSVC, X_train, y_train, cmp_decision_func=True)


@pytest.mark.parametrize('params', [
    {'kernel': 'linear', 'C': 1},
    {'kernel': 'rbf', 'C': 1, 'gamma': 1},
    {'kernel': 'poly', 'C': 1, 'gamma': 1},
])
@pytest.mark.parametrize('dataset', ['classification2', 'gaussian', 'blobs'])
@pytest.mark.parametrize('n_rows', [3, unit_param(100), quality_param(1000),
                                    stress_param(5000)])
@pytest.mark.parametrize('n_cols', [2, unit_param(100), quality_param(1000),
                         stress_param(1000)])
def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols):
    if (params['kernel'] == 'linear' and
            dataset in ['gaussian', 'classification2'] and
            n_rows > 1000 and n_cols >= 1000):
        # linear kernel will not fit the gaussian dataset, but takes very long
        return
    X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols)

    # Default to numpy for testing
    with cuml.using_output_type("numpy"):

        cuSVC = cu_svm.SVC(**params)
        cuSVC.fit(X_train, y_train)
Exemplo n.º 10
0
    X = dask_cudf.from_cudf(X, npartitions=2)
    Y_ohe = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]])
    Y_ohe = da.from_array(Y_ohe)

    enc = OneHotEncoder(handle_unknown='ignore')
    enc = enc.fit(X)
    df = enc.inverse_transform(Y_ohe)
    ref = DataFrame({'chars': [None, 'b'], 'int': [0, 2]})
    assert_frame_equal(df.compute().to_pandas(), ref.to_pandas())


@pytest.mark.mg
@pytest.mark.parametrize('drop', [None, 'first'])
@pytest.mark.parametrize('as_array', [True, False], ids=['cupy', 'cudf'])
@pytest.mark.parametrize('sparse', [True, False], ids=['sparse', 'dense'])
@pytest.mark.parametrize("n_samples", [10, 1000, stress_param(50000)])
def test_onehot_random_inputs(client, drop, as_array, sparse, n_samples):
    X, ary = generate_inputs_from_categories(n_samples=n_samples,
                                             as_array=as_array)
    if as_array:
        dX = da.from_array(X)
    else:
        dX = dask_cudf.from_cudf(X, npartitions=1)

    enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    ohe = enc.fit_transform(dX)
    ref = sk_enc.fit_transform(ary)
    if sparse:
        cp.testing.assert_array_equal(ohe.compute().toarray(), ref.toarray())
    else:
Exemplo n.º 11
0
    """
    X = dataset.data

    tsne = TSNE(n_components=2,
                random_state=1,
                n_neighbors=DEFAULT_N_NEIGHBORS,
                learning_rate_method='none',
                method=method,
                min_grad_norm=1e-12,
                perplexity=DEFAULT_PERPLEXITY)

    Y = tsne.fit_transform(X)
    validate_embedding(X, Y)


@pytest.mark.parametrize('nrows', [stress_param(2400000)])
@pytest.mark.parametrize('ncols', [stress_param(225)])
@pytest.mark.parametrize('method', ['fft', 'barnes_hut'])
def test_tsne_large(nrows, ncols, method):
    """
    This tests how TSNE handles large input
    """
    X, y = make_blobs(n_samples=nrows,
                      centers=8,
                      n_features=ncols,
                      random_state=1).astype(np.float32)

    tsne = TSNE(random_state=1,
                exaggeration_iter=1,
                n_iter=2,
                method=method,
Exemplo n.º 12
0
@pytest.fixture(
    scope="session",
    params=[
        unit_param({
            "n_samples": 350,
            "n_features": 20,
            "n_informative": 10
        }),
        quality_param({
            "n_samples": 5000,
            "n_features": 200,
            "n_informative": 80
        }),
        stress_param({
            "n_samples": 500000,
            "n_features": 400,
            "n_informative": 180
        }),
    ],
)
def small_clf(request):
    X, y = make_classification(
        n_samples=request.param["n_samples"],
        n_features=request.param["n_features"],
        n_clusters_per_class=1,
        n_informative=request.param["n_informative"],
        random_state=123,
        n_classes=2,
    )
    return X, y
Exemplo n.º 13
0
from cuml.testing.utils import get_pattern, unit_param, \
    quality_param, stress_param, array_equal, assert_dbscan_equal

from sklearn.cluster import DBSCAN as skDBSCAN
from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler


@pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None])
@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('use_handle', [True, False])
@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize(
    'ncols',
    [unit_param(20), quality_param(100),
     stress_param(1000)])
@pytest.mark.parametrize('out_dtype', [
    unit_param("int32"),
    unit_param(np.int32),
    unit_param("int64"),
    unit_param(np.int64),
    quality_param("int32"),
    stress_param("int32")
])
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch,
                out_dtype):
    if nrows == 500000 and pytest.max_gpu_memory < 32:
Exemplo n.º 14
0
    X_train_df, = dask_utils.persist_across_workers(c, [X_train_df],
                                                    workers=list(workers))

    return X_train_df


def _scale_rows(client, nrows):
    workers = list(client.scheduler_info()['workers'].keys())
    n_workers = len(workers)
    return n_workers * nrows


@pytest.mark.parametrize(
    "nrows",
    [unit_param(300), quality_param(1e6),
     stress_param(5e8)])
@pytest.mark.parametrize("ncols", [10, 30])
@pytest.mark.parametrize(
    "nclusters",
    [unit_param(5), quality_param(10),
     stress_param(15)])
@pytest.mark.parametrize(
    "n_neighbors",
    [unit_param(10), quality_param(4),
     stress_param(100)])
@pytest.mark.parametrize(
    "n_parts",
    [unit_param(1),
     unit_param(5),
     quality_param(7),
     stress_param(50)])
Exemplo n.º 15
0
test_data = [
    # ((1, 0, 1, 0, 0, 0, 0, 1), test_101c),
    ((0, 0, 2, 0, 0, 0, 0, 1), test_002c),
    ((0, 1, 0, 0, 0, 0, 0, 1), test_010c),
    ((1, 1, 0, 0, 0, 0, 0, 0), test_110),
    ((0, 1, 1, 0, 0, 0, 0, 1), test_011c),
    ((0, 1, 1, 0, 0, 0, 0, 1), test_011c_exog),
    ((1, 2, 1, 0, 0, 0, 0, 1), test_121c),
    ((1, 1, 1, 0, 0, 0, 0, 1), test_111c_missing),
    ((1, 0, 1, 1, 1, 1, 4, 0), test_101_111_4),
    ((5, 1, 0, 0, 0, 0, 0, 0), test_510),
    ((1, 1, 1, 2, 0, 0, 4, 1), test_111_200_4c),
    ((1, 1, 1, 2, 0, 0, 4, 1), test_111_200_4c_missing),
    ((1, 1, 1, 2, 0, 0, 4, 1), test_111_200_4c_missing_exog),
    ((1, 1, 2, 0, 1, 2, 4, 0), test_112_012_4),
    stress_param((1, 1, 1, 1, 1, 1, 12, 0), test_111_111_12),
    stress_param((1, 1, 1, 1, 1, 1, 12, 0), test_111_111_12_missing),
    stress_param((1, 0, 1, 1, 1, 1, 12, 1), test_111_111_12c_missing_exog),
]

# Dictionary for lazy-loading of datasets
# (name, dtype) -> (pandas dataframe, cuDF dataframe)
lazy_data = {}

# Dictionary for lazy-evaluation of reference fits
# (p, d, q, P, D, Q, s, k, name, dtype) -> SARIMAXResults
lazy_ref_fit = {}


def extract_order(tup):
    """Extract the order from a tuple of parameters"""
Exemplo n.º 16
0
    n_samples = np_array.shape[0]
    n_samples_per_part = int(n_samples / n_parts)
    chunks = [n_samples_per_part] * n_parts
    chunks[-1] += n_samples % n_samples_per_part
    chunks = tuple(chunks)
    return da.from_array(np_array, chunks=(chunks, -1))


@pytest.fixture(
    scope="module",
    params=[
        unit_param({'n_samples': 3000, 'n_features': 30,
                    'n_classes': 5, 'n_targets': 2}),
        quality_param({'n_samples': 8000, 'n_features': 35,
                       'n_classes': 12, 'n_targets': 3}),
        stress_param({'n_samples': 20000, 'n_features': 40,
                      'n_classes': 12, 'n_targets': 4})
    ])
def dataset(request):
    X, y = make_multilabel_classification(
        n_samples=int(request.param['n_samples'] * 1.2),
        n_features=request.param['n_features'],
        n_classes=request.param['n_classes'],
        n_labels=request.param['n_classes'],
        length=request.param['n_targets'])
    new_x = []
    new_y = []
    for i in range(y.shape[0]):
        a = np.argwhere(y[i] == 1)[:, 0]
        if len(a) >= request.param['n_targets']:
            new_x.append(i)
            np.random.shuffle(a)
Exemplo n.º 17
0
from cuml.testing.utils import get_pattern, unit_param, \
    quality_param, stress_param, array_equal, assert_dbscan_equal

from sklearn.cluster import DBSCAN as skDBSCAN
from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler


@pytest.mark.mg
@pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None])
@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize(
    'ncols',
    [unit_param(20), quality_param(100),
     stress_param(1000)])
@pytest.mark.parametrize('out_dtype', [
    unit_param("int32"),
    unit_param(np.int32),
    unit_param("int64"),
    unit_param(np.int64),
    quality_param("int32"),
    stress_param("int32")
])
def test_dbscan(datatype, nrows, ncols, max_mbytes_per_batch, out_dtype,
                client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN
Exemplo n.º 18
0
from cuml import TruncatedSVD as cuTSVD
from cuml.testing.utils import get_handle
from cuml.testing.utils import array_equal, unit_param, \
    quality_param, stress_param

from sklearn.datasets import make_blobs
from sklearn.decomposition import TruncatedSVD as skTSVD
from sklearn.utils import check_random_state


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('use_handle', [True, False])
@pytest.mark.parametrize(
    'name', [unit_param(None),
             quality_param('random'),
             stress_param('blobs')])
def test_tsvd_fit(datatype, name, use_handle):

    if name == 'blobs':
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'random':
        pytest.skip('fails when using random dataset '
                    'used by sklearn for testing')
        shape = 5000, 100
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)

    else:
        n, p = 500, 5
        rng = np.random.RandomState(0)
Exemplo n.º 19
0
from cuml.testing.utils import quality_param
from cuml.testing.utils import stress_param

import dask.array as da

from cuml.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_rand_score as sk_adjusted_rand_score

from cuml.dask.common.dask_arr_utils import to_dask_cudf


@pytest.mark.mg
@pytest.mark.parametrize(
    "nrows",
    [unit_param(1e3), quality_param(1e5),
     stress_param(5e6)])
@pytest.mark.parametrize("ncols", [10, 30])
@pytest.mark.parametrize(
    "nclusters",
    [unit_param(5), quality_param(10),
     stress_param(50)])
@pytest.mark.parametrize(
    "n_parts",
    [unit_param(None), quality_param(7),
     stress_param(50)])
@pytest.mark.parametrize("delayed_predict", [True, False])
@pytest.mark.parametrize("input_type", ["dataframe", "array"])
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict,
                    input_type, client):

    from cuml.dask.cluster import KMeans as cumlKMeans
Exemplo n.º 20
0
from cuml.dask.linear_model import ElasticNet
from cuml.dask.linear_model import Lasso
from cuml.metrics import r2_score
from cuml.testing.utils import unit_param, quality_param, stress_param

import numpy as np


@pytest.mark.mg
@pytest.mark.parametrize('dtype', [np.float32, np.float64])
@pytest.mark.parametrize('alpha', [0.001])
@pytest.mark.parametrize('algorithm', ['cyclic', 'random'])
@pytest.mark.parametrize(
    'nrows',
    [unit_param(50), quality_param(5000),
     stress_param(500000)])
@pytest.mark.parametrize('column_info', [
    unit_param([20, 10]),
    quality_param([100, 50]),
    stress_param([1000, 500])
])
@pytest.mark.parametrize(
    'n_parts',
    [unit_param(4), quality_param(32),
     stress_param(64)])
@pytest.mark.parametrize("delayed", [True, False])
def test_lasso(dtype, alpha, algorithm, nrows, column_info, n_parts, delayed,
               client):
    ncols, n_info = column_info

    X, y = make_regression(n_samples=nrows,
Exemplo n.º 21
0
import pytest

import numpy as np
from cuml.testing.utils import array_equal, \
    unit_param, stress_param
import cupy as cp

from cuml.dask.common.dask_arr_utils import to_dask_cudf


@pytest.mark.mg
@pytest.mark.parametrize(
    "data_info",
    [unit_param([1000, 20, 30]),
     stress_param([int(9e6), 5000, 30])])
@pytest.mark.parametrize("input_type", ["dataframe", "array"])
def test_pca_fit(data_info, input_type, client):

    nrows, ncols, n_parts = data_info
    if nrows == int(9e6) and pytest.max_gpu_memory < 48:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 256
            ncols = ncols * pytest.max_gpu_memory // 256
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    from cuml.dask.decomposition import TruncatedSVD as daskTPCA
    from sklearn.decomposition import TruncatedSVD