Exemplo n.º 1
0
import dask.array as da
import numpy as np
import cupy as cp

from dask.distributed import Client

from cuml.dask.datasets.blobs import make_blobs
from cuml.dask.common.input_utils import DistributedDataHandler

from cuml.test.utils import unit_param, quality_param, stress_param

from cuml.dask.common.part_utils import _extract_partitions


@pytest.mark.parametrize('nrows', [unit_param(1e3), quality_param(1e5),
                                   stress_param(1e6)])
@pytest.mark.parametrize('ncols', [unit_param(10), quality_param(100),
                                   stress_param(1000)])
@pytest.mark.parametrize('centers', [10])
@pytest.mark.parametrize("cluster_std", [0.1])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("nparts", [unit_param(1), unit_param(7),
                                    quality_param(100),
                                    stress_param(1000)])
@pytest.mark.parametrize("order", ['F', 'C'])
def test_make_blobs(nrows,
                    ncols,
                    centers,
                    cluster_std,
                    dtype,
Exemplo n.º 2
0
from cuml.test.utils import unit_param, quality_param, stress_param

from sklearn.linear_model import SGDClassifier
from sklearn.datasets.samples_generator import make_classification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


@pytest.mark.parametrize('lrate', ['constant', 'invscaling', 'adaptive'])
@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('input_type', ['ndarray'])
@pytest.mark.parametrize('penalty', ['none', 'l1', 'l2', 'elasticnet'])
@pytest.mark.parametrize('loss', ['hinge', 'log', 'squared_loss'])
@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize('column_info', [
    unit_param([20, 10]),
    quality_param([100, 50]),
    stress_param([1000, 500])
])
def test_mbsgd_classifier(datatype, lrate, input_type, penalty, loss, nrows,
                          column_info):
    ncols, n_info = column_info
    X, y = make_classification(n_samples=nrows,
                               n_informative=n_info,
                               n_features=ncols,
                               random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
Exemplo n.º 3
0
    quality_param, stress_param
import cuml.common.logger as logger

from sklearn.ensemble import RandomForestClassifier as skrfc
from sklearn.ensemble import RandomForestRegressor as skrfr
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.datasets import fetch_california_housing, \
    make_classification, make_regression
from sklearn.model_selection import train_test_split


@pytest.fixture(
    scope="session",
    params=[
        unit_param({'n_samples': 350, 'n_features': 20, 'n_informative': 10}),
        quality_param({'n_samples': 5000, 'n_features': 200,
                      'n_informative': 80}),
        stress_param({'n_samples': 500000, 'n_features': 400,
                     'n_informative': 180})
    ])
def small_clf(request):
    X, y = make_classification(n_samples=request.param['n_samples'],
                               n_features=request.param['n_features'],
                               n_clusters_per_class=1,
                               n_informative=request.param['n_informative'],
                               random_state=123, n_classes=2)
    return X, y


@pytest.fixture(
    scope="session",
    params=[
Exemplo n.º 4
0
    else:
        params['eval_metric'] = 'error'
        params['objective'] = 'reg:squarederror'
        params['base_score'] = 0.0

    params['max_depth'] = 25
    params.update(xgboost_params)

    bst = xgb.train(params, dtrain, num_rounds)
    bst.save_model(model_path)
    return bst


@pytest.mark.parametrize(
    'n_rows', [unit_param(1000),
               quality_param(10000),
               stress_param(500000)])
@pytest.mark.parametrize(
    'n_columns',
    [unit_param(20), quality_param(100),
     stress_param(1000)])
@pytest.mark.parametrize(
    'num_rounds',
    [unit_param(1),
     unit_param(5),
     quality_param(50),
     stress_param(90)])
@pytest.mark.skipif(has_xgboost() is False, reason="need to install xgboost")
def test_fil_classification(n_rows, n_columns, num_rounds, tmp_path):
    # settings
    classification = True  # change this to false to use regression
Exemplo n.º 5
0
from cuml.test.utils import get_handle, array_equal, unit_param, \
    quality_param, stress_param

from sklearn import datasets
from sklearn.datasets import make_multilabel_classification
from sklearn.decomposition import PCA as skPCA
from sklearn.datasets import make_blobs
from cuml.common.exceptions import NotFittedError


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('input_type', ['ndarray'])
@pytest.mark.parametrize('use_handle', [True, False])
@pytest.mark.parametrize(
    'name', [unit_param(None),
             quality_param('digits'),
             stress_param('blobs')])
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'digits':
        X, _ = datasets.load_digits(return_X_y=True)

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
Exemplo n.º 6
0
                       normalize=normalize,
                       solver="eig")

    assert getattr(cu_clf, 'score', False)
    sk_cu_grid = GridSearchCV(cu_clf, params, cv=5, iid=False)

    gdf_data = cudf.DataFrame.from_gpu_matrix(cuda.to_device(X_train))
    gdf_train = cudf.DataFrame(dict(train=y_train))

    sk_cu_grid.fit(gdf_data, gdf_train.train)
    assert sk_cu_grid.best_params_ == {'alpha': 0.1}


@pytest.mark.parametrize(
    'nrows',
    [unit_param(30), quality_param(5000),
     stress_param(500000)])
@pytest.mark.parametrize(
    'ncols',
    [unit_param(10), quality_param(100),
     stress_param(200)])
@pytest.mark.parametrize(
    'n_info',
    [unit_param(7), quality_param(50),
     stress_param(100)])
@pytest.mark.parametrize('datatype', [np.float32])
def test_accuracy(nrows, ncols, n_info, datatype):

    use_handle = True
    train_rows = np.int32(nrows * 0.8)
    X, y = make_classification(n_samples=nrows,
Exemplo n.º 7
0
        n_classes=num_classes,
        random_state=0,
    )
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=10
    )

    return X_train, X_test, y_train, y_test


@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("algorithm", ["eig", "svd"])
@pytest.mark.parametrize(
    "nrows", [unit_param(1000), quality_param(5000), stress_param(500000)]
)
@pytest.mark.parametrize(
    "column_info",
    [
        unit_param([20, 10]),
        quality_param([100, 50]),
        stress_param([1000, 500])
    ],
)
def test_linear_regression_model(datatype, algorithm, nrows, column_info):

    if algorithm == "svd" and nrows > 46340:
        pytest.skip("svd solver is not supported for the data that has more"
                    "than 46340 rows or columns if you are using CUDA version"
                    "10.x")
Exemplo n.º 8
0
    knn_cu = cuKNN()
    knn_cu.fit(X)

    ret = knn_cu.kneighbors(X, k, return_distance=False)
    assert not isinstance(ret, tuple)
    assert ret.shape == (n_samples, k)

    ret = knn_cu.kneighbors(X, k, return_distance=True)
    assert isinstance(ret, tuple)
    assert len(ret) == 2


@pytest.mark.parametrize('input_type', ['dataframe', 'ndarray'])
@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize(
    'n_feats',
    [unit_param(3), quality_param(100),
     stress_param(1000)])
@pytest.mark.parametrize(
    'k', [unit_param(3), quality_param(30),
          stress_param(50)])
@pytest.mark.parametrize("metric", valid_metrics())
def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric):
    X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0)

    X_index = X[:100]
    X_search = X[101:]
Exemplo n.º 9
0
    X_train_df, = dask_utils.persist_across_workers(c,
                                                    [X_train_df],
                                                    workers=workers)

    return X_train_df


def test_011_exception():
    from cuml.dask.neighbors import NearestNeighbors as daskNN

    with pytest.raises(NotImplementedError):
        cumlModel = daskNN()  # noqa: F841


@pytest.mark.parametrize("nrows", [unit_param(1e3), unit_param(1e4),
                                   quality_param(1e6),
                                   stress_param(5e8)])
@pytest.mark.parametrize("ncols", [10, 30])
@pytest.mark.parametrize("nclusters", [unit_param(5), quality_param(10),
                                       stress_param(15)])
@pytest.mark.parametrize("n_neighbors", [unit_param(10), quality_param(4),
                                         stress_param(100)])
@pytest.mark.parametrize("n_parts", [unit_param(1), unit_param(5),
                                     quality_param(7), stress_param(50)])
@pytest.mark.parametrize("streams_per_handle", [1, 5])
@pytest.mark.skip("MNMG KNN available in cuML 0.12+")
def test_compare_skl(nrows, ncols, nclusters, n_parts, n_neighbors,
                     streams_per_handle, cluster):

    client = Client(cluster)
Exemplo n.º 10
0
from cuml import DBSCAN as cuDBSCAN
from cuml.test.utils import get_pattern, unit_param, \
    quality_param, stress_param, array_equal, assert_dbscan_equal

from sklearn.cluster import DBSCAN as skDBSCAN
from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler


@pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None])
@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('use_handle', [True, False])
@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize(
    'ncols',
    [unit_param(20), quality_param(100),
     stress_param(1000)])
@pytest.mark.parametrize('out_dtype', [
    unit_param("int32"),
    unit_param(np.int32),
    unit_param("int64"),
    unit_param(np.int64),
    quality_param("int32"),
    stress_param("int32")
])
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch,
                out_dtype):
Exemplo n.º 11
0
from cuml import TruncatedSVD as cuTSVD
from cuml.test.utils import get_handle
from cuml.test.utils import array_equal, unit_param, \
    quality_param, stress_param

from sklearn.datasets.samples_generator import make_blobs
from sklearn.decomposition import TruncatedSVD as skTSVD
from sklearn.utils import check_random_state


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('use_handle', [True, False])
@pytest.mark.parametrize(
    'name', [unit_param(None),
             quality_param('random'),
             stress_param('blobs')])
def test_tsvd_fit(datatype, name, use_handle):

    if name == 'blobs':
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'random':
        pytest.skip('fails when using random dataset '
                    'used by sklearn for testing')
        shape = 5000, 100
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)

    else:
        n, p = 500, 5
Exemplo n.º 12
0
from cuml.ensemble import RandomForestRegressor as curfr
from cuml.metrics import r2_score
from cuml.test.utils import get_handle, unit_param, \
    quality_param, stress_param

from sklearn.ensemble import RandomForestClassifier as skrfc
from sklearn.ensemble import RandomForestRegressor as skrfr
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_california_housing, \
    make_classification, make_regression
from sklearn.model_selection import train_test_split


@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize('column_info', [
    unit_param([20, 10]),
    quality_param([200, 100]),
    stress_param([500, 350])
])
@pytest.mark.parametrize(
    'rows_sample',
    [unit_param(1.0), quality_param(0.90),
     stress_param(0.95)])
@pytest.mark.parametrize('datatype', [np.float32])
@pytest.mark.parametrize('split_algo', [0, 1])
@pytest.mark.parametrize('max_features', [1.0, 'auto', 'log2', 'sqrt'])
def test_rf_classification(datatype, split_algo, rows_sample, nrows,
                           column_info, max_features):
Exemplo n.º 13
0
    load_boston
from sklearn.model_selection import train_test_split

import treelite


pytestmark = pytest.mark.filterwarnings("ignore: For reproducible results(.*)"
                                        "::cuml[.*]")


@pytest.fixture(
    scope="session",
    params=[
        unit_param({"n_samples": 350, "n_features": 20, "n_informative": 10}),
        quality_param(
            {"n_samples": 5000, "n_features": 200, "n_informative": 80}
        ),
        stress_param(
            {"n_samples": 500000, "n_features": 400, "n_informative": 180}
        ),
    ],
)
def small_clf(request):
    X, y = make_classification(
        n_samples=request.param["n_samples"],
        n_features=request.param["n_features"],
        n_clusters_per_class=1,
        n_informative=request.param["n_informative"],
        random_state=123,
        n_classes=2,
    )
Exemplo n.º 14
0
import dask.array as da
import numpy as np
import cupy as cp

from cuml.dask.datasets.blobs import make_blobs
from cuml.dask.common.input_utils import DistributedDataHandler

from cuml.test.utils import unit_param, quality_param, stress_param

from cuml.dask.common.part_utils import _extract_partitions


@pytest.mark.parametrize(
    'nrows',
    [unit_param(1e3), quality_param(1e5),
     stress_param(1e6)])
@pytest.mark.parametrize(
    'ncols',
    [unit_param(10), quality_param(100),
     stress_param(1000)])
@pytest.mark.parametrize('centers', [10])
@pytest.mark.parametrize("cluster_std", [0.1])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize(
    "nparts",
    [unit_param(1),
     unit_param(7),
     quality_param(100),
     stress_param(1000)])
@pytest.mark.parametrize("order", ['F', 'C'])
Exemplo n.º 15
0
                      n_features=n_feats, random_state=0)

    knn_cu = cuKNN()
    knn_cu.fit(X)

    ret = knn_cu.kneighbors(X, k, return_distance=False)
    assert not isinstance(ret, tuple)
    assert ret.shape == (n_samples, k)

    ret = knn_cu.kneighbors(X, k, return_distance=True)
    assert isinstance(ret, tuple)
    assert len(ret) == 2


@pytest.mark.parametrize('input_type', ['dataframe', 'ndarray'])
@pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000),
                         stress_param(70000)])
@pytest.mark.parametrize('n_feats', [unit_param(3), quality_param(100),
                         stress_param(1000)])
@pytest.mark.parametrize('k', [unit_param(3), quality_param(30),
                         stress_param(50)])
@pytest.mark.parametrize("metric", valid_metrics())
def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric):
    X, _ = make_blobs(n_samples=nrows,
                      n_features=n_feats, random_state=0)

    X_index = X[:100]
    X_search = X[101:]

    p = 5  # Testing 5-norm of the minkowski metric only
    knn_sk = skKNN(metric=metric, p=p)  # Testing
Exemplo n.º 16
0
from cuml.solvers import SGD as cumlSGD
from cuml.test.utils import unit_param, quality_param, \
    stress_param

from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
from sklearn import datasets


@pytest.mark.parametrize('lrate', ['constant', 'invscaling', 'adaptive'])
@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('penalty', ['none', 'l1', 'l2', 'elasticnet'])
@pytest.mark.parametrize('loss', ['hinge', 'log', 'squared_loss'])
@pytest.mark.parametrize(
    'name', [unit_param(None),
             quality_param('iris'),
             stress_param('blobs')])
def test_svd(datatype, lrate, penalty, loss, name):

    if name == 'blobs':
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)
        X = X.astype(datatype)
        y = y.astype(datatype)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            train_size=0.8)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = (iris.data).astype(datatype)
        y = (iris.target).astype(datatype)
Exemplo n.º 17
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pytest

from dask.distributed import Client, wait

import numpy as np

from cuml.test.utils import unit_param, quality_param, stress_param


@pytest.mark.mg
@pytest.mark.parametrize("nrows", [unit_param(1e3), quality_param(1e5),
                                   stress_param(5e6)])
@pytest.mark.parametrize("ncols", [10, 30])
@pytest.mark.parametrize("nclusters", [unit_param(5), quality_param(10),
                                       stress_param(50)])
@pytest.mark.parametrize("n_parts", [unit_param(None), quality_param(7),
                                     stress_param(50)])
def test_end_to_end(nrows, ncols, nclusters, n_parts, cluster):

    client = Client(cluster)

    try:
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs
Exemplo n.º 18
0
    X_train_df = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)
    X_train_df, = dask_utils.persist_across_workers(c, [X_train_df],
                                                    workers=list(workers))

    return X_train_df


def _scale_rows(client, nrows):
    workers = list(client.scheduler_info()['workers'].keys())
    n_workers = len(workers)
    return n_workers * nrows


@pytest.mark.parametrize(
    "nrows",
    [unit_param(300), quality_param(1e6),
     stress_param(5e8)])
@pytest.mark.parametrize("ncols", [10, 30])
@pytest.mark.parametrize(
    "nclusters",
    [unit_param(5), quality_param(10),
     stress_param(15)])
@pytest.mark.parametrize(
    "n_neighbors",
    [unit_param(10), quality_param(4),
     stress_param(100)])
@pytest.mark.parametrize(
    "n_parts",
    [unit_param(1),
     unit_param(5),
     quality_param(7),
Exemplo n.º 19
0
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.datasets import fetch_california_housing, \
    make_classification, make_regression
from sklearn.model_selection import train_test_split


@pytest.fixture(scope="session",
                params=[
                    unit_param({
                        'n_samples': 350,
                        'n_features': 20,
                        'n_informative': 10
                    }),
                    quality_param({
                        'n_samples': 5000,
                        'n_features': 200,
                        'n_informative': 80
                    }),
                    stress_param({
                        'n_samples': 500000,
                        'n_features': 400,
                        'n_informative': 180
                    })
                ])
def small_clf(request):
    X, y = make_classification(n_samples=request.param['n_samples'],
                               n_features=request.param['n_features'],
                               n_clusters_per_class=1,
                               n_informative=request.param['n_informative'],
                               random_state=123,
                               n_classes=2)
Exemplo n.º 20
0
    chunks[-1] += n_samples % n_samples_per_part
    chunks = tuple(chunks)
    return da.from_array(np_array, chunks=(chunks, -1))


@pytest.fixture(scope="module",
                params=[
                    unit_param({
                        'n_samples': 1000,
                        'n_features': 30,
                        'n_classes': 5,
                        'n_targets': 2
                    }),
                    quality_param({
                        'n_samples': 5000,
                        'n_features': 100,
                        'n_classes': 12,
                        'n_targets': 4
                    }),
                    stress_param({
                        'n_samples': 12000,
                        'n_features': 40,
                        'n_classes': 5,
                        'n_targets': 2
                    })
                ])
def dataset(request):
    X, y = make_multilabel_classification(
        n_samples=int(request.param['n_samples'] * 1.2),
        n_features=request.param['n_features'],
        n_classes=request.param['n_classes'],
        n_labels=request.param['n_classes'],
Exemplo n.º 21
0
    cuSVC = cu_svm.SVC(**params)
    cuSVC.fit(X_train, y_train)

    sklSVC = svm.SVC(**params)
    sklSVC.fit(X_train, y_train)

    compare_svm(cuSVC, sklSVC, X_train, y_train, cmp_decision_func=True)


@pytest.mark.parametrize('params', [
    {'kernel': 'linear', 'C': 1},
    {'kernel': 'rbf', 'C': 1, 'gamma': 1},
    {'kernel': 'poly', 'C': 1, 'gamma': 1},
])
@pytest.mark.parametrize('dataset', ['classification2', 'gaussian', 'blobs'])
@pytest.mark.parametrize('n_rows', [3, unit_param(100), quality_param(1000),
                                    stress_param(5000)])
@pytest.mark.parametrize('n_cols', [2, unit_param(100), quality_param(1000),
                         stress_param(1000)])
def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols):
    if (params['kernel'] == 'linear' and
            dataset in ['gaussian', 'classification2'] and
            n_rows > 1000 and n_cols >= 1000):
        # linear kernel will not fit the gaussian dataset, but takes very long
        return
    X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols)

    # Default to numpy for testing
    with cuml.using_output_type("numpy"):

        cuSVC = cu_svm.SVC(**params)
Exemplo n.º 22
0
import cupy as cp

from cuml.linear_model import MBSGDRegressor as cumlMBSGRegressor
from cuml.metrics import r2_score
from cuml.test.utils import unit_param, quality_param, stress_param

from sklearn.linear_model import SGDRegressor
from cuml.datasets import make_regression
from sklearn.model_selection import train_test_split


@pytest.fixture(scope="module",
                params=[
                    unit_param([500, 20, 10, np.float32]),
                    unit_param([500, 20, 10, np.float64]),
                    quality_param([5000, 100, 50, np.float32]),
                    quality_param([5000, 100, 50, np.float64]),
                    stress_param([500000, 1000, 500, np.float32]),
                    stress_param([500000, 1000, 500, np.float64]),
                ],
                ids=[
                    '500-20-10-f32', '500-20-10-f64', '5000-100-50-f32',
                    '5000-100-50-f64', '500000-1000-500-f32',
                    '500000-1000-500-f64'
                ])
def make_dataset(request):
    nrows, ncols, n_info, datatype = request.param
    if nrows == 500000 and datatype == np.float64 and \
            pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 32
Exemplo n.º 23
0
import pytest

from cuml.linear_model import MBSGDRegressor as cumlMBSGRegressor
from cuml.metrics import r2_score
from cuml.test.utils import unit_param, quality_param, stress_param

from sklearn.linear_model import SGDRegressor
from sklearn.datasets.samples_generator import make_regression
from sklearn.model_selection import train_test_split


@pytest.mark.parametrize('lrate', ['constant', 'invscaling', 'adaptive'])
@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('input_type', ['ndarray'])
@pytest.mark.parametrize('penalty', ['none', 'l1', 'l2', 'elasticnet'])
@pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000),
                         stress_param(500000)])
@pytest.mark.parametrize('column_info', [unit_param([20, 10]),
                         quality_param([100, 50]),
                         stress_param([1000, 500])])
def test_mbsgd_regressor(datatype, lrate, input_type, penalty,
                         nrows, column_info):
    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows, n_features=ncols,
                           n_informative=n_info, random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005,
Exemplo n.º 24
0
        shuffle=False,
        random_state=random_state,
    )

    cuml_kmeans = cuml.KMeans(init="k-means||",
                              n_clusters=nclusters,
                              random_state=random_state,
                              output_type='numpy')

    preds = cuml_kmeans.fit_predict(X)

    assert adjusted_rand_score(cp.asnumpy(preds), cp.asnumpy(y)) >= 0.99


@pytest.mark.parametrize('name', dataset_names)
@pytest.mark.parametrize('nrows', [unit_param(1000), quality_param(5000)])
def test_kmeans_sklearn_comparison(name, nrows):

    random_state = 12

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)
Exemplo n.º 25
0
from cuml.test.utils import unit_param
from cuml.test.utils import quality_param
from cuml.test.utils import stress_param

import dask.array as da

from cuml.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_rand_score as sk_adjusted_rand_score

from cuml.dask.common.dask_arr_utils import to_dask_cudf


@pytest.mark.mg
@pytest.mark.parametrize(
    "nrows",
    [unit_param(1e3), quality_param(1e5),
     stress_param(5e6)])
@pytest.mark.parametrize("ncols", [10, 30])
@pytest.mark.parametrize(
    "nclusters",
    [unit_param(5), quality_param(10),
     stress_param(50)])
@pytest.mark.parametrize(
    "n_parts",
    [unit_param(None), quality_param(7),
     stress_param(50)])
@pytest.mark.parametrize("delayed_predict", [True, False])
@pytest.mark.parametrize("input_type", ["dataframe", "array"])
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict,
                    input_type, client):
Exemplo n.º 26
0
    quality_param, stress_param

import joblib

from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.datasets.samples_generator import make_blobs
from sklearn.manifold.t_sne import trustworthiness
from sklearn.metrics import adjusted_rand_score

dataset_names = ['iris', 'digits', 'wine', 'blobs']


@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize(
    'n_feats',
    [unit_param(20), quality_param(100),
     stress_param(1000)])
def test_blobs_cluster(nrows, n_feats):
    data, labels = datasets.make_blobs(n_samples=nrows,
                                       n_features=n_feats,
                                       centers=5,
                                       random_state=0)
    embedding = cuUMAP(verbose=False).fit_transform(data, convert_dtype=True)

    if nrows < 500000:
        score = adjusted_rand_score(labels, KMeans(5).fit_predict(embedding))
        assert score == 1.0
Exemplo n.º 27
0
    knn_cu = cuKNN()
    knn_cu.fit(X)

    ret = knn_cu.kneighbors(X, k, return_distance=False)
    assert not isinstance(ret, tuple)
    assert ret.shape == (n_samples, k)

    ret = knn_cu.kneighbors(X, k, return_distance=True)
    assert isinstance(ret, tuple)
    assert len(ret) == 2


@pytest.mark.parametrize('input_type', ['dataframe', 'ndarray'])
@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(70000)])
@pytest.mark.parametrize('n_feats', [unit_param(3), stress_param(1000)])
@pytest.mark.parametrize('k', [unit_param(3), stress_param(50)])
@pytest.mark.parametrize("metric", valid_metrics())
def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric):
    X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0)

    X_index = X[:100]
    X_search = X[101:]

    p = 5  # Testing 5-norm of the minkowski metric only
    knn_sk = skKNN(metric=metric, p=p)  # Testing
    knn_sk.fit(X_index.get())
    D_sk, I_sk = knn_sk.kneighbors(X_search.get(), k)
Exemplo n.º 28
0
    params = {'alpha': np.logspace(-3, -1, 10)}
    cu_clf = cumlRidge(alpha=alpha, fit_intercept=fit_intercept,
                       normalize=normalize, solver="eig")

    assert getattr(cu_clf, 'score', False)
    sk_cu_grid = GridSearchCV(cu_clf, params, cv=5, iid=False)

    gdf_data = cudf.DataFrame(X_train)
    gdf_train = cudf.DataFrame(dict(train=y_train))

    sk_cu_grid.fit(gdf_data, gdf_train.train)
    assert sk_cu_grid.best_params_ == {'alpha': 0.1}


@pytest.mark.parametrize('nrows', [unit_param(30), quality_param(5000),
                         stress_param(500000)])
@pytest.mark.parametrize('ncols', [unit_param(10), quality_param(100),
                         stress_param(200)])
@pytest.mark.parametrize('n_info', [unit_param(7), quality_param(50),
                         stress_param(100)])
@pytest.mark.parametrize('datatype', [np.float32])
def test_accuracy(nrows, ncols, n_info, datatype):

    use_handle = True
    train_rows = np.int32(nrows*0.8)
    X, y = make_classification(n_samples=nrows, n_features=ncols,
                               n_clusters_per_class=1, n_informative=n_info,
                               random_state=123, n_classes=5)

    X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
Exemplo n.º 29
0
def generate_dask_array(np_array, n_parts):
    n_samples = np_array.shape[0]
    n_samples_per_part = int(n_samples / n_parts)
    chunks = [n_samples_per_part] * n_parts
    chunks[-1] += n_samples % n_samples_per_part
    chunks = tuple(chunks)
    return da.from_array(np_array, chunks=(chunks, -1))


@pytest.fixture(
    scope="module",
    params=[
        unit_param({'n_samples': 3000, 'n_features': 30,
                    'n_classes': 5, 'n_targets': 2}),
        quality_param({'n_samples': 8000, 'n_features': 35,
                       'n_classes': 12, 'n_targets': 3}),
        stress_param({'n_samples': 20000, 'n_features': 40,
                      'n_classes': 12, 'n_targets': 4})
    ])
def dataset(request):
    X, y = make_multilabel_classification(
        n_samples=int(request.param['n_samples'] * 1.2),
        n_features=request.param['n_features'],
        n_classes=request.param['n_classes'],
        n_labels=request.param['n_classes'],
        length=request.param['n_targets'])
    new_x = []
    new_y = []
    for i in range(y.shape[0]):
        a = np.argwhere(y[i] == 1)[:, 0]
        if len(a) >= request.param['n_targets']:
Exemplo n.º 30
0
import numpy as np
import pytest

from cuml.test.utils import get_pattern, unit_param, \
    quality_param, stress_param, array_equal, assert_dbscan_equal

from sklearn.cluster import DBSCAN as skDBSCAN
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler


@pytest.mark.mg
@pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None])
@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000),
                         stress_param(500000)])
@pytest.mark.parametrize('ncols', [unit_param(20), quality_param(100),
                         stress_param(1000)])
@pytest.mark.parametrize('out_dtype', [unit_param("int32"),
                                       unit_param(np.int32),
                                       unit_param("int64"),
                                       unit_param(np.int64),
                                       quality_param("int32"),
                                       stress_param("int32")])
def test_dbscan(datatype, nrows, ncols,
                max_mbytes_per_batch, out_dtype, client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    n_samples = nrows
    n_feats = ncols