import dask.array as da import numpy as np import cupy as cp from dask.distributed import Client from cuml.dask.datasets.blobs import make_blobs from cuml.dask.common.input_utils import DistributedDataHandler from cuml.test.utils import unit_param, quality_param, stress_param from cuml.dask.common.part_utils import _extract_partitions @pytest.mark.parametrize('nrows', [unit_param(1e3), quality_param(1e5), stress_param(1e6)]) @pytest.mark.parametrize('ncols', [unit_param(10), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize('centers', [10]) @pytest.mark.parametrize("cluster_std", [0.1]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("nparts", [unit_param(1), unit_param(7), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize("order", ['F', 'C']) def test_make_blobs(nrows, ncols, centers, cluster_std, dtype,
from cuml.test.utils import unit_param, quality_param, stress_param from sklearn.linear_model import SGDClassifier from sklearn.datasets.samples_generator import make_classification from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split @pytest.mark.parametrize('lrate', ['constant', 'invscaling', 'adaptive']) @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('input_type', ['ndarray']) @pytest.mark.parametrize('penalty', ['none', 'l1', 'l2', 'elasticnet']) @pytest.mark.parametrize('loss', ['hinge', 'log', 'squared_loss']) @pytest.mark.parametrize( 'nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize('column_info', [ unit_param([20, 10]), quality_param([100, 50]), stress_param([1000, 500]) ]) def test_mbsgd_classifier(datatype, lrate, input_type, penalty, loss, nrows, column_info): ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_informative=n_info, n_features=ncols, random_state=0) X = X.astype(datatype) y = y.astype(datatype)
quality_param, stress_param import cuml.common.logger as logger from sklearn.ensemble import RandomForestClassifier as skrfc from sklearn.ensemble import RandomForestRegressor as skrfr from sklearn.metrics import accuracy_score, mean_squared_error from sklearn.datasets import fetch_california_housing, \ make_classification, make_regression from sklearn.model_selection import train_test_split @pytest.fixture( scope="session", params=[ unit_param({'n_samples': 350, 'n_features': 20, 'n_informative': 10}), quality_param({'n_samples': 5000, 'n_features': 200, 'n_informative': 80}), stress_param({'n_samples': 500000, 'n_features': 400, 'n_informative': 180}) ]) def small_clf(request): X, y = make_classification(n_samples=request.param['n_samples'], n_features=request.param['n_features'], n_clusters_per_class=1, n_informative=request.param['n_informative'], random_state=123, n_classes=2) return X, y @pytest.fixture( scope="session", params=[
else: params['eval_metric'] = 'error' params['objective'] = 'reg:squarederror' params['base_score'] = 0.0 params['max_depth'] = 25 params.update(xgboost_params) bst = xgb.train(params, dtrain, num_rounds) bst.save_model(model_path) return bst @pytest.mark.parametrize( 'n_rows', [unit_param(1000), quality_param(10000), stress_param(500000)]) @pytest.mark.parametrize( 'n_columns', [unit_param(20), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize( 'num_rounds', [unit_param(1), unit_param(5), quality_param(50), stress_param(90)]) @pytest.mark.skipif(has_xgboost() is False, reason="need to install xgboost") def test_fil_classification(n_rows, n_columns, num_rounds, tmp_path): # settings classification = True # change this to false to use regression
from cuml.test.utils import get_handle, array_equal, unit_param, \ quality_param, stress_param from sklearn import datasets from sklearn.datasets import make_multilabel_classification from sklearn.decomposition import PCA as skPCA from sklearn.datasets import make_blobs from cuml.common.exceptions import NotFittedError @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('input_type', ['ndarray']) @pytest.mark.parametrize('use_handle', [True, False]) @pytest.mark.parametrize( 'name', [unit_param(None), quality_param('digits'), stress_param('blobs')]) def test_pca_fit(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'digits': X, _ = datasets.load_digits(return_X_y=True) else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1, allow_unlabeled=False,
normalize=normalize, solver="eig") assert getattr(cu_clf, 'score', False) sk_cu_grid = GridSearchCV(cu_clf, params, cv=5, iid=False) gdf_data = cudf.DataFrame.from_gpu_matrix(cuda.to_device(X_train)) gdf_train = cudf.DataFrame(dict(train=y_train)) sk_cu_grid.fit(gdf_data, gdf_train.train) assert sk_cu_grid.best_params_ == {'alpha': 0.1} @pytest.mark.parametrize( 'nrows', [unit_param(30), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize( 'ncols', [unit_param(10), quality_param(100), stress_param(200)]) @pytest.mark.parametrize( 'n_info', [unit_param(7), quality_param(50), stress_param(100)]) @pytest.mark.parametrize('datatype', [np.float32]) def test_accuracy(nrows, ncols, n_info, datatype): use_handle = True train_rows = np.int32(nrows * 0.8) X, y = make_classification(n_samples=nrows,
n_classes=num_classes, random_state=0, ) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=10 ) return X_train, X_test, y_train, y_test @pytest.mark.parametrize("datatype", [np.float32, np.float64]) @pytest.mark.parametrize("algorithm", ["eig", "svd"]) @pytest.mark.parametrize( "nrows", [unit_param(1000), quality_param(5000), stress_param(500000)] ) @pytest.mark.parametrize( "column_info", [ unit_param([20, 10]), quality_param([100, 50]), stress_param([1000, 500]) ], ) def test_linear_regression_model(datatype, algorithm, nrows, column_info): if algorithm == "svd" and nrows > 46340: pytest.skip("svd solver is not supported for the data that has more" "than 46340 rows or columns if you are using CUDA version" "10.x")
knn_cu = cuKNN() knn_cu.fit(X) ret = knn_cu.kneighbors(X, k, return_distance=False) assert not isinstance(ret, tuple) assert ret.shape == (n_samples, k) ret = knn_cu.kneighbors(X, k, return_distance=True) assert isinstance(ret, tuple) assert len(ret) == 2 @pytest.mark.parametrize('input_type', ['dataframe', 'ndarray']) @pytest.mark.parametrize( 'nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize( 'n_feats', [unit_param(3), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize( 'k', [unit_param(3), quality_param(30), stress_param(50)]) @pytest.mark.parametrize("metric", valid_metrics()) def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) X_index = X[:100] X_search = X[101:]
X_train_df, = dask_utils.persist_across_workers(c, [X_train_df], workers=workers) return X_train_df def test_011_exception(): from cuml.dask.neighbors import NearestNeighbors as daskNN with pytest.raises(NotImplementedError): cumlModel = daskNN() # noqa: F841 @pytest.mark.parametrize("nrows", [unit_param(1e3), unit_param(1e4), quality_param(1e6), stress_param(5e8)]) @pytest.mark.parametrize("ncols", [10, 30]) @pytest.mark.parametrize("nclusters", [unit_param(5), quality_param(10), stress_param(15)]) @pytest.mark.parametrize("n_neighbors", [unit_param(10), quality_param(4), stress_param(100)]) @pytest.mark.parametrize("n_parts", [unit_param(1), unit_param(5), quality_param(7), stress_param(50)]) @pytest.mark.parametrize("streams_per_handle", [1, 5]) @pytest.mark.skip("MNMG KNN available in cuML 0.12+") def test_compare_skl(nrows, ncols, nclusters, n_parts, n_neighbors, streams_per_handle, cluster): client = Client(cluster)
from cuml import DBSCAN as cuDBSCAN from cuml.test.utils import get_pattern, unit_param, \ quality_param, stress_param, array_equal, assert_dbscan_equal from sklearn.cluster import DBSCAN as skDBSCAN from sklearn.datasets import make_blobs from sklearn.metrics import pairwise_distances from sklearn.preprocessing import StandardScaler @pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None]) @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('use_handle', [True, False]) @pytest.mark.parametrize( 'nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize( 'ncols', [unit_param(20), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize('out_dtype', [ unit_param("int32"), unit_param(np.int32), unit_param("int64"), unit_param(np.int64), quality_param("int32"), stress_param("int32") ]) def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype):
from cuml import TruncatedSVD as cuTSVD from cuml.test.utils import get_handle from cuml.test.utils import array_equal, unit_param, \ quality_param, stress_param from sklearn.datasets.samples_generator import make_blobs from sklearn.decomposition import TruncatedSVD as skTSVD from sklearn.utils import check_random_state @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('use_handle', [True, False]) @pytest.mark.parametrize( 'name', [unit_param(None), quality_param('random'), stress_param('blobs')]) def test_tsvd_fit(datatype, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: n, p = 500, 5
from cuml.ensemble import RandomForestRegressor as curfr from cuml.metrics import r2_score from cuml.test.utils import get_handle, unit_param, \ quality_param, stress_param from sklearn.ensemble import RandomForestClassifier as skrfc from sklearn.ensemble import RandomForestRegressor as skrfr from sklearn.metrics import accuracy_score from sklearn.datasets import fetch_california_housing, \ make_classification, make_regression from sklearn.model_selection import train_test_split @pytest.mark.parametrize( 'nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize('column_info', [ unit_param([20, 10]), quality_param([200, 100]), stress_param([500, 350]) ]) @pytest.mark.parametrize( 'rows_sample', [unit_param(1.0), quality_param(0.90), stress_param(0.95)]) @pytest.mark.parametrize('datatype', [np.float32]) @pytest.mark.parametrize('split_algo', [0, 1]) @pytest.mark.parametrize('max_features', [1.0, 'auto', 'log2', 'sqrt']) def test_rf_classification(datatype, split_algo, rows_sample, nrows, column_info, max_features):
load_boston from sklearn.model_selection import train_test_split import treelite pytestmark = pytest.mark.filterwarnings("ignore: For reproducible results(.*)" "::cuml[.*]") @pytest.fixture( scope="session", params=[ unit_param({"n_samples": 350, "n_features": 20, "n_informative": 10}), quality_param( {"n_samples": 5000, "n_features": 200, "n_informative": 80} ), stress_param( {"n_samples": 500000, "n_features": 400, "n_informative": 180} ), ], ) def small_clf(request): X, y = make_classification( n_samples=request.param["n_samples"], n_features=request.param["n_features"], n_clusters_per_class=1, n_informative=request.param["n_informative"], random_state=123, n_classes=2, )
import dask.array as da import numpy as np import cupy as cp from cuml.dask.datasets.blobs import make_blobs from cuml.dask.common.input_utils import DistributedDataHandler from cuml.test.utils import unit_param, quality_param, stress_param from cuml.dask.common.part_utils import _extract_partitions @pytest.mark.parametrize( 'nrows', [unit_param(1e3), quality_param(1e5), stress_param(1e6)]) @pytest.mark.parametrize( 'ncols', [unit_param(10), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize('centers', [10]) @pytest.mark.parametrize("cluster_std", [0.1]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize( "nparts", [unit_param(1), unit_param(7), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize("order", ['F', 'C'])
n_features=n_feats, random_state=0) knn_cu = cuKNN() knn_cu.fit(X) ret = knn_cu.kneighbors(X, k, return_distance=False) assert not isinstance(ret, tuple) assert ret.shape == (n_samples, k) ret = knn_cu.kneighbors(X, k, return_distance=True) assert isinstance(ret, tuple) assert len(ret) == 2 @pytest.mark.parametrize('input_type', ['dataframe', 'ndarray']) @pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000), stress_param(70000)]) @pytest.mark.parametrize('n_feats', [unit_param(3), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize('k', [unit_param(3), quality_param(30), stress_param(50)]) @pytest.mark.parametrize("metric", valid_metrics()) def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) X_index = X[:100] X_search = X[101:] p = 5 # Testing 5-norm of the minkowski metric only knn_sk = skKNN(metric=metric, p=p) # Testing
from cuml.solvers import SGD as cumlSGD from cuml.test.utils import unit_param, quality_param, \ stress_param from sklearn.datasets.samples_generator import make_blobs from sklearn.model_selection import train_test_split from sklearn import datasets @pytest.mark.parametrize('lrate', ['constant', 'invscaling', 'adaptive']) @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('penalty', ['none', 'l1', 'l2', 'elasticnet']) @pytest.mark.parametrize('loss', ['hinge', 'log', 'squared_loss']) @pytest.mark.parametrize( 'name', [unit_param(None), quality_param('iris'), stress_param('blobs')]) def test_svd(datatype, lrate, penalty, loss, name): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) elif name == 'iris': iris = datasets.load_iris() X = (iris.data).astype(datatype) y = (iris.target).astype(datatype)
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import pytest from dask.distributed import Client, wait import numpy as np from cuml.test.utils import unit_param, quality_param, stress_param @pytest.mark.mg @pytest.mark.parametrize("nrows", [unit_param(1e3), quality_param(1e5), stress_param(5e6)]) @pytest.mark.parametrize("ncols", [10, 30]) @pytest.mark.parametrize("nclusters", [unit_param(5), quality_param(10), stress_param(50)]) @pytest.mark.parametrize("n_parts", [unit_param(None), quality_param(7), stress_param(50)]) def test_end_to_end(nrows, ncols, nclusters, n_parts, cluster): client = Client(cluster) try: from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs
X_train_df = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) X_train_df, = dask_utils.persist_across_workers(c, [X_train_df], workers=list(workers)) return X_train_df def _scale_rows(client, nrows): workers = list(client.scheduler_info()['workers'].keys()) n_workers = len(workers) return n_workers * nrows @pytest.mark.parametrize( "nrows", [unit_param(300), quality_param(1e6), stress_param(5e8)]) @pytest.mark.parametrize("ncols", [10, 30]) @pytest.mark.parametrize( "nclusters", [unit_param(5), quality_param(10), stress_param(15)]) @pytest.mark.parametrize( "n_neighbors", [unit_param(10), quality_param(4), stress_param(100)]) @pytest.mark.parametrize( "n_parts", [unit_param(1), unit_param(5), quality_param(7),
from sklearn.metrics import accuracy_score, mean_squared_error from sklearn.datasets import fetch_california_housing, \ make_classification, make_regression from sklearn.model_selection import train_test_split @pytest.fixture(scope="session", params=[ unit_param({ 'n_samples': 350, 'n_features': 20, 'n_informative': 10 }), quality_param({ 'n_samples': 5000, 'n_features': 200, 'n_informative': 80 }), stress_param({ 'n_samples': 500000, 'n_features': 400, 'n_informative': 180 }) ]) def small_clf(request): X, y = make_classification(n_samples=request.param['n_samples'], n_features=request.param['n_features'], n_clusters_per_class=1, n_informative=request.param['n_informative'], random_state=123, n_classes=2)
chunks[-1] += n_samples % n_samples_per_part chunks = tuple(chunks) return da.from_array(np_array, chunks=(chunks, -1)) @pytest.fixture(scope="module", params=[ unit_param({ 'n_samples': 1000, 'n_features': 30, 'n_classes': 5, 'n_targets': 2 }), quality_param({ 'n_samples': 5000, 'n_features': 100, 'n_classes': 12, 'n_targets': 4 }), stress_param({ 'n_samples': 12000, 'n_features': 40, 'n_classes': 5, 'n_targets': 2 }) ]) def dataset(request): X, y = make_multilabel_classification( n_samples=int(request.param['n_samples'] * 1.2), n_features=request.param['n_features'], n_classes=request.param['n_classes'], n_labels=request.param['n_classes'],
cuSVC = cu_svm.SVC(**params) cuSVC.fit(X_train, y_train) sklSVC = svm.SVC(**params) sklSVC.fit(X_train, y_train) compare_svm(cuSVC, sklSVC, X_train, y_train, cmp_decision_func=True) @pytest.mark.parametrize('params', [ {'kernel': 'linear', 'C': 1}, {'kernel': 'rbf', 'C': 1, 'gamma': 1}, {'kernel': 'poly', 'C': 1, 'gamma': 1}, ]) @pytest.mark.parametrize('dataset', ['classification2', 'gaussian', 'blobs']) @pytest.mark.parametrize('n_rows', [3, unit_param(100), quality_param(1000), stress_param(5000)]) @pytest.mark.parametrize('n_cols', [2, unit_param(100), quality_param(1000), stress_param(1000)]) def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols): if (params['kernel'] == 'linear' and dataset in ['gaussian', 'classification2'] and n_rows > 1000 and n_cols >= 1000): # linear kernel will not fit the gaussian dataset, but takes very long return X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols) # Default to numpy for testing with cuml.using_output_type("numpy"): cuSVC = cu_svm.SVC(**params)
import cupy as cp from cuml.linear_model import MBSGDRegressor as cumlMBSGRegressor from cuml.metrics import r2_score from cuml.test.utils import unit_param, quality_param, stress_param from sklearn.linear_model import SGDRegressor from cuml.datasets import make_regression from sklearn.model_selection import train_test_split @pytest.fixture(scope="module", params=[ unit_param([500, 20, 10, np.float32]), unit_param([500, 20, 10, np.float64]), quality_param([5000, 100, 50, np.float32]), quality_param([5000, 100, 50, np.float64]), stress_param([500000, 1000, 500, np.float32]), stress_param([500000, 1000, 500, np.float64]), ], ids=[ '500-20-10-f32', '500-20-10-f64', '5000-100-50-f32', '5000-100-50-f64', '500000-1000-500-f32', '500000-1000-500-f64' ]) def make_dataset(request): nrows, ncols, n_info, datatype = request.param if nrows == 500000 and datatype == np.float64 and \ pytest.max_gpu_memory < 32: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 32
import pytest from cuml.linear_model import MBSGDRegressor as cumlMBSGRegressor from cuml.metrics import r2_score from cuml.test.utils import unit_param, quality_param, stress_param from sklearn.linear_model import SGDRegressor from sklearn.datasets.samples_generator import make_regression from sklearn.model_selection import train_test_split @pytest.mark.parametrize('lrate', ['constant', 'invscaling', 'adaptive']) @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('input_type', ['ndarray']) @pytest.mark.parametrize('penalty', ['none', 'l1', 'l2', 'elasticnet']) @pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize('column_info', [unit_param([20, 10]), quality_param([100, 50]), stress_param([1000, 500])]) def test_mbsgd_regressor(datatype, lrate, input_type, penalty, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005,
shuffle=False, random_state=random_state, ) cuml_kmeans = cuml.KMeans(init="k-means||", n_clusters=nclusters, random_state=random_state, output_type='numpy') preds = cuml_kmeans.fit_predict(X) assert adjusted_rand_score(cp.asnumpy(preds), cp.asnumpy(y)) >= 0.99 @pytest.mark.parametrize('name', dataset_names) @pytest.mark.parametrize('nrows', [unit_param(1000), quality_param(5000)]) def test_kmeans_sklearn_comparison(name, nrows): random_state = 12 default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows)
from cuml.test.utils import unit_param from cuml.test.utils import quality_param from cuml.test.utils import stress_param import dask.array as da from cuml.metrics import adjusted_rand_score from sklearn.metrics import adjusted_rand_score as sk_adjusted_rand_score from cuml.dask.common.dask_arr_utils import to_dask_cudf @pytest.mark.mg @pytest.mark.parametrize( "nrows", [unit_param(1e3), quality_param(1e5), stress_param(5e6)]) @pytest.mark.parametrize("ncols", [10, 30]) @pytest.mark.parametrize( "nclusters", [unit_param(5), quality_param(10), stress_param(50)]) @pytest.mark.parametrize( "n_parts", [unit_param(None), quality_param(7), stress_param(50)]) @pytest.mark.parametrize("delayed_predict", [True, False]) @pytest.mark.parametrize("input_type", ["dataframe", "array"]) def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict, input_type, client):
quality_param, stress_param import joblib from sklearn import datasets from sklearn.cluster import KMeans from sklearn.datasets.samples_generator import make_blobs from sklearn.manifold.t_sne import trustworthiness from sklearn.metrics import adjusted_rand_score dataset_names = ['iris', 'digits', 'wine', 'blobs'] @pytest.mark.parametrize( 'nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize( 'n_feats', [unit_param(20), quality_param(100), stress_param(1000)]) def test_blobs_cluster(nrows, n_feats): data, labels = datasets.make_blobs(n_samples=nrows, n_features=n_feats, centers=5, random_state=0) embedding = cuUMAP(verbose=False).fit_transform(data, convert_dtype=True) if nrows < 500000: score = adjusted_rand_score(labels, KMeans(5).fit_predict(embedding)) assert score == 1.0
knn_cu = cuKNN() knn_cu.fit(X) ret = knn_cu.kneighbors(X, k, return_distance=False) assert not isinstance(ret, tuple) assert ret.shape == (n_samples, k) ret = knn_cu.kneighbors(X, k, return_distance=True) assert isinstance(ret, tuple) assert len(ret) == 2 @pytest.mark.parametrize('input_type', ['dataframe', 'ndarray']) @pytest.mark.parametrize( 'nrows', [unit_param(500), quality_param(5000), stress_param(70000)]) @pytest.mark.parametrize('n_feats', [unit_param(3), stress_param(1000)]) @pytest.mark.parametrize('k', [unit_param(3), stress_param(50)]) @pytest.mark.parametrize("metric", valid_metrics()) def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) X_index = X[:100] X_search = X[101:] p = 5 # Testing 5-norm of the minkowski metric only knn_sk = skKNN(metric=metric, p=p) # Testing knn_sk.fit(X_index.get()) D_sk, I_sk = knn_sk.kneighbors(X_search.get(), k)
params = {'alpha': np.logspace(-3, -1, 10)} cu_clf = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver="eig") assert getattr(cu_clf, 'score', False) sk_cu_grid = GridSearchCV(cu_clf, params, cv=5, iid=False) gdf_data = cudf.DataFrame(X_train) gdf_train = cudf.DataFrame(dict(train=y_train)) sk_cu_grid.fit(gdf_data, gdf_train.train) assert sk_cu_grid.best_params_ == {'alpha': 0.1} @pytest.mark.parametrize('nrows', [unit_param(30), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize('ncols', [unit_param(10), quality_param(100), stress_param(200)]) @pytest.mark.parametrize('n_info', [unit_param(7), quality_param(50), stress_param(100)]) @pytest.mark.parametrize('datatype', [np.float32]) def test_accuracy(nrows, ncols, n_info, datatype): use_handle = True train_rows = np.int32(nrows*0.8) X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=5) X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
def generate_dask_array(np_array, n_parts): n_samples = np_array.shape[0] n_samples_per_part = int(n_samples / n_parts) chunks = [n_samples_per_part] * n_parts chunks[-1] += n_samples % n_samples_per_part chunks = tuple(chunks) return da.from_array(np_array, chunks=(chunks, -1)) @pytest.fixture( scope="module", params=[ unit_param({'n_samples': 3000, 'n_features': 30, 'n_classes': 5, 'n_targets': 2}), quality_param({'n_samples': 8000, 'n_features': 35, 'n_classes': 12, 'n_targets': 3}), stress_param({'n_samples': 20000, 'n_features': 40, 'n_classes': 12, 'n_targets': 4}) ]) def dataset(request): X, y = make_multilabel_classification( n_samples=int(request.param['n_samples'] * 1.2), n_features=request.param['n_features'], n_classes=request.param['n_classes'], n_labels=request.param['n_classes'], length=request.param['n_targets']) new_x = [] new_y = [] for i in range(y.shape[0]): a = np.argwhere(y[i] == 1)[:, 0] if len(a) >= request.param['n_targets']:
import numpy as np import pytest from cuml.test.utils import get_pattern, unit_param, \ quality_param, stress_param, array_equal, assert_dbscan_equal from sklearn.cluster import DBSCAN as skDBSCAN from sklearn.datasets import make_blobs from sklearn.preprocessing import StandardScaler @pytest.mark.mg @pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None]) @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize('ncols', [unit_param(20), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize('out_dtype', [unit_param("int32"), unit_param(np.int32), unit_param("int64"), unit_param(np.int64), quality_param("int32"), stress_param("int32")]) def test_dbscan(datatype, nrows, ncols, max_mbytes_per_batch, out_dtype, client): from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN n_samples = nrows n_feats = ncols