def test_categorical_parameters(class_prior, alpha, fit_prior, is_sparse, nlp_20news): x_dtype = cp.float32 y_dtype = cp.int32 nrows = 2000 ncols = 500 X, y = nlp_20news X = sparse_scipy_to_cp(X, x_dtype).tocsr()[:nrows, :ncols] if not is_sparse: X = X.todense() y = y.astype(y_dtype)[:nrows] if class_prior == 'balanced': class_prior = np.array([1 / 20] * 20) elif class_prior == 'unbalanced': class_prior = np.linspace(0.01, 0.09, 20) model = CategoricalNB(class_prior=class_prior, alpha=alpha, fit_prior=fit_prior) model_sk = skCNB(class_prior=class_prior, alpha=alpha, fit_prior=fit_prior) model.fit(X, y) y_hat = model.predict(X).get() y_log_prob = model.predict_log_proba(X).get() X = X.todense().get() if is_sparse else X.get() model_sk.fit(X, y.get()) y_hat_sk = model_sk.predict(X) y_log_prob_sk = model_sk.predict_log_proba(X) assert_allclose(y_log_prob, y_log_prob_sk, rtol=1e-4) assert_array_equal(y_hat, y_hat_sk)
def test_basic_fit_predict_sparse(x_dtype, y_dtype, nlp_20news): """ Cupy Test """ X, y = nlp_20news X = sparse_scipy_to_cp(X, x_dtype).astype(x_dtype) y = y.astype(y_dtype) # Priming it seems to lower the end-to-end runtime model = MultinomialNB() model.fit(X, y) cp.cuda.Stream.null.synchronize() with cp.prof.time_range(message="start", color_id=10): model = MultinomialNB() model.fit(X, y) y_hat = model.predict(X) y_hat = cp.asnumpy(y_hat) y = cp.asnumpy(y) assert accuracy_score(y, y_hat) >= 0.924
def test_categorical(x_dtype, y_dtype, is_sparse, nlp_20news): if x_dtype == cp.int32 and is_sparse: pytest.skip("Sparse matrices with integers dtype are not supported") X, y = nlp_20news n_rows = 2000 n_cols = 500 X = sparse_scipy_to_cp(X, dtype=cp.float32) X = X.tocsr()[:n_rows, :n_cols] y = y.astype(y_dtype)[:n_rows] if not is_sparse: X = X.todense() X = X.astype(x_dtype) cuml_model = CategoricalNB() cuml_model.fit(X, y) cuml_score = cuml_model.score(X, y) cuml_proba = cuml_model.predict_log_proba(X).get() X = X.todense().get() if is_sparse else X.get() y = y.get() sk_model = skCNB() sk_model.fit(X, y) sk_score = sk_model.score(X, y) sk_proba = sk_model.predict_log_proba(X) THRES = 1e-3 assert_array_equal(sk_model.class_count_, cuml_model.class_count_.get()) assert_allclose(sk_model.class_log_prior_, cuml_model.class_log_prior_.get(), 1e-6) assert_allclose(cuml_proba, sk_proba, atol=1e-2, rtol=1e-2) assert sk_score - THRES <= cuml_score <= sk_score + THRES
def test_gaussian_parameters(priors, var_smoothing, nlp_20news): x_dtype = cp.float32 y_dtype = cp.int32 nrows = 150 X, y = nlp_20news X = sparse_scipy_to_cp(X[:nrows], x_dtype).todense() y = y.astype(y_dtype)[:nrows] if priors == 'balanced': priors = cp.array([1 / 20] * 20) elif priors == 'unbalanced': priors = cp.linspace(0.01, 0.09, 20) model = GaussianNB(priors=priors, var_smoothing=var_smoothing) model_sk = skGNB(priors=priors.get() if priors is not None else None, var_smoothing=var_smoothing) model.fit(X, y) model_sk.fit(X.get(), y.get()) y_hat = model.predict(X) y_hat_sk = model_sk.predict(X.get()) y_hat = cp.asnumpy(y_hat) y = cp.asnumpy(y) assert_allclose(model.epsilon_.get(), model_sk.epsilon_, rtol=1e-4) assert_array_equal(y_hat, y_hat_sk)
def test_gaussian_fit_predict(x_dtype, y_dtype, is_sparse, nlp_20news): """ Cupy Test """ X, y = nlp_20news model = GaussianNB() n_rows = 500 n_cols = int(2e5) X = sparse_scipy_to_cp(X, x_dtype) X = X.tocsr()[:n_rows, :n_cols] if is_sparse: y = y.astype(y_dtype)[:n_rows] model.fit(X, y) else: X = X.todense() y = y[:n_rows].astype(y_dtype) model.fit(np.ascontiguousarray(cp.asnumpy(X).astype(x_dtype)), y) y_hat = model.predict(X) y_hat = cp.asnumpy(y_hat) y = cp.asnumpy(y) assert accuracy_score(y, y_hat) >= 0.99
def test_bernoulli(x_dtype, y_dtype, is_sparse, nlp_20news): X, y = nlp_20news n_rows = 500 X = sparse_scipy_to_cp(X, x_dtype).astype(x_dtype) y = y.astype(y_dtype) X = X.tocsr()[:n_rows] y = y[:n_rows] if not is_sparse: X = X.todense() sk_model = skBNB() cuml_model = BernoulliNB() sk_model.fit(X.get(), y.get()) cuml_model.fit(X, y) sk_score = sk_model.score(X.get(), y.get()) cuml_score = cuml_model.score(X, y) cuml_proba = cuml_model.predict_log_proba(X).get() sk_proba = sk_model.predict_log_proba(X.get()) THRES = 1e-3 assert_array_equal(sk_model.class_count_, cuml_model.class_count_.get()) assert_allclose(sk_model.class_log_prior_, cuml_model.class_log_prior_.get(), 1e-6) assert_allclose(cuml_proba, sk_proba, atol=1e-2, rtol=1e-2) assert sk_score - THRES <= cuml_score <= sk_score + THRES
def test_gaussian_partial_fit(nlp_20news): chunk_size = 250 n_rows = 1500 x_dtype, y_dtype = cp.float32, cp.int32 X, y = nlp_20news X = sparse_scipy_to_cp(X, x_dtype).tocsr()[:n_rows] y = y.astype(y_dtype)[:n_rows] model = GaussianNB() classes = np.unique(y) total_fit = 0 for i in range(math.ceil(X.shape[0] / chunk_size)): upper = i * chunk_size + chunk_size if upper > X.shape[0]: upper = -1 if upper > 0: x = X[i * chunk_size:upper] y_c = y[i * chunk_size:upper] else: x = X[i * chunk_size:] y_c = y[i * chunk_size:] model.partial_fit(x, y_c, classes=classes) total_fit += (upper - (i * chunk_size)) if upper == -1: break y_hat = model.predict(X) y_hat = cp.asnumpy(y_hat) y = cp.asnumpy(y) assert accuracy_score(y, y_hat) >= 0.99 # Test whether label mismatch between target y and classes raises an Error assert_raises(ValueError, GaussianNB().partial_fit, X, y, classes=cp.array([0, 1])) # Raise because classes is required on first call of partial_fit assert_raises(ValueError, GaussianNB().partial_fit, X, y)
def test_naive_bayes(nlp_20news): X, y = nlp_20news X = sparse_scipy_to_cp(X, cp.float32).astype(cp.float32) y = y.astype(cp.int32) with cupy_using_allocator(dummy_allocator): model = MultinomialNB() model.fit(X, y) y_hat = model.predict(X) y_hat = model.predict(X) y_hat = model.predict_proba(X) y_hat = model.predict_log_proba(X) y_hat = model.score(X, y) del y_hat
def test_partial_fit(x_dtype, y_dtype, nlp_20news): chunk_size = 500 X, y = nlp_20news X = sparse_scipy_to_cp(X, x_dtype).astype(x_dtype) y = y.astype(y_dtype) X = X.tocsr() model = MultinomialNB() classes = np.unique(y) total_fit = 0 for i in range(math.ceil(X.shape[0] / chunk_size)): upper = i * chunk_size + chunk_size if upper > X.shape[0]: upper = -1 if upper > 0: x = X[i * chunk_size:upper] y_c = y[i * chunk_size:upper] else: x = X[i * chunk_size:] y_c = y[i * chunk_size:] model.partial_fit(x, y_c, classes=classes) total_fit += (upper - (i * chunk_size)) if upper == -1: break y_hat = model.predict(X) y_hat = cp.asnumpy(y_hat) y = cp.asnumpy(y) assert accuracy_score(y, y_hat) >= 0.924
def test_basic_fit_predict_dense_numpy(x_dtype, y_dtype, nlp_20news): """ Cupy Test """ X, y = nlp_20news X = sparse_scipy_to_cp(X, cp.float32) y = y.astype(y_dtype) X = X.tocsr()[0:500].todense() y = y[:500] model = MultinomialNB() model.fit(np.ascontiguousarray(cp.asnumpy(X).astype(x_dtype)), y) y_hat = model.predict(X) y_hat = cp.asnumpy(y_hat) y = cp.asnumpy(y) accuracy_score(y, y_hat) >= 0.911
def test_multinomial_basic_fit_predict_dense_numpy(x_dtype, y_dtype, nlp_20news): """ Cupy Test """ X, y = nlp_20news n_rows = 500 X = sparse_scipy_to_cp(X, cp.float32).tocsr()[:n_rows] y = y[:n_rows].astype(y_dtype) model = MultinomialNB() model.fit(np.ascontiguousarray(cp.asnumpy(X.todense()).astype(x_dtype)), y) y_hat = model.predict(X).get() modelsk = skNB() modelsk.fit(X.get(), y.get()) y_sk = model.predict(X.get()) assert_allclose(y_hat, y_sk)
def test_categorical_partial_fit(x_dtype, y_dtype, is_sparse, nlp_20news): if x_dtype == cp.int32 and is_sparse: pytest.skip("Sparse matrices with integers dtype are not supported") n_rows = 5000 n_cols = 500 chunk_size = 1000 X, y = nlp_20news X = sparse_scipy_to_cp(X, 'float32').tocsr()[:n_rows] if is_sparse: X.data = X.data.astype(x_dtype) expected_score = 0.5414 else: X = X[:, :n_cols].todense().astype(x_dtype) expected_score = 0.1040 y = y.astype(y_dtype)[:n_rows] model = CategoricalNB() classes = np.unique(y) for i in range(math.ceil(X.shape[0] / chunk_size)): upper = i * chunk_size + chunk_size if upper > X.shape[0]: upper = -1 if upper > 0: x = X[i * chunk_size:upper] y_c = y[i * chunk_size:upper] else: x = X[i * chunk_size:] y_c = y[i * chunk_size:] model.partial_fit(x, y_c, classes=classes) if upper == -1: break cuml_score = model.score(X, y) THRES = 1e-4 assert expected_score - THRES <= cuml_score <= expected_score + THRES
def test_predict_log_proba(x_dtype, y_dtype, nlp_20news): X, y = nlp_20news cu_X = sparse_scipy_to_cp(X, x_dtype).astype(x_dtype) cu_y = y.astype(y_dtype) cu_X = cu_X.tocsr() y = y.get() cuml_model = MultinomialNB() sk_model = skNB() cuml_model.fit(cu_X, cu_y) sk_model.fit(X, y) cuml_proba = cuml_model.predict_log_proba(cu_X).get() sk_proba = sk_model.predict_log_proba(X) assert_allclose(cuml_proba, sk_proba, atol=1e-2, rtol=1e-2)
def test_bernoulli_partial_fit(x_dtype, y_dtype, nlp_20news): chunk_size = 500 n_rows = 1500 X, y = nlp_20news X = sparse_scipy_to_cp(X, x_dtype).astype(x_dtype) y = y.astype(y_dtype)[:n_rows] X = X.tocsr()[:n_rows] model = BernoulliNB() modelsk = skBNB() classes = np.unique(y) for i in range(math.ceil(X.shape[0] / chunk_size)): upper = i * chunk_size + chunk_size if upper > X.shape[0]: upper = -1 if upper > 0: x = X[i * chunk_size:upper] y_c = y[i * chunk_size:upper] else: x = X[i * chunk_size:] y_c = y[i * chunk_size:] model.partial_fit(x, y_c, classes=classes) modelsk.partial_fit(x.get(), y_c.get(), classes=classes.get()) if upper == -1: break y_hat = model.predict(X).get() y_sk = modelsk.predict(X.get()) assert_allclose(y_hat, y_sk)
def test_score(x_dtype, y_dtype, nlp_20news): X, y = nlp_20news cu_X = sparse_scipy_to_cp(X, x_dtype).astype(x_dtype) cu_y = y.astype(y_dtype) cu_X = cu_X.tocsr() y = y.get() cuml_model = MultinomialNB() sk_model = skNB() cuml_model.fit(cu_X, cu_y) sk_model.fit(X, y) cuml_score = cuml_model.score(cu_X, cu_y) sk_score = sk_model.score(X, y) THRES = 1e-4 assert sk_score - THRES <= cuml_score <= sk_score + THRES