def single_chunk_classification(): """X, y pair for classification. The `X` and `y` have a single block, so chunksize is 100. Useful for testing `partial_fit` methods. """ X, y = make_classification(chunks=100, random_state=0) return X, y
def single_chunk_binary_classification(): """X, y pair for classification. The `X` and `y` have a single block, so chunksize is 100. Useful for testing `partial_fit` methods. The `X` data are binary features """ X, y = make_classification(chunks=100, random_state=0) X = (abs(X) > 0).astype(int) return X, y
from dask.array.utils import assert_eq from daskml.datasets import make_classification from daskml import naive_bayes as nb from sklearn import naive_bayes as nb_ X, y = make_classification(chunks=50) X_ = X.compute() y_ = y.compute() def test_smoke(): a = nb.GaussianNB() b = nb_.GaussianNB() a.fit(X, y) b.fit(X.compute(), y.compute()) assert_eq(a.class_prior_.compute(), b.class_prior_) assert_eq(a.class_count_.compute(), b.class_count_) assert_eq(a.theta_.compute(), b.theta_) assert_eq(a.sigma_.compute(), b.sigma_) assert_eq(a.predict_proba(X).compute(), b.predict_proba(X_)) assert_eq(a.predict(X).compute(), b.predict(X_)) assert_eq(a.predict_log_proba(X).compute(), b.predict_log_proba(X_)) class TestBigMultinomialNB(object): def test_basic(self, single_chunk_count_classification): X, y = single_chunk_count_classification a = nb.BigMultinomialNB(classes=[0, 1]) b = nb_.MultinomialNB()