def test_count_vectorizer(give_vocabulary, distributed): m1 = sklearn.feature_extraction.text.CountVectorizer() b = db.from_sequence(JUNK_FOOD_DOCS, npartitions=2) r1 = m1.fit_transform(JUNK_FOOD_DOCS) if give_vocabulary: vocabulary = m1.vocabulary_ m1 = sklearn.feature_extraction.text.CountVectorizer( vocabulary=vocabulary) r1 = m1.transform(JUNK_FOOD_DOCS) else: vocabulary = None m2 = dask_ml.feature_extraction.text.CountVectorizer(vocabulary=vocabulary) if distributed: client = Client() # noqa else: client = dummy_context() if give_vocabulary: r2 = m2.transform(b) else: r2 = m2.fit_transform(b) with client: exclude = {"vocabulary_actor_", "stop_words_"} if give_vocabulary: # In scikit-learn, `.transform()` sets these. # This looks buggy. exclude |= {"vocabulary_", "fixed_vocabulary_"} assert_estimator_equal(m1, m2, exclude=exclude) assert isinstance(r2, da.Array) assert isinstance(r2._meta, scipy.sparse.csr_matrix) np.testing.assert_array_equal(r1.toarray(), r2.compute().toarray()) r3 = m2.transform(b) assert isinstance(r3, da.Array) assert isinstance(r3._meta, scipy.sparse.csr_matrix) np.testing.assert_array_equal(r1.toarray(), r3.compute().toarray()) if give_vocabulary: r4 = m2.fit_transform(b) assert isinstance(r4, da.Array) assert isinstance(r4._meta, scipy.sparse.csr_matrix) np.testing.assert_array_equal(r1.toarray(), r4.compute().toarray())
def test_pairwise_distances_argmin_min(X_blobs): centers = X_blobs[::100].compute() if SK_VERSION >= packaging.version.parse("0.20.0.dev0"): # X_blobs has 500 rows per block. # Ensure 500 rows in the scikit-learn version too. working_memory = 80 * 500 / 2**20 ctx = sklearn.config_context(working_memory=working_memory) else: ctx = dummy_context() with ctx: a_, b_ = sm.pairwise_distances_argmin_min(X_blobs.compute(), centers) a, b = dm.pairwise_distances_argmin_min(X_blobs, centers) a, b = dask.compute(a, b) npt.assert_array_equal(a, a_) npt.assert_array_equal(b, b_)
def test_pairwise_distances_argmin_min(X_blobs): centers = X_blobs[::100].compute() if SK_GE_020: # X_blobs has 500 rows per block. # Ensure 500 rows in the scikit-learn version too. working_memory = 80 * 500 / 2 ** 20 ctx = sklearn.config_context(working_memory=working_memory) else: ctx = dummy_context() with ctx: a_, b_ = sklearn.metrics.pairwise_distances_argmin_min( X_blobs.compute(), centers ) a, b = dask_ml.metrics.pairwise_distances_argmin_min(X_blobs, centers) a, b = dask.compute(a, b) npt.assert_array_equal(a, a_) npt.assert_array_equal(b, b_)