Exemplo n.º 1
0
def test_count_vectorizer(give_vocabulary, distributed):
    m1 = sklearn.feature_extraction.text.CountVectorizer()
    b = db.from_sequence(JUNK_FOOD_DOCS, npartitions=2)
    r1 = m1.fit_transform(JUNK_FOOD_DOCS)

    if give_vocabulary:
        vocabulary = m1.vocabulary_
        m1 = sklearn.feature_extraction.text.CountVectorizer(
            vocabulary=vocabulary)
        r1 = m1.transform(JUNK_FOOD_DOCS)
    else:
        vocabulary = None

    m2 = dask_ml.feature_extraction.text.CountVectorizer(vocabulary=vocabulary)

    if distributed:
        client = Client()  # noqa
    else:
        client = dummy_context()

    if give_vocabulary:
        r2 = m2.transform(b)
    else:
        r2 = m2.fit_transform(b)

    with client:
        exclude = {"vocabulary_actor_", "stop_words_"}
        if give_vocabulary:
            # In scikit-learn, `.transform()` sets these.
            # This looks buggy.
            exclude |= {"vocabulary_", "fixed_vocabulary_"}

        assert_estimator_equal(m1, m2, exclude=exclude)
        assert isinstance(r2, da.Array)
        assert isinstance(r2._meta, scipy.sparse.csr_matrix)
        np.testing.assert_array_equal(r1.toarray(), r2.compute().toarray())

        r3 = m2.transform(b)
        assert isinstance(r3, da.Array)
        assert isinstance(r3._meta, scipy.sparse.csr_matrix)
        np.testing.assert_array_equal(r1.toarray(), r3.compute().toarray())

        if give_vocabulary:
            r4 = m2.fit_transform(b)
            assert isinstance(r4, da.Array)
            assert isinstance(r4._meta, scipy.sparse.csr_matrix)
            np.testing.assert_array_equal(r1.toarray(), r4.compute().toarray())
Exemplo n.º 2
0
def test_pairwise_distances_argmin_min(X_blobs):
    centers = X_blobs[::100].compute()

    if SK_VERSION >= packaging.version.parse("0.20.0.dev0"):
        # X_blobs has 500 rows per block.
        # Ensure 500 rows in the scikit-learn version too.
        working_memory = 80 * 500 / 2**20

        ctx = sklearn.config_context(working_memory=working_memory)
    else:
        ctx = dummy_context()

    with ctx:
        a_, b_ = sm.pairwise_distances_argmin_min(X_blobs.compute(), centers)
        a, b = dm.pairwise_distances_argmin_min(X_blobs, centers)
        a, b = dask.compute(a, b)

    npt.assert_array_equal(a, a_)
    npt.assert_array_equal(b, b_)
Exemplo n.º 3
0
def test_pairwise_distances_argmin_min(X_blobs):
    centers = X_blobs[::100].compute()

    if SK_GE_020:
        # X_blobs has 500 rows per block.
        # Ensure 500 rows in the scikit-learn version too.
        working_memory = 80 * 500 / 2 ** 20

        ctx = sklearn.config_context(working_memory=working_memory)
    else:
        ctx = dummy_context()

    with ctx:
        a_, b_ = sklearn.metrics.pairwise_distances_argmin_min(
            X_blobs.compute(), centers
        )
        a, b = dask_ml.metrics.pairwise_distances_argmin_min(X_blobs, centers)
        a, b = dask.compute(a, b)

    npt.assert_array_equal(a, a_)
    npt.assert_array_equal(b, b_)