def test_multiclass(): X, y = sklearn.datasets.make_classification(n_classes=3, n_informative=4) X = da.from_array(X, chunks=50) y = da.from_array(y, chunks=50) if SK_GE_020: kwargs = {"multi_class": "auto"} else: kwargs = {} clf = ParallelPostFit( LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs", **kwargs) ) clf.fit(X, y) result = clf.predict(X) expected = clf.estimator.predict(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected) result = clf.predict_proba(X) expected = clf.estimator.predict_proba(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected)
def test_predict(kind): X, y = make_classification(chunks=100) if kind == "numpy": X, y = dask.compute(X, y) elif kind == "dask.dataframe": X = dd.from_dask_array(X) y = dd.from_dask_array(y) base = LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs") wrap = ParallelPostFit( LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs")) base.fit(X, y) wrap.fit(X, y) assert_estimator_equal(wrap.estimator, base) result = wrap.predict(X) expected = base.predict(X) assert_eq_ar(result, expected) result = wrap.predict_proba(X) expected = base.predict_proba(X) assert_eq_ar(result, expected)
def test_sparse_inputs(): X = csr_matrix((3, 4)) y = np.asarray([0, 0, 1], dtype=np.int32) base = SGDClassifier(tol=1e-3) base = base.fit(X, y) wrap = ParallelPostFit(base) X_da = da.from_array(X, chunks=(1, 4)) result = wrap.predict(X_da).compute() expected = base.predict(X) assert_eq_ar(result, expected)
def test_multiclass(): X, y = make_classification(chunks=50, n_classes=3, n_informative=4) clf = ParallelPostFit(LogisticRegression(random_state=0)) clf.fit(X, y) result = clf.predict(X) expected = clf.estimator.predict(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected) result = clf.predict_proba(X) expected = clf.estimator.predict_proba(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected)
def test_transform_meta_override(): X = pd.DataFrame({"cat_s": ["a", "b", "c", "d"]}) dd_X = dd.from_pandas(X, npartitions=2) base = OneHotEncoder(sparse=False) base.fit(pd.DataFrame(X)) # Failure when not proving transform_meta # because of value dependent model wrap = ParallelPostFit(base) with pytest.raises(ValueError): wrap.transform(dd_X) wrap = ParallelPostFit(base, transform_meta=np.array([[0, 0, 0, 0]], dtype=np.float64)) result = wrap.transform(dd_X) expected = base.transform(X) assert_eq_ar(result, expected)
def test_transform(kind): X, y = make_classification(chunks=100) if kind == "numpy": X, y = dask.compute(X, y) elif kind == "dask.dataframe": X = dd.from_dask_array(X) y = dd.from_dask_array(y) base = PCA(random_state=0) wrap = ParallelPostFit(PCA(random_state=0)) base.fit(*dask.compute(X, y)) wrap.fit(*dask.compute(X, y)) assert_estimator_equal(wrap.estimator, base) result = base.transform(*dask.compute(X)) expected = wrap.transform(X) assert_eq_ar(result, expected)
def test_predict_meta_override(): X = pd.DataFrame({"c_0": [1, 2, 3, 4]}) y = np.array([1, 2, 3, 4]) base = CategoricalNB() base.fit(pd.DataFrame(X), y) dd_X = dd.from_pandas(X, npartitions=2) dd_X._meta = pd.DataFrame({"c_0": [5]}) # Failure when not proving predict_meta # because of value dependent model wrap = ParallelPostFit(base) with pytest.raises(ValueError): wrap.predict(dd_X) # Success when providing meta over-ride wrap = ParallelPostFit(base, predict_meta=np.array([1])) result = wrap.predict(dd_X) expected = base.predict(X) assert_eq_ar(result, expected)
def test_multiclass(): X, y = sklearn.datasets.make_classification(n_classes=3, n_informative=4) X = da.from_array(X, chunks=50) y = da.from_array(y, chunks=50) clf = ParallelPostFit( LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs", multi_class="auto")) clf.fit(*dask.compute(X, y)) result = clf.predict(X) expected = clf.estimator.predict(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected) result = clf.predict_proba(X) expected = clf.estimator.predict_proba(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected) result = clf.predict_log_proba(X) expected = clf.estimator.predict_log_proba(X) assert_eq_ar(result, expected)