def test_big(fit_intercept, is_sparse): with dask.config.set(scheduler='synchronous'): X, y = make_classification(is_sparse=is_sparse) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X) if fit_intercept: assert lr.intercept_ is not None
def test_fit(fit_intercept, is_sparse): X, y = make_classification(n_samples=100, n_features=5, chunksize=10, is_sparse=is_sparse) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X)
def test_big(fit_intercept): import dask dask.set_options(get=dask.get) X, y = make_classification() lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X) if fit_intercept: assert lr.intercept_ is not None
def test_gridsearch(): from sklearn.pipeline import make_pipeline dcv = pytest.importorskip('dask_searchcv') X, y = make_classification(n_samples=100, n_features=5, chunksize=10) grid = {'logisticregression__lamduh': [.001, .01, .1, .5]} pipe = make_pipeline(DoNothingTransformer(), LogisticRegression()) search = dcv.GridSearchCV(pipe, grid, cv=3) search.fit(X, y)
def test_big(fit_intercept, is_sparse, is_cupy): with dask.config.set(scheduler='synchronous'): X, y = make_classification(is_sparse=is_sparse) if is_cupy and not is_sparse: cupy = pytest.importorskip('cupy') X, y = to_dask_cupy_array_xy(X, y, cupy) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X) if fit_intercept: assert lr.intercept_ is not None
def test_fit(fit_intercept, is_sparse, is_cupy): X, y = make_classification(n_samples=100, n_features=5, chunksize=10, is_sparse=is_sparse) if is_cupy and not is_sparse: cupy = pytest.importorskip('cupy') X, y = to_dask_cupy_array_xy(X, y, cupy) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X)
def __init__(self, *, client=None, fit_intercept=True, solver="admm", penalty="l2", C=1.0, max_iter=100, verbose=False, **kwargs): super(LogisticRegression, self).__init__(client=client, verbose=verbose, **kwargs) if not has_daskglm("0.2.1.dev"): raise ImportError( "dask-glm >= 0.2.1.dev was not found, please install it" " to use multi-GPU logistic regression.") from dask_glm.estimators import LogisticRegression \ as LogisticRegressionGLM self.fit_intercept = fit_intercept self.solver = solver self.penalty = penalty self.C = C self.max_iter = max_iter if self.penalty not in ("l2", "l1", "elastic_net"): raise TypeError("Only l2, l1, and elastic_net penalties are" " currently supported.") self.solver_model = LogisticRegressionGLM( solver=self.solver, fit_intercept=self.fit_intercept, regularizer=self.penalty, max_iter=self.max_iter, lamduh=1 / self.C, )
def test_in_pipeline(): from sklearn.pipeline import make_pipeline X, y = make_classification(n_samples=100, n_features=5, chunksize=10) pipe = make_pipeline(DoNothingTransformer(), LogisticRegression()) pipe.fit(X, y)
def test_lr_init(solver): LogisticRegression(solver=solver)
class LogisticRegression(BaseEstimator): """ Distributed Logistic Regression for Binary classification. Parameters ---------- fit_intercept: boolean (default = True) If True, the model tries to correct for the global mean of y. If False, the model expects that you have centered the data. solver : 'admm' Solver to use. Only admm is supported currently. penalty : {'l1', 'l2', 'elastic_net'} (default = 'l2') Regularization technique for the solver. C: float (default = 1.0) Inverse of regularization strength; must be a positive float. max_iter: int (default = 100) Maximum number of iterations taken for the solvers to converge. verbose : int or boolean (default=False) Sets logging level. It must be one of `cuml.common.logger.level_*`. See :ref:`verbosity-levels` for more info. Attributes ---------- coef_: device array (n_features, 1) The estimated coefficients for the logistic regression model. intercept_: device array (1,) The independent term. If `fit_intercept` is False, will be 0. solver: string Algorithm to use in the optimization process. Currently only `admm` is supported. Notes ------ This estimator is a wrapper class around Dask-GLM's Logistic Regression estimator. Several methods in this wrapper class duplicate code from Dask-GLM to create a user-friendly namespace. """ def __init__(self, *, client=None, fit_intercept=True, solver="admm", penalty="l2", C=1.0, max_iter=100, verbose=False, **kwargs): super(LogisticRegression, self).__init__(client=client, verbose=verbose, **kwargs) if not has_daskglm("0.2.1.dev"): raise ImportError( "dask-glm >= 0.2.1.dev was not found, please install it" " to use multi-GPU logistic regression.") from dask_glm.estimators import LogisticRegression \ as LogisticRegressionGLM self.fit_intercept = fit_intercept self.solver = solver self.penalty = penalty self.C = C self.max_iter = max_iter if self.penalty not in ("l2", "l1", "elastic_net"): raise TypeError("Only l2, l1, and elastic_net penalties are" " currently supported.") self.solver_model = LogisticRegressionGLM( solver=self.solver, fit_intercept=self.fit_intercept, regularizer=self.penalty, max_iter=self.max_iter, lamduh=1 / self.C, ) @with_cupy_rmm def fit(self, X, y): """ Fit the model with X and y. Parameters ---------- X : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_features) Features for regression y : Dask cuDF Series or CuPy backed Dask Array (n_rows,) Label (outcome values) """ X = self._input_to_dask_cupy_array(X) y = self._input_to_dask_cupy_array(y) self.solver_model.fit(X, y) self._finalize_coefs() return self @with_cupy_rmm def predict(self, X): """ Predicts the ŷ for X. Parameters ---------- X : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_features) Distributed dense matrix (floats or doubles) of shape (n_samples, n_features). Returns ------- y : Dask cuDF Series or CuPy backed Dask Array (n_rows,) """ return self.predict_proba(X) > 0.5 @with_cupy_rmm def predict_proba(self, X): from dask_glm.utils import sigmoid X = self._input_to_dask_cupy_array(X) return sigmoid(self.decision_function(X)) @with_cupy_rmm def decision_function(self, X): X = self._input_to_dask_cupy_array(X) X_ = self._maybe_add_intercept(X) return np.dot(X_, self._coef) @with_cupy_rmm def score(self, X, y): from dask_glm.utils import accuracy_score X = self._input_to_dask_cupy_array(X) y = self._input_to_dask_cupy_array(y) return accuracy_score(y, self.predict(X)) @with_cupy_rmm def _finalize_coefs(self): # _coef contains coefficients and (potentially) intercept self._coef = cp.asarray(self.solver_model._coef) if self.fit_intercept: self.coef_ = self._coef[:-1] self.intercept_ = self.solver_model._coef[-1] else: self.coef_ = self._coef @with_cupy_rmm def _maybe_add_intercept(self, X): from dask_glm.utils import add_intercept if self.fit_intercept: return add_intercept(X) else: return X @with_cupy_rmm def _input_to_dask_cupy_array(self, X): if (is_dataframe_like(X) or is_series_like(X)) and hasattr(X, "dask"): if not isinstance(X._meta, (cudf.Series, cudf.DataFrame)): raise TypeError("Please convert your Dask DataFrame" " to a Dask-cuDF DataFrame using dask_cudf.") X = X.values X._meta = cp.asarray(X._meta) elif is_arraylike(X) and hasattr(X, "dask"): if not isinstance(X._meta, cp.ndarray): raise TypeError("Please convert your CPU Dask Array" " to a GPU Dask Array using" " arr.map_blocks(cp.asarray).") else: raise TypeError("Please pass a GPU backed Dask DataFrame" " or Dask Array.") X.compute_chunk_sizes() return X def get_param_names(self): return list(self.kwargs.keys())