示例#1
0
def test_big(fit_intercept, is_sparse):
    with dask.config.set(scheduler='synchronous'):
        X, y = make_classification(is_sparse=is_sparse)
        lr = LogisticRegression(fit_intercept=fit_intercept)
        lr.fit(X, y)
        lr.predict(X)
        lr.predict_proba(X)
    if fit_intercept:
        assert lr.intercept_ is not None
示例#2
0
def test_fit(fit_intercept, is_sparse):
    X, y = make_classification(n_samples=100,
                               n_features=5,
                               chunksize=10,
                               is_sparse=is_sparse)
    lr = LogisticRegression(fit_intercept=fit_intercept)
    lr.fit(X, y)
    lr.predict(X)
    lr.predict_proba(X)
示例#3
0
def test_big(fit_intercept):
    import dask
    dask.set_options(get=dask.get)
    X, y = make_classification()
    lr = LogisticRegression(fit_intercept=fit_intercept)
    lr.fit(X, y)
    lr.predict(X)
    lr.predict_proba(X)
    if fit_intercept:
        assert lr.intercept_ is not None
示例#4
0
def test_gridsearch():
    from sklearn.pipeline import make_pipeline
    dcv = pytest.importorskip('dask_searchcv')

    X, y = make_classification(n_samples=100, n_features=5, chunksize=10)
    grid = {'logisticregression__lamduh': [.001, .01, .1, .5]}
    pipe = make_pipeline(DoNothingTransformer(), LogisticRegression())
    search = dcv.GridSearchCV(pipe, grid, cv=3)
    search.fit(X, y)
示例#5
0
def test_big(fit_intercept, is_sparse, is_cupy):
    with dask.config.set(scheduler='synchronous'):
        X, y = make_classification(is_sparse=is_sparse)
        if is_cupy and not is_sparse:
            cupy = pytest.importorskip('cupy')
            X, y = to_dask_cupy_array_xy(X, y, cupy)
        lr = LogisticRegression(fit_intercept=fit_intercept)
        lr.fit(X, y)
        lr.predict(X)
        lr.predict_proba(X)
    if fit_intercept:
        assert lr.intercept_ is not None
示例#6
0
def test_fit(fit_intercept, is_sparse, is_cupy):
    X, y = make_classification(n_samples=100,
                               n_features=5,
                               chunksize=10,
                               is_sparse=is_sparse)

    if is_cupy and not is_sparse:
        cupy = pytest.importorskip('cupy')
        X, y = to_dask_cupy_array_xy(X, y, cupy)

    lr = LogisticRegression(fit_intercept=fit_intercept)
    lr.fit(X, y)
    lr.predict(X)
    lr.predict_proba(X)
示例#7
0
    def __init__(self,
                 *,
                 client=None,
                 fit_intercept=True,
                 solver="admm",
                 penalty="l2",
                 C=1.0,
                 max_iter=100,
                 verbose=False,
                 **kwargs):
        super(LogisticRegression, self).__init__(client=client,
                                                 verbose=verbose,
                                                 **kwargs)

        if not has_daskglm("0.2.1.dev"):
            raise ImportError(
                "dask-glm >= 0.2.1.dev was not found, please install it"
                " to use multi-GPU logistic regression.")

        from dask_glm.estimators import LogisticRegression \
            as LogisticRegressionGLM

        self.fit_intercept = fit_intercept
        self.solver = solver
        self.penalty = penalty
        self.C = C
        self.max_iter = max_iter

        if self.penalty not in ("l2", "l1", "elastic_net"):
            raise TypeError("Only l2, l1, and elastic_net penalties are"
                            " currently supported.")

        self.solver_model = LogisticRegressionGLM(
            solver=self.solver,
            fit_intercept=self.fit_intercept,
            regularizer=self.penalty,
            max_iter=self.max_iter,
            lamduh=1 / self.C,
        )
示例#8
0
def test_in_pipeline():
    from sklearn.pipeline import make_pipeline
    X, y = make_classification(n_samples=100, n_features=5, chunksize=10)
    pipe = make_pipeline(DoNothingTransformer(), LogisticRegression())
    pipe.fit(X, y)
示例#9
0
def test_lr_init(solver):
    LogisticRegression(solver=solver)
示例#10
0
class LogisticRegression(BaseEstimator):
    """
    Distributed Logistic Regression for Binary classification.


    Parameters
    ----------
    fit_intercept: boolean (default = True)
       If True, the model tries to correct for the global mean of y.
       If False, the model expects that you have centered the data.
    solver : 'admm'
        Solver to use. Only admm is supported currently.
    penalty : {'l1', 'l2', 'elastic_net'} (default = 'l2')
        Regularization technique for the solver.
    C: float (default = 1.0)
       Inverse of regularization strength; must be a positive float.
    max_iter: int (default = 100)
        Maximum number of iterations taken for the solvers to converge.
    verbose : int or boolean (default=False)
        Sets logging level. It must be one of `cuml.common.logger.level_*`.
        See :ref:`verbosity-levels` for more info.

    Attributes
    ----------
    coef_: device array (n_features, 1)
        The estimated coefficients for the logistic regression model.
    intercept_: device array (1,)
        The independent term. If `fit_intercept` is False, will be 0.
    solver: string
        Algorithm to use in the optimization process. Currently only `admm` is
        supported.

    Notes
    ------

    This estimator is a wrapper class around Dask-GLM's
    Logistic Regression estimator. Several methods in this wrapper class
    duplicate code from Dask-GLM to create a user-friendly namespace.
    """
    def __init__(self,
                 *,
                 client=None,
                 fit_intercept=True,
                 solver="admm",
                 penalty="l2",
                 C=1.0,
                 max_iter=100,
                 verbose=False,
                 **kwargs):
        super(LogisticRegression, self).__init__(client=client,
                                                 verbose=verbose,
                                                 **kwargs)

        if not has_daskglm("0.2.1.dev"):
            raise ImportError(
                "dask-glm >= 0.2.1.dev was not found, please install it"
                " to use multi-GPU logistic regression.")

        from dask_glm.estimators import LogisticRegression \
            as LogisticRegressionGLM

        self.fit_intercept = fit_intercept
        self.solver = solver
        self.penalty = penalty
        self.C = C
        self.max_iter = max_iter

        if self.penalty not in ("l2", "l1", "elastic_net"):
            raise TypeError("Only l2, l1, and elastic_net penalties are"
                            " currently supported.")

        self.solver_model = LogisticRegressionGLM(
            solver=self.solver,
            fit_intercept=self.fit_intercept,
            regularizer=self.penalty,
            max_iter=self.max_iter,
            lamduh=1 / self.C,
        )

    @with_cupy_rmm
    def fit(self, X, y):
        """
        Fit the model with X and y.

        Parameters
        ----------
        X : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_features)
            Features for regression
        y : Dask cuDF Series or CuPy backed Dask Array (n_rows,)
            Label (outcome values)
        """

        X = self._input_to_dask_cupy_array(X)
        y = self._input_to_dask_cupy_array(y)
        self.solver_model.fit(X, y)
        self._finalize_coefs()
        return self

    @with_cupy_rmm
    def predict(self, X):
        """
        Predicts the ŷ for X.

        Parameters
        ----------
        X : Dask cuDF dataframe  or CuPy backed Dask Array (n_rows, n_features)
            Distributed dense matrix (floats or doubles) of shape
            (n_samples, n_features).

        Returns
        -------
        y : Dask cuDF Series or CuPy backed Dask Array (n_rows,)
        """
        return self.predict_proba(X) > 0.5

    @with_cupy_rmm
    def predict_proba(self, X):
        from dask_glm.utils import sigmoid

        X = self._input_to_dask_cupy_array(X)
        return sigmoid(self.decision_function(X))

    @with_cupy_rmm
    def decision_function(self, X):
        X = self._input_to_dask_cupy_array(X)
        X_ = self._maybe_add_intercept(X)
        return np.dot(X_, self._coef)

    @with_cupy_rmm
    def score(self, X, y):
        from dask_glm.utils import accuracy_score

        X = self._input_to_dask_cupy_array(X)
        y = self._input_to_dask_cupy_array(y)
        return accuracy_score(y, self.predict(X))

    @with_cupy_rmm
    def _finalize_coefs(self):
        # _coef contains coefficients and (potentially) intercept
        self._coef = cp.asarray(self.solver_model._coef)
        if self.fit_intercept:
            self.coef_ = self._coef[:-1]
            self.intercept_ = self.solver_model._coef[-1]
        else:
            self.coef_ = self._coef

    @with_cupy_rmm
    def _maybe_add_intercept(self, X):
        from dask_glm.utils import add_intercept

        if self.fit_intercept:
            return add_intercept(X)
        else:
            return X

    @with_cupy_rmm
    def _input_to_dask_cupy_array(self, X):
        if (is_dataframe_like(X) or is_series_like(X)) and hasattr(X, "dask"):

            if not isinstance(X._meta, (cudf.Series, cudf.DataFrame)):
                raise TypeError("Please convert your Dask DataFrame"
                                " to a Dask-cuDF DataFrame using dask_cudf.")
            X = X.values
            X._meta = cp.asarray(X._meta)

        elif is_arraylike(X) and hasattr(X, "dask"):
            if not isinstance(X._meta, cp.ndarray):
                raise TypeError("Please convert your CPU Dask Array"
                                " to a GPU Dask Array using"
                                " arr.map_blocks(cp.asarray).")
        else:
            raise TypeError("Please pass a GPU backed Dask DataFrame"
                            " or Dask Array.")

        X.compute_chunk_sizes()
        return X

    def get_param_names(self):
        return list(self.kwargs.keys())