예제 #1
0
def logistic_regression(X,
                        y,
                        coef_only=False,
                        alpha=0.05,
                        as_dataframe=True,
                        remove_na=False,
                        **kwargs):
    """(Multiple) Binary logistic regression.

    Parameters
    ----------
    X : np.array or list
        Predictor(s). Shape = (n_samples, n_features) or (n_samples,).
    y : np.array or list
        Dependent variable. Shape = (n_samples).
        Must be binary.
    coef_only : bool
        If True, return only the regression coefficients.
    alpha : float
        Alpha value used for the confidence intervals.
        CI = [alpha / 2 ; 1 - alpha / 2]
    as_dataframe : bool
        If True, returns a pandas DataFrame. If False, returns a dictionnary.
    remove_na : bool
        If True, apply a listwise deletion of missing values (i.e. the entire
        row is removed).
    **kwargs : optional
        Optional arguments passed to sklearn.linear_model.LogisticRegression.

    Returns
    -------
    stats : dataframe or dict
        Logistic regression summary::

        'names' : name of variable(s) in the model (e.g. x1, x2...)
        'coef' : regression coefficients
        'se' : standard error
        'z' : z-scores
        'pval' : two-tailed p-values
        'CI[2.5%]' : lower confidence interval
        'CI[97.5%]' : upper confidence interval

    Notes
    -----
    This is a wrapper around the
    :py:class:`sklearn.linear_model.LogisticRegression` class.

    Results have been compared against statsmodels and JASP.

    Note that the first coefficient is always the constant term (intercept) of
    the model.

    This function will not run if NaN values are either present in the target
    or predictors variables. Please remove them before runing the function.

    Adapted from a code found at
    https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d

    Examples
    --------
    1. Simple binary logistic regression

    >>> import numpy as np
    >>> from pingouin import logistic_regression
    >>> np.random.seed(123)
    >>> x = np.random.normal(size=30)
    >>> y = np.random.randint(0, 2, size=30)
    >>> lom = logistic_regression(x, y)
    >>> lom.round(2)
           names  coef    se     z  pval  CI[2.5%]  CI[97.5%]
    0  Intercept -0.27  0.37 -0.73  0.46     -0.99       0.45
    1         x1  0.06  0.32  0.19  0.85     -0.56       0.68

    2. Multiple binary logistic regression

    >>> np.random.seed(42)
    >>> z = np.random.normal(size=30)
    >>> X = np.column_stack((x, z))
    >>> lom = logistic_regression(X, y)
    >>> print(lom['coef'].values)
    [-0.34933805 -0.0226106  -0.39453532]

    3. Using a Pandas DataFrame

    >>> import pandas as pd
    >>> df = pd.DataFrame({'x': x, 'y': y, 'z': z})
    >>> lom = logistic_regression(df[['x', 'z']], df['y'])
    >>> print(lom['coef'].values)
    [-0.34933805 -0.0226106  -0.39453532]

    4. Return only the coefficients

    >>> logistic_regression(X, y, coef_only=True)
    array([-0.34933805, -0.0226106 , -0.39453532])

    4. Passing custom parameters to sklearn

    >>> lom = logistic_regression(X, y, solver='sag', max_iter=10000)
    >>> print(lom['coef'].values)
    [-0.34941889 -0.02261911 -0.39451064]
    """
    # Check that sklearn is installed
    from pingouin.utils import _is_sklearn_installed
    _is_sklearn_installed(raise_error=True)
    from sklearn.linear_model import LogisticRegression

    # Extract names if X is a Dataframe or Series
    if isinstance(X, pd.DataFrame):
        names = X.keys().tolist()
    elif isinstance(X, pd.Series):
        names = [X.name]
    else:
        names = []

    assert 0 < alpha < 1
    assert y.ndim == 1, 'y must be one-dimensional.'

    # Convert to numpy array
    X = np.asarray(X)
    y = np.asarray(y)

    # Add axis if only one-dimensional array
    if X.ndim == 1:
        X = X[..., np.newaxis]

    # Check for NaN /  Inf
    if remove_na:
        X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows')
        y = np.squeeze(y)
    y_gd = np.isfinite(y).all()
    X_gd = np.isfinite(X).all()
    assert y_gd, 'Target variable contains NaN or Inf. Please remove them.'
    assert X_gd, 'Predictors contains NaN or Inf. Please remove them.'

    # Check that X and y have same length
    assert y.shape[0] == X.shape[0], 'X and y must have same number of samples'

    # Check that y is binary
    if np.unique(y).size != 2:
        raise ValueError('Dependent variable must be binary.')

    if not names:
        names = ['x' + str(i + 1) for i in range(X.shape[1])]

    # Add intercept in names
    names.insert(0, "Intercept")

    # Initialize and fit
    if 'solver' not in kwargs:
        kwargs['solver'] = 'lbfgs'
    if 'multi_class' not in kwargs:
        kwargs['multi_class'] = 'auto'
    lom = LogisticRegression(**kwargs)
    lom.fit(X, y)
    coef = np.append(lom.intercept_, lom.coef_)
    if coef_only:
        return coef

    # Design matrix -- add intercept
    X_design = np.column_stack((np.ones(X.shape[0]), X))
    n, p = X_design.shape

    # Fisher Information Matrix
    denom = (2 * (1 + np.cosh(lom.decision_function(X))))
    denom = np.tile(denom, (p, 1)).T
    fim = np.dot((X_design / denom).T, X_design)
    crao = np.linalg.inv(fim)

    # Standard error and Z-scores
    se = np.sqrt(np.diag(crao))
    z_scores = coef / se

    # Two-tailed p-values
    pval = np.array([2 * norm.sf(abs(z)) for z in z_scores])

    # Confidence intervals
    crit = norm.ppf(1 - alpha / 2)
    ll = coef - crit * se
    ul = coef + crit * se

    # Rename CI
    ll_name = 'CI[%.1f%%]' % (100 * alpha / 2)
    ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2))

    # Create dict
    stats = {
        'names': names,
        'coef': coef,
        'se': se,
        'z': z_scores,
        'pval': pval,
        ll_name: ll,
        ul_name: ul
    }
    if as_dataframe:
        return pd.DataFrame.from_dict(stats)
    else:
        return stats
예제 #2
0
def linear_regression(X,
                      y,
                      add_intercept=True,
                      coef_only=False,
                      alpha=0.05,
                      as_dataframe=True,
                      remove_na=False):
    """(Multiple) Linear regression.

    Parameters
    ----------
    X : np.array or list
        Predictor(s). Shape = (n_samples, n_features) or (n_samples,).
    y : np.array or list
        Dependent variable. Shape = (n_samples).
    add_intercept : bool
        If False, assume that the data are already centered. If True, add a
        constant term to the model. In this case, the first value in the
        output dict is the intercept of the model.
    coef_only : bool
        If True, return only the regression coefficients.
    alpha : float
        Alpha value used for the confidence intervals.
        CI = [alpha / 2 ; 1 - alpha / 2]
    as_dataframe : bool
        If True, returns a pandas DataFrame. If False, returns a dictionnary.
    remove_na : bool
        If True, apply a listwise deletion of missing values (i.e. the entire
        row is removed).

    Returns
    -------
    stats : dataframe or dict
        Linear regression summary::

        'names' : name of variable(s) in the model (e.g. x1, x2...)
        'coef' : regression coefficients
        'se' : standard error of the estimate
        'T' : T-values
        'pval' : p-values
        'r2' : coefficient of determination (R2)
        'adj_r2' : adjusted R2
        'CI[2.5%]' : lower confidence interval
        'CI[97.5%]' : upper confidence interval

    Notes
    -----
    The beta coefficients of the regression are estimated using the
    :py:func:`numpy.linalg.lstsq` function.

    It is generally recommanded to include a constant term (intercept) to the
    model to limit the bias and force the residual mean to equal zero.
    Note that intercept coefficient and p-values are however rarely meaningful.

    The standard error of the estimates is a measure of the accuracy of the
    prediction defined as:

    .. math:: se = \\sqrt{MSE \\cdot (X^TX)^{-1}}

    where :math:`MSE` is the mean squared error,

    .. math:: MSE = \\frac{\\sum{(true - pred)^2}}{n - p - 1}

    :math:`p` is the total number of explanatory variables in the model
    (excluding the intercept) and :math:`n` is the sample size.

    Using the coefficients and the standard errors, the T-values can be
    obtained:

    .. math:: T = \\frac{coef}{se}

    and the p-values can then be approximated using a T-distribution
    with :math:`n - p - 1` degrees of freedom.

    The coefficient of determination (:math:`R^2`) is defined as:

    .. math:: R^2 = 1 - (\\frac{SS_{resid}}{SS_{total}})

    The adjusted :math:`R^2` is defined as:

    .. math:: \\overline{R}^2 = 1 - (1 - R^2) \\frac{n - 1}{n - p - 1}

    Results have been compared against sklearn, statsmodels and JASP.

    This function will not run if NaN values are either present in the target
    or predictors variables. Please remove them before runing the function.

    Examples
    --------
    1. Simple linear regression

    >>> import numpy as np
    >>> from pingouin import linear_regression
    >>> np.random.seed(123)
    >>> mean, cov, n = [4, 6], [[1, 0.5], [0.5, 1]], 30
    >>> x, y = np.random.multivariate_normal(mean, cov, n).T
    >>> lm = linear_regression(x, y)
    >>> lm.round(2)
           names  coef    se     T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0  Intercept  4.40  0.54  8.16  0.00  0.24    0.21      3.29       5.50
    1         x1  0.39  0.13  2.99  0.01  0.24    0.21      0.12       0.67

    2. Multiple linear regression

    >>> np.random.seed(42)
    >>> z = np.random.normal(size=n)
    >>> X = np.column_stack((x, z))
    >>> lm = linear_regression(X, y)
    >>> print(lm['coef'].values)
    [4.54123324 0.36628301 0.17709451]

    3. Using a Pandas DataFrame

    >>> import pandas as pd
    >>> df = pd.DataFrame({'x': x, 'y': y, 'z': z})
    >>> lm = linear_regression(df[['x', 'z']], df['y'])
    >>> print(lm['coef'].values)
    [4.54123324 0.36628301 0.17709451]

    4. No intercept and return coef only

    >>> linear_regression(X, y, add_intercept=False, coef_only=True)
    array([ 1.40935593, -0.2916508 ])

    5. Return a dictionnary instead of a DataFrame

    >>> lm_dict = linear_regression(X, y, as_dataframe=False)

    6. Remove missing values

    >>> X[4, 1] = np.nan
    >>> y[7] = np.nan
    >>> linear_regression(X, y, remove_na=True, coef_only=True)
    array([4.64069731, 0.35455398, 0.1888135 ])
    """
    # Extract names if X is a Dataframe or Series
    if isinstance(X, pd.DataFrame):
        names = X.keys().tolist()
    elif isinstance(X, pd.Series):
        names = [X.name]
    else:
        names = []

    assert 0 < alpha < 1
    assert y.ndim == 1, 'y must be one-dimensional.'

    # Convert input to numpy array
    X = np.asarray(X)
    y = np.asarray(y)

    if X.ndim == 1:
        # Convert to (n_samples, n_features) shape
        X = X[..., np.newaxis]

    # Check for NaN / Inf
    if remove_na:
        X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows')
        y = np.squeeze(y)
    y_gd = np.isfinite(y).all()
    X_gd = np.isfinite(X).all()
    assert y_gd, 'Target (y) contains NaN or Inf. Please remove them.'
    assert X_gd, 'Predictors (X) contain NaN or Inf. Please remove them.'

    # Check that X and y have same length
    assert y.shape[0] == X.shape[0], 'X and y must have same number of samples'

    if not names:
        names = ['x' + str(i + 1) for i in range(X.shape[1])]

    if add_intercept:
        # Add intercept
        X = np.column_stack((np.ones(X.shape[0]), X))
        names.insert(0, "Intercept")

    # Compute beta coefficient and predictions
    coef = np.linalg.lstsq(X, y, rcond=None)[0]
    if coef_only:
        return coef
    pred = np.dot(X, coef)
    resid = np.square(y - pred)
    ss_res = resid.sum()

    n, p = X.shape[0], X.shape[1]
    # Degrees of freedom should not include the intercept
    dof = n - p if add_intercept else n - p - 1
    # Compute mean squared error, variance and SE
    MSE = ss_res / dof
    beta_var = MSE * (np.linalg.pinv(np.dot(X.T, X)).diagonal())
    beta_se = np.sqrt(beta_var)

    # Compute R2, adjusted R2 and RMSE
    ss_tot = np.square(y - y.mean()).sum()
    # ss_exp = np.square(pred - y.mean()).sum()
    r2 = 1 - (ss_res / ss_tot)
    adj_r2 = 1 - (1 - r2) * (n - 1) / dof

    # Compute T and p-values
    T = coef / beta_se
    pval = np.array([2 * t.sf(np.abs(i), dof) for i in T])

    # Compute confidence intervals
    crit = t.ppf(1 - alpha / 2, dof)
    marg_error = crit * beta_se
    ll = coef - marg_error
    ul = coef + marg_error

    # Rename CI
    ll_name = 'CI[%.1f%%]' % (100 * alpha / 2)
    ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2))

    # Create dict
    stats = {
        'names': names,
        'coef': coef,
        'se': beta_se,
        'T': T,
        'pval': pval,
        'r2': r2,
        'adj_r2': adj_r2,
        ll_name: ll,
        ul_name: ul
    }

    if as_dataframe:
        return pd.DataFrame.from_dict(stats)
    else:
        return stats
예제 #3
0
def logistic_regression(X,
                        y,
                        coef_only=False,
                        alpha=0.05,
                        as_dataframe=True,
                        remove_na=False,
                        **kwargs):
    """(Multiple) Binary logistic regression.

    Parameters
    ----------
    X : np.array or list
        Predictor(s). Shape = (n_samples, n_features) or (n_samples,).
    y : np.array or list
        Dependent variable. Shape = (n_samples).
        ``y`` must be binary, i.e. only contains 0 or 1. Multinomial logistic
        regression is not supported.
    coef_only : bool
        If True, return only the regression coefficients.
    alpha : float
        Alpha value used for the confidence intervals.
        :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]`
    as_dataframe : bool
        If True, returns a pandas DataFrame. If False, returns a dictionnary.
    remove_na : bool
        If True, apply a listwise deletion of missing values (i.e. the entire
        row is removed). Default is False, which will raise an error if missing
        values are present in either the predictor(s) or dependent
        variable.
    **kwargs : optional
        Optional arguments passed to
        :py:class:`sklearn.linear_model.LogisticRegression` (see Notes).

    Returns
    -------
    stats : dataframe or dict
        Logistic regression summary::

        'names' : name of variable(s) in the model (e.g. x1, x2...)
        'coef' : regression coefficients (log-odds)
        'se' : standard error
        'z' : z-scores
        'pval' : two-tailed p-values
        'CI[2.5%]' : lower confidence interval
        'CI[97.5%]' : upper confidence interval

    See also
    --------
    linear_regression

    Notes
    -----
    This is a wrapper around the
    :py:class:`sklearn.linear_model.LogisticRegression` class. Importantly,
    Pingouin automatically disables the L2 regularization applied by
    scikit-learn. This can be modified by changing the ``penalty`` argument.

    The logistic regression assumes that the log-odds (the logarithm of the
    odds) for the value labeled "1" in the response variable is a linear
    combination of the predictor variables. The log-odds are given by the
    `logit <https://en.wikipedia.org/wiki/Logit>`_ function,
    which map a probability :math:`p` of the response variable being "1"
    from :math:`[0, 1)` to :math:`(-\\infty, +\\infty)`.

    .. math:: \\text{logit}(p) = \\ln \\frac{p}{1 - p} = \\beta_0 + \\beta X

    The odds of the response variable being "1" can be obtained by
    exponentiating the log-odds:

    .. math:: \\frac{p}{1 - p} = e^{\\beta_0 + \\beta X}

    and the probability of the response variable being "1" is given by:

    .. math:: p = \\frac{1}{1 + e^{-(\\beta_0 + \\beta X})}

    Note that the above function that converts log-odds to probability is
    called the `logistic function
    <https://en.wikipedia.org/wiki/Logistic_function>`_.

    The first coefficient is always the constant term (intercept) of
    the model. Scikit-learn will automatically add the intercept
    to your predictor(s) matrix, therefore, :math:`X` should not include a
    constant term. Pingouin will remove any constant term (e.g column with only
    one unique value), or duplicate columns from :math:`X`.

    The calculation of the p-values and confidence interval is adapted from a
    code found at
    https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d

    Results have been compared against statsmodels, R, and JASP.

    Examples
    --------
    1. Simple binary logistic regression

    >>> import numpy as np
    >>> from pingouin import logistic_regression
    >>> np.random.seed(123)
    >>> x = np.random.normal(size=30)
    >>> y = np.random.randint(0, 2, size=30)
    >>> lom = logistic_regression(x, y)
    >>> lom.round(2)
           names  coef    se     z  pval  CI[2.5%]  CI[97.5%]
    0  Intercept -0.27  0.37 -0.74  0.46     -1.00       0.45
    1         x1  0.07  0.32  0.21  0.84     -0.55       0.68

    2. Multiple binary logistic regression

    >>> np.random.seed(42)
    >>> z = np.random.normal(size=30)
    >>> X = np.column_stack((x, z))
    >>> lom = logistic_regression(X, y)
    >>> print(lom['coef'].values)
    [-0.36736745 -0.04374684 -0.47829392]

    3. Using a Pandas DataFrame

    >>> import pandas as pd
    >>> df = pd.DataFrame({'x': x, 'y': y, 'z': z})
    >>> lom = logistic_regression(df[['x', 'z']], df['y'])
    >>> print(lom['coef'].values)
    [-0.36736745 -0.04374684 -0.47829392]

    4. Return only the coefficients

    >>> logistic_regression(X, y, coef_only=True)
    array([-0.36736745, -0.04374684, -0.47829392])

    5. Passing custom parameters to sklearn

    >>> lom = logistic_regression(X, y, solver='sag', max_iter=10000,
    ...                           random_state=42)
    >>> print(lom['coef'].values)
    [-0.36751796 -0.04367056 -0.47841908]


    **How to interpret the log-odds coefficients?**

    We'll use the `Wikipedia example
    <https://en.wikipedia.org/wiki/Logistic_regression#Probability_of_passing_an_exam_versus_hours_of_study>`_
    of the probability of passing an exam
    versus the hours of study:

    *A group of 20 students spends between 0 and 6 hours studying for an
    exam. How does the number of hours spent studying affect the
    probability of the student passing the exam?*

    >>> # First, let's create the dataframe
    >>> Hours = [0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50,
    ...          2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50]
    >>> Pass = [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1]
    >>> df = pd.DataFrame({'HoursStudy': Hours, 'PassExam': Pass})
    >>> # And then run the logistic regression
    >>> lr = logistic_regression(df['HoursStudy'], df['PassExam']).round(3)
    >>> lr
            names   coef     se      z   pval  CI[2.5%]  CI[97.5%]
    0   Intercept -4.078  1.761 -2.316  0.021    -7.529     -0.626
    1  HoursStudy  1.505  0.629  2.393  0.017     0.272      2.737

    The ``Intercept`` coefficient (-4.078) is the log-odds of ``PassExam=1``
    when ``HoursStudy=0``. The odds ratio can be obtained by exponentiating
    the log-odds:

    >>> np.exp(-4.078)
    0.016941314421496552

    i.e. :math:`0.017:1`. Conversely the odds of failing the exam are
    :math:`(1/0.017) \\approx 59:1`.

    The probability can then be obtained with the following equation

    .. math:: p = \\frac{1}{1 + e^{-(-4.078 + 0 * 1.505)}}

    >>> 1 / (1 + np.exp(-(-4.078)))
    0.016659087580814722

    The ``HoursStudy`` coefficient (1.505) means that for each additional hour
    of study, the log-odds of passing the exam increase by 1.505, and the odds
    are multipled by :math:`e^{1.505} \\approx 4.50`.

    For example, a student who studies 2 hours has a probability of passing
    the exam of 25%:

    >>> 1 / (1 + np.exp(-(-4.078 + 2 * 1.505)))
    0.2557836148964987

    The table below shows the probability of passing the exam for several
    values of ``HoursStudy``:

    +----------------+----------+----------------+------------------+
    | Hours of Study | Log-odds | Odds           | Probability      |
    +================+==========+================+==================+
    | 0              | −4.08    | 0.017 ≈ 1:59   | 0.017            |
    +----------------+----------+----------------+------------------+
    | 1              | −2.57    | 0.076 ≈ 1:13   | 0.07             |
    +----------------+----------+----------------+------------------+
    | 2              | −1.07    | 0.34 ≈ 1:3     | 0.26             |
    +----------------+----------+----------------+------------------+
    | 3              | 0.44     | 1.55           | 0.61             |
    +----------------+----------+----------------+------------------+
    | 4              | 1.94     | 6.96           | 0.87             |
    +----------------+----------+----------------+------------------+
    | 5              | 3.45     | 31.4           | 0.97             |
    +----------------+----------+----------------+------------------+
    | 6              | 4.96     | 141.4          | 0.99             |
    +----------------+----------+----------------+------------------+
    """
    # Check that sklearn is installed
    from pingouin.utils import _is_sklearn_installed
    _is_sklearn_installed(raise_error=True)
    from sklearn.linear_model import LogisticRegression

    # Extract names if X is a Dataframe or Series
    if isinstance(X, pd.DataFrame):
        names = X.keys().tolist()
    elif isinstance(X, pd.Series):
        names = [X.name]
    else:
        names = []

    # Convert to numpy array
    X = np.asarray(X)
    y = np.asarray(y)
    assert y.ndim == 1, 'y must be one-dimensional.'
    assert 0 < alpha < 1, 'alpha must be between 0 and 1.'

    # Add axis if only one-dimensional array
    if X.ndim == 1:
        X = X[..., np.newaxis]

    # Check for NaN /  Inf
    if remove_na:
        X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows')
        y = np.squeeze(y)
    y_gd = np.isfinite(y).all()
    X_gd = np.isfinite(X).all()
    assert y_gd, ("Target (y) contains NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")
    assert X_gd, ("Predictors (X) contain NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")

    # Check that X and y have same length
    assert y.shape[0] == X.shape[0], 'X and y must have same number of samples'

    # Check that y is binary
    if np.unique(y).size != 2:
        raise ValueError('Dependent variable must be binary.')

    if not names:
        names = ['x' + str(i + 1) for i in range(X.shape[1])]

    # We also want to make sure that there is no column
    # with only one unique value, otherwise the regression fails
    # This is equivalent, but much faster, to pd.DataFrame(X).nunique()
    idx_unique = np.where(np.all(X == X[0, :], axis=0))[0]
    if len(idx_unique):
        X = np.delete(X, idx_unique, 1)
        names = np.delete(names, idx_unique).tolist()

    # Finally, we want to remove duplicate columns
    if X.shape[1] > 1:
        idx_duplicate = []
        for pair in itertools.combinations(range(X.shape[1]), 2):
            if np.array_equal(X[:, pair[0]], X[:, pair[1]]):
                idx_duplicate.append(pair[1])
        if len(idx_duplicate):
            X = np.delete(X, idx_duplicate, 1)
            names = np.delete(names, idx_duplicate).tolist()

    # Initialize and fit
    if 'solver' not in kwargs:
        kwargs['solver'] = 'lbfgs'
    if 'multi_class' not in kwargs:
        kwargs['multi_class'] = 'auto'
    if 'penalty' not in kwargs:
        kwargs['penalty'] = 'none'
    lom = LogisticRegression(**kwargs)
    lom.fit(X, y)
    coef = np.append(lom.intercept_, lom.coef_)
    if coef_only:
        return coef

    # Design matrix -- add intercept
    names.insert(0, "Intercept")
    X_design = np.column_stack((np.ones(X.shape[0]), X))
    n, p = X_design.shape

    # Fisher Information Matrix
    denom = (2 * (1 + np.cosh(lom.decision_function(X))))
    denom = np.tile(denom, (p, 1)).T
    fim = np.dot((X_design / denom).T, X_design)
    crao = np.linalg.pinv(fim)

    # Standard error and Z-scores
    se = np.sqrt(np.diag(crao))
    z_scores = coef / se

    # Two-tailed p-values
    pval = 2 * norm.sf(np.fabs(z_scores))

    # Confidence intervals
    crit = norm.ppf(1 - alpha / 2)
    ll = coef - crit * se
    ul = coef + crit * se

    # Rename CI
    ll_name = 'CI[%.1f%%]' % (100 * alpha / 2)
    ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2))

    # Create dict
    stats = {
        'names': names,
        'coef': coef,
        'se': se,
        'z': z_scores,
        'pval': pval,
        ll_name: ll,
        ul_name: ul
    }
    if as_dataframe:
        return pd.DataFrame(stats)
    else:
        return stats
예제 #4
0
def linear_regression(X,
                      y,
                      add_intercept=True,
                      coef_only=False,
                      alpha=0.05,
                      as_dataframe=True,
                      remove_na=False,
                      relimp=False):
    """(Multiple) Linear regression.

    Parameters
    ----------
    X : pd.DataFrame, np.array or list
        Predictor(s). Shape = (n_samples, n_features) or (n_samples,).
    y : pd.Series, np.array or list
        Dependent variable. Shape = (n_samples).
    add_intercept : bool
        If False, assume that the data are already centered. If True, add a
        constant term to the model. In this case, the first value in the
        output dict is the intercept of the model.

        .. note:: It is generally recommanded to include a constant term
            (intercept) to the model to limit the bias and force the residual
            mean to equal zero. Note that intercept coefficient and p-values
            are however rarely meaningful.
    coef_only : bool
        If True, return only the regression coefficients.
    alpha : float
        Alpha value used for the confidence intervals.
        :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]`
    as_dataframe : bool
        If True, returns a pandas DataFrame. If False, returns a dictionnary.
    remove_na : bool
        If True, apply a listwise deletion of missing values (i.e. the entire
        row is removed). Default is False, which will raise an error if missing
        values are present in either the predictor(s) or dependent
        variable.
    relimp : bool
        If True, returns the relative importance (= contribution) of
        predictors. This is irrelevant when the predictors are uncorrelated:
        the total :math:`R^2` of the model is simply the sum of each univariate
        regression :math:`R^2`-values. However, this does not apply when
        predictors are correlated. Instead, the total :math:`R^2` of the model
        is partitioned by averaging over all combinations of predictors,
        as done in the `relaimpo
        <https://cran.r-project.org/web/packages/relaimpo/relaimpo.pdf>`_
        R package (``calc.relimp(type="lmg")``).

        .. warning:: The computation time roughly doubles for each
            additional predictor and therefore this can be extremely slow for
            models with more than 12-15 predictors.

        .. versionadded:: 0.3.0

    Returns
    -------
    stats : dataframe or dict
        Linear regression summary:

        * ``'names'``: name of variable(s) in the model (e.g. x1, x2...)
        * ``'coef'``: regression coefficients
        * ``'se'``: standard error of the estimate
        * ``'T'``: T-values
        * ``'pval'``: p-values
        * ``'r2'``: coefficient of determination (:math:`R^2`)
        * ``'adj_r2'``: adjusted :math:`R^2`
        * ``'CI[2.5%]'``: lower confidence interval
        * ``'CI[97.5%]'``: upper confidence interval
        * ``'residuals'``: residuals (only if ``as_dataframe=False``)
        * ``'relimp'``: relative contribution of each predictor to the final\
                        :math:`R^2` (only if ``relimp=True``).
        * ``'relimp_perc'``: percent relative contribution

    See also
    --------
    logistic_regression, mediation_analysis, corr

    Notes
    -----
    The :math:`\\beta` coefficients are estimated using an ordinary least
    squares (OLS) regression, as implemented in the
    :py:func:`numpy.linalg.lstsq` function. The OLS method minimizes
    the sum of squared residuals, and leads to a closed-form expression for
    the estimated :math:`\\beta`:

    .. math:: \\hat{\\beta} = (X^TX)^{-1} X^Ty

    It is generally recommanded to include a constant term (intercept) to the
    model to limit the bias and force the residual mean to equal zero.
    Note that intercept coefficient and p-values are however rarely meaningful.

    The standard error of the estimates is a measure of the accuracy of the
    prediction defined as:

    .. math:: \\sigma = \\sqrt{\\text{MSE} \\cdot (X^TX)^{-1}}

    where :math:`\\text{MSE}` is the mean squared error,

    .. math::

        \\text{MSE} = \\frac{SS_{\\text{resid}}}{n - p - 1}
         = \\frac{\\sum{(\\text{true} - \\text{pred})^2}}{n - p - 1}

    :math:`p` is the total number of predictor variables in the model
    (excluding the intercept) and :math:`n` is the sample size.

    Using the :math:`\\beta` coefficients and the standard errors,
    the T-values can be obtained:

    .. math:: T = \\frac{\\beta}{\\sigma}

    and the p-values approximated using a T-distribution with
    :math:`n - p - 1` degrees of freedom.

    The coefficient of determination (:math:`R^2`) is defined as:

    .. math:: R^2 = 1 - (\\frac{SS_{\\text{resid}}}{SS_{\\text{total}}})

    The adjusted :math:`R^2` is defined as:

    .. math:: \\overline{R}^2 = 1 - (1 - R^2) \\frac{n - 1}{n - p - 1}

    The residuals can be accessed via :code:`stats.residuals_` if ``stats``
    is a pandas DataFrame or :code:`stats['residuals']` if ``stats`` is a
    dict.

    The relative importance (``relimp``) column is a partitioning of the
    total :math:`R^2` of the model into individual :math:`R^2` contribution.
    This is calculated by taking the average over average contributions in
    models of different sizes. For more details, please refer to
    `Groemping et al. 2006 <http://dx.doi.org/10.18637/jss.v017.i01>`_
    and the R package `relaimpo
    <https://cran.r-project.org/web/packages/relaimpo/relaimpo.pdf>`_.

    Note that Pingouin will automatically remove any duplicate columns
    from :math:`X`, as well as any column with only one unique value
    (constant), excluding the intercept.

    Results have been compared against sklearn, statsmodels and JASP.

    Examples
    --------
    1. Simple linear regression

    >>> import numpy as np
    >>> from pingouin import linear_regression
    >>> np.random.seed(123)
    >>> mean, cov, n = [4, 6], [[1, 0.5], [0.5, 1]], 30
    >>> x, y = np.random.multivariate_normal(mean, cov, n).T
    >>> lm = linear_regression(x, y)
    >>> lm.round(2)
           names  coef    se     T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0  Intercept  4.40  0.54  8.16  0.00  0.24    0.21      3.29       5.50
    1         x1  0.39  0.13  2.99  0.01  0.24    0.21      0.12       0.67

    2. Multiple linear regression

    >>> np.random.seed(42)
    >>> z = np.random.normal(size=n)
    >>> X = np.column_stack((x, z))
    >>> lm = linear_regression(X, y)
    >>> print(lm['coef'].values)
    [4.54123324 0.36628301 0.17709451]

    3. Get the residuals

    >>> np.round(lm.residuals_, 2)
    array([ 1.18, -1.17,  1.32,  0.76, -1.25,  0.34, -1.54, -0.2 ,  0.36,
           -0.39,  0.69,  1.39,  0.2 , -1.14, -0.21, -1.68,  0.67, -0.69,
            0.62,  0.92, -1.  ,  0.64, -0.21, -0.78,  1.08, -0.03, -1.3 ,
            0.64,  0.81, -0.04])

    4. Using a Pandas DataFrame

    >>> import pandas as pd
    >>> df = pd.DataFrame({'x': x, 'y': y, 'z': z})
    >>> lm = linear_regression(df[['x', 'z']], df['y'])
    >>> print(lm['coef'].values)
    [4.54123324 0.36628301 0.17709451]

    5. No intercept and return coef only

    >>> linear_regression(X, y, add_intercept=False, coef_only=True)
    array([ 1.40935593, -0.2916508 ])

    6. Return a dictionnary instead of a DataFrame

    >>> lm_dict = linear_regression(X, y, as_dataframe=False)

    7. Remove missing values

    >>> X[4, 1] = np.nan
    >>> y[7] = np.nan
    >>> linear_regression(X, y, remove_na=True, coef_only=True)
    array([4.64069731, 0.35455398, 0.1888135 ])

    8. Get the relative importance of predictors

    >>> lm = linear_regression(X, y, remove_na=True, relimp=True)
    >>> lm[['names', 'relimp', 'relimp_perc']]
               names    relimp  relimp_perc
    0  Intercept       NaN          NaN
    1         x1  0.217265    82.202201
    2         x2  0.047041    17.797799

    The ``relimp`` column is a partitioning of the total :math:`R^2` of the
    model into individual contribution. Therefore, it sums to the :math:`R^2`
    of the full model. The ``relimp_perc`` is normalized to sum to 100%. See
    `Groemping 2006 <https://www.jstatsoft.org/article/view/v017i01>`_
    for more details.

    >>> lm[['relimp', 'relimp_perc']].sum()
    relimp           0.264305
    relimp_perc    100.000000
    dtype: float64
    """
    # Extract names if X is a Dataframe or Series
    if isinstance(X, pd.DataFrame):
        names = X.keys().tolist()
    elif isinstance(X, pd.Series):
        names = [X.name]
    else:
        names = []

    # Convert input to numpy array
    X = np.asarray(X)
    y = np.asarray(y)
    assert y.ndim == 1, 'y must be one-dimensional.'
    assert 0 < alpha < 1

    if X.ndim == 1:
        # Convert to (n_samples, n_features) shape
        X = X[..., np.newaxis]

    # Check for NaN / Inf
    if remove_na:
        X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows')
        y = np.squeeze(y)
    y_gd = np.isfinite(y).all()
    X_gd = np.isfinite(X).all()
    assert y_gd, ("Target (y) contains NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")
    assert X_gd, ("Predictors (X) contain NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")

    # Check that X and y have same length
    assert y.shape[0] == X.shape[0], 'X and y must have same number of samples'

    if not names:
        names = ['x' + str(i + 1) for i in range(X.shape[1])]

    if add_intercept:
        # Add intercept
        X = np.column_stack((np.ones(X.shape[0]), X))
        names.insert(0, "Intercept")

    # FINAL CHECKS BEFORE RUNNING LEAST SQUARES REGRESSION
    # 1. Let's remove the column with only zero, otherwise the regression fails
    n_nonzero = np.count_nonzero(X, axis=0)
    idx_zero = np.flatnonzero(n_nonzero == 0)  # Find columns that are only 0
    if len(idx_zero):
        X = np.delete(X, idx_zero, 1)
        names = np.delete(names, idx_zero)

    # 2. We also want to make sure that there is no more than one column
    # (= Intercept) with only one unique value, otherwise the regression fails
    # This is equivalent, but much faster, to pd.DataFrame(X).nunique()
    idx_unique = np.where(np.all(X == X[0, :], axis=0))[0]
    if len(idx_unique) > 1:
        # Houston, we have a problem!
        # We remove all but the first "Intercept" column.
        X = np.delete(X, idx_unique[1:], 1)
        names = np.delete(names, idx_unique[1:])

    # 3. Finally, we want to remove duplicate columns
    if X.shape[1] > 1:
        idx_duplicate = []
        for pair in itertools.combinations(range(X.shape[1]), 2):
            if np.array_equal(X[:, pair[0]], X[:, pair[1]]):
                idx_duplicate.append(pair[1])
        if len(idx_duplicate):
            X = np.delete(X, idx_duplicate, 1)
            names = np.delete(names, idx_duplicate)

    # LEAST-SQUARE REGRESSION + STATISTICS
    # Compute beta coefficient and predictions
    n, p = X.shape[0], X.shape[1]
    assert n >= 3, 'At least three valid samples are required in X.'
    assert p >= 1, 'X must have at least one valid column.'
    coef, ss_res, _, _ = np.linalg.lstsq(X, y, rcond=None)
    if coef_only:
        return coef
    ss_res = np.squeeze(ss_res)
    pred = np.dot(X, coef)
    resid = y - pred
    # Degrees of freedom should not include the intercept
    dof = n - p if add_intercept else n - p - 1
    # Compute mean squared error, variance and SE
    MSE = ss_res / dof
    beta_var = MSE * (np.linalg.pinv(np.dot(X.T, X)).diagonal())
    beta_se = np.sqrt(beta_var)

    # Compute R2, adjusted R2 and RMSE
    ss_tot = np.square(y - y.mean()).sum()
    # ss_exp = np.square(pred - y.mean()).sum()
    r2 = 1 - (ss_res / ss_tot)
    adj_r2 = 1 - (1 - r2) * (n - 1) / dof

    # Compute T and p-values
    T = coef / beta_se
    pval = 2 * t.sf(np.fabs(T), dof)

    # Compute confidence intervals
    crit = t.ppf(1 - alpha / 2, dof)
    marg_error = crit * beta_se
    ll = coef - marg_error
    ul = coef + marg_error

    # Rename CI
    ll_name = 'CI[%.1f%%]' % (100 * alpha / 2)
    ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2))

    # Create dict
    stats = {
        'names': names,
        'coef': coef,
        'se': beta_se,
        'T': T,
        'pval': pval,
        'r2': r2,
        'adj_r2': adj_r2,
        ll_name: ll,
        ul_name: ul
    }

    # Relative importance
    if relimp:
        data = pd.concat(
            [pd.DataFrame(y, columns=['y']),
             pd.DataFrame(X, columns=names)],
            sort=False,
            axis=1)
        if 'Intercept' in names:
            # Intercept is the first column
            reli = _relimp(data.drop(columns=['Intercept']).cov())
            reli['names'] = ['Intercept'] + reli['names']
            reli['relimp'] = np.insert(reli['relimp'], 0, np.nan)
            reli['relimp_perc'] = np.insert(reli['relimp_perc'], 0, np.nan)
        else:
            reli = _relimp(data.cov())
        stats.update(reli)

    if as_dataframe:
        stats = pd.DataFrame(stats)
        stats.residuals_ = 0  # Trick to avoid Pandas warning
        stats.residuals_ = resid  # Residuals is a hidden attribute
    else:
        stats['residuals'] = resid
    return stats
예제 #5
0
def logistic_regression(X, y, coef_only=False, alpha=0.05,
                        as_dataframe=True, remove_na=False, **kwargs):
    """(Multiple) Binary logistic regression.

    Parameters
    ----------
    X : array_like
        Predictor(s), of shape *(n_samples, n_features)* or *(n_samples)*.
    y : array_like
        Dependent variable, of shape *(n_samples)*.
        ``y`` must be binary, i.e. only contains 0 or 1. Multinomial logistic
        regression is not supported.
    coef_only : bool
        If True, return only the regression coefficients.
    alpha : float
        Alpha value used for the confidence intervals.
        :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]`
    as_dataframe : bool
        If True, returns a pandas DataFrame. If False, returns a dictionnary.
    remove_na : bool
        If True, apply a listwise deletion of missing values (i.e. the entire
        row is removed). Default is False, which will raise an error if missing
        values are present in either the predictor(s) or dependent
        variable.
    **kwargs : optional
        Optional arguments passed to
        :py:class:`sklearn.linear_model.LogisticRegression` (see Notes).

    Returns
    -------
    stats : :py:class:`pandas.DataFrame` or dict
        Logistic regression summary:

        * ``'names'``: name of variable(s) in the model (e.g. x1, x2...)
        * ``'coef'``: regression coefficients (log-odds)
        * ``'se'``: standard error
        * ``'z'``: z-scores
        * ``'pval'``: two-tailed p-values
        * ``'CI[2.5%]'``: lower confidence interval
        * ``'CI[97.5%]'``: upper confidence interval

    See also
    --------
    linear_regression

    Notes
    -----
    .. caution:: This function is a wrapper around the
        :py:class:`sklearn.linear_model.LogisticRegression` class. However,
        Pingouin internally disables the L2 regularization and changes the
        default solver in order to get results that are similar to R and
        statsmodels.

    The logistic regression assumes that the log-odds (the logarithm of the
    odds) for the value labeled "1" in the response variable is a linear
    combination of the predictor variables. The log-odds are given by the
    `logit <https://en.wikipedia.org/wiki/Logit>`_ function,
    which map a probability :math:`p` of the response variable being "1"
    from :math:`[0, 1)` to :math:`(-\\infty, +\\infty)`.

    .. math:: \\text{logit}(p) = \\ln \\frac{p}{1 - p} = \\beta_0 + \\beta X

    The odds of the response variable being "1" can be obtained by
    exponentiating the log-odds:

    .. math:: \\frac{p}{1 - p} = e^{\\beta_0 + \\beta X}

    and the probability of the response variable being "1" is given by the
    `logistic function <https://en.wikipedia.org/wiki/Logistic_function>`_:

    .. math:: p = \\frac{1}{1 + e^{-(\\beta_0 + \\beta X})}

    The first coefficient is always the constant term (intercept) of
    the model. Pingouin will automatically add the intercept
    to your predictor(s) matrix, therefore, :math:`X` should not include a
    constant term. Pingouin will remove any constant term (e.g column with only
    one unique value), or duplicate columns from :math:`X`.

    The calculation of the p-values and confidence interval is adapted from a
    `code by Rob Speare
    <https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d>`_.
    Results have been compared against statsmodels, R, and JASP.

    Examples
    --------
    1. Simple binary logistic regression.

    In this first example, we'll use the
    `penguins dataset <https://github.com/allisonhorst/palmerpenguins>`_
    to see how well we can predict the sex of penguins based on their
    bodies mass.

    >>> import numpy as np
    >>> import pandas as pd
    >>> import pingouin as pg
    >>> df = pg.read_dataset('penguins')
    >>> # Let's first convert the target variable from string to boolean:
    >>> df['male'] = (df['sex'] == 'male').astype(int)  # male: 1, female: 0
    >>> # Since there are missing values in our outcome variable, we need to
    >>> # set `remove_na=True` otherwise regression will fail.
    >>> lom = pg.logistic_regression(df['body_mass_g'], df['male'],
    ...                              remove_na=True)
    >>> lom.round(2)
             names  coef    se     z  pval  CI[2.5%]  CI[97.5%]
    0    Intercept -5.16  0.71 -7.24   0.0     -6.56      -3.77
    1  body_mass_g  0.00  0.00  7.24   0.0      0.00       0.00

    Body mass is a significant predictor of sex (p<0.001). Here, it
    could be useful to rescale our predictor variable from *g* to *kg*
    (e.g divide by 1000) in order to get more intuitive coefficients and
    confidence intervals:

    >>> df['body_mass_kg'] = df['body_mass_g'] / 1000
    >>> lom = pg.logistic_regression(df['body_mass_kg'], df['male'],
    ...                              remove_na=True)
    >>> lom.round(2)
              names  coef    se     z  pval  CI[2.5%]  CI[97.5%]
    0     Intercept -5.16  0.71 -7.24   0.0     -6.56      -3.77
    1  body_mass_kg  1.23  0.17  7.24   0.0      0.89       1.56

    2. Multiple binary logistic regression

    We'll now add the species as a categorical predictor in our model. To do
    so, we first need to dummy-code our categorical variable, dropping the
    first level of our categorical variable (species = Adelie) which will be
    used as the reference level:

    >>> df = pd.get_dummies(df, columns=['species'], drop_first=True)
    >>> X = df[['body_mass_kg', 'species_Chinstrap', 'species_Gentoo']]
    >>> y = df['male']
    >>> lom = pg.logistic_regression(X, y, remove_na=True)
    >>> lom.round(2)
                   names   coef    se     z  pval  CI[2.5%]  CI[97.5%]
    0          Intercept -26.24  2.84 -9.24  0.00    -31.81     -20.67
    1       body_mass_kg   7.10  0.77  9.23  0.00      5.59       8.61
    2  species_Chinstrap  -0.13  0.42 -0.31  0.75     -0.96       0.69
    3     species_Gentoo  -9.72  1.12 -8.65  0.00    -11.92      -7.52

    3. Using NumPy aray and returning only the coefficients

    >>> pg.logistic_regression(X.to_numpy(), y.to_numpy(), coef_only=True,
    ...                        remove_na=True)
    array([-26.23906892,   7.09826571,  -0.13180626,  -9.71718529])

    4. Passing custom parameters to sklearn

    >>> lom = pg.logistic_regression(X, y, solver='sag', max_iter=10000,
    ...                           random_state=42, remove_na=True)
    >>> print(lom['coef'].to_numpy())
    [-25.98248153   7.02881472  -0.13119779  -9.62247569]

    **How to interpret the log-odds coefficients?**

    We'll use the `Wikipedia example
    <https://en.wikipedia.org/wiki/Logistic_regression#Probability_of_passing_an_exam_versus_hours_of_study>`_
    of the probability of passing an exam
    versus the hours of study:

    *A group of 20 students spends between 0 and 6 hours studying for an
    exam. How does the number of hours spent studying affect the
    probability of the student passing the exam?*

    >>> # First, let's create the dataframe
    >>> Hours = [0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50,
    ...          2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50]
    >>> Pass = [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1]
    >>> df = pd.DataFrame({'HoursStudy': Hours, 'PassExam': Pass})
    >>> # And then run the logistic regression
    >>> lr = pg.logistic_regression(df['HoursStudy'], df['PassExam']).round(3)
    >>> lr
            names   coef     se      z   pval  CI[2.5%]  CI[97.5%]
    0   Intercept -4.078  1.761 -2.316  0.021    -7.529     -0.626
    1  HoursStudy  1.505  0.629  2.393  0.017     0.272      2.737

    The ``Intercept`` coefficient (-4.078) is the log-odds of ``PassExam=1``
    when ``HoursStudy=0``. The odds ratio can be obtained by exponentiating
    the log-odds:

    >>> np.exp(-4.078)
    0.016941314421496552

    i.e. :math:`0.017:1`. Conversely the odds of failing the exam are
    :math:`(1/0.017) \\approx 59:1`.

    The probability can then be obtained with the following equation

    .. math:: p = \\frac{1}{1 + e^{-(-4.078 + 0 * 1.505)}}

    >>> 1 / (1 + np.exp(-(-4.078)))
    0.016659087580814722

    The ``HoursStudy`` coefficient (1.505) means that for each additional hour
    of study, the log-odds of passing the exam increase by 1.505, and the odds
    are multipled by :math:`e^{1.505} \\approx 4.50`.

    For example, a student who studies 2 hours has a probability of passing
    the exam of 25%:

    >>> 1 / (1 + np.exp(-(-4.078 + 2 * 1.505)))
    0.2557836148964987

    The table below shows the probability of passing the exam for several
    values of ``HoursStudy``:

    +----------------+----------+----------------+------------------+
    | Hours of Study | Log-odds | Odds           | Probability      |
    +================+==========+================+==================+
    | 0              | −4.08    | 0.017 ≈ 1:59   | 0.017            |
    +----------------+----------+----------------+------------------+
    | 1              | −2.57    | 0.076 ≈ 1:13   | 0.07             |
    +----------------+----------+----------------+------------------+
    | 2              | −1.07    | 0.34 ≈ 1:3     | 0.26             |
    +----------------+----------+----------------+------------------+
    | 3              | 0.44     | 1.55           | 0.61             |
    +----------------+----------+----------------+------------------+
    | 4              | 1.94     | 6.96           | 0.87             |
    +----------------+----------+----------------+------------------+
    | 5              | 3.45     | 31.4           | 0.97             |
    +----------------+----------+----------------+------------------+
    | 6              | 4.96     | 141.4          | 0.99             |
    +----------------+----------+----------------+------------------+
    """
    # Check that sklearn is installed
    from pingouin.utils import _is_sklearn_installed
    _is_sklearn_installed(raise_error=True)
    from sklearn.linear_model import LogisticRegression

    # Extract names if X is a Dataframe or Series
    if isinstance(X, pd.DataFrame):
        names = X.keys().tolist()
    elif isinstance(X, pd.Series):
        names = [X.name]
    else:
        names = []

    # Convert to numpy array
    X = np.asarray(X)
    y = np.asarray(y)
    assert y.ndim == 1, 'y must be one-dimensional.'
    assert 0 < alpha < 1, 'alpha must be between 0 and 1.'

    # Add axis if only one-dimensional array
    if X.ndim == 1:
        X = X[..., np.newaxis]

    # Check for NaN /  Inf
    if remove_na:
        X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows')
        y = np.squeeze(y)
    y_gd = np.isfinite(y).all()
    X_gd = np.isfinite(X).all()
    assert y_gd, ("Target (y) contains NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")
    assert X_gd, ("Predictors (X) contain NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")

    # Check that X and y have same length
    assert y.shape[0] == X.shape[0], 'X and y must have same number of samples'

    # Check that y is binary
    if np.unique(y).size != 2:
        raise ValueError('Dependent variable must be binary.')

    if not names:
        names = ['x' + str(i + 1) for i in range(X.shape[1])]

    # We also want to make sure that there is no column
    # with only one unique value, otherwise the regression fails
    # This is equivalent, but much faster, to pd.DataFrame(X).nunique()
    idx_unique = np.where(np.all(X == X[0, :], axis=0))[0]
    if len(idx_unique):
        X = np.delete(X, idx_unique, 1)
        names = np.delete(names, idx_unique).tolist()

    # Finally, we want to remove duplicate columns
    if X.shape[1] > 1:
        idx_duplicate = []
        for pair in itertools.combinations(range(X.shape[1]), 2):
            if np.array_equal(X[:, pair[0]], X[:, pair[1]]):
                idx_duplicate.append(pair[1])
        if len(idx_duplicate):
            X = np.delete(X, idx_duplicate, 1)
            names = np.delete(names, idx_duplicate).tolist()

    # Initialize and fit
    if 'solver' not in kwargs:
        # https://stats.stackexchange.com/a/204324/253579
        # Updated in Pingouin > 0.3.6 to be consistent with R
        kwargs['solver'] = 'newton-cg'
    if 'penalty' not in kwargs:
        kwargs['penalty'] = 'none'
    lom = LogisticRegression(**kwargs)
    lom.fit(X, y)

    if lom.get_params()['fit_intercept']:
        names.insert(0, "Intercept")
        X_design = np.column_stack((np.ones(X.shape[0]), X))
        coef = np.append(lom.intercept_, lom.coef_)
    else:
        coef = lom.coef_
        X_design = X

    if coef_only:
        return coef

    # Fisher Information Matrix
    n, p = X_design.shape
    denom = (2 * (1 + np.cosh(lom.decision_function(X))))
    denom = np.tile(denom, (p, 1)).T
    fim = (X_design / denom).T @ X_design
    crao = np.linalg.pinv(fim)

    # Standard error and Z-scores
    se = np.sqrt(np.diag(crao))
    z_scores = coef / se

    # Two-tailed p-values
    pval = 2 * norm.sf(np.fabs(z_scores))

    # Wald Confidence intervals
    # In R: this is equivalent to confint.default(model)
    # Note that confint(model) will however return the profile CI
    crit = norm.ppf(1 - alpha / 2)
    ll = coef - crit * se
    ul = coef + crit * se

    # Rename CI
    ll_name = 'CI[%.1f%%]' % (100 * alpha / 2)
    ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2))

    # Create dict
    stats = {'names': names, 'coef': coef, 'se': se, 'z': z_scores,
             'pval': pval, ll_name: ll, ul_name: ul}
    if as_dataframe:
        return pd.DataFrame(stats)
    else:
        return stats
예제 #6
0
def linear_regression(X, y, add_intercept=True, weights=None, coef_only=False,
                      alpha=0.05, as_dataframe=True, remove_na=False,
                      relimp=False):
    """(Multiple) Linear regression.

    Parameters
    ----------
    X : array_like
        Predictor(s), of shape *(n_samples, n_features)* or *(n_samples)*.
    y : array_like
        Dependent variable, of shape *(n_samples)*.
    add_intercept : bool
        If False, assume that the data are already centered. If True, add a
        constant term to the model. In this case, the first value in the
        output dict is the intercept of the model.

        .. note:: It is generally recommanded to include a constant term
            (intercept) to the model to limit the bias and force the residual
            mean to equal zero. The intercept coefficient and p-values
            are however rarely meaningful.
    weights : array_like
        An optional vector of sample weights to be used in the fitting
        process, of shape *(n_samples)*. Missing or negative weights are not
        allowed. If not null, a weighted least squares is calculated.

        .. versionadded:: 0.3.5
    coef_only : bool
        If True, return only the regression coefficients.
    alpha : float
        Alpha value used for the confidence intervals.
        :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]`
    as_dataframe : bool
        If True, returns a pandas DataFrame. If False, returns a dictionnary.
    remove_na : bool
        If True, apply a listwise deletion of missing values (i.e. the entire
        row is removed). Default is False, which will raise an error if missing
        values are present in either the predictor(s) or dependent
        variable.
    relimp : bool
        If True, returns the relative importance (= contribution) of
        predictors. This is irrelevant when the predictors are uncorrelated:
        the total :math:`R^2` of the model is simply the sum of each univariate
        regression :math:`R^2`-values. However, this does not apply when
        predictors are correlated. Instead, the total :math:`R^2` of the model
        is partitioned by averaging over all combinations of predictors,
        as done in the `relaimpo
        <https://cran.r-project.org/web/packages/relaimpo/relaimpo.pdf>`_
        R package (``calc.relimp(type="lmg")``).

        .. warning:: The computation time roughly doubles for each
            additional predictor and therefore this can be extremely slow for
            models with more than 12-15 predictors.

        .. versionadded:: 0.3.0

    Returns
    -------
    stats : :py:class:`pandas.DataFrame` or dict
        Linear regression summary:

        * ``'names'``: name of variable(s) in the model (e.g. x1, x2...)
        * ``'coef'``: regression coefficients
        * ``'se'``: standard errors
        * ``'T'``: T-values
        * ``'pval'``: p-values
        * ``'r2'``: coefficient of determination (:math:`R^2`)
        * ``'adj_r2'``: adjusted :math:`R^2`
        * ``'CI[2.5%]'``: lower confidence intervals
        * ``'CI[97.5%]'``: upper confidence intervals
        * ``'relimp'``: relative contribution of each predictor to the final\
                        :math:`R^2` (only if ``relimp=True``).
        * ``'relimp_perc'``: percent relative contribution

        In addition, the output dataframe comes with hidden attributes such as
        the residuals, and degrees of freedom of the model and residuals, which
        can be accessed as follow, respectively:

        >>> lm = pg.linear_regression() # doctest: +SKIP
        >>> lm.residuals_, lm.df_model_, lm.df_resid_ # doctest: +SKIP

        Note that to follow scikit-learn convention, these hidden atributes end
        with an "_". When ``as_dataframe=False`` however, these attributes
        are no longer hidden and can be accessed as any other keys in the
        output dictionnary:

        >>> lm = pg.linear_regression() # doctest: +SKIP
        >>> lm['residuals'], lm['df_model'], lm['df_resid'] # doctest: +SKIP

    See also
    --------
    logistic_regression, mediation_analysis, corr

    Notes
    -----
    The :math:`\\beta` coefficients are estimated using an ordinary least
    squares (OLS) regression, as implemented in the
    :py:func:`scipy.linalg.lstsq` function. The OLS method minimizes
    the sum of squared residuals, and leads to a closed-form expression for
    the estimated :math:`\\beta`:

    .. math:: \\hat{\\beta} = (X^TX)^{-1} X^Ty

    It is generally recommanded to include a constant term (intercept) to the
    model to limit the bias and force the residual mean to equal zero.
    Note that intercept coefficient and p-values are however rarely meaningful.

    The standard error of the estimates is a measure of the accuracy of the
    prediction defined as:

    .. math:: \\sigma = \\sqrt{\\text{MSE} \\cdot (X^TX)^{-1}}

    where :math:`\\text{MSE}` is the mean squared error,

    .. math::

        \\text{MSE} = \\frac{SS_{\\text{resid}}}{n - p - 1}
         = \\frac{\\sum{(\\text{true} - \\text{pred})^2}}{n - p - 1}

    :math:`p` is the total number of predictor variables in the model
    (excluding the intercept) and :math:`n` is the sample size.

    Using the :math:`\\beta` coefficients and the standard errors,
    the T-values can be obtained:

    .. math:: T = \\frac{\\beta}{\\sigma}

    and the p-values approximated using a T-distribution with
    :math:`n - p - 1` degrees of freedom.

    The coefficient of determination (:math:`R^2`) is defined as:

    .. math:: R^2 = 1 - (\\frac{SS_{\\text{resid}}}{SS_{\\text{total}}})

    The adjusted :math:`R^2` is defined as:

    .. math:: \\overline{R}^2 = 1 - (1 - R^2) \\frac{n - 1}{n - p - 1}

    The relative importance (``relimp``) column is a partitioning of the
    total :math:`R^2` of the model into individual :math:`R^2` contribution.
    This is calculated by taking the average over average contributions in
    models of different sizes. For more details, please refer to
    `Groemping et al. 2006 <http://dx.doi.org/10.18637/jss.v017.i01>`_
    and the R package `relaimpo
    <https://cran.r-project.org/web/packages/relaimpo/relaimpo.pdf>`_.

    Note that Pingouin will automatically remove any duplicate columns
    from :math:`X`, as well as any column with only one unique value
    (constant), excluding the intercept.

    Results have been compared against sklearn, R, statsmodels and JASP.

    Examples
    --------
    1. Simple linear regression using columns of a pandas dataframe

    In this first example, we'll use the tips dataset to see how well we
    can predict the waiter's tip (in dollars) based on the total bill (also
    in dollars).

    >>> import numpy as np
    >>> import pingouin as pg
    >>> df = pg.read_dataset('tips')
    >>> # Let's predict the tip ($) based on the total bill (also in $)
    >>> lm = pg.linear_regression(df['total_bill'], df['tip'])
    >>> lm.round(2)
            names  coef    se      T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0   Intercept  0.92  0.16   5.76   0.0  0.46    0.45      0.61       1.23
    1  total_bill  0.11  0.01  14.26   0.0  0.46    0.45      0.09       0.12

    It comes as no surprise that total bill is indeed a significant predictor
    of the waiter's tip (T=14.26, p<0.05). The :math:`R^2` of the model is 0.46
    and the adjusted :math:`R^2` is 0.45, which means that our model roughly
    explains ~45% of the total variance in the tip amount.

    2. Multiple linear regression

    We can also have more than one predictor and run a multiple linear
    regression. Below, we add the party size as a second predictor of tip.

    >>> # We'll add a second predictor: the party size
    >>> lm = pg.linear_regression(df[['total_bill', 'size']], df['tip'])
    >>> lm.round(2)
            names  coef    se      T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0   Intercept  0.67  0.19   3.46  0.00  0.47    0.46      0.29       1.05
    1  total_bill  0.09  0.01  10.17  0.00  0.47    0.46      0.07       0.11
    2        size  0.19  0.09   2.26  0.02  0.47    0.46      0.02       0.36

    The party size is also a significant predictor of tip (T=2.26, p=0.02).
    Note that adding this new predictor however only improved the :math:`R^2`
    of our model by ~1%.

    This function also works with numpy arrays:

    >>> X = df[['total_bill', 'size']].to_numpy()
    >>> y = df['tip'].to_numpy()
    >>> pg.linear_regression(X, y).round(2)
           names  coef    se      T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0  Intercept  0.67  0.19   3.46  0.00  0.47    0.46      0.29       1.05
    1         x1  0.09  0.01  10.17  0.00  0.47    0.46      0.07       0.11
    2         x2  0.19  0.09   2.26  0.02  0.47    0.46      0.02       0.36

    3. Get the residuals

    >>> # For clarity, only display the first 9 values
    >>> np.round(lm.residuals_, 2)[:9]
    array([-1.62, -0.55,  0.31,  0.06, -0.11,  0.93,  0.13, -0.81, -0.49])

    Using pandas, we can show a summary of the distribution of the residuals:

    >>> import pandas as pd
    >>> pd.Series(lm.residuals_).describe().round(2)
    count    244.00
    mean      -0.00
    std        1.01
    min       -2.93
    25%       -0.55
    50%       -0.09
    75%        0.51
    max        4.04
    dtype: float64

    5. No intercept and return only the regression coefficients

    Sometimes it may be useful to remove the constant term from the regression,
    or to only return the regression coefficients without calculating the
    standard errors or p-values. This latter can potentially save you a lot of
    time if you need to calculate hundreds of regression and only care about
    the coefficients!

    >>> pg.linear_regression(X, y, add_intercept=False, coef_only=True)
    array([0.1007119 , 0.36209717])

    6. Return a dictionnary instead of a dataframe

    >>> lm_dict = pg.linear_regression(X, y, as_dataframe=False)
    >>> lm_dict.keys()
    dict_keys(['names', 'coef', 'se', 'T', 'pval', 'r2', 'adj_r2', 'CI[2.5%]',
               'CI[97.5%]', 'df_model', 'df_resid', 'residuals'])

    7. Remove missing values

    >>> X[4, 1] = np.nan
    >>> y[7] = np.nan
    >>> pg.linear_regression(X, y, remove_na=True, coef_only=True)
    array([0.65749955, 0.09262059, 0.19927529])

    8. Get the relative importance of predictors

    >>> lm = pg.linear_regression(X, y, remove_na=True, relimp=True)
    >>> lm[['names', 'relimp', 'relimp_perc']]
           names    relimp  relimp_perc
    0  Intercept       NaN          NaN
    1         x1  0.342503    73.045583
    2         x2  0.126386    26.954417

    The ``relimp`` column is a partitioning of the total :math:`R^2` of the
    model into individual contribution. Therefore, it sums to the :math:`R^2`
    of the full model. The ``relimp_perc`` is normalized to sum to 100%. See
    `Groemping 2006 <https://www.jstatsoft.org/article/view/v017i01>`_
    for more details.

    >>> lm[['relimp', 'relimp_perc']].sum()
    relimp           0.468889
    relimp_perc    100.000000
    dtype: float64

    9. Weighted linear regression

    >>> X = [1, 2, 3, 4, 5, 6]
    >>> y = [10, 22, 11, 13, 13, 16]
    >>> w = [1, 0.1, 1, 1, 0.5, 1]  # Array of weights. Must be >= 0.
    >>> lm = pg.linear_regression(X, y, weights=w)
    >>> lm.round(2)
           names  coef    se     T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0  Intercept  9.00  2.03  4.42  0.01  0.51    0.39      3.35      14.64
    1         x1  1.04  0.50  2.06  0.11  0.51    0.39     -0.36       2.44
    """
    # Extract names if X is a Dataframe or Series
    if isinstance(X, pd.DataFrame):
        names = X.keys().tolist()
    elif isinstance(X, pd.Series):
        names = [X.name]
    else:
        names = []

    # Convert input to numpy array
    X = np.asarray(X)
    y = np.asarray(y)
    assert y.ndim == 1, 'y must be one-dimensional.'
    assert 0 < alpha < 1

    if X.ndim == 1:
        # Convert to (n_samples, n_features) shape
        X = X[..., np.newaxis]

    # Check for NaN / Inf
    if remove_na:
        X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows')
        y = np.squeeze(y)
    y_gd = np.isfinite(y).all()
    X_gd = np.isfinite(X).all()
    assert y_gd, ("Target (y) contains NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")
    assert X_gd, ("Predictors (X) contain NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")

    # Check that X and y have same length
    assert y.shape[0] == X.shape[0], 'X and y must have same number of samples'

    if not names:
        names = ['x' + str(i + 1) for i in range(X.shape[1])]

    if add_intercept:
        # Add intercept
        X = np.column_stack((np.ones(X.shape[0]), X))
        names.insert(0, "Intercept")

    # FINAL CHECKS BEFORE RUNNING LEAST SQUARES REGRESSION
    # 1. Let's remove column(s) with only zero, otherwise the regression fails
    n_nonzero = np.count_nonzero(X, axis=0)
    idx_zero = np.flatnonzero(n_nonzero == 0)  # Find columns that are only 0
    if len(idx_zero):
        X = np.delete(X, idx_zero, 1)
        names = np.delete(names, idx_zero)

    # 2. We also want to make sure that there is no more than one constant
    # column (= intercept), otherwise the regression fails
    # This is equivalent, but much faster, to pd.DataFrame(X).nunique()
    idx_unique = np.where(np.all(X == X[0, :], axis=0))[0]
    if len(idx_unique) > 1:
        # We remove all but the first "Intercept" column.
        X = np.delete(X, idx_unique[1:], 1)
        names = np.delete(names, idx_unique[1:])
    # Is there a constant in our predictor matrix? Useful for dof and R^2.
    constant = 1 if len(idx_unique) > 0 else 0

    # 3. Finally, we want to remove duplicate columns
    if X.shape[1] > 1:
        idx_duplicate = []
        for pair in itertools.combinations(range(X.shape[1]), 2):
            if np.array_equal(X[:, pair[0]], X[:, pair[1]]):
                idx_duplicate.append(pair[1])
        if len(idx_duplicate):
            X = np.delete(X, idx_duplicate, 1)
            names = np.delete(names, idx_duplicate)

    # 4. Check that we have enough samples / features
    n, p = X.shape[0], X.shape[1]
    assert n >= 3, 'At least three valid samples are required in X.'
    assert p >= 1, 'X must have at least one valid column.'

    # 5. Handle weights
    if weights is not None:
        if relimp:
            raise ValueError("relimp = True is not supported when using "
                             "weights.")
        w = np.asarray(weights)
        assert w.ndim == 1, 'weights must be a 1D array.'
        assert w.size == n, 'weights must be of shape n_samples.'
        assert not np.isnan(w).any(), 'Missing weights are not accepted.'
        assert not (w < 0).any(), 'Negative weights are not accepted.'
        # Do not count weights == 0 in dof
        # This gives similar results as R lm() but different from statsmodels
        n = np.count_nonzero(w)
        # Rescale (whitening)
        wts = np.diag(np.sqrt(w))
        Xw = wts @ X
        yw = wts @ y
    else:
        # Set all weights to one, [1, 1, 1, ...]
        w = np.ones(n)
        Xw = X
        yw = y

    # FIT (WEIGHTED) LEAST SQUARES REGRESSION USING SCIPY.LINALG.LSTST
    coef, ss_res, rank, _ = lstsq(Xw, yw)
    if coef_only:
        return coef

    # Degrees of freedom
    df_model = rank - constant
    df_resid = n - p

    # Calculate predicted values and (weighted) residuals
    pred = Xw @ coef
    resid = yw - pred
    # ss_res = (resid ** 2).sum()

    # Calculate total (weighted) sums of squares and R^2
    ss_tot = yw @ yw
    ss_wtot = np.sum(w * (y - np.average(y, weights=w))**2)
    if constant:
        r2 = 1 - ss_res / ss_wtot
    else:
        r2 = 1 - ss_res / ss_tot
    adj_r2 = 1 - (1 - r2) * (n - constant) / df_resid

    # Compute mean squared error, variance and SE
    mse = ss_res / df_resid
    beta_var = mse * (np.linalg.pinv(Xw.T @ Xw).diagonal())
    beta_se = np.sqrt(beta_var)

    # Compute T and p-values
    T = coef / beta_se
    pval = 2 * t.sf(np.fabs(T), df_resid)

    # Compute confidence intervals
    crit = t.ppf(1 - alpha / 2, df_resid)
    marg_error = crit * beta_se
    ll = coef - marg_error
    ul = coef + marg_error

    # Rename CI
    ll_name = 'CI[%.1f%%]' % (100 * alpha / 2)
    ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2))

    # Create dict
    stats = {'names': names, 'coef': coef, 'se': beta_se, 'T': T,
             'pval': pval, 'r2': r2, 'adj_r2': adj_r2, ll_name: ll,
             ul_name: ul}

    # Relative importance
    if relimp:
        data = pd.concat([pd.DataFrame(y, columns=['y']),
                          pd.DataFrame(X, columns=names)], sort=False, axis=1)
        if 'Intercept' in names:
            # Intercept is the first column
            reli = _relimp(data.drop(columns=['Intercept']).cov())
            reli['names'] = ['Intercept'] + reli['names']
            reli['relimp'] = np.insert(reli['relimp'], 0, np.nan)
            reli['relimp_perc'] = np.insert(reli['relimp_perc'], 0, np.nan)
        else:
            reli = _relimp(data.cov())
        stats.update(reli)

    if as_dataframe:
        stats = pd.DataFrame(stats)
        stats.df_model_ = df_model
        stats.df_resid_ = df_resid
        stats.residuals_ = 0  # Trick to avoid Pandas warning
        stats.residuals_ = resid  # Residuals is a hidden attribute
    else:
        stats['df_model'] = df_model
        stats['df_resid'] = df_resid
        stats['residuals'] = resid
    return stats
예제 #7
0
def logistic_regression(X,
                        y,
                        coef_only=False,
                        alpha=0.05,
                        as_dataframe=True,
                        remove_na=False,
                        **kwargs):
    """(Multiple) Binary logistic regression.

    Parameters
    ----------
    X : np.array or list
        Predictor(s). Shape = (n_samples, n_features) or (n_samples,).
    y : np.array or list
        Dependent variable. Shape = (n_samples).
        Must be binary.
    coef_only : bool
        If True, return only the regression coefficients.
    alpha : float
        Alpha value used for the confidence intervals.
        :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]`
    as_dataframe : bool
        If True, returns a pandas DataFrame. If False, returns a dictionnary.
    remove_na : bool
        If True, apply a listwise deletion of missing values (i.e. the entire
        row is removed). Default is False, which will raise an error if missing
        values are present in either the predictor(s) or dependent
        variable.
    **kwargs : optional
        Optional arguments passed to
        :py:class:`sklearn.linear_model.LogisticRegression`.

    Returns
    -------
    stats : dataframe or dict
        Logistic regression summary::

        'names' : name of variable(s) in the model (e.g. x1, x2...)
        'coef' : regression coefficients
        'se' : standard error
        'z' : z-scores
        'pval' : two-tailed p-values
        'CI[2.5%]' : lower confidence interval
        'CI[97.5%]' : upper confidence interval

    See also
    --------
    linear_regression

    Notes
    -----
    This is a wrapper around the
    :py:class:`sklearn.linear_model.LogisticRegression` class. Note that
    Pingouin automatically disables the l2 regularization applied by
    scikit-learn. This can be modified by changing the `penalty` argument.

    The calculation of the p-values and confidence interval is adapted from a
    code found at
    https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d

    Note that the first coefficient is always the constant term (intercept) of
    the model. Scikit-learn will automatically add the intercept
    to your predictor(s) matrix, therefore, :math:`X` should not include a
    constant term. Pingouin will remove any constant term (e.g column with only
    one unique value), or duplicate columns from :math:`X`.

    Results have been compared against statsmodels, R, and JASP.

    Examples
    --------
    1. Simple binary logistic regression

    >>> import numpy as np
    >>> from pingouin import logistic_regression
    >>> np.random.seed(123)
    >>> x = np.random.normal(size=30)
    >>> y = np.random.randint(0, 2, size=30)
    >>> lom = logistic_regression(x, y)
    >>> lom.round(2)
           names  coef    se     z  pval  CI[2.5%]  CI[97.5%]
    0  Intercept -0.27  0.37 -0.74  0.46     -1.00       0.45
    1         x1  0.07  0.32  0.21  0.84     -0.55       0.68

    2. Multiple binary logistic regression

    >>> np.random.seed(42)
    >>> z = np.random.normal(size=30)
    >>> X = np.column_stack((x, z))
    >>> lom = logistic_regression(X, y)
    >>> print(lom['coef'].values)
    [-0.36736745 -0.04374684 -0.47829392]

    3. Using a Pandas DataFrame

    >>> import pandas as pd
    >>> df = pd.DataFrame({'x': x, 'y': y, 'z': z})
    >>> lom = logistic_regression(df[['x', 'z']], df['y'])
    >>> print(lom['coef'].values)
    [-0.36736745 -0.04374684 -0.47829392]

    4. Return only the coefficients

    >>> logistic_regression(X, y, coef_only=True)
    array([-0.36736745, -0.04374684, -0.47829392])

    5. Passing custom parameters to sklearn

    >>> lom = logistic_regression(X, y, solver='sag', max_iter=10000,
    ...                           random_state=42)
    >>> print(lom['coef'].values)
    [-0.36751796 -0.04367056 -0.47841908]
    """
    # Check that sklearn is installed
    from pingouin.utils import _is_sklearn_installed
    _is_sklearn_installed(raise_error=True)
    from sklearn.linear_model import LogisticRegression

    # Extract names if X is a Dataframe or Series
    if isinstance(X, pd.DataFrame):
        names = X.keys().tolist()
    elif isinstance(X, pd.Series):
        names = [X.name]
    else:
        names = []

    # Convert to numpy array
    X = np.asarray(X)
    y = np.asarray(y)
    assert y.ndim == 1, 'y must be one-dimensional.'
    assert 0 < alpha < 1, 'alpha must be between 0 and 1.'

    # Add axis if only one-dimensional array
    if X.ndim == 1:
        X = X[..., np.newaxis]

    # Check for NaN /  Inf
    if remove_na:
        X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows')
        y = np.squeeze(y)
    y_gd = np.isfinite(y).all()
    X_gd = np.isfinite(X).all()
    assert y_gd, ("Target (y) contains NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")
    assert X_gd, ("Predictors (X) contain NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")

    # Check that X and y have same length
    assert y.shape[0] == X.shape[0], 'X and y must have same number of samples'

    # Check that y is binary
    if np.unique(y).size != 2:
        raise ValueError('Dependent variable must be binary.')

    if not names:
        names = ['x' + str(i + 1) for i in range(X.shape[1])]

    # We also want to make sure that there is no column
    # with only one unique value, otherwise the regression fails
    # This is equivalent, but much faster, to pd.DataFrame(X).nunique()
    idx_unique = np.where(np.all(X == X[0, :], axis=0))[0]
    if len(idx_unique):
        X = np.delete(X, idx_unique, 1)
        names = np.delete(names, idx_unique).tolist()

    # Finally, we want to remove duplicate columns
    if X.shape[1] > 1:
        idx_duplicate = []
        for pair in itertools.combinations(range(X.shape[1]), 2):
            if np.array_equal(X[:, pair[0]], X[:, pair[1]]):
                idx_duplicate.append(pair[1])
        if len(idx_duplicate):
            X = np.delete(X, idx_duplicate, 1)
            names = np.delete(names, idx_duplicate).tolist()

    # Initialize and fit
    if 'solver' not in kwargs:
        kwargs['solver'] = 'lbfgs'
    if 'multi_class' not in kwargs:
        kwargs['multi_class'] = 'auto'
    if 'penalty' not in kwargs:
        kwargs['penalty'] = 'none'
    lom = LogisticRegression(**kwargs)
    lom.fit(X, y)
    coef = np.append(lom.intercept_, lom.coef_)
    if coef_only:
        return coef

    # Design matrix -- add intercept
    names.insert(0, "Intercept")
    X_design = np.column_stack((np.ones(X.shape[0]), X))
    n, p = X_design.shape

    # Fisher Information Matrix
    denom = (2 * (1 + np.cosh(lom.decision_function(X))))
    denom = np.tile(denom, (p, 1)).T
    fim = np.dot((X_design / denom).T, X_design)
    crao = np.linalg.pinv(fim)

    # Standard error and Z-scores
    se = np.sqrt(np.diag(crao))
    z_scores = coef / se

    # Two-tailed p-values
    pval = 2 * norm.sf(np.fabs(z_scores))

    # Confidence intervals
    crit = norm.ppf(1 - alpha / 2)
    ll = coef - crit * se
    ul = coef + crit * se

    # Rename CI
    ll_name = 'CI[%.1f%%]' % (100 * alpha / 2)
    ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2))

    # Create dict
    stats = {
        'names': names,
        'coef': coef,
        'se': se,
        'z': z_scores,
        'pval': pval,
        ll_name: ll,
        ul_name: ul
    }
    if as_dataframe:
        return pd.DataFrame(stats)
    else:
        return stats