def skipped(x, y, method='spearman'): """ Skipped correlation (Rousselet and Pernet 2012). Parameters ---------- x, y : array_like First and second set of observations. x and y must be independent. method : str Method used to compute the correlation after outlier removal. Can be either 'spearman' (default) or 'pearson'. Returns ------- r : float Skipped correlation coefficient. pval : float Two-tailed p-value. outliers : array of bool Indicate if value is an outlier or not Notes ----- The skipped correlation involves multivariate outlier detection using a projection technique (Wilcox, 2004, 2005). First, a robust estimator of multivariate location and scatter, for instance the minimum covariance determinant estimator (MCD; Rousseeuw, 1984; Rousseeuw and van Driessen, 1999; Hubert et al., 2008) is computed. Second, data points are orthogonally projected on lines joining each of the data point to the location estimator. Third, outliers are detected using a robust technique. Finally, Spearman correlations are computed on the remaining data points and calculations are adjusted by taking into account the dependency among the remaining data points. Code inspired by Matlab code from Cyril Pernet and Guillaume Rousselet [1]_. Requires scikit-learn. References ---------- .. [1] Pernet CR, Wilcox R, Rousselet GA. Robust Correlation Analyses: False Positive and Power Validation Using a New Open Source Matlab Toolbox. Frontiers in Psychology. 2012;3:606. doi:10.3389/fpsyg.2012.00606. """ # Check that sklearn is installed from pingouin.utils import _is_sklearn_installed _is_sklearn_installed(raise_error=True) from scipy.stats import chi2 from sklearn.covariance import MinCovDet X = np.column_stack((x, y)) nrows, ncols = X.shape gval = np.sqrt(chi2.ppf(0.975, 2)) # Compute center and distance to center center = MinCovDet(random_state=42).fit(X).location_ B = X - center B2 = B**2 bot = B2.sum(axis=1) # Loop over rows dis = np.zeros(shape=(nrows, nrows)) for i in np.arange(nrows): if bot[i] != 0: dis[i, :] = np.linalg.norm(B * B2[i, :] / bot[i], axis=1) # Detect outliers def idealf(x): """Compute the ideal fourths IQR (Wilcox 2012). """ n = len(x) j = int(np.floor(n / 4 + 5 / 12)) y = np.sort(x) g = (n / 4) - j + (5 / 12) low = (1 - g) * y[j - 1] + g * y[j] k = n - j + 1 up = (1 - g) * y[k - 1] + g * y[k - 2] return up - low # One can either use the MAD or the IQR (see Wilcox 2012) # MAD = mad(dis, axis=1) iqr = np.apply_along_axis(idealf, 1, dis) thresh = (np.median(dis, axis=1) + gval * iqr) outliers = np.apply_along_axis(np.greater, 0, dis, thresh).any(axis=0) # Compute correlation on remaining data if method == 'spearman': r, pval = spearmanr(X[~outliers, 0], X[~outliers, 1]) else: r, pval = pearsonr(X[~outliers, 0], X[~outliers, 1]) return r, pval, outliers
def _is_sklearn_installed(self): """Test function _is_statsmodels_installed.""" assert isinstance(_is_sklearn_installed(), bool)
def logistic_regression(X, y, coef_only=False, alpha=0.05, as_dataframe=True, remove_na=False, **kwargs): """(Multiple) Binary logistic regression. Parameters ---------- X : np.array or list Predictor(s). Shape = (n_samples, n_features) or (n_samples,). y : np.array or list Dependent variable. Shape = (n_samples). Must be binary. coef_only : bool If True, return only the regression coefficients. alpha : float Alpha value used for the confidence intervals. CI = [alpha / 2 ; 1 - alpha / 2] as_dataframe : bool If True, returns a pandas DataFrame. If False, returns a dictionnary. remove_na : bool If True, apply a listwise deletion of missing values (i.e. the entire row is removed). **kwargs : optional Optional arguments passed to sklearn.linear_model.LogisticRegression. Returns ------- stats : dataframe or dict Logistic regression summary:: 'names' : name of variable(s) in the model (e.g. x1, x2...) 'coef' : regression coefficients 'se' : standard error 'z' : z-scores 'pval' : two-tailed p-values 'CI[2.5%]' : lower confidence interval 'CI[97.5%]' : upper confidence interval Notes ----- This is a wrapper around the :py:class:`sklearn.linear_model.LogisticRegression` class. Results have been compared against statsmodels and JASP. Note that the first coefficient is always the constant term (intercept) of the model. This function will not run if NaN values are either present in the target or predictors variables. Please remove them before runing the function. Adapted from a code found at https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d Examples -------- 1. Simple binary logistic regression >>> import numpy as np >>> from pingouin import logistic_regression >>> np.random.seed(123) >>> x = np.random.normal(size=30) >>> y = np.random.randint(0, 2, size=30) >>> lom = logistic_regression(x, y) >>> lom.round(2) names coef se z pval CI[2.5%] CI[97.5%] 0 Intercept -0.27 0.37 -0.73 0.46 -0.99 0.45 1 x1 0.06 0.32 0.19 0.85 -0.56 0.68 2. Multiple binary logistic regression >>> np.random.seed(42) >>> z = np.random.normal(size=30) >>> X = np.column_stack((x, z)) >>> lom = logistic_regression(X, y) >>> print(lom['coef'].values) [-0.34933805 -0.0226106 -0.39453532] 3. Using a Pandas DataFrame >>> import pandas as pd >>> df = pd.DataFrame({'x': x, 'y': y, 'z': z}) >>> lom = logistic_regression(df[['x', 'z']], df['y']) >>> print(lom['coef'].values) [-0.34933805 -0.0226106 -0.39453532] 4. Return only the coefficients >>> logistic_regression(X, y, coef_only=True) array([-0.34933805, -0.0226106 , -0.39453532]) 4. Passing custom parameters to sklearn >>> lom = logistic_regression(X, y, solver='sag', max_iter=10000) >>> print(lom['coef'].values) [-0.34941889 -0.02261911 -0.39451064] """ # Check that sklearn is installed from pingouin.utils import _is_sklearn_installed _is_sklearn_installed(raise_error=True) from sklearn.linear_model import LogisticRegression # Extract names if X is a Dataframe or Series if isinstance(X, pd.DataFrame): names = X.keys().tolist() elif isinstance(X, pd.Series): names = [X.name] else: names = [] assert 0 < alpha < 1 assert y.ndim == 1, 'y must be one-dimensional.' # Convert to numpy array X = np.asarray(X) y = np.asarray(y) # Add axis if only one-dimensional array if X.ndim == 1: X = X[..., np.newaxis] # Check for NaN / Inf if remove_na: X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows') y = np.squeeze(y) y_gd = np.isfinite(y).all() X_gd = np.isfinite(X).all() assert y_gd, 'Target variable contains NaN or Inf. Please remove them.' assert X_gd, 'Predictors contains NaN or Inf. Please remove them.' # Check that X and y have same length assert y.shape[0] == X.shape[0], 'X and y must have same number of samples' # Check that y is binary if np.unique(y).size != 2: raise ValueError('Dependent variable must be binary.') if not names: names = ['x' + str(i + 1) for i in range(X.shape[1])] # Add intercept in names names.insert(0, "Intercept") # Initialize and fit if 'solver' not in kwargs: kwargs['solver'] = 'lbfgs' if 'multi_class' not in kwargs: kwargs['multi_class'] = 'auto' lom = LogisticRegression(**kwargs) lom.fit(X, y) coef = np.append(lom.intercept_, lom.coef_) if coef_only: return coef # Design matrix -- add intercept X_design = np.column_stack((np.ones(X.shape[0]), X)) n, p = X_design.shape # Fisher Information Matrix denom = (2 * (1 + np.cosh(lom.decision_function(X)))) denom = np.tile(denom, (p, 1)).T fim = np.dot((X_design / denom).T, X_design) crao = np.linalg.inv(fim) # Standard error and Z-scores se = np.sqrt(np.diag(crao)) z_scores = coef / se # Two-tailed p-values pval = np.array([2 * norm.sf(abs(z)) for z in z_scores]) # Confidence intervals crit = norm.ppf(1 - alpha / 2) ll = coef - crit * se ul = coef + crit * se # Rename CI ll_name = 'CI[%.1f%%]' % (100 * alpha / 2) ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2)) # Create dict stats = { 'names': names, 'coef': coef, 'se': se, 'z': z_scores, 'pval': pval, ll_name: ll, ul_name: ul } if as_dataframe: return pd.DataFrame.from_dict(stats) else: return stats
def logistic_regression(X, y, coef_only=False, alpha=0.05, as_dataframe=True, remove_na=False, **kwargs): """(Multiple) Binary logistic regression. Parameters ---------- X : np.array or list Predictor(s). Shape = (n_samples, n_features) or (n_samples,). y : np.array or list Dependent variable. Shape = (n_samples). ``y`` must be binary, i.e. only contains 0 or 1. Multinomial logistic regression is not supported. coef_only : bool If True, return only the regression coefficients. alpha : float Alpha value used for the confidence intervals. :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]` as_dataframe : bool If True, returns a pandas DataFrame. If False, returns a dictionnary. remove_na : bool If True, apply a listwise deletion of missing values (i.e. the entire row is removed). Default is False, which will raise an error if missing values are present in either the predictor(s) or dependent variable. **kwargs : optional Optional arguments passed to :py:class:`sklearn.linear_model.LogisticRegression` (see Notes). Returns ------- stats : dataframe or dict Logistic regression summary:: 'names' : name of variable(s) in the model (e.g. x1, x2...) 'coef' : regression coefficients (log-odds) 'se' : standard error 'z' : z-scores 'pval' : two-tailed p-values 'CI[2.5%]' : lower confidence interval 'CI[97.5%]' : upper confidence interval See also -------- linear_regression Notes ----- This is a wrapper around the :py:class:`sklearn.linear_model.LogisticRegression` class. Importantly, Pingouin automatically disables the L2 regularization applied by scikit-learn. This can be modified by changing the ``penalty`` argument. The logistic regression assumes that the log-odds (the logarithm of the odds) for the value labeled "1" in the response variable is a linear combination of the predictor variables. The log-odds are given by the `logit <https://en.wikipedia.org/wiki/Logit>`_ function, which map a probability :math:`p` of the response variable being "1" from :math:`[0, 1)` to :math:`(-\\infty, +\\infty)`. .. math:: \\text{logit}(p) = \\ln \\frac{p}{1 - p} = \\beta_0 + \\beta X The odds of the response variable being "1" can be obtained by exponentiating the log-odds: .. math:: \\frac{p}{1 - p} = e^{\\beta_0 + \\beta X} and the probability of the response variable being "1" is given by: .. math:: p = \\frac{1}{1 + e^{-(\\beta_0 + \\beta X})} Note that the above function that converts log-odds to probability is called the `logistic function <https://en.wikipedia.org/wiki/Logistic_function>`_. The first coefficient is always the constant term (intercept) of the model. Scikit-learn will automatically add the intercept to your predictor(s) matrix, therefore, :math:`X` should not include a constant term. Pingouin will remove any constant term (e.g column with only one unique value), or duplicate columns from :math:`X`. The calculation of the p-values and confidence interval is adapted from a code found at https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d Results have been compared against statsmodels, R, and JASP. Examples -------- 1. Simple binary logistic regression >>> import numpy as np >>> from pingouin import logistic_regression >>> np.random.seed(123) >>> x = np.random.normal(size=30) >>> y = np.random.randint(0, 2, size=30) >>> lom = logistic_regression(x, y) >>> lom.round(2) names coef se z pval CI[2.5%] CI[97.5%] 0 Intercept -0.27 0.37 -0.74 0.46 -1.00 0.45 1 x1 0.07 0.32 0.21 0.84 -0.55 0.68 2. Multiple binary logistic regression >>> np.random.seed(42) >>> z = np.random.normal(size=30) >>> X = np.column_stack((x, z)) >>> lom = logistic_regression(X, y) >>> print(lom['coef'].values) [-0.36736745 -0.04374684 -0.47829392] 3. Using a Pandas DataFrame >>> import pandas as pd >>> df = pd.DataFrame({'x': x, 'y': y, 'z': z}) >>> lom = logistic_regression(df[['x', 'z']], df['y']) >>> print(lom['coef'].values) [-0.36736745 -0.04374684 -0.47829392] 4. Return only the coefficients >>> logistic_regression(X, y, coef_only=True) array([-0.36736745, -0.04374684, -0.47829392]) 5. Passing custom parameters to sklearn >>> lom = logistic_regression(X, y, solver='sag', max_iter=10000, ... random_state=42) >>> print(lom['coef'].values) [-0.36751796 -0.04367056 -0.47841908] **How to interpret the log-odds coefficients?** We'll use the `Wikipedia example <https://en.wikipedia.org/wiki/Logistic_regression#Probability_of_passing_an_exam_versus_hours_of_study>`_ of the probability of passing an exam versus the hours of study: *A group of 20 students spends between 0 and 6 hours studying for an exam. How does the number of hours spent studying affect the probability of the student passing the exam?* >>> # First, let's create the dataframe >>> Hours = [0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50, ... 2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50] >>> Pass = [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1] >>> df = pd.DataFrame({'HoursStudy': Hours, 'PassExam': Pass}) >>> # And then run the logistic regression >>> lr = logistic_regression(df['HoursStudy'], df['PassExam']).round(3) >>> lr names coef se z pval CI[2.5%] CI[97.5%] 0 Intercept -4.078 1.761 -2.316 0.021 -7.529 -0.626 1 HoursStudy 1.505 0.629 2.393 0.017 0.272 2.737 The ``Intercept`` coefficient (-4.078) is the log-odds of ``PassExam=1`` when ``HoursStudy=0``. The odds ratio can be obtained by exponentiating the log-odds: >>> np.exp(-4.078) 0.016941314421496552 i.e. :math:`0.017:1`. Conversely the odds of failing the exam are :math:`(1/0.017) \\approx 59:1`. The probability can then be obtained with the following equation .. math:: p = \\frac{1}{1 + e^{-(-4.078 + 0 * 1.505)}} >>> 1 / (1 + np.exp(-(-4.078))) 0.016659087580814722 The ``HoursStudy`` coefficient (1.505) means that for each additional hour of study, the log-odds of passing the exam increase by 1.505, and the odds are multipled by :math:`e^{1.505} \\approx 4.50`. For example, a student who studies 2 hours has a probability of passing the exam of 25%: >>> 1 / (1 + np.exp(-(-4.078 + 2 * 1.505))) 0.2557836148964987 The table below shows the probability of passing the exam for several values of ``HoursStudy``: +----------------+----------+----------------+------------------+ | Hours of Study | Log-odds | Odds | Probability | +================+==========+================+==================+ | 0 | −4.08 | 0.017 ≈ 1:59 | 0.017 | +----------------+----------+----------------+------------------+ | 1 | −2.57 | 0.076 ≈ 1:13 | 0.07 | +----------------+----------+----------------+------------------+ | 2 | −1.07 | 0.34 ≈ 1:3 | 0.26 | +----------------+----------+----------------+------------------+ | 3 | 0.44 | 1.55 | 0.61 | +----------------+----------+----------------+------------------+ | 4 | 1.94 | 6.96 | 0.87 | +----------------+----------+----------------+------------------+ | 5 | 3.45 | 31.4 | 0.97 | +----------------+----------+----------------+------------------+ | 6 | 4.96 | 141.4 | 0.99 | +----------------+----------+----------------+------------------+ """ # Check that sklearn is installed from pingouin.utils import _is_sklearn_installed _is_sklearn_installed(raise_error=True) from sklearn.linear_model import LogisticRegression # Extract names if X is a Dataframe or Series if isinstance(X, pd.DataFrame): names = X.keys().tolist() elif isinstance(X, pd.Series): names = [X.name] else: names = [] # Convert to numpy array X = np.asarray(X) y = np.asarray(y) assert y.ndim == 1, 'y must be one-dimensional.' assert 0 < alpha < 1, 'alpha must be between 0 and 1.' # Add axis if only one-dimensional array if X.ndim == 1: X = X[..., np.newaxis] # Check for NaN / Inf if remove_na: X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows') y = np.squeeze(y) y_gd = np.isfinite(y).all() X_gd = np.isfinite(X).all() assert y_gd, ("Target (y) contains NaN or Inf. Please remove them " "manually or use remove_na=True.") assert X_gd, ("Predictors (X) contain NaN or Inf. Please remove them " "manually or use remove_na=True.") # Check that X and y have same length assert y.shape[0] == X.shape[0], 'X and y must have same number of samples' # Check that y is binary if np.unique(y).size != 2: raise ValueError('Dependent variable must be binary.') if not names: names = ['x' + str(i + 1) for i in range(X.shape[1])] # We also want to make sure that there is no column # with only one unique value, otherwise the regression fails # This is equivalent, but much faster, to pd.DataFrame(X).nunique() idx_unique = np.where(np.all(X == X[0, :], axis=0))[0] if len(idx_unique): X = np.delete(X, idx_unique, 1) names = np.delete(names, idx_unique).tolist() # Finally, we want to remove duplicate columns if X.shape[1] > 1: idx_duplicate = [] for pair in itertools.combinations(range(X.shape[1]), 2): if np.array_equal(X[:, pair[0]], X[:, pair[1]]): idx_duplicate.append(pair[1]) if len(idx_duplicate): X = np.delete(X, idx_duplicate, 1) names = np.delete(names, idx_duplicate).tolist() # Initialize and fit if 'solver' not in kwargs: kwargs['solver'] = 'lbfgs' if 'multi_class' not in kwargs: kwargs['multi_class'] = 'auto' if 'penalty' not in kwargs: kwargs['penalty'] = 'none' lom = LogisticRegression(**kwargs) lom.fit(X, y) coef = np.append(lom.intercept_, lom.coef_) if coef_only: return coef # Design matrix -- add intercept names.insert(0, "Intercept") X_design = np.column_stack((np.ones(X.shape[0]), X)) n, p = X_design.shape # Fisher Information Matrix denom = (2 * (1 + np.cosh(lom.decision_function(X)))) denom = np.tile(denom, (p, 1)).T fim = np.dot((X_design / denom).T, X_design) crao = np.linalg.pinv(fim) # Standard error and Z-scores se = np.sqrt(np.diag(crao)) z_scores = coef / se # Two-tailed p-values pval = 2 * norm.sf(np.fabs(z_scores)) # Confidence intervals crit = norm.ppf(1 - alpha / 2) ll = coef - crit * se ul = coef + crit * se # Rename CI ll_name = 'CI[%.1f%%]' % (100 * alpha / 2) ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2)) # Create dict stats = { 'names': names, 'coef': coef, 'se': se, 'z': z_scores, 'pval': pval, ll_name: ll, ul_name: ul } if as_dataframe: return pd.DataFrame(stats) else: return stats
def logistic_regression(X, y, coef_only=False, alpha=0.05, as_dataframe=True, remove_na=False, **kwargs): """(Multiple) Binary logistic regression. Parameters ---------- X : array_like Predictor(s), of shape *(n_samples, n_features)* or *(n_samples)*. y : array_like Dependent variable, of shape *(n_samples)*. ``y`` must be binary, i.e. only contains 0 or 1. Multinomial logistic regression is not supported. coef_only : bool If True, return only the regression coefficients. alpha : float Alpha value used for the confidence intervals. :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]` as_dataframe : bool If True, returns a pandas DataFrame. If False, returns a dictionnary. remove_na : bool If True, apply a listwise deletion of missing values (i.e. the entire row is removed). Default is False, which will raise an error if missing values are present in either the predictor(s) or dependent variable. **kwargs : optional Optional arguments passed to :py:class:`sklearn.linear_model.LogisticRegression` (see Notes). Returns ------- stats : :py:class:`pandas.DataFrame` or dict Logistic regression summary: * ``'names'``: name of variable(s) in the model (e.g. x1, x2...) * ``'coef'``: regression coefficients (log-odds) * ``'se'``: standard error * ``'z'``: z-scores * ``'pval'``: two-tailed p-values * ``'CI[2.5%]'``: lower confidence interval * ``'CI[97.5%]'``: upper confidence interval See also -------- linear_regression Notes ----- .. caution:: This function is a wrapper around the :py:class:`sklearn.linear_model.LogisticRegression` class. However, Pingouin internally disables the L2 regularization and changes the default solver in order to get results that are similar to R and statsmodels. The logistic regression assumes that the log-odds (the logarithm of the odds) for the value labeled "1" in the response variable is a linear combination of the predictor variables. The log-odds are given by the `logit <https://en.wikipedia.org/wiki/Logit>`_ function, which map a probability :math:`p` of the response variable being "1" from :math:`[0, 1)` to :math:`(-\\infty, +\\infty)`. .. math:: \\text{logit}(p) = \\ln \\frac{p}{1 - p} = \\beta_0 + \\beta X The odds of the response variable being "1" can be obtained by exponentiating the log-odds: .. math:: \\frac{p}{1 - p} = e^{\\beta_0 + \\beta X} and the probability of the response variable being "1" is given by the `logistic function <https://en.wikipedia.org/wiki/Logistic_function>`_: .. math:: p = \\frac{1}{1 + e^{-(\\beta_0 + \\beta X})} The first coefficient is always the constant term (intercept) of the model. Pingouin will automatically add the intercept to your predictor(s) matrix, therefore, :math:`X` should not include a constant term. Pingouin will remove any constant term (e.g column with only one unique value), or duplicate columns from :math:`X`. The calculation of the p-values and confidence interval is adapted from a `code by Rob Speare <https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d>`_. Results have been compared against statsmodels, R, and JASP. Examples -------- 1. Simple binary logistic regression. In this first example, we'll use the `penguins dataset <https://github.com/allisonhorst/palmerpenguins>`_ to see how well we can predict the sex of penguins based on their bodies mass. >>> import numpy as np >>> import pandas as pd >>> import pingouin as pg >>> df = pg.read_dataset('penguins') >>> # Let's first convert the target variable from string to boolean: >>> df['male'] = (df['sex'] == 'male').astype(int) # male: 1, female: 0 >>> # Since there are missing values in our outcome variable, we need to >>> # set `remove_na=True` otherwise regression will fail. >>> lom = pg.logistic_regression(df['body_mass_g'], df['male'], ... remove_na=True) >>> lom.round(2) names coef se z pval CI[2.5%] CI[97.5%] 0 Intercept -5.16 0.71 -7.24 0.0 -6.56 -3.77 1 body_mass_g 0.00 0.00 7.24 0.0 0.00 0.00 Body mass is a significant predictor of sex (p<0.001). Here, it could be useful to rescale our predictor variable from *g* to *kg* (e.g divide by 1000) in order to get more intuitive coefficients and confidence intervals: >>> df['body_mass_kg'] = df['body_mass_g'] / 1000 >>> lom = pg.logistic_regression(df['body_mass_kg'], df['male'], ... remove_na=True) >>> lom.round(2) names coef se z pval CI[2.5%] CI[97.5%] 0 Intercept -5.16 0.71 -7.24 0.0 -6.56 -3.77 1 body_mass_kg 1.23 0.17 7.24 0.0 0.89 1.56 2. Multiple binary logistic regression We'll now add the species as a categorical predictor in our model. To do so, we first need to dummy-code our categorical variable, dropping the first level of our categorical variable (species = Adelie) which will be used as the reference level: >>> df = pd.get_dummies(df, columns=['species'], drop_first=True) >>> X = df[['body_mass_kg', 'species_Chinstrap', 'species_Gentoo']] >>> y = df['male'] >>> lom = pg.logistic_regression(X, y, remove_na=True) >>> lom.round(2) names coef se z pval CI[2.5%] CI[97.5%] 0 Intercept -26.24 2.84 -9.24 0.00 -31.81 -20.67 1 body_mass_kg 7.10 0.77 9.23 0.00 5.59 8.61 2 species_Chinstrap -0.13 0.42 -0.31 0.75 -0.96 0.69 3 species_Gentoo -9.72 1.12 -8.65 0.00 -11.92 -7.52 3. Using NumPy aray and returning only the coefficients >>> pg.logistic_regression(X.to_numpy(), y.to_numpy(), coef_only=True, ... remove_na=True) array([-26.23906892, 7.09826571, -0.13180626, -9.71718529]) 4. Passing custom parameters to sklearn >>> lom = pg.logistic_regression(X, y, solver='sag', max_iter=10000, ... random_state=42, remove_na=True) >>> print(lom['coef'].to_numpy()) [-25.98248153 7.02881472 -0.13119779 -9.62247569] **How to interpret the log-odds coefficients?** We'll use the `Wikipedia example <https://en.wikipedia.org/wiki/Logistic_regression#Probability_of_passing_an_exam_versus_hours_of_study>`_ of the probability of passing an exam versus the hours of study: *A group of 20 students spends between 0 and 6 hours studying for an exam. How does the number of hours spent studying affect the probability of the student passing the exam?* >>> # First, let's create the dataframe >>> Hours = [0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50, ... 2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50] >>> Pass = [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1] >>> df = pd.DataFrame({'HoursStudy': Hours, 'PassExam': Pass}) >>> # And then run the logistic regression >>> lr = pg.logistic_regression(df['HoursStudy'], df['PassExam']).round(3) >>> lr names coef se z pval CI[2.5%] CI[97.5%] 0 Intercept -4.078 1.761 -2.316 0.021 -7.529 -0.626 1 HoursStudy 1.505 0.629 2.393 0.017 0.272 2.737 The ``Intercept`` coefficient (-4.078) is the log-odds of ``PassExam=1`` when ``HoursStudy=0``. The odds ratio can be obtained by exponentiating the log-odds: >>> np.exp(-4.078) 0.016941314421496552 i.e. :math:`0.017:1`. Conversely the odds of failing the exam are :math:`(1/0.017) \\approx 59:1`. The probability can then be obtained with the following equation .. math:: p = \\frac{1}{1 + e^{-(-4.078 + 0 * 1.505)}} >>> 1 / (1 + np.exp(-(-4.078))) 0.016659087580814722 The ``HoursStudy`` coefficient (1.505) means that for each additional hour of study, the log-odds of passing the exam increase by 1.505, and the odds are multipled by :math:`e^{1.505} \\approx 4.50`. For example, a student who studies 2 hours has a probability of passing the exam of 25%: >>> 1 / (1 + np.exp(-(-4.078 + 2 * 1.505))) 0.2557836148964987 The table below shows the probability of passing the exam for several values of ``HoursStudy``: +----------------+----------+----------------+------------------+ | Hours of Study | Log-odds | Odds | Probability | +================+==========+================+==================+ | 0 | −4.08 | 0.017 ≈ 1:59 | 0.017 | +----------------+----------+----------------+------------------+ | 1 | −2.57 | 0.076 ≈ 1:13 | 0.07 | +----------------+----------+----------------+------------------+ | 2 | −1.07 | 0.34 ≈ 1:3 | 0.26 | +----------------+----------+----------------+------------------+ | 3 | 0.44 | 1.55 | 0.61 | +----------------+----------+----------------+------------------+ | 4 | 1.94 | 6.96 | 0.87 | +----------------+----------+----------------+------------------+ | 5 | 3.45 | 31.4 | 0.97 | +----------------+----------+----------------+------------------+ | 6 | 4.96 | 141.4 | 0.99 | +----------------+----------+----------------+------------------+ """ # Check that sklearn is installed from pingouin.utils import _is_sklearn_installed _is_sklearn_installed(raise_error=True) from sklearn.linear_model import LogisticRegression # Extract names if X is a Dataframe or Series if isinstance(X, pd.DataFrame): names = X.keys().tolist() elif isinstance(X, pd.Series): names = [X.name] else: names = [] # Convert to numpy array X = np.asarray(X) y = np.asarray(y) assert y.ndim == 1, 'y must be one-dimensional.' assert 0 < alpha < 1, 'alpha must be between 0 and 1.' # Add axis if only one-dimensional array if X.ndim == 1: X = X[..., np.newaxis] # Check for NaN / Inf if remove_na: X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows') y = np.squeeze(y) y_gd = np.isfinite(y).all() X_gd = np.isfinite(X).all() assert y_gd, ("Target (y) contains NaN or Inf. Please remove them " "manually or use remove_na=True.") assert X_gd, ("Predictors (X) contain NaN or Inf. Please remove them " "manually or use remove_na=True.") # Check that X and y have same length assert y.shape[0] == X.shape[0], 'X and y must have same number of samples' # Check that y is binary if np.unique(y).size != 2: raise ValueError('Dependent variable must be binary.') if not names: names = ['x' + str(i + 1) for i in range(X.shape[1])] # We also want to make sure that there is no column # with only one unique value, otherwise the regression fails # This is equivalent, but much faster, to pd.DataFrame(X).nunique() idx_unique = np.where(np.all(X == X[0, :], axis=0))[0] if len(idx_unique): X = np.delete(X, idx_unique, 1) names = np.delete(names, idx_unique).tolist() # Finally, we want to remove duplicate columns if X.shape[1] > 1: idx_duplicate = [] for pair in itertools.combinations(range(X.shape[1]), 2): if np.array_equal(X[:, pair[0]], X[:, pair[1]]): idx_duplicate.append(pair[1]) if len(idx_duplicate): X = np.delete(X, idx_duplicate, 1) names = np.delete(names, idx_duplicate).tolist() # Initialize and fit if 'solver' not in kwargs: # https://stats.stackexchange.com/a/204324/253579 # Updated in Pingouin > 0.3.6 to be consistent with R kwargs['solver'] = 'newton-cg' if 'penalty' not in kwargs: kwargs['penalty'] = 'none' lom = LogisticRegression(**kwargs) lom.fit(X, y) if lom.get_params()['fit_intercept']: names.insert(0, "Intercept") X_design = np.column_stack((np.ones(X.shape[0]), X)) coef = np.append(lom.intercept_, lom.coef_) else: coef = lom.coef_ X_design = X if coef_only: return coef # Fisher Information Matrix n, p = X_design.shape denom = (2 * (1 + np.cosh(lom.decision_function(X)))) denom = np.tile(denom, (p, 1)).T fim = (X_design / denom).T @ X_design crao = np.linalg.pinv(fim) # Standard error and Z-scores se = np.sqrt(np.diag(crao)) z_scores = coef / se # Two-tailed p-values pval = 2 * norm.sf(np.fabs(z_scores)) # Wald Confidence intervals # In R: this is equivalent to confint.default(model) # Note that confint(model) will however return the profile CI crit = norm.ppf(1 - alpha / 2) ll = coef - crit * se ul = coef + crit * se # Rename CI ll_name = 'CI[%.1f%%]' % (100 * alpha / 2) ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2)) # Create dict stats = {'names': names, 'coef': coef, 'se': se, 'z': z_scores, 'pval': pval, ll_name: ll, ul_name: ul} if as_dataframe: return pd.DataFrame(stats) else: return stats
def logistic_regression(X, y, coef_only=False, alpha=0.05, as_dataframe=True, remove_na=False, **kwargs): """(Multiple) Binary logistic regression. Parameters ---------- X : np.array or list Predictor(s). Shape = (n_samples, n_features) or (n_samples,). y : np.array or list Dependent variable. Shape = (n_samples). Must be binary. coef_only : bool If True, return only the regression coefficients. alpha : float Alpha value used for the confidence intervals. :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]` as_dataframe : bool If True, returns a pandas DataFrame. If False, returns a dictionnary. remove_na : bool If True, apply a listwise deletion of missing values (i.e. the entire row is removed). Default is False, which will raise an error if missing values are present in either the predictor(s) or dependent variable. **kwargs : optional Optional arguments passed to :py:class:`sklearn.linear_model.LogisticRegression`. Returns ------- stats : dataframe or dict Logistic regression summary:: 'names' : name of variable(s) in the model (e.g. x1, x2...) 'coef' : regression coefficients 'se' : standard error 'z' : z-scores 'pval' : two-tailed p-values 'CI[2.5%]' : lower confidence interval 'CI[97.5%]' : upper confidence interval See also -------- linear_regression Notes ----- This is a wrapper around the :py:class:`sklearn.linear_model.LogisticRegression` class. Note that Pingouin automatically disables the l2 regularization applied by scikit-learn. This can be modified by changing the `penalty` argument. The calculation of the p-values and confidence interval is adapted from a code found at https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d Note that the first coefficient is always the constant term (intercept) of the model. Scikit-learn will automatically add the intercept to your predictor(s) matrix, therefore, :math:`X` should not include a constant term. Pingouin will remove any constant term (e.g column with only one unique value), or duplicate columns from :math:`X`. Results have been compared against statsmodels, R, and JASP. Examples -------- 1. Simple binary logistic regression >>> import numpy as np >>> from pingouin import logistic_regression >>> np.random.seed(123) >>> x = np.random.normal(size=30) >>> y = np.random.randint(0, 2, size=30) >>> lom = logistic_regression(x, y) >>> lom.round(2) names coef se z pval CI[2.5%] CI[97.5%] 0 Intercept -0.27 0.37 -0.74 0.46 -1.00 0.45 1 x1 0.07 0.32 0.21 0.84 -0.55 0.68 2. Multiple binary logistic regression >>> np.random.seed(42) >>> z = np.random.normal(size=30) >>> X = np.column_stack((x, z)) >>> lom = logistic_regression(X, y) >>> print(lom['coef'].values) [-0.36736745 -0.04374684 -0.47829392] 3. Using a Pandas DataFrame >>> import pandas as pd >>> df = pd.DataFrame({'x': x, 'y': y, 'z': z}) >>> lom = logistic_regression(df[['x', 'z']], df['y']) >>> print(lom['coef'].values) [-0.36736745 -0.04374684 -0.47829392] 4. Return only the coefficients >>> logistic_regression(X, y, coef_only=True) array([-0.36736745, -0.04374684, -0.47829392]) 5. Passing custom parameters to sklearn >>> lom = logistic_regression(X, y, solver='sag', max_iter=10000, ... random_state=42) >>> print(lom['coef'].values) [-0.36751796 -0.04367056 -0.47841908] """ # Check that sklearn is installed from pingouin.utils import _is_sklearn_installed _is_sklearn_installed(raise_error=True) from sklearn.linear_model import LogisticRegression # Extract names if X is a Dataframe or Series if isinstance(X, pd.DataFrame): names = X.keys().tolist() elif isinstance(X, pd.Series): names = [X.name] else: names = [] # Convert to numpy array X = np.asarray(X) y = np.asarray(y) assert y.ndim == 1, 'y must be one-dimensional.' assert 0 < alpha < 1, 'alpha must be between 0 and 1.' # Add axis if only one-dimensional array if X.ndim == 1: X = X[..., np.newaxis] # Check for NaN / Inf if remove_na: X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows') y = np.squeeze(y) y_gd = np.isfinite(y).all() X_gd = np.isfinite(X).all() assert y_gd, ("Target (y) contains NaN or Inf. Please remove them " "manually or use remove_na=True.") assert X_gd, ("Predictors (X) contain NaN or Inf. Please remove them " "manually or use remove_na=True.") # Check that X and y have same length assert y.shape[0] == X.shape[0], 'X and y must have same number of samples' # Check that y is binary if np.unique(y).size != 2: raise ValueError('Dependent variable must be binary.') if not names: names = ['x' + str(i + 1) for i in range(X.shape[1])] # We also want to make sure that there is no column # with only one unique value, otherwise the regression fails # This is equivalent, but much faster, to pd.DataFrame(X).nunique() idx_unique = np.where(np.all(X == X[0, :], axis=0))[0] if len(idx_unique): X = np.delete(X, idx_unique, 1) names = np.delete(names, idx_unique).tolist() # Finally, we want to remove duplicate columns if X.shape[1] > 1: idx_duplicate = [] for pair in itertools.combinations(range(X.shape[1]), 2): if np.array_equal(X[:, pair[0]], X[:, pair[1]]): idx_duplicate.append(pair[1]) if len(idx_duplicate): X = np.delete(X, idx_duplicate, 1) names = np.delete(names, idx_duplicate).tolist() # Initialize and fit if 'solver' not in kwargs: kwargs['solver'] = 'lbfgs' if 'multi_class' not in kwargs: kwargs['multi_class'] = 'auto' if 'penalty' not in kwargs: kwargs['penalty'] = 'none' lom = LogisticRegression(**kwargs) lom.fit(X, y) coef = np.append(lom.intercept_, lom.coef_) if coef_only: return coef # Design matrix -- add intercept names.insert(0, "Intercept") X_design = np.column_stack((np.ones(X.shape[0]), X)) n, p = X_design.shape # Fisher Information Matrix denom = (2 * (1 + np.cosh(lom.decision_function(X)))) denom = np.tile(denom, (p, 1)).T fim = np.dot((X_design / denom).T, X_design) crao = np.linalg.pinv(fim) # Standard error and Z-scores se = np.sqrt(np.diag(crao)) z_scores = coef / se # Two-tailed p-values pval = 2 * norm.sf(np.fabs(z_scores)) # Confidence intervals crit = norm.ppf(1 - alpha / 2) ll = coef - crit * se ul = coef + crit * se # Rename CI ll_name = 'CI[%.1f%%]' % (100 * alpha / 2) ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2)) # Create dict stats = { 'names': names, 'coef': coef, 'se': se, 'z': z_scores, 'pval': pval, ll_name: ll, ul_name: ul } if as_dataframe: return pd.DataFrame(stats) else: return stats
def skipped_corr(x, y, vis=False, ax=None, color='blue', return_dist=False): from pingouin.utils import _is_sklearn_installed _is_sklearn_installed(raise_error=True) from scipy.stats import chi2 from sklearn.covariance import MinCovDet X = np.column_stack((x, y)) nrows, ncols = X.shape gval = np.sqrt(chi2.ppf(0.975, 2)) # Compute center and distance to center center = MinCovDet(random_state=42).fit(X).location_ B = X - center B2 = B**2 bot = B2.sum(axis=1) # Loop over rows dis = np.zeros(shape=(nrows, nrows)) for i in np.arange(nrows): if bot[i] != 0: # Avoid division by zero error dis[i, :] = np.linalg.norm(B * B2[i, :] / bot[i], axis=1) def idealf(x): """Compute the ideal fourths IQR (Wilcox 2012). """ n = len(x) j = int(np.floor(n / 4 + 5 / 12)) y = np.sort(x) g = (n / 4) - j + (5 / 12) low = (1 - g) * y[j - 1] + g * y[j] k = n - j + 1 up = (1 - g) * y[k - 1] + g * y[k - 2] return up - low # One can either use the MAD or the IQR (see Wilcox 2012) # MAD = mad(dis, axis=1) iqr = np.apply_along_axis(idealf, 1, dis) thresh = (np.median(dis, axis=1) + gval * iqr) outliers = np.apply_along_axis(np.greater, 0, dis, thresh).any(axis=0) cloud = X[~outliers] R = np.random.RandomState(42) rs = np.zeros(10000) for i in range(10000): # _samp = np.random.choice(range(len(cloud)),size=len(cloud)) _samp = R.choice(range(len(cloud)), size=len(cloud)) rs[i] = pearsonr(cloud[_samp, 0], cloud[_samp, 1])[0] if rs.mean() > 0: p = (1 - np.mean(rs > 0)) * 2 else: p = (1 - np.mean(rs < 0)) * 2 r_pearson, _ = pearsonr(x[~outliers], y[~outliers]) ci_l, ci_u = np.percentile(rs, [2.5, 97.5]) # Scatter plot and regression lines if vis and ax == None: fig, ax = plt.subplots() if vis: sns.regplot(x[~outliers], y[~outliers], ax=ax, color=color, scatter=False) ax.scatter(x, y, color=color, edgecolor=color) print('Skipped Pearson r = {}\n95% CI = [{}, {}], P = {}'.format( r_pearson.round(2), ci_l.round(2), ci_u.round(2), p.round(4))) if return_dist: return rs
def plot_full_skipped_corr(x, y, title, xlab, ylab): from pingouin.utils import _is_sklearn_installed _is_sklearn_installed(raise_error=True) from scipy.stats import chi2 from sklearn.covariance import MinCovDet X = np.column_stack((x, y)) nrows, ncols = X.shape gval = np.sqrt(chi2.ppf(0.975, 2)) # Compute center and distance to center center = MinCovDet(random_state=42).fit(X).location_ B = X - center B2 = B**2 bot = B2.sum(axis=1) # Loop over rows dis = np.zeros(shape=(nrows, nrows)) for i in np.arange(nrows): if bot[i] != 0: # Avoid division by zero error dis[i, :] = np.linalg.norm(B * B2[i, :] / bot[i], axis=1) def idealf(x): """Compute the ideal fourths IQR (Wilcox 2012). """ n = len(x) j = int(np.floor(n / 4 + 5 / 12)) y = np.sort(x) g = (n / 4) - j + (5 / 12) low = (1 - g) * y[j - 1] + g * y[j] k = n - j + 1 up = (1 - g) * y[k - 1] + g * y[k - 2] return up - low # One can either use the MAD or the IQR (see Wilcox 2012) # MAD = mad(dis, axis=1) iqr = np.apply_along_axis(idealf, 1, dis) thresh = (np.median(dis, axis=1) + gval * iqr) outliers = np.apply_along_axis(np.greater, 0, dis, thresh).any(axis=0) cloud = X[~outliers] R = np.random.RandomState(42) rs = np.zeros(10000) for i in range(10000): # _samp = np.random.choice(range(len(cloud)),size=len(cloud)) _samp = R.choice(range(len(cloud)), size=len(cloud)) rs[i] = pearsonr(cloud[_samp, 0], cloud[_samp, 1])[0] if rs.mean() > 0: p = (1 - np.mean(rs > 0)) * 2 else: p = (1 - np.mean(rs < 0)) * 2 r_pearson, _ = pearsonr(x[~outliers], y[~outliers]) ci_l, ci_u = np.percentile(rs, [2.5, 97.5]) fig, (ax1, ax3) = plt.subplots(2, figsize=(6, 10)) # plt.subplots_adjust(wspace=0.3) sns.despine() # Scatter plot and regression lines sns.regplot(x[~outliers], y[~outliers], ax=ax1, color='darkcyan') ax1.scatter(x[outliers], y[outliers], color='indianred', label='outliers') ax1.scatter(x[~outliers], y[~outliers], color='seagreen', label='good') sns.distplot(rs, kde=True, ax=ax3, color='steelblue') for i in [ci_l, ci_u]: ax3.axvline(x=i, color='coral', lw=2) ax3.axvline(x=0, color='k', ls='--', lw=1.5) ax3.set_xlabel('Correlation coefficient') ax3.set_title('Skipped Pearson r = {}\n95% CI = [{}, {}], P = {}'.format( r_pearson.round(2), ci_l.round(2), ci_u.round(2), p.round(4)), y=1.05) ax1.set_xlim([i * 1.2 for i in ax1.get_xlim()]) ax1.set_title(title) ax1.set_xlabel(xlab) ax1.set_ylabel(ylab) # Optimize layout plt.tight_layout()