def plot_kde(df, treatment, probability, measure='probability', bw_method='scott', fill=True, color_e='b', color_u='r'): """Generates a density plot that can be used to check whether positivity may be violated qualitatively. The kernel density used is SciPy's Gaussian kernel. Either Scott's Rule or Silverman's Rule can be implemented. Alternative option to the boxplot of probabilities Parameters ------------ df : DataFrame Pandas dataframe containing the variables of interest treatment : str Column name of the treatment variable probability : str Column name of the predicted probability of treatment measure : str, optional Measure to plot. Options include either the probabilities or log-odds stratified by treatment received. Default is probabilities (measure='probability'). Log-odds can be requested via measure='logit' bw_method : str, optional Method used to estimate the bandwidth. Following SciPy, either 'scott' or 'silverman' are valid options fill : bool, optional Whether to color the area under the density curves. Default is true color_e : str, optional Color of the line/area for the treated group. Default is Blue color_u : str, optional Color of the line/area for the treated group. Default is Red Returns --------------- matplotlib axes """ if measure == 'probability': x = np.linspace(0, 1, 10000) density_t = gaussian_kde(df.loc[df[treatment] == 1][probability].dropna(), bw_method=bw_method) density_u = gaussian_kde(df.loc[df[treatment] == 0][probability].dropna(), bw_method=bw_method) elif measure == 'logit': t = np.log(probability_to_odds(df.loc[df[treatment] == 1][probability].dropna())) density_t = gaussian_kde(t, bw_method=bw_method) u = np.log(probability_to_odds(df.loc[df[treatment] == 0][probability].dropna())) density_u = gaussian_kde(u, bw_method=bw_method) x = np.linspace(np.min((np.min(t), np.min(u))) - 1, np.max((np.max(t), np.max(u))) + 1, 10000) else: raise ValueError("Only plots of probabilities or log-odds are supported. Please specify either " "'probability' or 'logit'") ax = plt.gca() if fill: ax.fill_between(x, density_t(x), color=color_e, alpha=0.2, label=None) ax.fill_between(x, density_u(x), color=color_u, alpha=0.2, label=None) ax.plot(x, density_t(x), color=color_e, label='Treat = 1') ax.plot(x, density_u(x), color=color_u, label='Treat = 0') if measure == 'probability': ax.set_xlabel('Probability') else: ax.set_xlabel('Log-Odds') ax.set_ylabel('Density') ax.legend() return ax
def plot_boxplot(df, treatment, probability, measure='probability'): """Generates a stratified boxplot that can be used to visually check whether positivity may be violated, qualitatively. Alternative option to the kernel density plot. Parameters ---------- df : DataFrame Pandas dataframe containing the variables of interest treatment : str Column name of the treatment variable probability : str Column name of the predicted probability of treatment measure : str, optional Measure to plot. Options include either the probabilities or log-odds stratified by treatment received. Default is probabilities (measure='probability'). Log-odds can be requested via measure='logit' Returns ------------- matplotlib axes """ if measure == 'probability': boxes = (df.loc[df[treatment] == 1][probability].dropna(), df.loc[df[treatment] == 0][probability].dropna()) elif measure == 'logit': boxes = (np.log( probability_to_odds( df.loc[df[treatment] == 1][probability].dropna())), np.log( probability_to_odds( df.loc[df[treatment] == 0][probability].dropna()))) else: raise ValueError( "Only plots of probabilities or log-odds are supported. Please specify either " "'probability' or 'logit") labs = ['A = 1', 'A = 0'] meanpointprops = dict(marker='D', markeredgecolor='black', markerfacecolor='black') ax = plt.gca() ax.boxplot(boxes, labels=labs, meanprops=meanpointprops, showmeans=True) if measure == 'probability': ax.set_ylabel('Probability') ax.set_ylim([0, 1]) else: ax.set_ylabel('Log-Odds') return ax
def plot_boxplot(self, measure='probability'): """Generates a stratified boxplot that can be used to visually check whether positivity may be violated, qualitatively. Alternative option to the kernel density plot. Parameters ---------- measure : str, optional Measure to plot. Options include either the probabilities or log-odds stratified by treatment received. Default is probabilities (measure='probability'). Log-odds can be requested via measure='logit' Returns ------------- matplotlib axes """ if measure == 'probability': boxes = (self.df.loc[self.df[self.ex] == 1]['__denom__'].dropna(), self.df.loc[self.df[self.ex] == 0]['__denom__'].dropna()) elif measure == 'logit': boxes = (np.log( probability_to_odds( self.df.loc[self.df[self.ex] == 1]['__denom__'].dropna())), np.log( probability_to_odds(self.df.loc[self.df[self.ex] == 0] ['__denom__'].dropna()))) else: raise ValueError( "Only plots of probabilities or log-odds are supported. Please specify either " "'probability' or 'logit") labs = ['Treat = 1', 'Treat = 0'] meanpointprops = dict(marker='D', markeredgecolor='black', markerfacecolor='black') ax = plt.gca() ax.boxplot(boxes, labels=labs, meanprops=meanpointprops, showmeans=True) if measure == 'probability': ax.set_ylabel('Probability') else: ax.set_ylabel('Log-Odds') return ax
def fit(self): """Estimates risk difference, risk ratio, and odds ratio based on the gAW and QAW. If a continuous outcome, then the average treatment effect is returned. Confidence intervals come from influence curves Returns ------- `TMLE` gains `risk_difference`, `risk_ratio`, and `odds_ratio` for binary outcomes and `average _treatment_effect` for continuous outcomes """ if (self._fit_exposure_model is False) or (self._fit_outcome_model is False): raise ValueError( 'The exposure and outcome models must be specified before the psi estimate can ' 'be generated') if self._miss_flag and not self._fit_missing_model: warnings.warn( "No missing data model has been specified. All missing outcome data is assumed to be " "missing completely at random. To relax this assumption to outcome data is missing at random" "please use the `missing_model()` function", UserWarning) # Step 4) Calculating clever covariate (HAW) if self._miss_flag and self._fit_missing_model: self.g1W_total = self.g1W * self.m1W self.g0W_total = self.g0W * self.m0W else: self.g1W_total = self.g1W self.g0W_total = self.g0W H1W = self.df[self.exposure] / self.g1W_total H0W = -(1 - self.df[self.exposure]) / self.g0W_total HAW = H1W + H0W # Step 5) Estimating TMLE f = sm.families.family.Binomial() y = self.df[self.outcome] log = sm.GLM(y, np.column_stack((H1W, H0W)), offset=np.log(probability_to_odds(self.QAW)), family=f, missing='drop').fit() self._epsilon = log.params Qstar1 = logistic.cdf( np.log(probability_to_odds(self.QA1W)) + self._epsilon[0] / self.g1W_total) Qstar0 = logistic.cdf( np.log(probability_to_odds(self.QA0W)) - self._epsilon[1] / self.g0W_total) Qstar = log.predict(np.column_stack((H1W, H0W)), offset=np.log(probability_to_odds(self.QAW))) # Step 6) Calculating Psi if self.alpha == 0.05: # Without this, won't match R exactly. R relies on 1.96, while I use SciPy zalpha = 1.96 else: zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1) # p-values are not implemented (doing my part to enforce CL over p-values) delta = np.where(self.df[self._missing_indicator] == 1, 1, 0) if self._continuous_outcome: # Calculating Average Treatment Effect Qstar = self._unit_unbound(Qstar, mini=self._continuous_min, maxi=self._continuous_max) Qstar1 = self._unit_unbound(Qstar1, mini=self._continuous_min, maxi=self._continuous_max) Qstar0 = self._unit_unbound(Qstar0, mini=self._continuous_min, maxi=self._continuous_max) self.average_treatment_effect = np.nanmean(Qstar1 - Qstar0) # Influence Curve for CL y_unbound = self._unit_unbound(self.df[self.outcome], mini=self._continuous_min, maxi=self._continuous_max) ic = np.where( delta == 1, HAW * (y_unbound - Qstar) + (Qstar1 - Qstar0) - self.average_treatment_effect, Qstar1 - Qstar0 - self.average_treatment_effect) seIC = np.sqrt(np.nanvar(ic, ddof=1) / self.df.shape[0]) self.average_treatment_effect_se = seIC self.average_treatment_effect_ci = [ self.average_treatment_effect - zalpha * seIC, self.average_treatment_effect + zalpha * seIC ] else: # Calculating Risk Difference self.risk_difference = np.nanmean(Qstar1 - Qstar0) # Influence Curve for CL ic = np.where( delta == 1, HAW * (self.df[self.outcome] - Qstar) + (Qstar1 - Qstar0) - self.risk_difference, (Qstar1 - Qstar0) - self.risk_difference) seIC = np.sqrt(np.nanvar(ic, ddof=1) / self.df.shape[0]) self.risk_difference_se = seIC self.risk_difference_ci = [ self.risk_difference - zalpha * seIC, self.risk_difference + zalpha * seIC ] # Calculating Risk Ratio self.risk_ratio = np.nanmean(Qstar1) / np.nanmean(Qstar0) # Influence Curve for CL ic = np.where( delta == 1, (1 / np.mean(Qstar1) * (H1W * (self.df[self.outcome] - Qstar) + Qstar1 - np.mean(Qstar1)) - (1 / np.mean(Qstar0)) * (-1 * H0W * (self.df[self.outcome] - Qstar) + Qstar0 - np.mean(Qstar0))), (Qstar1 - np.mean(Qstar1)) + Qstar0 - np.mean(Qstar0)) seIC = np.sqrt(np.nanvar(ic, ddof=1) / self.df.shape[0]) self.risk_ratio_se = seIC self.risk_ratio_ci = [ np.exp(np.log(self.risk_ratio) - zalpha * seIC), np.exp(np.log(self.risk_ratio) + zalpha * seIC) ] # Calculating Odds Ratio self.odds_ratio = (np.nanmean(Qstar1) / (1 - np.nanmean(Qstar1))) / ( np.nanmean(Qstar0) / (1 - np.nanmean(Qstar0))) # Influence Curve for CL ic = np.where( delta == 1, ((1 / (np.nanmean(Qstar1) * (1 - np.nanmean(Qstar1))) * (H1W * (self.df[self.outcome] - Qstar) + Qstar1)) - (1 / (np.nanmean(Qstar0) * (1 - np.nanmean(Qstar0))) * (-1 * H0W * (self.df[self.outcome] - Qstar) + Qstar0))), ((1 / (np.nanmean(Qstar1) * (1 - np.nanmean(Qstar1))) * Qstar1 - (1 / (np.nanmean(Qstar0) * (1 - np.nanmean(Qstar0))) * Qstar0)))) seIC = np.sqrt(np.nanvar(ic, ddof=1) / self.df.shape[0]) self.odds_ratio_se = seIC self.odds_ratio_ci = [ np.exp(np.log(self.odds_ratio) - zalpha * seIC), np.exp(np.log(self.odds_ratio) + zalpha * seIC) ]
def fit(self): """Estimates risk difference, risk ratio, and odds ratio based on the gAW and QAW. Confidence intervals come from the influence curve Returns ------- TMLE gains Psi and confint attributes """ if (self._fit_exposure_model is False) or (self._fit_outcome_model is False): raise ValueError( 'The exposure and outcome models must be specified before the psi estimate can ' 'be generated') # Step 4) Calculating clever covariate (HAW) H1W = self.df[self._exposure] / self.g1W H0W = -(1 - self.df[self._exposure]) / (self.g0W) HAW = H1W + H0W # Step 5) Estimating TMLE f = sm.families.family.Binomial() log = sm.GLM(self.df[self._outcome], np.column_stack((H1W, H0W)), offset=np.log(probability_to_odds(self.QAW)), family=f).fit() self._epsilon = log.params Qstar1 = logistic.cdf( np.log(probability_to_odds(self.QA1W)) + self._epsilon[0] * 1 / self.g1W) Qstar0 = logistic.cdf( np.log(probability_to_odds(self.QA0W)) + self._epsilon[1] * -1 / self.g0W) Qstar = log.predict(np.column_stack((H1W, H0W)), offset=np.log(probability_to_odds(self.QAW))) # Step 6) Calculating Psi if self.alpha == 0.05: # Without this, won't match R exactly. R relies on 1.96, while I use SciPy zalpha = 1.96 else: zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1) # p-values are not implemented (doing my part to enforce CL over p-values) # Calculating Risk Difference self.risk_difference = np.mean(Qstar1 - Qstar0) # Influence Curve for CL ic = HAW * (self.df[self._outcome] - Qstar) + ( Qstar1 - Qstar0) - self.risk_difference varIC = np.var(ic, ddof=1) / self.df.shape[0] self.risk_difference_ci = [ self.risk_difference - zalpha * math.sqrt(varIC), self.risk_difference + zalpha * math.sqrt(varIC) ] # Calculating Risk Ratio self.risk_ratio = np.mean(Qstar1) / np.mean(Qstar0) # Influence Curve for CL ic = (1 / np.mean(Qstar1) * (H1W * (self.df[self._outcome] - Qstar) + Qstar1 - np.mean(Qstar1)) - (1 / np.mean(Qstar0)) * (-1 * H0W * (self.df[self._outcome] - Qstar) + Qstar0 - np.mean(Qstar0))) varIC = np.var(ic, ddof=1) / self.df.shape[0] self.risk_ratio_ci = [ np.exp(np.log(self.risk_ratio) - zalpha * math.sqrt(varIC)), np.exp(np.log(self.risk_ratio) + zalpha * math.sqrt(varIC)) ] # Calculating Odds Ratio self.odds_ratio = (np.mean(Qstar1) / (1 - np.mean(Qstar1))) / (np.mean(Qstar0) / (1 - np.mean(Qstar0))) # Influence Curve for CL ic = ((1 / (np.mean(Qstar1) * (1 - np.mean(Qstar1))) * (H1W * (self.df[self._outcome] - Qstar) + Qstar1)) - (1 / (np.mean(Qstar0) * (1 - np.mean(Qstar0))) * (-1 * H0W * (self.df[self._outcome] - Qstar) + Qstar0))) seIC = math.sqrt(np.var(ic, ddof=1) / self.df.shape[0]) self.odds_ratio_ci = [ np.exp(np.log(self.odds_ratio) - zalpha * seIC), np.exp(np.log(self.odds_ratio) + zalpha * seIC) ]
def test_forth_and_back_conversions(self): original = 1.1 pr = odds_to_probability(original) odd = probability_to_odds(pr) npt.assert_allclose(original, odd)
def test_back_and_forth_conversions(self): original = 0.12 odd = probability_to_odds(original) pr = odds_to_probability(odd) npt.assert_allclose(original, pr)
def test_probability_to_odds(self): od = probability_to_odds(0.5) assert od == 1
def fit(self): """Estimates psi based on the gAW and QAW. Confidence intervals come from the influence curve """ if (self._fit_exposure_model is False) or (self._fit_exposure_model is False): raise ValueError( 'The exposure and outcome models must be specified before the psi estimate can ' 'be generated') # Calculating clever covariates H1W = self.df[self._exposure] / self.gW H0W = (1 - self.df[self._exposure]) / (1 - self.gW) # Fitting logistic model with QAW offset f = sm.families.family.Binomial(sm.families.links.logit) log = sm.GLM(self.df[self._outcome], np.column_stack((H1W, H0W)), offset=np.log(probability_to_odds(self.QAW)), family=f).fit() self._epsilon = log.params # Getting Qn* # Qstar = logistic.cdf(self.QAW + self._epsilon*gAW) # I think this would allow natural course comparison Qstar1 = logistic.cdf( np.log(probability_to_odds(self.QA1W)) + self._epsilon[0] * (1 / self.gW)) Qstar0 = logistic.cdf( np.log(probability_to_odds(self.QA0W)) + self._epsilon[1] * (1 / (1 - self.gW))) # Estimating parameter if self._psi_correspond == 'risk_difference': self.psi = np.mean(Qstar1 - Qstar0) elif self._psi_correspond == 'risk_ratio': self.psi = np.mean(Qstar1) / np.mean(Qstar0) elif self._psi_correspond == 'odds_ratio': self.psi = (np.mean(Qstar1) / (1 - np.mean(Qstar1))) / (np.mean(Qstar0) / (1 - np.mean(Qstar0))) else: raise ValueError( 'Specified parameter is not implemented. Please use one of the available options' ) # Getting influence curve zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1) if self._psi_correspond == 'risk_difference': ic = ((self.df[self._exposure] / self.gW - (1 - self.df[self._exposure]) / (1 - self.gW)) * (self.df[self._outcome] - self.QAW) + self.QA1W - self.QA0W - (np.mean(Qstar1) - np.mean(Qstar0))) varIC = np.var(ic, ddof=1) / self.df.shape[0] self.confint = [ self.psi - zalpha * math.sqrt(varIC), self.psi + zalpha * math.sqrt(varIC) ] elif self._psi_correspond == 'risk_ratio': ic = ((1 / np.mean(Qstar1)) * (self.df[self._exposure] / self.gW * (self.df[self._outcome] - self.QAW) + self.QA1W - np.mean(Qstar1)) - (1 / np.mean(Qstar0)) * ((1 - self.df[self._exposure]) / (1 - self.gW) * (self.df[self._outcome] - self.QAW) + self.QA0W - np.mean(Qstar0))) varIC = np.var(ic, ddof=1) / self.df.shape[0] self.confint = [ np.exp(np.log(self.psi) - zalpha * math.sqrt(varIC)), np.exp(np.log(self.psi) + zalpha * math.sqrt(varIC)) ] elif self._psi_correspond == 'odds_ratio': ic = (1 / (np.mean(Qstar1) * (1 - np.mean(Qstar1))) * (self.df[self._exposure] / self.gW * (self.df[self._outcome] - self.QAW + self.QA1W)) - 1 / (np.mean(Qstar0) * (1 - np.mean(Qstar0))) * ((1 - self.df[self._exposure]) / (1 - self.gW) * (self.df[self._outcome] - self.QAW + self.QA0W))) varIC = np.var(ic, ddof=1) / self.df.shape[0] self.confint = [ np.exp(np.log(self.psi) - zalpha * math.sqrt(varIC)), np.exp(np.log(self.psi) + zalpha * math.sqrt(varIC)) ] else: pass