def exposure_model(self, model, custom_model=None, bound=False, print_results=True): """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature Parameters ---------- model : str Independent variables to predict the exposure. Example) 'var1 + var2 + var3' custom_model : optional Input for a custom model that is used in place of the logit model (default). The model must have the "fit()" and "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the background, TMLE will fit the custom model and generate the predicted probablities bound : float, list, optional Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Specifying this argument can improve finite sample performance for random positivity violations. However, truncating weights leads to additional confounding. Default is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for asymmetric trunctation, with the first value being the lower bound and the second being the upper bound print_results : bool, optional Whether to print the fitted model results. Default is True (prints results) """ self._exp_model = self.exposure + ' ~ ' + model self.__mweight = model # Step 3) Estimation of g-model (exposure model) if custom_model is None: fitmodel = propensity_score(self.df, self._exp_model, print_results=print_results) self.g1W = fitmodel.predict(self.df) # User-specified prediction model else: # TODO need to create smart warning system # warnings.warn("TMLE can result in confidence intervals below nominal coverage when used with " # "certain machine learning algorithms") self._exp_model_custom = True data = patsy.dmatrix(model + ' - 1', self.df) self.g1W = exposure_machine_learner(xdata=np.asarray(data), ydata=np.asarray( self.df[self.exposure]), ml_model=custom_model, print_results=print_results) self.g0W = 1 - self.g1W if bound: # Bounding predicted probabilities if requested self.g1W = _bounding_(self.g1W, bounds=bound) self.g0W = _bounding_(self.g0W, bounds=bound) self._fit_exposure_model = True
def exposure_model(self, model, bound=False, print_results=True): r"""Specify the propensity score / inverse probability weight model. Model used to predict the exposure via a logistic regression model. This model estimates .. math:: \widehat{\Pr}(A=1|L) = logit^{-1}(\widehat{\beta_0} + \widehat{\beta} L) Parameters ---------- model : str Independent variables to predict the exposure. For example, 'var1 + var2 + var3' bound : float, list, optional Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Specifying this argument can improve finite sample performance for random positivity violations. However, truncating weights leads to additional confounding. Default is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for asymmetric trunctation, with the first value being the lower bound and the second being the upper bound print_results : bool, optional Whether to print the fitted model results. Default is True (prints results) """ self.__mweight = model self._exp_model = self.exposure + ' ~ ' + model d, n, iptw = iptw_calculator(df=self.df, treatment=self.exposure, model_denom=model, model_numer='1', weight=self._weight_, stabilized=False, standardize='population', bound=None, print_results=print_results) self.df['_g1_'] = d self.df['_g0_'] = 1 - d # Applying bounds AFTER extracting g1 and g0 if bound: self.df['_g1_'] = _bounding_(self.df['_g1_'], bounds=bound) self.df['_g0_'] = _bounding_(self.df['_g0_'], bounds=bound) self._fit_exposure_ = True
def missing_model(self, model_denominator, model_numerator=None, stabilized=True, bound=False, print_results=True): """Estimation of Pr(M=0|A=a,L), which is the missing data mechanism for the outcome. The corresponding observation probabilities are used to account for informative censoring by observed variables. The missing_model only accounts for missing outcome data. The inverse probability weights calculated by this function account for informative censoring (missing data on the outcome) by observed variables. The parametric model should be sufficiently flexible to capture any interaction terms and functional forms of continuous variables Note ---- The treatment variable should be included in the model Parameters ---------- model_denominator: str String listing variables predicting missingness of outcomes via `patsy` syntax. For example, ` 'var1 + var2 + var3'. This is for the predicted probabilities of the denominator model_numerator : str, optional Optional string listing variables to predict the exposure, separated by +. Only used to calculate the numerator. Default (None) calculates the probability of censoring by treatment only. In general this is recommended. If assessing effect modifcation, this variable should be included in the numerator as well. Argument is only used when calculating stabilized weights stabilized : bool, optional Whether to use stabilized inverse probability of censoring weights bound : float, list, optional Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Specifying this argument can improve finite sample performance for random positivity violations. However, inference becomes limited to the restricted population. Default is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of floats can be provided for asymmetric trunctation print_results: bool, optional """ # Error if no missing outcome data if not self._miss_flag: raise ValueError( "No missing outcome data is present in the data set") # Warning if exposure is not included in the missingness of outcome model if self.treatment not in model_denominator: warnings.warn( "For the specified missing outcome model, the exposure variable should be included in the " "model", UserWarning) self._miss_model = self._missing_indicator + ' ~ ' + model_denominator fitmodel = propensity_score(self.df, self._miss_model, print_results=print_results) if stabilized: if model_numerator is None: mnum = self.treatment else: mnum = model_numerator numerator_model = propensity_score(self.df, self._missing_indicator + ' ~ ' + mnum, weights=self._weight_, print_results=print_results) n = numerator_model.predict(self.df) else: n = 1 if bound: # Bounding predicted probabilities if requested d = _bounding_(fitmodel.predict(self.df), bounds=bound) else: d = fitmodel.predict(self.df) self.ipmw = np.where(self.df[self._missing_indicator] == 1, n / d, np.nan) self._fit_missing_ = True
def outcome_model(self, model, custom_model=None, bound=False, print_results=True, continuous_distribution='gaussian'): """Estimation of E(Y|A,L,M=1), which is also written sometimes as Q(A,W,M=1) or Pr(Y=1|A,W,M=1). Estimation of this model is based on complete observations of Y only Parameters ---------- model : str Independent variables to predict the exposure. Example) 'var1 + var2 + var3' custom_model : optional Input for a custom model that is used in place of the logit model (default). The model must have the "fit()" and "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the background, TMLE will fit the custom model and generate the predicted probablities bound : bool, optional This argument should ONLY be used if the outcome is continuous. Value between 0,1 to truncate the bounded predicted outcomes. Default is `False`, meaning no truncation of predicted outcomes occurs (unless a predicted outcome is outside the bounded continuous outcome). Providing a single float assumes symmetric trunctation. A list of floats can be provided for asymmetric trunctation. print_results : bool, optional Whether to print the fitted model results. Default is True (prints results) continuous_distribution : str, optional Distribution to use for continuous outcomes. Options are 'gaussian' for normal distributions and 'poisson' for Poisson distributions """ self._out_model = self.outcome + ' ~ ' + model if self._miss_flag: cc = self.df.copy().dropna() else: cc = self.df.copy() # Step 1) Prediction for Q (estimation of Q-model) if custom_model is None: # Logistic Regression model for predictions self._continuous_type = continuous_distribution if self._continuous_outcome: if (continuous_distribution == 'gaussian') or (continuous_distribution == 'normal'): f = sm.families.family.Gaussian() elif continuous_distribution == 'poisson': f = sm.families.family.Poisson() else: raise ValueError( "Only 'gaussian' and 'poisson' distributions are supported" ) log = smf.glm(self._out_model, cc, family=f).fit() else: f = sm.families.family.Binomial() log = smf.glm(self._out_model, cc, family=f).fit() if print_results: print( '\n----------------------------------------------------------------' ) print('MODEL: ' + self._out_model) print( '-----------------------------------------------------------------' ) print(log.summary()) # Step 2) Estimation under the scenarios dfx = self.df.copy() dfx[self.exposure] = 1 self.QA1W = log.predict(dfx) dfx = self.df.copy() dfx[self.exposure] = 0 self.QA0W = log.predict(dfx) # User-specified model else: # TODO need to create smart warning system # warnings.warn("TMLE can result in confidence intervals below nominal coverage when used with " # "certain machine learning algorithms") self._out_model_custom = True data = patsy.dmatrix(model + ' - 1', cc) dfx = self.df.copy() dfx[self.exposure] = 1 adata = patsy.dmatrix(model + ' - 1', dfx) dfx = self.df.copy() dfx[self.exposure] = 0 ndata = patsy.dmatrix(model + ' - 1', dfx) self.QA1W, self.QA0W = outcome_machine_learner( xdata=np.asarray(data), ydata=np.asarray(cc[self.outcome]), all_a=adata, none_a=ndata, ml_model=custom_model, continuous=self._continuous_outcome, print_results=print_results) if not bound: # Bounding predicted probabilities if requested bound = self._cb # This bounding step prevents continuous outcomes from being outside the range self.QA1W = _bounding_(self.QA1W, bounds=bound) self.QA0W = _bounding_(self.QA0W, bounds=bound) self.QAW = self.QA1W * self.df[self.exposure] + self.QA0W * ( 1 - self.df[self.exposure]) self._fit_outcome_model = True
def missing_model(self, model, custom_model=None, bound=False, print_results=True): """Estimation of Pr(M=1|A,L), which is the missing data mechanism for the outcome. The corresponding observation probabilities are used to update the clever covariates for estimation of Qn. The initial estimate of Q is still based on complete observations only Parameters ---------- model : str Independent variables to predict the exposure. Example) 'var1 + var2 + var3'. The treatment must be included for the missing data model custom_model : optional Input for a custom model that is used in place of the logit model (default). The model must have the "fit()" and "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the background, TMLE will fit the custom model and generate the predicted probablities bound: float, list, optional Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Specifying this argument can improve finite sample performance for random positivity violations. However, truncating weights leads to additional confounding. Default is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for asymmetric trunctation, with the first value being the lower bound and the second being the upper bound print_results : bool, optional Whether to print the fitted model results. Default is True (prints results) """ # Error if no missing outcome data if not self._miss_flag: raise ValueError( "No missing outcome data is present in the data set") # Warning if exposure is not included in the missingness of outcome model if self.exposure not in model: warnings.warn( "For the specified missing outcome model, the exposure variable should be included in the " "model", UserWarning) self._miss_model = self._missing_indicator + ' ~ ' + model # Step 3b) Prediction for M if missing outcome data exists if custom_model is None: # Logistic Regression model for predictions fitmodel = propensity_score(self.df, self._miss_model, print_results=print_results) dfx = self.df.copy() dfx[self.exposure] = 1 self.m1W = fitmodel.predict(dfx) dfx = self.df.copy() dfx[self.exposure] = 0 self.m0W = fitmodel.predict(dfx) # User-specified model else: # TODO need to create smart warning system # warnings.warn("TMLE can result in confidence intervals below nominal coverage when used with " # "certain machine learning algorithms") self._miss_model_custom = True data = patsy.dmatrix(model + ' - 1', self.df) dfx = self.df.copy() dfx[self.exposure] = 1 adata = patsy.dmatrix(model + ' - 1', dfx) dfx = self.df.copy() dfx[self.exposure] = 0 ndata = patsy.dmatrix(model + ' - 1', dfx) self.m1W, self.m0W = missing_machine_learner( xdata=np.array(data), mdata=self.df[self._missing_indicator], all_a=adata, none_a=ndata, ml_model=custom_model, print_results=print_results) if bound: # Bounding predicted probabilities if requested self.m1W = _bounding_(self.m1W, bounds=bound) self.m0W = _bounding_(self.m0W, bounds=bound) self._fit_missing_model = True
def missing_model(self, model, bound=False, print_results=True): r"""Estimation of Pr(M=0|A,L), which is the missing data mechanism for the outcome. Predicted probabilities are used to create inverse probability of censoring weights to account for informative missing data on the outcome. Missing weights take the following form .. math:: \frac{1}{\Pr(C=0|A=a, L)} Weights are calculated for both A=1 and A=0 Note ---- The treatment variable should be included in the model Parameters ---------- model : str Independent variables to predict the exposure. Example) 'var1 + var2 + var3'. The treatment must be included for the missing data model bound : float, list, optional Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Specifying this argument can improve finite sample performance for random positivity violations. However, truncating weights leads to additional confounding. Default is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for asymmetric trunctation, with the first value being the lower bound and the second being the upper bound print_results : bool, optional Whether to print the fitted model results. Default is True (prints results) """ # Error if no missing outcome data if not self._miss_flag: raise ValueError( "No missing outcome data is present in the data set") # Warning if exposure is not included in the missingness of outcome model if self.exposure not in model: warnings.warn( "For the specified missing outcome model, the exposure variable should be included in the " "model", UserWarning) # Warning if exposure is not included in the missingness of outcome model if self.exposure not in model: warnings.warn( "For the specified missing outcome model, the exposure variable should be included in the " "model", UserWarning) self._miss_model = self._missing_indicator + ' ~ ' + model fitmodel = propensity_score(self.df, self._miss_model, print_results=print_results) dfx = self.df.copy() dfx[self.exposure] = 1 self.df['_ipmw_a1_'] = np.where(self.df[self._missing_indicator] == 1, fitmodel.predict(dfx), np.nan) dfx = self.df.copy() dfx[self.exposure] = 0 self.df['_ipmw_a0_'] = np.where(self.df[self._missing_indicator] == 1, fitmodel.predict(dfx), np.nan) # If bounds are requested if bound: self.df['_ipmw_a1_'] = _bounding_(self.df['_ipmw_a1_'], bounds=bound) self.df['_ipmw_a0_'] = _bounding_(self.df['_ipmw_a0_'], bounds=bound) self._fit_missing_ = True
def sampling_model(self, model_denominator, model_numerator='1', bound=None, stabilized=True, print_results=True): """Logistic regression model(s) for estimating sampling weights. The model denominator must be specified for both stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the stabilization factor for the weight numerator. By default model results are returned Parameters ---------- model_denominator : str String listing variables to predict the exposure, separated by +. For example, 'var1 + var2 + var3'. This is for the predicted probabilities of the denominator model_numerator : str, optional Optional string listing variables to predict the selection separated by +. Only used to calculate the numerator. Default ('1') calculates the overall probability of selection. In general, this is recommended. Adding in other variables means they are no longer accounted for in estimation of IPSW. Argument is also only used when calculating stabilized weights bound : float, list, optional Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Specifying this argument can improve finite sample performance for random positivity violations. However, inference becomes limited to the restricted population. Default is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of floats can be provided for asymmetric trunctation stabilized : bool, optional Whether to generated stabilized IPSW. Default is True, which returns the stabilized IPSW print_results : bool, optional Whether to print the model results from the regression models. Default is True """ if not stabilized: if model_numerator != '1': raise ValueError( 'Argument for model_numerator is only used for stabilized=True' ) dmodel = propensity_score(self.df, self.selection + ' ~ ' + model_denominator, print_results=print_results) self.sample['__denom__'] = dmodel.predict(self.sample) self._denominator_model = True # Stabilization factor if valid if stabilized: nmodel = propensity_score(self.df, self.selection + ' ~ ' + model_numerator, print_results=print_results) self.sample['__numer__'] = nmodel.predict(self.sample) else: self.sample['__numer__'] = 1 if bound: self.sample['__denom__'] = _bounding_(self.sample['__denom__'], bounds=bound) self.sample['__numer__'] = _bounding_(self.sample['__numer__'], bounds=bound) # Calculate IPSW (generalizability) if self.generalize: self.sample['__ipsw__'] = self.sample['__numer__'] / self.sample[ '__denom__'] # Calculate IOSW (transportability) else: if stabilized: self.sample['__ipsw__'] = (((1 - self.sample['__denom__']) / self.sample['__denom__']) * (self.sample['__numer__'] / (1 - self.sample['__numer__']))) else: self.sample['__ipsw__'] = ( 1 - self.sample['__denom__']) / self.sample['__denom__'] self.ipsw = self.sample['__ipsw__']
def treatment_model(self, model_denominator, model_numerator='1', stabilized=True, bound=False, print_results=True): """Logistic regression model(s) for propensity score models. The model denominator must be specified for both stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the stabilization factor for the weight numerator. By default model results are returned Parameters ------------ model_denominator : str String listing variables to predict the exposure via `patsy` syntax. For example, `'var1 + var2 + var3'`. This is for the predicted probabilities of the denominator model_numerator : str, optional Optional string listing variables to predict the exposure, separated by +. Only used to calculate the numerator. Default ('1') calculates the overall probability of exposure. In general this is recommended. If confounding variables are included in the numerator, they would later need to be adjusted for in the faux marginal structural argument. Additionally, used for assessment of effect measure modification. Argument is also only used when calculating stabilized weights stabilized : bool, optional Whether to return stabilized or unstabilized weights. Default is stabilized weights (True) bound : float, list, optional Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Specifying this argument can improve finite sample performance for random positivity violations. However, inference becomes limited to the restricted population. Default is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of floats can be provided for asymmetric trunctation print_results : bool, optional Whether to print the model results from the regression models. Default is True """ # Calculating denominator probabilities self.__mdenom = model_denominator denominator_model = propensity_score(self.df, self.treatment + ' ~ ' + model_denominator, weights=self._weight_, print_results=print_results) d = denominator_model.predict(self.df) self.df['__denom__'] = d # Calculating numerator probabilities (if stabilized) if stabilized is True: numerator_model = propensity_score(self.df, self.treatment + ' ~ ' + model_numerator, weights=self._weight_, print_results=print_results) n = numerator_model.predict(self.df) else: if model_numerator != '1': raise ValueError( 'Argument for model_numerator is only used for stabilized=True' ) n = 1 self.df['__numer__'] = n # Bounding predicted probabilities if requested if bound: self.df['__denom__'] = _bounding_(self.df['__denom__'], bounds=bound) self.df['__numer__'] = _bounding_(self.df['__numer__'], bounds=bound) # Calculating weights self.iptw = self._weight_calculator(self.df, denominator='__denom__', numerator='__numer__', stabilized=stabilized)