def fit(self, X, y, sample_weight=None): """ Fit the ordinary least squares model. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data y : array_like, shape (n_samples, 1) or (n_samples,) Target values sample_weight : array_like, shape (n_samples,) Individual weights for each sample Returns ------- self """ assert ndim(y) == 1 or (ndim(y) == 2 and shape(y)[1] == 1) y = reshape(y, (-1,)) if self.fit_intercept: X = add_constant(X, has_constant='add') if sample_weight is not None: ols = WLS(y, X, weights=sample_weight, hasconst=self.fit_intercept) else: ols = WLS(y, X, hasconst=self.fit_intercept) self.results = ols.fit(**self.fit_args) return self
def compute_WLS_stdevs(X, Y, weights, alpha): """Function to calculate standard deviations of Weighted Ridge coefficients Args: X: dataset containing the explanatory variables Y: vector of the response variable weights: vector of weights (one for each tuple of the X dataset) alpha: regularization parameter Returns: stdevs_beta: list containing the standard deviations of the coefficients """ # Build Weighted Regression (WLS) model X_enh = add_constant(X) wls_model = WLS(Y, X_enh, weights=weights) results = wls_model.fit() errors_wls_weighted = results.wresid # Estimate of the sigma squared quantity sigma2 = np.dot(errors_wls_weighted.T, errors_wls_weighted) / (X.shape[0] - X.shape[1]) weights_matr = np.diag(weights) # reformulate weights as diagonal matrix # Standard deviations of the coefficients partial_ = np.linalg.inv( np.linalg.multi_dot([X.T, weights_matr, X]) + alpha * np.diag([ 1, ] * X.shape[1])) variances_beta_matrix = sigma2 * np.linalg.multi_dot( [partial_, X.T, weights_matr, X, partial_.T]) variances_beta = np.diag(variances_beta_matrix) stdevs_beta = list(np.sqrt(variances_beta)) return stdevs_beta
def _fit(self, polys=1, periods=None, discontinuities=None): A, parameters = self.getDesignMatrix(polys=polys, periods=periods, discontinuities=discontinuities) model = WLS(self.series['y'], A, weights=self.series['dy']) model.data.xnames = parameters result = model.fit() return result P = np.diag(1.0 / self.series['dy']**2) AP = np.dot(A.T, P) N = np.dot(AP, A) Q_xx = np.linalg.inv(N) W = np.dot(AP, self.series['y']) p = np.dot(Q_xx, W) pickle.dump(p, open('p.pkl', 'wb')) fit = np.dot(A, p) v = fit - self.series['y'] result = {'parameters': parameters, 'p': p} result['v'] = v result['fit'] = fit result['aic'] = self._aic(v, A.shape[1]) sigma2 = np.dot(np.dot(v.T, P), v) / (self.nepochs - A.shape[1]) sigma_x = np.sqrt(np.diag(Q_xx * sigma2)) result['psigma'] = sigma_x w = np.diag(P) chi_square = np.dot(v**2, w) result['chi_square'] = chi_square nrms = np.sqrt(chi_square / (self.nepochs - A.shape[1])) wrms = np.sqrt( (self.nepochs / (self.nepochs - A.shape[1])) * chi_square / sum(w)) result['nrms'] = nrms result['wrms'] = wrms return result
def test_fixed_scale(self): cov_type = 'fixed_scale' kwds = {} res1 = self.res_ols.get_robustcov_results(cov_type, **kwds) res2 = self.res_wls.get_robustcov_results(cov_type, **kwds) assert_allclose(res1.params, res2.params, rtol=1e-13) assert_allclose(res1.cov_params(), res2.cov_params(), rtol=1e-13) assert_allclose(res1.bse, res2.bse, rtol=1e-13) assert_allclose(res1.pvalues, res2.pvalues, rtol=1e-12) tt = res2.t_test(np.eye(len(res2.params)), cov_p=res2.normalized_cov_params) assert_allclose(res2.cov_params(), res2.normalized_cov_params, rtol=1e-13) assert_allclose(res2.bse, tt.sd, rtol=1e-13) assert_allclose(res2.pvalues, tt.pvalue, rtol=1e-13) assert_allclose(res2.tvalues, tt.tvalue, rtol=1e-13) # using cov_type in fit mod = self.res_wls.model mod3 = WLS(mod.endog, mod.exog, weights=mod.weights) res3 = mod3.fit(cov_type=cov_type, cov_kwds=kwds) tt = res3.t_test(np.eye(len(res3.params)), cov_p=res3.normalized_cov_params) assert_allclose(res3.cov_params(), res3.normalized_cov_params, rtol=1e-13) assert_allclose(res3.bse, tt.sd, rtol=1e-13) assert_allclose(res3.pvalues, tt.pvalue, rtol=1e-13) assert_allclose(res3.tvalues, tt.tvalue, rtol=1e-13)
def discontinuitiesSignificanceTest(self, discontinuities, polys=1, periods=None): # while 1: # result = self._fit(polys=polys, periods=periods, discontinuities=discontinuities) # j = polys + 1 + 2 * len(periods) # psigma = result['psigma'][j:] # p = result['p'][j:] # temp = [] # for i, discontinuity in enumerate(discontinuities): # npars = discontinuity.nParameters() # p_ = p[:npars] # psigma_ = psigma[:npars] # p = p[npars:] # psigma = psigma[npars:] # if np.all(np.abs(p_) > 2 * psigma_): # temp.append(discontinuity) # # else: # # if temp == discontinuities: # print('test significance end', temp) # return temp # # discontinuities = temp # while 1: A, params = self.getDesignMatrix( polys=polys, periods=periods, discontinuities=discontinuities) model = WLS(self.series['y'], A, weights=self.series['dy']) model.data.xnames = params result = model.fit() # print(result.summary2()) j = polys + 1 + 2 * len(periods) ind = result.pvalues[j:] < 0.05 discontinuities = list(compress(discontinuities, ind)) return discontinuities
def fit_WLS(self, X, assignment, outcome, confounder_types, weight_name='weights', intercept='True'): df = X[[assignment, outcome]].copy() regression_confounders = [] for confounder, var_type in confounder_types.items(): if var_type == 'o' or var_type == 'u': c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder) if len(c_dummies.columns) == 1: df = pd.concat([df, c_dummies[c_dummies.columns]], axis=1) regression_confounders.extend(c_dummies.columns) else: df = pd.concat([df, c_dummies[c_dummies.columns[1:]]], axis=1) regression_confounders.extend(c_dummies.columns[1:]) else: regression_confounders.append(confounder) df.loc[:, confounder] = X[confounder].copy() df.loc[:, confounder] = X[confounder].copy() if intercept: df.loc[:, 'intercept'] = 1. regression_confounders.append('intercept') model = WLS(df[outcome], df[[assignment] + regression_confounders], weights=X[weight_name]) result = model.fit() self.wls_model = result return result
def do_egger_regression_single_variance_term(self): """ Does egger regression based on single variance term estimates. :return: list of length two each with a tuple of floats: beta, se, wald_p_val of the estimate for intercept and slope respectively. """ num_estimates = len(self.estimation_data) # runtime checks. if num_estimates < 3: raise ValueError( "Only {} estimates supplied, need at least three to estimate egger" .format(num_estimates)) if len(self.exposure_tuples) != num_estimates: raise ValueError( "No exposure data present, cannot do Egger regression.") if len(self.outcome_tuples) != num_estimates: raise ValueError( "No outcome data present, cannot do Egger regression.") """ Now turn exposure into positive values. """ outcome_tuples = copy.deepcopy(self.outcome_tuples) exposure_tuples = copy.deepcopy(self.exposure_tuples) for i in range(num_estimates): if exposure_tuples[i][0] < 0: # flip. exposure_tuples[i] = (-1.0 * exposure_tuples[i][0], exposure_tuples[i][0]) outcome_tuples[i] = (-1.0 * outcome_tuples[i][0], outcome_tuples[i][0]) x_dat = np.asarray([x[0] for x in exposure_tuples], dtype=float) x_dat = add_constant(x_dat) y_dat = np.asarray([x[0] for x in outcome_tuples], dtype=float) w_dat = np.zeros(len(self.estimation_data), dtype=float) for i in range(len(self.estimation_data)): w_dat[i] = 1 / (float(outcome_tuples[i][1])**2) wls_model = WLS(y_dat, x_dat, weights=w_dat) results = wls_model.fit() self.egger_intercept = (results.params[0], results.bse[0], results.pvalues[0]) self.egger_slope = (results.params[1], results.bse[1], results.pvalues[1]) self.egger_done = True return self.egger_intercept, self.egger_slope
def wls(x): ''' Performs weighted least squares on the numpy array x of shape (N,3), where the three components are indep variable, dep variable, dep variable errors. Returns log(likelihood). ''' wls_model = WLS(x[:, 1], sm.add_constant(x[:, 0]), weights=1 / x[:, 2]**2) result = wls_model.fit() return -np.sum( (x[:, 1] - wls_model.predict(result.params))**2 / x[:, 2]** 2), result.params[1], x[:, 1] - wls_model.predict(result.params)
def blp(y, d, prop, b_hat, s_hat, print_table=True): """Return intercept and slope for Best Linear Predictor (BLP) Parameters ---------- y : ndarray vector of outcomes d : ndarray treatment indicator prop : ndarray treatment propensity b_hat : ndarray [description] s_hat : ndarray [description] print_table : bool, optional Toggle results table, by default True Returns ------- dict results for ATE and HET """ # Calculate model matrix y_reg = y # outcome w_reg = (prop * (1 - prop)) ** (-1) # weights x_reg = np.column_stack( ( np.repeat(1, repeats=len(y)), # constant b_hat, # baseline b0 d - prop, # average treatment effect ate (d - prop) * (s_hat - np.mean(s_hat)), # heterogeneity het ) ) labels = ["const.", "b0", "ate", "het"] # Run weighted least squares wls = WLS(endog=y_reg, exog=x_reg, w=w_reg) wls = wls.fit() if print_table: print(wls.summary(xname=labels)) return { "ate": wls.params[labels.index("ate")], "het": wls.params[labels.index("het")], }
def test_pm(self): res = results_meta.exk1_metafor eff, var_eff = self.eff, self.var_eff tau2, converged = _fit_tau_iterative(eff, var_eff, tau2_start=0.1, atol=1e-8) assert_equal(converged, True) assert_allclose(tau2, res.tau2, atol=1e-10) # compare with WLS, PM weights mod_wls = WLS(eff, np.ones(len(eff)), weights=1 / (var_eff + tau2)) res_wls = mod_wls.fit(cov_type="fixed_scale") assert_allclose(res_wls.params, res.b, atol=1e-13) assert_allclose(res_wls.bse, res.se, atol=1e-10) ci_low, ci_upp = res_wls.conf_int()[0] assert_allclose(ci_low, res.ci_lb, atol=1e-10) assert_allclose(ci_upp, res.ci_ub, atol=1e-10) # need stricter atol to match metafor, # I also used higher precision in metafor res3 = combine_effects(eff, var_eff, method_re="pm", atol=1e-7) # TODO: asserts below are copy paste, DRY? assert_allclose(res3.tau2, res.tau2, atol=1e-10) assert_allclose(res3.mean_effect_re, res.b, atol=1e-13) assert_allclose(res3.sd_eff_w_re, res.se, atol=1e-10) ci = res3.conf_int(use_t=False)[1] assert_allclose(ci[0], res.ci_lb, atol=1e-10) assert_allclose(ci[1], res.ci_ub, atol=1e-10) assert_allclose(res3.q, res.QE, atol=1e-10) # the following does not pass yet # assert_allclose(res3.i2, res.I2 / 100, atol=1e-10) # percent in R # assert_allclose(res3.h2, res.H2, atol=1e-10) th = res3.test_homogeneity() q, pv = th df = th.df assert_allclose(pv, res.QEp, atol=1e-10) assert_allclose(q, res.QE, atol=1e-10) assert_allclose(df, 9 - 1, atol=1e-10)
def _fit(self, polys=1, periods=None, discontinuities=None): A, parameters = self.getDesignMatrix(polys=polys, periods=periods, discontinuities=discontinuities) model = WLS(self.series['y'], A, weights=self.series['dy']) model.data.xnames = parameters result = model.fit() _fit = result.fittedvalues npar_without_discontinuouites = polys + 2 * len(periods) + 1 A[:, npar_without_discontinuouites:] = 0 continuous = np.dot(A, result.params.values) return result, _fit, continuous P = np.diag(1.0 / self.series['dy'] ** 2) AP = np.dot(A.T, P) N = np.dot(AP, A) Q_xx = np.linalg.inv(N) W = np.dot(AP, self.series['y']) p = np.dot(Q_xx, W) fit = np.dot(A, p) npar_without_discontinuouites = polys + 2 * periods + 1 A[:, npar_without_discontinuouites:] = 0 continuous = np.dot(A, p) v = fit - self.series['y'] result = {'parameters': parameters, 'p': p} result['v'] = v result['fit'] = fit result['continuous'] = continuous result['aic'] = self._aic(v, A.shape[1]) sigma2 = np.dot(np.dot(v.T, P), v) / (self.nepochs - A.shape[1]) sigma_x = np.sqrt(np.diag(Q_xx * sigma2)) result['psigma'] = sigma_x w = np.diag(P) chi_square = np.dot(v**2, w) result['chi_square'] = chi_square nrms = np.sqrt(chi_square / (self.nepochs - A.shape[1])) wrms = np.sqrt( (self.nepochs / (self.nepochs - A.shape[1])) * chi_square / sum(w)) result['nrms'] = nrms result['wrms'] = wrms return result
def setup_class(cls): # from example wls.py nsample = 50 x = np.linspace(0, 20, nsample) X = np.column_stack((x, (x - 5)**2)) from statsmodels.tools.tools import add_constant X = add_constant(X) beta = [5., 0.5, -0.01] sig = 0.5 w = np.ones(nsample) w[int(nsample * 6. / 10):] = 3 y_true = np.dot(X, beta) e = np.random.normal(size=nsample) y = y_true + sig * w * e X = X[:, [0, 1]] # ### WLS knowing the true variance ratio of heteroscedasticity mod_wls = WLS(y, X, weights=1. / w) cls.res_wls = mod_wls.fit()
def setup_class(cls): # from example wls.py nsample = 50 x = np.linspace(0, 20, nsample) X = np.column_stack((x, (x - 5)**2)) from statsmodels.tools.tools import add_constant X = add_constant(X) beta = [5., 0.5, -0.01] sig = 0.5 w = np.ones(nsample) w[int(nsample * 6. / 10):] = 3 y_true = np.dot(X, beta) e = np.random.normal(size=nsample) y = y_true + sig * w * e X = X[:,[0,1]] # ### WLS knowing the true variance ratio of heteroscedasticity mod_wls = WLS(y, X, weights=1./w) cls.res_wls = mod_wls.fit()
class LinearRegression(object): ''' Patsy wrapper for linear estimation and prediction. Uses statsmodels WLS to allow weights. If no weights are provided, results are equivalent to OLS. ''' def __init__(self, formula=None, data=None, **kwargs): # convert all variables raised to a power to float64 # this prevents mis-specification of probabilities in cases of variable overflow # (if the original var was compressed to a smaller bit integer/float) if type(data) == pd.DataFrame: power_vars = list(set(re.findall(r'(?<=power\().+?(?=,)', formula))) for var in power_vars: data[var] = data[var].astype('float64') if formula: y, X = patsy.dmatrices(formula, data, 1) self._y_design_info = y.design_info self._X_design_info = X.design_info self._model = WLS(y, X, **kwargs) self._fit = self._model.fit() self._betas = self._fit.params self._std = np.std(data[self._model.data.ynames].values - self.predict(data)) self._r2 = self._fit.rsquared self._r2_adj = self._fit.rsquared_adj else: self._y_design_info = None self._X_design_info = None self._model = None self._fit = None self._betas = None self._std = None self._r2 = None self._r2_adj = None def __repr__(self): return str(self._fit.summary()) def predict(self, data): ''' Returns fitted values for the data provided. ''' if len(data) == 0: return [] # identifies exponential variables from the design matrix (via the 'power' flag) and converts to float64 # this prevents mis-specification of probabilities in cases of variable overflow # (if the original var was compressed to a smaller bit integer/float) power_vars = list(set([ re.search(r'(?<=power\().+?(?=,)', column).group() for column in \ self._X_design_info.column_names if 'power' in column ])) for var in power_vars: data[var] = data[var].astype('float64') (X, ) = patsy.build_design_matrices([self._X_design_info], data) return linear_transform(np.asarray(X), self._betas) def residuals(self, data): ''' Returns residuals from fitting the model to the data provided. ''' if len(data) == 0: return [] return data[self._model.data.ynames].values - self.predict(data) def draw(self, data, rand_engine): ''' Returns fitted values for the data provided plus a random draw from a normal distribution with the regression standard error. ''' return self.predict(data) + rand_engine.normal(0, self._std, len(data)) def Rsquared(self, adjusted=True): ''' Returns the model's adjusted R squared. To return unadjusted R squared, pass adjusted=False. ''' if adjusted: return self._r2_adj else: return self._r2 def to_pickle(self, filename): ''' Writes basic model information to a pickle file. ''' pickle.dump((self._y_design_info, self._X_design_info, self._betas, self._std, self._r2, self._r2_adj), open(filename, "wb")) @staticmethod def read_pickle(filename): ''' Reads basic model information from a pickle file. Returns a LinearRegression object that does not include the model summary or fit object but can execute all class functions. ''' y_design_info, X_design_info, betas, std, r2, r2_adj = pickle.load( open(filename, "rb")) linear_regression = LinearRegression() linear_regression._y_design_info = y_design_info linear_regression._X_design_info = X_design_info linear_regression._betas = betas linear_regression._std = std linear_regression._r2 = r2 linear_regression._r2_adj = r2_adj return linear_regression def __add__(self, other): ret = copy(self) ret._betas = self._betas + other._betas return ret def __sub__(self, other): ret = copy(self) ret._betas = self._betas - other._betas return ret def __mul__(self, other): ret = copy(self) ret._betas = ret._betas * other return ret
def ipw_ra(self, return_results=True, effect_group="all", disp=False): """ ATE and POM from inverse probability weighted regression adjustment. \n%(params_returns)s See Also -------- TreatmentEffectsResults """ treat_mask = self.treat_mask endog = self.model_pool.endog exog = self.model_pool.exog prob = self.prob_select prob0 = prob[~treat_mask] prob1 = prob[treat_mask] if effect_group == "all": w0 = 1 / (1 - prob0) w1 = 1 / prob1 exogt = exog elif effect_group in [1, "treated"]: w0 = prob0 / (1 - prob0) w1 = prob1 / prob1 exogt = exog[treat_mask] effect_group = 1 # standardize effect_group name elif effect_group in [0, "untreated", "control"]: w0 = (1 - prob0) / (1 - prob0) w1 = (1 - prob1) / prob1 exogt = exog[~treat_mask] effect_group = 0 # standardize effect_group name else: raise ValueError("incorrect option for effect_group") mod0 = WLS(endog[~treat_mask], exog[~treat_mask], weights=w0) result0 = mod0.fit(cov_type='HC1') # mean0_ipwra = (result0.predict(exog) * (prob / prob.mean())).mean() mean0_ipwra = result0.predict(exogt).mean() mod1 = WLS(endog[treat_mask], exog[treat_mask], weights=w1) result1 = mod1.fit(cov_type='HC1') # mean1_ipwra = (result1.predict(exog) * (prob / prob.mean())).mean() mean1_ipwra = result1.predict(exogt).mean() if not return_results: return mean1_ipwra - mean0_ipwra, mean0_ipwra, mean1_ipwra # GMM mod_gmm = _IPWRAGMM(endog, self.results_select, None, teff=self, effect_group=effect_group) start_params = np.concatenate( ([mean1_ipwra - mean0_ipwra, mean0_ipwra], result0.params, result1.params, np.asarray(self.results_select.params))) res_gmm = mod_gmm.fit(start_params=start_params, inv_weights=np.eye(len(start_params)), optim_method='nm', optim_args={ "maxiter": 2000, "disp": disp }, maxiter=1) res = TreatmentEffectResults( self, res_gmm, "IPW", start_params=start_params, effect_group=effect_group, ) return res
def aipw_wls(self, return_results=True, disp=False): """ ATE and POM from double robust augmented inverse probability weighting. This uses weighted outcome regression, while `aipw` uses unweighted outcome regression. Option for effect on treated or on untreated is not available. \n%(params_returns)s See Also -------- TreatmentEffectsResults """ nobs = self.nobs prob = self.prob_select endog = self.model_pool.endog exog = self.model_pool.exog tind = self.treatment treat_mask = self.treat_mask ww1 = tind / prob * (tind / prob - 1) mod1 = WLS(endog[treat_mask], exog[treat_mask], weights=ww1[treat_mask]) result1 = mod1.fit(cov_type='HC1') mean1_ipw2 = result1.predict(exog).mean() ww0 = (1 - tind) / (1 - prob) * ((1 - tind) / (1 - prob) - 1) mod0 = WLS(endog[~treat_mask], exog[~treat_mask], weights=ww0[~treat_mask]) result0 = mod0.fit(cov_type='HC1') mean0_ipw2 = result0.predict(exog).mean() self.results_ipwwls0 = result0 self.results_ipwwls1 = result1 correct0 = (result0.resid / (1 - prob[tind == 0])).sum() / nobs correct1 = (result1.resid / (prob[tind == 1])).sum() / nobs tmean0 = mean0_ipw2 + correct0 tmean1 = mean1_ipw2 + correct1 ate = tmean1 - tmean0 if not return_results: return ate, tmean0, tmean1 p2_aipw_wls = np.asarray([ate, tmean0]).squeeze() # GMM mod_gmm = _AIPWWLSGMM(endog, self.results_select, None, teff=self) start_params = np.concatenate( (p2_aipw_wls, result0.params, result1.params, self.results_select.params)) res_gmm = mod_gmm.fit(start_params=start_params, inv_weights=np.eye(len(start_params)), optim_method='nm', optim_args={ "maxiter": 5000, "disp": disp }, maxiter=1) res = TreatmentEffectResults( self, res_gmm, "IPW", start_params=start_params, effect_group="all", ) return res
def gates(y, d, prop, s_hat, q=10, print_table=True): """Calculate Group Average Treatment Effect Parameters ---------- y : ndarray vector of outcomes d : ndarray treatment indicator prop : ndarray treatment propensity s_hat : ndarray estimated treatment effect q : int, optional number of groups, by default 10 print_table : bool, optional toggle results table, by default True Returns ------- dict results with baseline and treatment effect for each group """ # Define groups bin_indices, bin_edges, bin_pct = quantile_grid( x=s_hat + 1e-16 * np.random.uniform(size=len(s_hat)), q=q # Break ties ) # Dummy coding s_onehot = np.zeros((len(s_hat), len(bin_edges))) s_onehot[np.arange(0, len(s_hat)), bin_indices] = 1 # Calculate model matrix x_reg = np.column_stack( (s_onehot, s_onehot * np.reshape(d - prop, newshape=(-1, 1))) ) w_reg = (prop * (1 - prop)) ** (-1) # weights y_reg = y # Run weighted least squares labels_baseline = [ f"Baseline: p={p / 100:.2f} ({x:.2f})" for p, x in zip(bin_pct.tolist(), bin_edges.tolist()) ] labels_treatment = [ f"Treatment: p={p / 100:.2f} ({x:.2f})" for p, x in zip(bin_pct.tolist(), bin_edges.tolist()) ] labels = labels_baseline + labels_treatment wls = WLS(endog=y_reg, exog=x_reg, w=w_reg) wls = wls.fit() if print_table: print(wls.summary(xname=labels)) return { "coef_baseline": wls.params[: len(labels_baseline)], "coef_treatment": wls.params[len(labels_baseline) :], "bin_values": bin_edges, "bin_count": np.sum(s_onehot, axis=0), }
from statsmodels.tools.tools import add_constant X = add_constant(X) beta = [5., 0.5, -0.01] sig = 0.5 w = np.ones(nsample) w[nsample * 6 / 10:] = 3 y_true = np.dot(X, beta) e = np.random.normal(size=nsample) y = y_true + sig * w * e X = X[:, [0, 1]] # ### WLS knowing the true variance ratio of heteroscedasticity mod_wls = WLS(y, X, weights=1. / w) res_wls = mod_wls.fit() prstd, iv_l, iv_u = wls_prediction_std(res_wls) pred_res = get_prediction(res_wls) ci = pred_res.conf_int(obs=True) from numpy.testing import assert_allclose assert_allclose(pred_res.se_obs, prstd, rtol=1e-13) assert_allclose(ci, np.column_stack((iv_l, iv_u)), rtol=1e-13) print pred_res.summary_frame().head() pred_res2 = res_wls.get_prediction() ci2 = pred_res2.conf_int(obs=True)
def do_egger_regression_two_variance_term(self): """ Does egger regression based on two variance term estimates. :return: list of length two each with a tuple of floats: beta, se, wald_p_val of the estimate for intercept and slope respectively. """ num_estimates = len(self.estimation_data) # runtime checks. if num_estimates < 3: raise ValueError( "Only {} estimates supplied, need at least three to estimate egger" .format(num_estimates)) if len(self.exposure_tuples) != num_estimates: raise ValueError( "No exposure data present, cannot do Egger regression.") if len(self.outcome_tuples) != num_estimates: raise ValueError( "No outcome data present, cannot do Egger regression.") """ Now turn exposure into positive values. """ outcome_tuples = copy.deepcopy(self.outcome_tuples) exposure_tuples = copy.deepcopy(self.exposure_tuples) for i in range(num_estimates): if exposure_tuples[i][0] < 0: # flip. exposure_tuples[i] = (-1 * exposure_tuples[i][0], exposure_tuples[i][0]) outcome_tuples[i] = (-1 * outcome_tuples[i][0], outcome_tuples[i][0]) x_dat = np.asarray([x[0] for x in exposure_tuples]) x_dat = add_constant(x_dat) y_dat = np.asarray([x[0] for x in outcome_tuples]) #if this value is zero, we add the smallest possible constant, so it can still be used as weights. #checked with the 2015 paper introducing MR-Egger, and it works as expected. w_dat = np.zeros(len(self.estimation_data)) for i in range(len(self.estimation_data)): w_dat[i] = outcome_tuples[i][0] ** -2 / \ ( (outcome_tuples[i][0]**-2 * outcome_tuples[i][1] ** 2) + (exposure_tuples[i][0]**-2 * exposure_tuples[i][1] ** 2) ) wls_model = WLS(y_dat, x_dat, weights=w_dat) results = wls_model.fit() self.egger_intercept = (results.params[0], results.bse[0], results.pvalues[0]) self.egger_slope = (results.params[1], results.bse[1], results.pvalues[1]) self.egger_done = True return self.egger_intercept, self.egger_slope
from statsmodels.tools.tools import add_constant X = add_constant(X) beta = [5., 0.5, -0.01] sig = 0.5 w = np.ones(nsample) w[nsample * 6/10:] = 3 y_true = np.dot(X, beta) e = np.random.normal(size=nsample) y = y_true + sig * w * e X = X[:,[0,1]] # ### WLS knowing the true variance ratio of heteroscedasticity mod_wls = WLS(y, X, weights=1./w) res_wls = mod_wls.fit() prstd, iv_l, iv_u = wls_prediction_std(res_wls) pred_res = get_prediction(res_wls) ci = pred_res.conf_int(obs=True) from numpy.testing import assert_allclose assert_allclose(pred_res.se_obs, prstd, rtol=1e-13) assert_allclose(ci, np.column_stack((iv_l, iv_u)), rtol=1e-13) print(pred_res.summary_frame().head()) pred_res2 = res_wls.get_prediction() ci2 = pred_res2.conf_int(obs=True)
class LinearRegression(object): """Patsy wrapper for linear estimation and prediction. """ def __init__(self, formula=None, data=None, **kwargs): if formula: y, X = patsy.dmatrices(formula, data, 1) self._y_design_info = y.design_info self._X_design_info = X.design_info self._model = WLS(y, X, **kwargs) self._fit = self._model.fit() self._betas = self._fit.params self._std = numpy.std(data[self._model.data.ynames].values - self.predict(data)) else: self._y_design_info = None self._X_design_info = None self._model = None self._fit = None self._betas = None self._std = None def __repr__(self): return str(self._fit.summary()) def predict(self, data): if len(data) == 0: return [] (X, ) = patsy.build_design_matrices([self._X_design_info], data) return linear_transform(numpy.asarray(X), self._betas) def draw(self, data, rand_engine): return self.predict(data) + rand_engine.normal(0, self._std, len(data)) def to_pickle(self, filename): pickle.dump((self._y_design_info, self._X_design_info, self._betas, self._std), open(filename, "wb")) @staticmethod def read_pickle(filename): y_design_info, X_design_info, betas, std = pickle.load(open(filename, "rb")) linear_regression = LinearRegression() linear_regression._y_design_info = y_design_info linear_regression._X_design_info = X_design_info linear_regression._betas = betas linear_regression._std = std return linear_regression def __add__(self, other): ret = copy(self) ret._betas = self._betas + other._betas return ret def __sub__(self, other): ret = copy(self) ret._betas = self._betas - other._betas return ret def __mul__(self, other): ret = copy(self) ret._betas = ret._betas * other return ret