def test_fit_regularized(self): # Data set sizes for n,p in (50,2),(100,5): # Penalty weights for js,s in enumerate([0,0.1]): coef_name = "coef_%d_%d_%d" % (n, p, js) params = getattr(survival_enet_r_results, coef_name) fname = "survival_data_%d_%d.csv" % (n, p) time, status, entry, exog = self.load_file(fname) exog -= exog.mean(0) exog /= exog.std(0, ddof=1) model = PHReg(time, exog, status=status, ties='breslow') sm_result = model.fit_regularized(alpha=s) # The agreement isn't very high, the issue may be on # the R side. See below for further checks. assert_allclose(sm_result.params, params, rtol=0.3) # The penalized log-likelihood that we are maximizing. def plf(params): llf = model.loglike(params) / len(time) L1_wt = 1 llf = llf - s * ((1 - L1_wt)*np.sum(params**2) / 2 + L1_wt*np.sum(np.abs(params))) return llf # Confirm that we are doing better than glmnet. llf_r = plf(params) llf_sm = plf(sm_result.params) assert_equal(np.sign(llf_sm - llf_r), 1)
def test_fit_regularized(self): # Data set sizes for n,p in (50,2),(100,5): # Penalty weights for js,s in enumerate([0,0.1]): coef_name = "coef_%d_%d_%d" % (n, p, js) coef = getattr(survival_enet_r_results, coef_name) fname = "survival_data_%d_%d.csv" % (n, p) time, status, entry, exog = self.load_file(fname) exog -= exog.mean(0) exog /= exog.std(0, ddof=1) mod = PHReg(time, exog, status=status, ties='breslow') rslt = mod.fit_regularized(alpha=s) # The agreement isn't very high, the issue may be on # their side. They seem to use some approximations # that we are not using. assert_allclose(rslt.params, coef, rtol=0.3) # Smoke test for summary smry = rslt.summary()
def test_formula(self): np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200,4)) entry = np.zeros_like(time) entry[0:10] = time[0:10] / 2 df = pd.DataFrame({"time": time, "status": status, "exog1": exog[:, 0], "exog2": exog[:, 1], "exog3": exog[:, 2], "exog4": exog[:, 3], "entry": entry}) mod1 = PHReg(time, exog, status, entry=entry) rslt1 = mod1.fit() fml = "time ~ 0 + exog1 + exog2 + exog3 + exog4" mod2 = PHReg.from_formula(fml, df, status=status, entry=entry) rslt2 = mod2.fit() mod3 = PHReg.from_formula(fml, df, status="status", entry="entry") rslt3 = mod3.fit() assert_allclose(rslt1.params, rslt2.params) assert_allclose(rslt1.params, rslt3.params) assert_allclose(rslt1.bse, rslt2.bse) assert_allclose(rslt1.bse, rslt3.bse)
def test_summary(self): # smoke test np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200,4)) mod = PHReg(time, exog, status) rslt = mod.fit() rslt.summary()
def test_summary(self): # smoke test np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200,4)) mod = PHReg(time, exog, status) rslt = mod.fit() smry = rslt.summary() strata = np.kron(np.arange(50), np.ones(4)) mod = PHReg(time, exog, status, strata=strata) rslt = mod.fit() smry = rslt.summary() msg = "3 strata dropped for having no events" assert_(msg in str(smry)) groups = np.kron(np.arange(25), np.ones(8)) mod = PHReg(time, exog, status) rslt = mod.fit(groups=groups) smry = rslt.summary() entry = np.random.uniform(0.1, 0.8, 200) * time mod = PHReg(time, exog, status, entry=entry) rslt = mod.fit() smry = rslt.summary() msg = "200 observations have positive entry times" assert_(msg in str(smry))
def test_offset(self): np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200,4)) mod1 = PHReg(time, exog, status) rslt1 = mod1.fit() offset = exog[:,0] * rslt1.params[0] exog = exog[:, 1:] mod2 = PHReg(time, exog, status, offset=offset) rslt2 = mod2.fit() assert_allclose(rslt2.params, rslt1.params[1:])
def test_predict_formula(self): n = 100 np.random.seed(34234) time = 50 * np.random.uniform(size=n) status = np.random.randint(0, 2, n).astype(np.float64) exog = np.random.uniform(1, 2, size=(n, 2)) df = pd.DataFrame({"time": time, "status": status, "exog1": exog[:, 0], "exog2": exog[:, 1]}) fml = "time ~ 0 + exog1 + np.log(exog2) + exog1*exog2" model1 = PHReg.from_formula(fml, df, status=status) result1 = model1.fit() from patsy import dmatrix dfp = dmatrix(model1.data.design_info.builder, df) pr1 = result1.predict() pr2 = result1.predict(exog=df) pr3 = model1.predict(result1.params, exog=dfp) # No standard errors pr4 = model1.predict(result1.params, cov_params=result1.cov_params(), exog=dfp) prl = (pr1, pr2, pr3, pr4) for i in range(4): for j in range(i): assert_allclose(prl[i].predicted_values, prl[j].predicted_values) prl = (pr1, pr2, pr4) for i in range(3): for j in range(i): assert_allclose(prl[i].standard_errors, prl[j].standard_errors)
def test_predict(self): # All smoke tests. We should be able to convert the lhr and hr # tests into real tests against R. There are many options to # this function that may interact in complicated ways. Only a # few key combinations are tested here. np.random.seed(34234) endog = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200, 4)) mod = PHReg(endog, exog, status) rslt = mod.fit() rslt.predict() for pred_type in "lhr", "hr", "cumhaz", "surv": rslt.predict(pred_type=pred_type) rslt.predict(endog=endog[0:10], pred_type=pred_type) rslt.predict(endog=endog[0:10], exog=exog[0:10, :], pred_type=pred_type)
def do1(self, fname, ties, entry_f, strata_f): # Read the test data. time, status, entry, exog = self.load_file(fname) n = len(time) vs = fname.split("_") n = int(vs[2]) p = int(vs[3].split(".")[0]) ties1 = ties[0:3] # Needs to match the kronecker statement in survival.R strata = np.kron(range(5), np.ones(n // 5)) # No stratification or entry times mod = PHReg(time, exog, status, ties=ties) phrb = mod.fit(**args) coef_r, se_r, time_r, hazard_r = get_results(n, p, None, ties1) assert_allclose(phrb.params, coef_r, rtol=1e-3) assert_allclose(phrb.bse, se_r, rtol=1e-4) time_h, cumhaz, surv = phrb.baseline_cumulative_hazard[0] # Entry times but no stratification phrb = PHReg(time, exog, status, entry=entry, ties=ties).fit(**args) coef, se, time_r, hazard_r = get_results(n, p, "et", ties1) assert_allclose(phrb.params, coef, rtol=1e-3) assert_allclose(phrb.bse, se, rtol=1e-3) # Stratification but no entry times phrb = PHReg(time, exog, status, strata=strata, ties=ties).fit(**args) coef, se, time_r, hazard_r = get_results(n, p, "st", ties1) assert_allclose(phrb.params, coef, rtol=1e-4) assert_allclose(phrb.bse, se, rtol=1e-4) # Stratification and entry times phrb = PHReg(time, exog, status, entry=entry, strata=strata, ties=ties).fit(**args) coef, se, time_r, hazard_r = get_results(n, p, "et_st", ties1) assert_allclose(phrb.params, coef, rtol=1e-3) assert_allclose(phrb.bse, se, rtol=1e-4) #smoke test time_h, cumhaz, surv = phrb.baseline_cumulative_hazard[0]
def test_get_distribution(self): # Smoke test np.random.seed(34234) exog = np.random.normal(size=(200, 2)) lin_pred = exog.sum(1) elin_pred = np.exp(-lin_pred) time = -elin_pred * np.log(np.random.uniform(size=200)) mod = PHReg(time, exog) rslt = mod.fit() dist = rslt.get_distribution() fitted_means = dist.mean() true_means = elin_pred fitted_var = dist.var() fitted_sd = dist.std() sample = dist.rvs()
def test_formula_args(self): np.random.seed(34234) n = 200 time = 50 * np.random.uniform(size=n) status = np.random.randint(0, 2, size=n).astype(np.float64) exog = np.random.normal(size=(200, 2)) offset = np.random.uniform(size=n) entry = np.random.uniform(0, 1, size=n) * time df = pd.DataFrame({"time": time, "status": status, "x1": exog[:, 0], "x2": exog[:, 1], "offset": offset, "entry": entry}) model1 = PHReg.from_formula("time ~ x1 + x2", status="status", offset="offset", entry="entry", data=df) result1 = model1.fit() model2 = PHReg.from_formula("time ~ x1 + x2", status=df.status, offset=df.offset, entry=df.entry, data=df) result2 = model2.fit() assert_allclose(result1.params, result2.params) assert_allclose(result1.bse, result2.bse)
def test_get_distribution(self): np.random.seed(34234) n = 200 exog = np.random.normal(size=(n, 2)) lin_pred = exog.sum(1) elin_pred = np.exp(-lin_pred) time = -elin_pred * np.log(np.random.uniform(size=n)) status = np.ones(n) status[0:20] = 0 strata = np.kron(range(5), np.ones(n // 5)) mod = PHReg(time, exog, status=status, strata=strata) rslt = mod.fit() dist = rslt.get_distribution() fitted_means = dist.mean() true_means = elin_pred fitted_var = dist.var() fitted_sd = dist.std() sample = dist.rvs()
def test_formula_cat_interactions(self): time = np.r_[1, 2, 3, 4, 5, 6, 7, 8, 9] status = np.r_[1, 1, 0, 0, 1, 0, 1, 1, 1] x1 = np.r_[1, 1, 1, 2, 2, 2, 3, 3, 3] x2 = np.r_[1, 2, 3, 1, 2, 3, 1, 2, 3] df = pd.DataFrame({"time": time, "status": status, "x1": x1, "x2": x2}) model1 = PHReg.from_formula("time ~ C(x1) + C(x2) + C(x1)*C(x2)", status="status", data=df) assert_equal(model1.exog.shape, [9, 8])
def test_post_estimation(self): # All regression tests np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200,4)) mod = PHReg(time, exog, status) rslt = mod.fit() mart_resid = rslt.martingale_residuals assert_allclose(np.abs(mart_resid).sum(), 120.72475743348433) w_avg = rslt.weighted_covariate_averages assert_allclose(np.abs(w_avg[0]).sum(0), np.r_[7.31008415, 9.77608674,10.89515885, 13.1106801]) bc_haz = rslt.baseline_cumulative_hazard v = [np.mean(np.abs(x)) for x in bc_haz[0]] w = np.r_[23.482841556421608, 0.44149255358417017, 0.68660114081275281] assert_allclose(v, w) score_resid = rslt.score_residuals v = np.r_[ 0.50924792, 0.4533952, 0.4876718, 0.5441128] w = np.abs(score_resid).mean(0) assert_allclose(v, w) groups = np.random.randint(0, 3, 200) mod = PHReg(time, exog, status) rslt = mod.fit(groups=groups) robust_cov = rslt.cov_params() v = [0.00513432, 0.01278423, 0.00810427, 0.00293147] w = np.abs(robust_cov).mean(0) assert_allclose(v, w, rtol=1e-6) s_resid = rslt.schoenfeld_residuals ii = np.flatnonzero(np.isfinite(s_resid).all(1)) s_resid = s_resid[ii, :] v = np.r_[0.85154336, 0.72993748, 0.73758071, 0.78599333] assert_allclose(np.abs(s_resid).mean(0), v)
def _kernel_cumincidence(time, status, exog, kfunc, freq_weights, dimred=True): """ Calculates cumulative incidence functions using kernels. Parameters ---------- time : array_like The observed time values status : array_like The status values. status == 0 indicates censoring, status == 1, 2, ... are the events. exog : array_like Covariates such that censoring becomes independent of outcome times conditioned on the covariate values. kfunc : function A kernel function freq_weights : array_like Optional frequency weights dimred : bool If True, proportional hazards regression models are used to reduce exog to two columns by predicting overall events and censoring in two separate models. If False, exog is used directly for calculating kernel weights without dimension reduction. """ # Reorder so time is ascending ii = np.argsort(time) time = time[ii] status = status[ii] exog = exog[ii, :] nobs = len(time) # Convert the unique times to ranks (0, 1, 2, ...) utime, rtime = np.unique(time, return_inverse=True) # Last index where each unique time occurs. ie = np.searchsorted(time, utime, side='right') - 1 ngrp = int(status.max()) # All-cause status statusa = (status >= 1).astype(np.float64) if freq_weights is not None: freq_weights = freq_weights / freq_weights.sum() ip = [] sp = [None] * nobs n_risk = [None] * nobs kd = [None] * nobs for k in range(ngrp): status0 = (status == k + 1).astype(np.float64) # Dimension reduction step if dimred: sfe = PHReg(time, exog, status0).fit() fitval_e = sfe.predict().predicted_values sfc = PHReg(time, exog, 1 - status0).fit() fitval_c = sfc.predict().predicted_values exog2d = np.hstack((fitval_e[:, None], fitval_c[:, None])) exog2d -= exog2d.mean(0) exog2d /= exog2d.std(0) else: exog2d = exog ip0 = 0 for i in range(nobs): if k == 0: kd1 = exog2d - exog2d[i, :] kd1 = kfunc(kd1) kd[i] = kd1 # Get the local all-causes survival function if k == 0: denom = np.cumsum(kd[i][::-1])[::-1] num = kd[i] * statusa rat = num / denom tr = 1e-15 ii = np.flatnonzero((denom < tr) & (num < tr)) rat[ii] = 0 ratc = 1 - rat ratc = np.clip(ratc, 1e-10, np.inf) lrat = np.log(ratc) prat = np.cumsum(lrat)[ie] sf = np.exp(prat) sp[i] = np.r_[1, sf[:-1]] n_risk[i] = denom[ie] # Number of cause-specific deaths at each unique time. d0 = np.bincount(rtime, weights=status0*kd[i], minlength=len(utime)) # The cumulative incidence function probabilities. Carry # forward once the effective sample size drops below 1. ip1 = np.cumsum(sp[i] * d0 / n_risk[i]) jj = len(ip1) - np.searchsorted(n_risk[i][::-1], 1) if jj < len(ip1): ip1[jj:] = ip1[jj - 1] if freq_weights is None: ip0 += ip1 else: ip0 += freq_weights[i] * ip1 if freq_weights is None: ip0 /= nobs ip.append(ip0) return utime, ip
def _kernel_cumincidence(time, status, exog, kfunc, freq_weights, dimred=True): """ Calculates cumulative incidence functions using kernels. Parameters ---------- time : array-like The observed time values status : array-like The status values. status == 0 indicates censoring, status == 1, 2, ... are the events. exog : array-like Covariates such that censoring becomes independent of outcome times conditioned on the covariate values. kfunc : function A kernel function freq_weights : array-like Optional frequency weights dimred : boolean If True, proportional hazards regression models are used to reduce exog to two columns by predicting overall events and censoring in two separate models. If False, exog is used directly for calculating kernel weights without dimension reduction. """ # Reorder so time is ascending ii = np.argsort(time) time = time[ii] status = status[ii] exog = exog[ii, :] nobs = len(time) # Convert the unique times to ranks (0, 1, 2, ...) utime, rtime = np.unique(time, return_inverse=True) # Last index where each unique time occurs. ie = np.searchsorted(time, utime, side='right') - 1 ngrp = int(status.max()) # All-cause status statusa = (status >= 1).astype(np.float64) if freq_weights is not None: freq_weights = freq_weights / freq_weights.sum() ip = [] sp = [None] * nobs n_risk = [None] * nobs kd = [None] * nobs for k in range(ngrp): status0 = (status == k + 1).astype(np.float64) # Dimension reduction step if dimred: sfe = PHReg(time, exog, status0).fit() fitval_e = sfe.predict().predicted_values sfc = PHReg(time, exog, 1 - status0).fit() fitval_c = sfc.predict().predicted_values exog2d = np.hstack((fitval_e[:, None], fitval_c[:, None])) exog2d -= exog2d.mean(0) exog2d /= exog2d.std(0) else: exog2d = exog ip0 = 0 for i in range(nobs): if k == 0: kd1 = exog2d - exog2d[i, :] kd1 = kfunc(kd1) kd[i] = kd1 # Get the local all-causes survival function if k == 0: denom = np.cumsum(kd[i][::-1])[::-1] num = kd[i] * statusa rat = num / denom tr = 1e-15 ii = np.flatnonzero((denom < tr) & (num < tr)) rat[ii] = 0 ratc = 1 - rat ratc = np.clip(ratc, 1e-10, np.inf) lrat = np.log(ratc) prat = np.cumsum(lrat)[ie] sf = np.exp(prat) sp[i] = np.r_[1, sf[:-1]] n_risk[i] = denom[ie] # Number of cause-specific deaths at each unique time. d0 = np.bincount(rtime, weights=status0*kd[i], minlength=len(utime)) # The cumulative incidence function probabilities. Carry # forward once the effective sample size drops below 1. ip1 = np.cumsum(sp[i] * d0 / n_risk[i]) jj = len(ip1) - np.searchsorted(n_risk[i][::-1], 1) if jj < len(ip1): ip1[jj:] = ip1[jj - 1] if freq_weights is None: ip0 += ip1 else: ip0 += freq_weights[i] * ip1 if freq_weights is None: ip0 /= nobs ip.append(ip0) return utime, ip
def _kernel_survfunc(time, status, exog, kfunc, freq_weights): """ Estimate the marginal survival function under dependent censoring. Parameters ---------- time : array-like The observed times for each subject status : array-like The status for each subject (1 indicates event, 0 indicates censoring) exog : array-like Covariates such that censoring is independent conditional on exog kfunc : function Kernel function freq_weights : array-like Optional frequency weights Returns ------- probs : array-like The estimated survival probabilities times : array-like The times at which the survival probabilities are estimated References ---------- Zeng, Donglin 2004. Estimating Marginal Survival Function by Adjusting for Dependent Censoring Using Many Covariates. The Annals of Statistics 32 (4): 1533 55. doi:10.1214/009053604000000508. http://arxiv.org/pdf/math/0409180.pdf """ # Dimension reduction step sfe = PHReg(time, exog, status).fit() fitval_e = sfe.predict().predicted_values sfc = PHReg(time, exog, 1 - status).fit() fitval_c = sfc.predict().predicted_values exog2d = np.hstack((fitval_e[:, None], fitval_c[:, None])) n = len(time) ixd = np.flatnonzero(status == 1) # For consistency with standard KM, only compute the survival # function at the times of observed events. utime = np.unique(time[ixd]) # Reorder everything so time is ascending ii = np.argsort(time) time = time[ii] status = status[ii] exog2d = exog2d[ii, :] # Last index where each evaluation time occurs. ie = np.searchsorted(time, utime, side='right') - 1 if freq_weights is not None: freq_weights = freq_weights / freq_weights.sum() sprob = 0. for i in range(n): kd = exog2d - exog2d[i, :] kd = kfunc(kd) denom = np.cumsum(kd[::-1])[::-1] num = kd * status rat = num / denom tr = 1e-15 ii = np.flatnonzero((denom < tr) & (num < tr)) rat[ii] = 0 ratc = 1 - rat ratc = np.clip(ratc, 1e-12, np.inf) lrat = np.log(ratc) prat = np.cumsum(lrat)[ie] prat = np.exp(prat) if freq_weights is None: sprob += prat else: sprob += prat * freq_weights[i] if freq_weights is None: sprob /= n return sprob, utime
def _kernel_survfunc(time, status, exog, kfunc, freq_weights): """ Estimate the marginal survival function under dependent censoring. Parameters ---------- time : array_like The observed times for each subject status : array_like The status for each subject (1 indicates event, 0 indicates censoring) exog : array_like Covariates such that censoring is independent conditional on exog kfunc : function Kernel function freq_weights : array_like Optional frequency weights Returns ------- probs : array_like The estimated survival probabilities times : array_like The times at which the survival probabilities are estimated References ---------- Zeng, Donglin 2004. Estimating Marginal Survival Function by Adjusting for Dependent Censoring Using Many Covariates. The Annals of Statistics 32 (4): 1533 55. doi:10.1214/009053604000000508. https://arxiv.org/pdf/math/0409180.pdf """ # Dimension reduction step sfe = PHReg(time, exog, status).fit() fitval_e = sfe.predict().predicted_values sfc = PHReg(time, exog, 1 - status).fit() fitval_c = sfc.predict().predicted_values exog2d = np.hstack((fitval_e[:, None], fitval_c[:, None])) n = len(time) ixd = np.flatnonzero(status == 1) # For consistency with standard KM, only compute the survival # function at the times of observed events. utime = np.unique(time[ixd]) # Reorder everything so time is ascending ii = np.argsort(time) time = time[ii] status = status[ii] exog2d = exog2d[ii, :] # Last index where each evaluation time occurs. ie = np.searchsorted(time, utime, side='right') - 1 if freq_weights is not None: freq_weights = freq_weights / freq_weights.sum() sprob = 0. for i in range(n): kd = exog2d - exog2d[i, :] kd = kfunc(kd) denom = np.cumsum(kd[::-1])[::-1] num = kd * status rat = num / denom tr = 1e-15 ii = np.flatnonzero((denom < tr) & (num < tr)) rat[ii] = 0 ratc = 1 - rat ratc = np.clip(ratc, 1e-12, np.inf) lrat = np.log(ratc) prat = np.cumsum(lrat)[ie] prat = np.exp(prat) if freq_weights is None: sprob += prat else: sprob += prat * freq_weights[i] if freq_weights is None: sprob /= n return sprob, utime
def test_summary(self): # smoke test np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200, 4)) mod = PHReg(time, exog, status) rslt = mod.fit() smry = rslt.summary() strata = np.kron(np.arange(50), np.ones(4)) mod = PHReg(time, exog, status, strata=strata) rslt = mod.fit() smry = rslt.summary() msg = "3 strata dropped for having no events" assert_(msg in str(smry)) groups = np.kron(np.arange(25), np.ones(8)) mod = PHReg(time, exog, status) rslt = mod.fit(groups=groups) smry = rslt.summary() entry = np.random.uniform(0.1, 0.8, 200) * time mod = PHReg(time, exog, status, entry=entry) rslt = mod.fit() smry = rslt.summary() msg = "200 observations have positive entry times" assert_(msg in str(smry))