def test_fit_regularized(self): # Data set sizes for n,p in (50,2),(100,5): # Penalty weights for js,s in enumerate([0,0.1]): coef_name = "coef_%d_%d_%d" % (n, p, js) coef = getattr(survival_enet_r_results, coef_name) fname = "survival_data_%d_%d.csv" % (n, p) time, status, entry, exog = self.load_file(fname) exog -= exog.mean(0) exog /= exog.std(0, ddof=1) mod = PHReg(time, exog, status=status, ties='breslow') rslt = mod.fit_regularized(alpha=s) # The agreement isn't very high, the issue may be on # their side. They seem to use some approximations # that we are not using. assert_allclose(rslt.params, coef, rtol=0.3) # Smoke test for summary smry = rslt.summary()
def test_fit_regularized(self): # Data set sizes for n,p in (50,2),(100,5): # Penalty weights for js,s in enumerate([0,0.1]): coef_name = "coef_%d_%d_%d" % (n, p, js) params = getattr(survival_enet_r_results, coef_name) fname = "survival_data_%d_%d.csv" % (n, p) time, status, entry, exog = self.load_file(fname) exog -= exog.mean(0) exog /= exog.std(0, ddof=1) model = PHReg(time, exog, status=status, ties='breslow') sm_result = model.fit_regularized(alpha=s) # The agreement isn't very high, the issue may be on # the R side. See below for further checks. assert_allclose(sm_result.params, params, rtol=0.3) # The penalized log-likelihood that we are maximizing. def plf(params): llf = model.loglike(params) / len(time) L1_wt = 1 llf = llf - s * ((1 - L1_wt)*np.sum(params**2) / 2 + L1_wt*np.sum(np.abs(params))) return llf # Confirm that we are doing better than glmnet. llf_r = plf(params) llf_sm = plf(sm_result.params) assert_equal(np.sign(llf_sm - llf_r), 1)
def test_fit_regularized(self): # Data set sizes for n, p in (50, 2), (100, 5): # Penalty weights for js, s in enumerate([0, 0.1]): coef_name = "coef_%d_%d_%d" % (n, p, js) coef = getattr(survival_enet_r_results, coef_name) fname = "survival_data_%d_%d.csv" % (n, p) time, status, entry, exog = self.load_file(fname) exog -= exog.mean(0) exog /= exog.std(0, ddof=1) mod = PHReg(time, exog, status=status, ties='breslow') rslt = mod.fit_regularized(alpha=s) # The agreement isn't very high, the issue may be on # their side. They seem to use some approximations # that we are not using. assert_allclose(rslt.params, coef, rtol=0.3) # Smoke test for summary smry = rslt.summary()
def test_fit_regularized(self): # Data set sizes for n,p in (50,2),(100,5): # Penalty weights for js,s in enumerate([0,0.1]): coef_name = "coef_%d_%d_%d" % (n, p, js) params = getattr(survival_enet_r_results, coef_name) fname = "survival_data_%d_%d.csv" % (n, p) time, status, entry, exog = self.load_file(fname) exog -= exog.mean(0) exog /= exog.std(0, ddof=1) model = PHReg(time, exog, status=status, ties='breslow') sm_result = model.fit_regularized(alpha=s) # The agreement is not very high, the issue may be on # the R side. See below for further checks. assert_allclose(sm_result.params, params, rtol=0.3) # The penalized log-likelihood that we are maximizing. def plf(params): llf = model.loglike(params) / len(time) L1_wt = 1 llf = llf - s * ((1 - L1_wt)*np.sum(params**2) / 2 + L1_wt*np.sum(np.abs(params))) return llf # Confirm that we are doing better than glmnet. llf_r = plf(params) llf_sm = plf(sm_result.params) assert_equal(np.sign(llf_sm - llf_r), 1)