def test_leukemia(): """Check the Nelson-Aalen estimator on the leukemia dataset.""" leukemia = datasets.leukemia() nelson_aalen = NelsonAalen(var_type="aalen", tie_break="discrete") nelson_aalen.fit("time", status="status", group="group", data=leukemia) # Table 4.2 on p. 94 in in Klein & Moeschberger (2003) displays the # Nelson-Aalen cumulative hazard estimates and standard errors for the # treatment group to 4 decimal places times = np.array([0, 6, 7, 10, 13, 16, 22, 23]) cum_haz_treatment = \ [0., 0.1428, 0.2017, 0.2683, 0.3517, 0.4426, 0.5854, 0.7521] std_err_treatment = [ 0., 0.0825, 0.1015, 0.1212, 0.1473, 0.1729, 0.2243, 0.2795 ] for eps in (0., 0.5): # Perturb the times forward by a small amount `eps` to ensure that # the estimates are right continuous piecewise constant cum_haz_pred, std_err_pred = \ nelson_aalen.predict(times + eps, return_se=True) np.testing.assert_almost_equal(cum_haz_pred.treatment, cum_haz_treatment, decimal=3) np.testing.assert_almost_equal(std_err_pred.treatment, std_err_treatment, decimal=3)
def test_init_with_dataframe(): """Test initializing with a DataFrame.""" # Leukemia dataset leukemia = datasets.leukemia() # Initialize with column names SurvivalData("time", status="status", group="group", data=leukemia) # Initialize with arrays SurvivalData(leukemia.time, status=leukemia.status, group=leukemia.group, data=leukemia) # Channing House dataset channing = datasets.channing() # Initialize with column names SurvivalData("exit", entry="entry", status="status", group="sex", data=channing, warn=False) # Initialize with arrays SurvivalData(channing.exit, entry=channing.entry, status=channing.status, group=channing.sex, data=channing, warn=False)
def test_formatting(): """Test the string formatting functions.""" leukemia = datasets.leukemia() surv = SurvivalData("time", status="status", group="group", data=leukemia) # Good options surv.set_format(censor_marker="!") # Bad option with pytest.raises(RuntimeError): surv.set_format(invalid_option="??")
def test_fit_predict_summary(): """Check all the fit parameters and predictions.""" leukemia = datasets.leukemia() surv = SurvivalData("time", status="status", group="group", data=leukemia) for conf_type, var_type, tie_break in product(*NA_PARAMETERS): breslow = Breslow(conf_type=conf_type, var_type=var_type, tie_break=tie_break) breslow.fit(surv) # TODO: figure out better tests here breslow.predict([0, 1, 2]) breslow.predict([0, 1, 2], return_ci=True) breslow.summary.table("treatment")
def test_init_with_arrays(): """Test initializing with arrays of data.""" # Leukemia dataset leukemia = datasets.leukemia() SurvivalData(leukemia.time, status=leukemia.status, group=leukemia.group) # Channing House dataset channing = datasets.channing() SurvivalData(channing.exit, entry=channing.entry, status=channing.status, group=channing.sex, warn=False)
def test_leukemia(): """Check computed values on the leukemia dataset.""" leukemia = datasets.leukemia() kaplan_meier = KaplanMeier(var_type="greenwood") kaplan_meier.fit("time", status="status", group="group", data=leukemia) # Table Table 4.1 on p. 49 in Cox & Oakes (1984) displays the Kaplan-Meier # estimates for the treatment group to 4 decimal places, and Table 4.1B on # p. 93 in Klein & Moeschberger (2003) lists their Greenwood's formula-based # standard errors to 3 decimal places times = np.array([0, 6, 7, 10, 13, 16, 22, 23]) survival_treatment = \ [1., 0.8571, 0.8067, 0.7529, 0.6902, 0.6275, 0.5378, 0.4482] std_err_treatment = [0., 0.076, 0.087, 0.096, 0.107, 0.114, 0.128, 0.135] for eps in (0., 0.5): # Perturb the times forward by a small amount `eps` to ensure that # the estimates are right continuous piecewise constant survival_pred, std_err_pred = \ kaplan_meier.predict(times + eps, return_se=True) np.testing.assert_almost_equal(survival_pred.treatment, survival_treatment, decimal=3) np.testing.assert_almost_equal(std_err_pred.treatment, std_err_treatment, decimal=3) # The Example in the left margin of p. 53 in Kleinbaum & Klein (2005) lists # the Kaplan-Meier estimates for the control group. Since there is no # censoring in the control group, this is the same as the empirical survival # function times = np.asarray([0, 1, 2, 3, 4, 5, 8, 11, 12, 15, 17, 22, 23]) survival_control = [ 1., 19 / 21, 17 / 21, 16 / 21, 14 / 21, 12 / 21, 8 / 21, 6 / 21, 4 / 21, 3 / 21, 2 / 21, 1 / 21, 0. ] for eps in (0., 0.5): # Perturb the times forward by a small amount `eps` to ensure that # the estimates are right continuous piecewise constant survival_pred, std_err_pred = \ kaplan_meier.predict(times + eps, return_se=True) np.testing.assert_almost_equal(survival_pred.control, survival_control, decimal=3) # Page 27 in http://www.math.ucsd.edu/~rxu/math284/slect2.pdf lists # Kaplan-Meier summary tables from R's survfit function (in the survival # package) with three different types of confidence intervals for the # treatment group times = np.array([6, 7, 10, 13, 16, 22, 23]) # Confidence intervals of type "log" ci_lower_log = [ 0.7198171, 0.6531242, 0.5859190, 0.5096131, 0.4393939, 0.3370366, 0.2487882 ] ci_upper_log = [ 1.0000000, 0.9964437, 0.9675748, 0.9347692, 0.8959949, 0.8582008, 0.8073720 ] kaplan_meier = KaplanMeier(conf_type="log", var_type="greenwood") kaplan_meier.fit("time", status="status", group="group", data=leukemia) for eps in (0., 0.5): # Perturb the times forward by a small amount `eps` to ensure that the # estimates are right continuous piecewise constant _, ci_lower_pred, ci_upper_pred = \ kaplan_meier.predict(times + eps, return_ci=True) np.testing.assert_almost_equal(ci_lower_pred.treatment, ci_lower_log, decimal=7) np.testing.assert_almost_equal(ci_upper_pred.treatment, ci_upper_log, decimal=7) # Confidence intervals of type "log-log" ci_lower_log_log = [ 0.6197180, 0.5631466, 0.5031995, 0.4316102, 0.3675109, 0.2677789, 0.1880520 ] ci_upper_log_log = [ 0.9515517, 0.9228090, 0.8893618, 0.8490660, 0.8049122, 0.7467907, 0.6801426 ] kaplan_meier = KaplanMeier(conf_type="log-log", var_type="greenwood") kaplan_meier.fit("time", status="status", group="group", data=leukemia) for eps in (0., 0.5): # Perturb the times forward by a small amount `eps` to ensure that the # estimates are right continuous piecewise constant _, ci_lower_pred, ci_upper_pred = \ kaplan_meier.predict(times + eps, return_ci=True) np.testing.assert_almost_equal(ci_lower_pred.treatment, ci_lower_log_log, decimal=7) np.testing.assert_almost_equal(ci_upper_pred.treatment, ci_upper_log_log, decimal=7) # Confidence intervals of type "linear" (in R: "plain") ci_lower_linear = [ 0.7074793, 0.6363327, 0.5640993, 0.4808431, 0.4039095, 0.2864816, 0.1843849 ] ci_upper_linear = [ 1.0000000, 0.9771127, 0.9417830, 0.8995491, 0.8509924, 0.7891487, 0.7119737 ] kaplan_meier = KaplanMeier(conf_type="linear", var_type="greenwood") kaplan_meier.fit("time", status="status", group="group", data=leukemia) for eps in (0., 0.5): # Perturb the times forward by a small amount `eps` to ensure that the # estimates are right continuous piecewise constant _, ci_lower_pred, ci_upper_pred = \ kaplan_meier.predict(times + eps, return_ci=True) np.testing.assert_almost_equal(ci_lower_pred.treatment, ci_lower_linear, decimal=7) np.testing.assert_almost_equal(ci_upper_pred.treatment, ci_upper_linear, decimal=7)