def test_predict_log_hazard_relative_to_mean_without_normalization(self, rossi): cox = CoxPHFitter(normalize=False) cox.fit(rossi, 'week', 'arrest') log_relative_hazards = cox.predict_log_hazard_relative_to_mean(rossi) means = rossi.mean(0).to_frame().T assert cox.predict_partial_hazard(means).values[0][0] != 1.0 assert_frame_equal(log_relative_hazards, np.log(cox.predict_partial_hazard(rossi) / cox.predict_partial_hazard(means).squeeze()))
def test_coxph_plotting(self, block): df = load_regression_dataset() cp = CoxPHFitter() cp.fit(df, "T", "E") cp.plot() self.plt.title('test_coxph_plotting') self.plt.show(block=block)
def test_print_summary(self, rossi): import sys saved_stdout = sys.stdout try: out = StringIO() sys.stdout = out cp = CoxPHFitter() cp.fit(rossi, duration_col='week', event_col='arrest') cp.print_summary() output = out.getvalue().strip().split() expected = """n=432, number of events=114 coef exp(coef) se(coef) z p lower 0.95 upper 0.95 fin -1.897e-01 8.272e-01 9.579e-02 -1.981e+00 4.763e-02 -3.775e-01 -1.938e-03 * age -3.500e-01 7.047e-01 1.344e-01 -2.604e+00 9.210e-03 -6.134e-01 -8.651e-02 ** race 1.032e-01 1.109e+00 1.012e-01 1.020e+00 3.078e-01 -9.516e-02 3.015e-01 wexp -7.486e-02 9.279e-01 1.051e-01 -7.124e-01 4.762e-01 -2.809e-01 1.311e-01 mar -1.421e-01 8.675e-01 1.254e-01 -1.134e+00 2.570e-01 -3.880e-01 1.037e-01 paro -4.134e-02 9.595e-01 9.522e-02 -4.341e-01 6.642e-01 -2.280e-01 1.453e-01 prio 2.639e-01 1.302e+00 8.291e-02 3.182e+00 1.460e-03 1.013e-01 4.264e-01 ** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Concordance = 0.640""".strip().split() for i in [0, 1, 2, -2, -1]: assert output[i] == expected[i] finally: sys.stdout = saved_stdout
def test_coxph_plotting_normalized(self, block): df = load_regression_dataset() cp = CoxPHFitter() cp.fit(df, "T", "E") cp.plot(True) self.plt.title("test_coxph_plotting_normalized") self.plt.show(block=block)
def test_coxph_plotting_with_subset_of_columns(self, block): df = load_regression_dataset() cp = CoxPHFitter() cp.fit(df, "T", "E") cp.plot(columns=["var1", "var2"]) self.plt.title("test_coxph_plotting_with_subset_of_columns") self.plt.show(block=block)
def test_log_likelihood_is_available_in_output(self, data_nus): cox = CoxPHFitter() cox.fit(data_nus, duration_col='t', event_col='E', include_likelihood=True) assert abs(cox._log_likelihood - -12.7601409152) < 0.001
def test_predict_log_hazard_relative_to_mean_with_normalization(self, rossi): cox = CoxPHFitter(normalize=True) cox.fit(rossi, 'week', 'arrest') # they are equal because the data is normalized, so the mean of the covarites is all 0, # thus exp(beta * 0) == 1, so exp(beta * X)/exp(beta * 0) = exp(beta * X) assert_frame_equal(cox.predict_log_hazard_relative_to_mean(rossi), np.log(cox.predict_partial_hazard(rossi)))
def test_coxph_plotting_normalized(self, block): df = load_regression_dataset() cp = CoxPHFitter() cp.fit(df, "T", "E") cp.plot(True) self.plt.title('test_coxph_plotting') self.plt.show(block=block)
def test_strata_works_if_only_a_single_element_is_in_the_strata(self): df = load_holly_molly_polly() del df['Start(days)'] del df['Stop(days)'] del df['ID'] cp = CoxPHFitter() cp.fit(df, 'T', 'Status', strata=['Stratum']) assert True
def test_coxph_plotting_with_subset_of_columns_and_standardized( self, block): df = load_regression_dataset() cp = CoxPHFitter() cp.fit(df, "T", "E") cp.plot(True, columns=['var1', 'var2']) self.plt.title( 'test_coxph_plotting_with_subset_of_columns_and_standardized') self.plt.show(block=block)
def test_using_dataframes_vs_numpy_arrays(self, data_pred2): cf = CoxPHFitter() cf.fit(data_pred2, 't', 'E') X = data_pred2[cf.data.columns] assert_frame_equal( cf.predict_partial_hazard(np.array(X)), cf.predict_partial_hazard(X) )
def test_output_with_strata_against_R(self, rossi): """ rossi <- read.csv('.../lifelines/datasets/rossi.csv') r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race, paro, mar, wexp) + prio, data = rossi) """ expected = np.array([[-0.335, -0.059, 0.100]]) cf = CoxPHFitter(normalize=False) cf.fit(rossi, duration_col='week', event_col='arrest', strata=['race', 'paro', 'mar', 'wexp']) npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)
def test_fit_methods_require_duration_col(self): X = load_regression_dataset() aaf = AalenAdditiveFitter() cph = CoxPHFitter() with pytest.raises(TypeError): aaf.fit(X) with pytest.raises(TypeError): cph.fit(X)
def test_p_value_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschberger(self): # see table 8.1 in Survival Analysis by John P. Klein and Melvin L. Moeschberger, Second Edition df = load_larynx() cf = CoxPHFitter() cf.fit(df, duration_col='time', event_col='death') # p-values actual_p = cf._compute_p_values() expected_p = np.array([0.1847, 0.7644, 0.0730, 0.00]) npt.assert_array_almost_equal(actual_p, expected_p, decimal=2)
def test_baseline_survival_is_the_same_indp_of_scale(self, regression_dataset): df = regression_dataset.copy() cp1 = CoxPHFitter() cp1.fit(df, event_col='E', duration_col='T') df_descaled = regression_dataset.copy() df_descaled[['var1', 'var2', 'var3']] = df_descaled[['var1', 'var2', 'var3']] / df_descaled[['var1', 'var2', 'var3']].std() cp2 = CoxPHFitter() cp2.fit(df_descaled, event_col='E', duration_col='T') assert_frame_equal(cp2.baseline_survival_, cp1.baseline_survival_)
def test_summary(self, rossi): cp = CoxPHFitter() cp.fit(rossi, duration_col='week', event_col='arrest') summDf = cp.summary expectedColumns = [ 'coef', 'exp(coef)', 'se(coef)', 'z', 'p', 'lower 0.95', 'upper 0.95' ] assert all([col in summDf.columns for col in expectedColumns])
def test_se_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschberger(self): # see table 8.1 in Survival Analysis by John P. Klein and Melvin L. Moeschberger, Second Edition df = load_larynx() cf = CoxPHFitter(normalize=False) cf.fit(df, duration_col='time', event_col='death') # standard errors actual_se = cf._compute_standard_errors().values expected_se = np.array([[0.0143, 0.4623, 0.3561, 0.4222]]) npt.assert_array_almost_equal(actual_se, expected_se, decimal=2)
def test_baseline_cumulative_hazard_is_the_same_indp_of_location(self, regression_dataset): df = regression_dataset.copy() cp1 = CoxPHFitter() cp1.fit(df, event_col='E', duration_col='T') df_demeaned = regression_dataset.copy() df_demeaned[['var1', 'var2', 'var3']] = df_demeaned[['var1', 'var2', 'var3']] - df_demeaned[['var1', 'var2', 'var3']].mean() cp2 = CoxPHFitter() cp2.fit(df_demeaned, event_col='E', duration_col='T') assert_frame_equal(cp2.baseline_cumulative_hazard_, cp1.baseline_cumulative_hazard_)
def test_se_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschberger(self): # see table 8.1 in Survival Analysis by John P. Klein and Melvin L. Moeschberger, Second Edition df = load_larynx() cf = CoxPHFitter() cf.fit(df, duration_col='time', event_col='death') # standard errors actual_se = cf._compute_standard_errors().values expected_se = np.array([[0.0143, 0.4623, 0.3561, 0.4222]]) npt.assert_array_almost_equal(actual_se, expected_se, decimal=2)
def test_output_with_strata_against_R(self, rossi): """ rossi <- read.csv('.../lifelines/datasets/rossi.csv') r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race, paro, mar, wexp) + prio, data = rossi) """ expected = np.array([[-0.3355, -0.0590, 0.1002]]) cf = CoxPHFitter() cf.fit(rossi, duration_col='week', event_col='arrest', strata=['race', 'paro', 'mar', 'wexp']) npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)
def test_penalized_output_against_R(self, rossi): # R code: # # rossi <- read.csv('.../lifelines/datasets/rossi.csv') # mod.allison <- coxph(Surv(week, arrest) ~ ridge(fin, age, race, wexp, mar, paro, prio, # theta=1.0, scale=FALSE), data=rossi) # cat(round(mod.allison$coefficients, 4), sep=", ") expected = np.array([[-0.3641, -0.0580, 0.2894, -0.1496, -0.3837, -0.0822, 0.0913]]) cf = CoxPHFitter(normalize=False, penalizer=1.0) cf.fit(rossi, duration_col='week', event_col='arrest') npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)
def test_predict_methods_in_regression_return_same_types(self): X = load_regression_dataset() aaf = AalenAdditiveFitter() cph = CoxPHFitter() aaf.fit(X, duration_col='T', event_col='E') cph.fit(X, duration_col='T', event_col='E') for fit_method in ['predict_percentile', 'predict_median', 'predict_expectation', 'predict_survival_function', 'predict_cumulative_hazard']: assert isinstance(getattr(aaf, fit_method)(X), type(getattr(cph, fit_method)(X)))
def test_penalized_output_against_R(self, rossi): # R code: # # rossi <- read.csv('.../lifelines/datasets/rossi.csv') # mod.allison <- coxph(Surv(week, arrest) ~ ridge(fin, age, race, wexp, mar, paro, prio, # theta=1.0, scale=TRUE), data=rossi) # cat(round(mod.allison$coefficients, 4), sep=", ") expected = np.array([[-0.3761, -0.0565, 0.3099, -0.1532, -0.4295, -0.0837, 0.0909]]) cf = CoxPHFitter(penalizer=1.0) cf.fit(rossi, duration_col='week', event_col='arrest') npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)
def test_hazard_works_as_intended_with_strata_against_R_output(self, rossi): """ > library(survival) > ross = read.csv('rossi.csv') > r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race, paro, mar, wexp) + prio, data = rossi) > basehaz(r, centered=FALSE) """ cp = CoxPHFitter(normalize=False) cp.fit(rossi, 'week', 'arrest', strata=['race', 'paro', 'mar', 'wexp']) npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 0)].ix[[14, 35, 37, 43, 52]].values, [0.28665890, 0.63524149, 1.01822603, 1.48403930, 1.48403930], decimal=2) npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 1)].ix[[27, 43, 48, 52]].values, [0.35738173, 0.76415714, 1.26635373, 1.26635373], decimal=2)
def test_coef_output_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschberger(self): # see example 8.3 in Survival Analysis by John P. Klein and Melvin L. Moeschberger, Second Edition df = load_kidney_transplant(usecols=['time', 'death', 'black_male', 'white_male', 'black_female']) cf = CoxPHFitter(normalize=False) cf.fit(df, duration_col='time', event_col='death') # coefs actual_coefs = cf.hazards_.values expected_coefs = np.array([[0.1596, 0.2484, 0.6567]]) npt.assert_array_almost_equal(actual_coefs, expected_coefs, decimal=4)
def test_output_against_R(self, rossi): # from http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-cox-regression.pdf # Link is now broken, but this is the code: # # rossi <- read.csv('.../lifelines/datasets/rossi.csv') # mod.allison <- coxph(Surv(week, arrest) ~ fin + age + race + wexp + mar + paro + prio, # data=rossi) # cat(round(mod.allison$coefficients, 4), sep=", ") expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849, 0.0915]]) cf = CoxPHFitter(normalize=False) cf.fit(rossi, duration_col='week', event_col='arrest') npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)
def test_penalized_output_against_R(self): # R code: # # rossi <- read.csv('.../lifelines/datasets/rossi.csv') # mod.allison <- coxph(Surv(week, arrest) ~ ridge(fin, age, race, wexp, mar, paro, prio, # theta=1.0, scale=FALSE), data=rossi) # cat(round(mod.allison$coefficients, 4), sep=", ") expected = np.array([[-0.3641, -0.0580, 0.2894, -0.1496, -0.3837, -0.0822, 0.0913]]) df = load_rossi() cf = CoxPHFitter(normalize=False, penalizer=1.0) cf.fit(df, duration_col='week', event_col='arrest') npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)
def test_coef_output_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschberger(self): # see example 8.3 in Survival Analysis by John P. Klein and Melvin L. Moeschberger, Second Edition df = load_kidney_transplant(usecols=['time', 'death', 'black_male', 'white_male', 'black_female']) cf = CoxPHFitter() cf.fit(df, duration_col='time', event_col='death') # coefs actual_coefs = cf.hazards_.values expected_coefs = np.array([[0.1596, 0.2484, 0.6567]]) npt.assert_array_almost_equal(actual_coefs, expected_coefs, decimal=4)
def test_hazard_works_as_intended_with_strata_against_R_output(self, rossi): """ > library(survival) > rossi = read.csv('.../lifelines/datasets/rossi.csv') > r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race, paro, mar, wexp) + prio, data = rossi) > basehaz(r, centered=TRUE) """ cp = CoxPHFitter() cp.fit(rossi, 'week', 'arrest', strata=['race', 'paro', 'mar', 'wexp']) npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 0)].ix[[14, 35, 37, 43, 52]].values, [0.076600555, 0.169748261, 0.272088807, 0.396562717, 0.396562717], decimal=2) npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 1)].ix[[27, 43, 48, 52]].values, [0.095499001, 0.204196905, 0.338393113, 0.338393113], decimal=2)
def test_summary(self, rossi): cp = CoxPHFitter() cp.fit(rossi, duration_col='week', event_col='arrest') summDf = cp.summary expectedColumns = ['coef', 'exp(coef)', 'se(coef)', 'z', 'p', 'lower 0.95', 'upper 0.95'] assert all([col in summDf.columns for col in expectedColumns])
def test_strata_against_r_output(self, rossi): """ > r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race, paro, mar, wexp) + prio, data = rossi) > r > r$loglik """ cp = CoxPHFitter(normalize=False) cp.fit(rossi, 'week', 'arrest', strata=['race', 'paro', 'mar', 'wexp'], include_likelihood=True) npt.assert_almost_equal(cp.summary['coef'].values, [-0.335, -0.059, 0.100], decimal=3) assert abs(cp._log_likelihood - -436.9339) / 436.9339 < 0.01
def test_warning_is_raised_if_df_has_a_near_constant_column(self, rossi): cox = CoxPHFitter() rossi['constant'] = 1.0 with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") try: cox.fit(rossi, 'week', 'arrest') except: pass assert len(w) == 1 assert issubclass(w[-1].category, RuntimeWarning) assert "variance" in str(w[-1].message)
def test_prediction_methods_respect_index(self, data_pred2): x = data_pred2[['x1', 'x2']].ix[:3].sort_index(ascending=False) expected_index = pd.Index(np.array([3, 2, 1, 0])) cph = CoxPHFitter() cph.fit(data_pred2, duration_col='t', event_col='E') npt.assert_array_equal(cph.predict_partial_hazard(x).index, expected_index) npt.assert_array_equal(cph.predict_percentile(x).index, expected_index) npt.assert_array_equal(cph.predict_expectation(x).index, expected_index) aaf = AalenAdditiveFitter() aaf.fit(data_pred2, duration_col='t', event_col='E') npt.assert_array_equal(aaf.predict_percentile(x).index, expected_index) npt.assert_array_equal(aaf.predict_expectation(x).index, expected_index)
def test_strata_against_R_output(self, rossi): """ > library(survival) > rossi = read.csv('.../lifelines/datasets/rossi.csv') > r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race, paro, mar, wexp) + prio, data = rossi) > r$loglik """ cp = CoxPHFitter() cp.fit(rossi, 'week', 'arrest', strata=['race', 'paro', 'mar', 'wexp'], include_likelihood=True) npt.assert_almost_equal(cp.summary['coef'].values, [-0.335, -0.059, 0.100], decimal=3) assert abs(cp._log_likelihood - -436.9339) / 436.9339 < 0.01
def test_fit_methods_can_accept_optional_event_col_param(self): X = load_regression_dataset() aaf = AalenAdditiveFitter() aaf.fit(X, 'T', event_col='E') assert_series_equal(aaf.event_observed.sort_index(), X['E'].astype(bool), check_names=False) aaf.fit(X, 'T') npt.assert_array_equal(aaf.event_observed.values, np.ones(X.shape[0])) cph = CoxPHFitter() cph.fit(X, 'T', event_col='E') assert_series_equal(cph.event_observed.sort_index(), X['E'].astype(bool), check_names=False) cph.fit(X, 'T') npt.assert_array_equal(cph.event_observed.values, np.ones(X.shape[0]))
def test_survival_prediction_is_the_same_indp_of_scale(self, regression_dataset): df = regression_dataset.copy() df_scaled = regression_dataset.copy() df_scaled[['var1', 'var2', 'var3']] = df_scaled[['var1', 'var2', 'var3']] * 10.0 cp1 = CoxPHFitter() cp1.fit(df, event_col='E', duration_col='T') cp2 = CoxPHFitter() cp2.fit(df_scaled, event_col='E', duration_col='T') assert_frame_equal( cp1.predict_survival_function(df.ix[[0]][['var1', 'var2', 'var3']]), cp2.predict_survival_function(df_scaled.ix[[0]][['var1', 'var2', 'var3']]) )
def test_data_normalization(self, data_pred2): # During fit, CoxPH copies the training data and normalizes it. # Future calls should be normalized in the same way and # internal training set should not be saved in a normalized state. cf = CoxPHFitter(normalize=True) cf.fit(data_pred2, duration_col='t', event_col='E') # Internal training set ci_trn = concordance_index(cf.durations, -cf.predict_partial_hazard(cf.data).values, cf.event_observed) # New data should normalize in the exact same way ci_org = concordance_index(data_pred2['t'], -cf.predict_partial_hazard(data_pred2[['x1', 'x2']]).values, data_pred2['E']) assert ci_org == ci_trn
def test_using_dataframes_vs_numpy_arrays(self, data_pred2): # First without normalization cf = CoxPHFitter(normalize=False) cf.fit(data_pred2, 't', 'E') X = data_pred2[cf.data.columns] hazards = cf.predict_partial_hazard(X) # A Numpy array should return the same result hazards_n = cf.predict_partial_hazard(np.array(X)) assert np.all(hazards == hazards_n) # Now with normalization cf = CoxPHFitter(normalize=True) cf.fit(data_pred2, 't', 'E') hazards = cf.predict_partial_hazard(X) # Compare with array argument hazards_n = cf.predict_partial_hazard(np.array(X)) assert np.all(hazards == hazards_n)
def test_cox_ph_prediction_monotonicity(self, data_pred2): # Concordance wise, all prediction methods should be monotonic versions # of one-another, unless numerical factors screw it up. t = data_pred2['t'] e = data_pred2['E'] X = data_pred2[['x1', 'x2']] for normalize in [True, False]: msg = ("Predict methods should get the same concordance" + " when {}normalizing".format('' if normalize else 'not ')) cf = CoxPHFitter(normalize=normalize) cf.fit(data_pred2, duration_col='t', event_col='E') # Base comparison is partial_hazards ci_ph = concordance_index(t, -cf.predict_partial_hazard(X).values, e) ci_med = concordance_index(t, cf.predict_median(X).ravel(), e) assert ci_ph == ci_med, msg ci_exp = concordance_index(t, cf.predict_expectation(X).ravel(), e) assert ci_ph == ci_exp, msg
def test_concordance_index_fast_is_same_as_slow(): size = 100 T = np.random.normal(size=size) P = np.random.normal(size=size) C = np.random.choice([0, 1], size=size) Z = np.zeros_like(T) # Hard to imagine these failing assert slow_cindex(T, Z, C) == fast_cindex(T, Z, C) assert slow_cindex(T, T, C) == fast_cindex(T, T, C) # This is the real test though assert slow_cindex(T, P, C) == fast_cindex(T, P, C) cp = CoxPHFitter() df = load_rossi() cp.fit(df, duration_col='week', event_col='arrest') T = cp.durations.values.ravel() P = -cp.predict_partial_hazard(cp.data).values.ravel() E = cp.event_observed.values.ravel() assert slow_cindex(T, P, E) == fast_cindex(T, P, E)
def test_strata_removes_variable_from_summary_output(self): df = load_rossi() cp = CoxPHFitter() cp.fit(df, 'week', 'arrest', strata=['race']) assert 'race' not in cp.summary.index
kaplen_meier.fit(time_of_event, timeline=time, event_observed=event, label='All patients') kaplen_meier.plot() plt.show() #stratify Congestive Heart Complications history = df['chf'] == 1; kaplen_meier = KaplanMeierFitter() kaplen_meier.fit(time_of_event[history], timeline=time, event_observed=event[history], label='Congestive heart complications') ax = kaplen_meier.plot() kaplen_meier.fit(time_of_event[~history], timeline=time, event_observed=event[~history], label='No congestive heart complications') kaplen_meier.plot(ax=ax, c="b") plt.show() #Cox proportional hazard ph_data = df[["fstat", "lenfol", "bmi", "age"]] ph = CoxPHFitter() ph.fit(ph_data, 'lenfol', event_col='fstat') ph.print_summary() print(ph.baseline_hazard_.head()) #use predict_survival_function to get probability x = ph_data[ph_data.columns.difference(['lenfol', 'fstat'])].ix[23:25] print(x) ph.predict_survival_function(x).plot() plt.show()
def test_fit_method(self, data_nus): cf = CoxPHFitter(normalize=False) cf.fit(data_nus, duration_col='t', event_col='E') assert np.abs(cf.hazards_.ix[0][0] - -0.0335) < 0.0001
def test_input_column_order_is_equal_to_output_hazards_order(self): rossi = load_rossi() cp = CoxPHFitter() expected = ['fin', 'age', 'race', 'wexp', 'mar', 'paro', 'prio'] cp.fit(rossi, event_col='week', duration_col='arrest') assert list(cp.hazards_.columns) == expected