def test_from_array_int_time(surv_arrays): event, time = surv_arrays time += 1 time *= time expected = numpy.empty(dtype=[('event', bool), ('time', float)], shape=100) expected['event'] = event.astype(bool) expected['time'] = time.astype(int) y = Surv.from_arrays(event.astype(bool), time.astype(int)) assert_array_equal(y, expected)
def test_from_array_with_one_name_2(surv_arrays): event, time = surv_arrays expected = numpy.empty(dtype=[('event', bool), ('survival_time', float)], shape=100) expected['event'] = event.astype(bool) expected['survival_time'] = time y = Surv.from_arrays(event.astype(bool), time, name_time='survival_time') assert_array_equal(y, expected)
def test_simple(simple_data_km): time, event, true_x, true_y = simple_data_km x, y = kaplan_meier_estimator(event, time) assert_array_equal(x, true_x) assert_array_almost_equal(y, true_y) ys = Surv.from_arrays(event, time) est = SurvivalFunctionEstimator().fit(ys) assert_array_equal(est.unique_time_[1:], true_x) assert_array_almost_equal(est.prob_[1:], true_y) prob = est.predict_proba(true_x) assert_array_almost_equal(prob, true_y)
def test_from_array_with_names(self): event, time = self.arrays expected = numpy.empty(dtype=[('death', bool), ('survival_time', float)], shape=100) expected['death'] = event.astype(bool) expected['survival_time'] = time y = Surv.from_arrays(event.astype(bool), time, name_time='survival_time', name_event='death') assert_array_equal(y, expected)
def test_dropout_rate(self): model = self.ESTIMATOR(dropout_rate=-0.1) x = numpy.arange(100).reshape(5, 20) y = Surv.from_arrays([False, False, True, True, False], [12, 14, 6, 9, 1]) self.assertRaisesRegex( ValueError, r"dropout_rate must be within \[0; 1\[, but was -0.1", model.fit, x, y) model.set_params(dropout_rate=1.2) self.assertRaisesRegex( ValueError, r"dropout_rate must be within \[0; 1\[, but was 1.2", model.fit, x, y)
def traditional_surv_analysis(datas, opts): # tidy data as ndarray train_X, train_Y = datas["train"].xs.numpy(), datas["train"].ys.numpy() test_X, test_Y = datas["test"].xs.numpy(), datas["test"].ys.numpy() if "val" in datas.keys(): train_X = np.concatenate([train_X, datas["val"].xs]) train_Y = np.concatenate([train_Y, datas["val"].ys]) # construct structured array train_Y = Surv.from_arrays(train_Y[:, 1].astype("bool"), train_Y[:, 0]) test_Y = Surv.from_arrays(test_Y[:, 1].astype("bool"), test_Y[:, 0]) # construct estimators estimators = { "CoxPH": CoxPHSurvivalAnalysis(), "CGBSA": CGBSA(n_estimators=500, random_state=opts.random_seed), "GBSA": GBSA(n_estimators=500, random_state=opts.random_seed), "FKSVM": FKSVM(random_state=opts.random_seed), "FSVM": FSVM(random_state=opts.random_seed) } # training for name, estimator in estimators.items(): print("%s training." % name) estimator.fit(train_X, train_Y) # evaluation train_scores = {} test_scores = {} for name, estimator in estimators.items(): print("%s evaluation." % name) train_scores[name] = estimator.score(train_X, train_Y) test_scores[name] = estimator.score(test_X, test_Y) # return return train_scores, test_scores
def test_n_estimators(self): model = self.ESTIMATOR(n_estimators=0) x = numpy.arange(100).reshape(5, 20) y = Surv.from_arrays([False, False, True, True, False], [12, 14, 6, 9, 1]) self.assertRaisesRegex( ValueError, "n_estimators must be greater than 0 but was 0", model.fit, x, y) model.set_params(n_estimators=-1) self.assertRaisesRegex( ValueError, "n_estimators must be greater than 0 but was -1", model.fit, x, y)
def test_subsample(self): model = self.ESTIMATOR(subsample=0) x = numpy.arange(100).reshape(5, 20) y = Surv.from_arrays([False, False, True, True, False], [12, 14, 6, 9, 1]) self.assertRaisesRegex(ValueError, "subsample must be in ]0; 1] but was 0", model.fit, x, y) model.set_params(subsample=1.2) self.assertRaisesRegex(ValueError, "subsample must be in ]0; 1] but was 1.2", model.fit, x, y)
def toy_data(): x = numpy.array([[1., 1.], [10.2, 15.], [20., 5.], [40, 30], [45, 21], [50, 36]]) rnd = numpy.random.RandomState(0) t = rnd.exponential(scale=8, size=x.shape[0]) t.sort() y = Surv.from_arrays([True, True, False, True, False, False], t, name_event='status') return x, y
def test_regression_not_supported(self): x = numpy.zeros((100, 10)) y = Surv.from_arrays(numpy.ones(100, dtype=bool), numpy.arange(100, dtype=float)) ssvm = FastSurvivalSVM(rank_ratio=0, optimizer='simple') self.assertRaisesRegex( ValueError, "optimizer 'simple' does not implement regression objective", ssvm.fit, x, y) ssvm.set_params(optimizer='PRSVM') self.assertRaisesRegex( ValueError, "optimizer 'PRSVM' does not implement regression objective", ssvm.fit, x, y)
def test_simple(self): y = Surv.from_arrays([True, False, False, True, False], [7., 8., 11., 11., 23.], name_event="D", name_time="Y") x = pandas.DataFrame({ "F1": [1, 1, 1, 0, 0], "F2": [23, 43, 54, 75, 67], "F3": [120, 98, 78, 91, 79], "F4": [0.123, 0.541, 0.784, 0.846, 0.331] }) coxnet = CoxnetSurvivalAnalysis(l1_ratio=1.0) coxnet.fit(x.values, y) expected_alphas = numpy.array([ 7.02666666666667, 6.40243696630484, 5.83366211207401, 5.31541564828386, 4.84320877198972, 4.41295145312887, 4.02091700863675, 3.66370982370111, 3.3382359405709, 3.04167626017436, 2.77146212443153, 2.52525306776672, 2.30091654511542, 2.09650946083909, 1.91026133856035, 1.74055898614351, 1.5859325229961, 1.44504264866632, 1.31666904246323, 1.19969979362274, 1.09312177046848, 0.996011845149902, 0.907528897950459, 0.826906531910992, 0.753446434665921, 0.686512329995589, 0.625524466706047, 0.569954597101554, 0.519321401555745, 0.473186319551291, 0.431149751078499, 0.392847595491192, 0.357948097841098, 0.326148975375191, 0.297174799307102, 0.270774609184727, 0.24671973919085, 0.22480183754923, 0.204831061881182, 0.186634434881721, 0.170054346072885, 0.154947186657187, 0.141182105646904, 0.128639876495421, 0.117211864413924, 0.106799085428826, 0.0973113490299429, 0.0886664769834391, 0.0807895915432809, 0.0736124668960205, 0.0670729382214382 ]) # FIXME assert_array_almost_equal(expected_alphas, coxnet.alphas_[:len(expected_alphas)]) coef = pandas.DataFrame(coxnet.coef_[:, :len(expected_alphas)], dtype=float) expected_coef = pandas.read_csv(SIMPLE_COEF_FILE, header=None, skiprows=1) assert_columns_almost_equal(coef, expected_coef)
def test_sample_weight(self): model = self.ESTIMATOR() x = numpy.arange(100).reshape(5, 20) y = Surv.from_arrays([False, False, True, True, False], [12, 14, 6, 9, 1]) self.assertRaisesRegex( ValueError, r"Found input variables with inconsistent numbers of samples: \[5, 3\]", model.fit, x, y, [2, 3, 4]) model.set_params(dropout_rate=1.2) self.assertRaisesRegex( ValueError, r"Found input variables with inconsistent numbers of samples: \[5, 8\]", model.fit, x, y, [2, 4, 5, 6, 7, 1, 2, 7])
def test_rank_ratio_out_of_bounds(self): x = numpy.zeros((100, 10)) y = Surv.from_arrays(numpy.ones(100, dtype=bool), numpy.arange(100, dtype=float)) ssvm = FastSurvivalSVM(rank_ratio=-1) self.assertRaisesRegex(ValueError, r"rank_ratio must be in \[0; 1\]", ssvm.fit, x, y) ssvm.set_params(rank_ratio=1.2) self.assertRaisesRegex(ValueError, r"rank_ratio must be in \[0; 1\]", ssvm.fit, x, y) ssvm.set_params(rank_ratio=numpy.nan) self.assertRaisesRegex(ValueError, r"rank_ratio must be in \[0; 1\]", ssvm.fit, x, y) ssvm.set_params(rank_ratio=numpy.inf) self.assertRaisesRegex(ValueError, r"rank_ratio must be in \[0; 1\]", ssvm.fit, x, y)
def uno_c_data(request, whas500_pred): p = request.param y = None y_train = None y_test = None estimate = None expected = None tau = None if p == 'no_ties': y = Surv.from_arrays( event=numpy.array((0, 1, 1, 0, 1, 0, 1, 0, 0, 1), dtype=bool), time=(1, 5, 6, 10, 11, 34, 45, 46, 50, 56)) estimate = (5, 8, 11, 19, 34, 12, 3, 9, 12, 20) expected = (0.347890360332615, 8, 15, 0, 0) elif p == 'tied_risk_1': y = Surv.from_arrays( time=(1, 5, 6, 10, 11, 34, 45, 46, 50, 56), event=numpy.array((0, 1, 1, 0, 1, 0, 1, 0, 0, 1), dtype=bool)) estimate = (5, 8, 11, 11, 34, 12, 3, 9, 12, 20) expected = (0.365629810028969, 8, 14, 1, 0) elif p == 'tied_risk_2': y = Surv.from_arrays( time=(1, 5, 6, 10, 11, 34, 45, 46, 50, 56), event=numpy.array((0, 1, 1, 0, 1, 0, 1, 0, 0, 1), dtype=bool)) estimate = (5, 8, 11, 11, 34, 12, 11, 9, 12, 20) expected = (0.387865723332956, 7, 14, 2, 0) elif p == 'truncated_1': y_train = Surv.from_arrays( time=(2, 4, 6, 8, 10, 11, 15, 19), event=(False, True, False, True, False, False, False, False)) y_test = Surv.from_arrays( time=(1, 3, 5, 8, 12, 13), event=(True, False, False, True, True, True)) estimate = (5, 8, 13, 11, 9, 4) expected = (0.7543736528146774, 4, 4, 0, 0) tau = 19 elif p == 'truncated_2': y = Surv.from_arrays( time=(1, 5, 6, 10, 11, 34, 45, 46, 50, 56), event=numpy.array((0, 1, 1, 0, 1, 0, 1, 1, 1, 1), dtype=bool)) estimate = (5, 8, 11, 19, 34, 12, 3, 9, 12, 18) expected = (0.347890361949191, 8, 18, 0, 0) tau = 45.25 elif p == 'last_time_censored': y_train = Surv.from_arrays( time=(2, 4, 6, 8, 10, 11, 15, 19), event=(False, True, False, True, False, False, False, False)) y_test = Surv.from_arrays( time=(1, 3, 5, 7, 12, 13, 20), event=(True, False, False, True, True, False, False)) estimate = (5, 8, 13, 11, 9, 7, 4) expected = (0.8126567565914234, 6, 5, 0, 0) elif p == 'tied_event': y = Surv.from_arrays( event=[False, True, False, True, True, False, True, False, False, True], time=[1, 5, 6, 11, 11, 34, 45, 45, 50, 55]) estimate = (5, 8, 11, 19, 34, 12, 3, 9, 12, 18) expected = (0.4036321031048623, 11, 10, 0, 1) elif p == 'tied_event_and_time': y = Surv.from_arrays( event=[True, False, False, False, True, False, True, True, False, False, False, True, True], time=[34, 11, 11, 5, 1, 89, 13, 45, 7, 13, 9, 13, 90]) estimate = (1, 19, 13, 13, 15, 14, 19, 23, 11, 10, 11, 1, 18) expected = (0.46795357052737824, 14, 12, 1, 2) elif p == 'whas500': event, time, estimate = whas500_pred y = Surv.from_arrays(event, time) expected = (0.7929275009049014, 57849, 17300, 0, 14) y_train = y if y_train is None else y_train y_test = y if y_test is None else y_test yield y_train, y_test, estimate, expected, tau
def test_from_array_event_value_wrong_5(surv_arrays): event, time = surv_arrays event = numpy.arange(event.shape[0]) with pytest.raises(ValueError, match="event indicator must be binary"): Surv.from_arrays(event, time)
def test_from_array_event_value_wrong_4(surv_arrays): event, time = surv_arrays event[1] = 3 with pytest.raises(ValueError, match="event indicator must be binary"): Surv.from_arrays(event, time)
def sample_gb_class(request): x = numpy.arange(100).reshape(5, 20) y = Surv.from_arrays([False, False, True, True, False], [12, 14, 6, 9, 1]) return request.param, x, y
def fake_data(): x = numpy.random.randn(100, 11) y = Surv.from_arrays(numpy.ones(100, dtype=bool), numpy.arange(1, 101, dtype=float)) return x, y
def output_bootstrap(model, n_iterations, df_train, data_train, y_train, df_test, name): """ Compute the output of the model on the bootstraped test set # Arguments model: neural network model trained with final parameters. n_iterations: number of bootstrap iterations df_train: training dataset data_train: two columns dataset with survival time and censoring status for training samples y_train: survival time df_test: test dataset name: name of the model # Returns results_all: AUC and Uno C-index at 5 and 10 years """ if name == "CoxTime" or name == "Cox-CC": _ = model.compute_baseline_hazards() results_all = pd.DataFrame(columns=['auc5', 'auc10', 'unoc5', 'unoc10']) results_final = pd.DataFrame( columns=['mean', 'ci95_lo', 'ci95_hi', 'std', 'count']) for i in range(n_iterations): print(i) test_boot = resample(df_test, n_samples=len(df_test), replace=True) x_test_boot = test_boot.drop(['surv_test', 'cen_test'], axis=1) duration_test_b, event_test_b = test_boot[ 'surv_test'].values, test_boot['cen_test'].values data_test_b = skSurv.from_arrays(event=event_test_b, time=duration_test_b) if name == "Cox-CC" or name == "CoxTime" or name == "DeepHit": surv = model.predict_surv_df(np.array(x_test_boot, dtype='float32')) else: n_picktime = int(y_train[['s']].apply(pd.Series.nunique)) x_test_boot_all = pd.concat([x_test_boot] * n_picktime) time_test = pd.DataFrame( np.repeat(np.unique(y_train[['s']]), len(x_test_boot))) x_test_boot_all.reset_index(inplace=True, drop=True) x_test_boot_all = pd.concat([x_test_boot_all, time_test], axis=1) surv = make_predictions_pseudobs(model, y_train, x_test_boot_all, x_test_boot, name) time_grid = np.linspace(duration_test_b.min(), duration_test_b.max(), 100) prob_5_10 = pd.concat([ determine_surv_prob(surv, i) for i in (duration_test_b.min(), 5, 10) ], axis=1) auc5 = float( cumulative_dynamic_auc(data_train, data_test_b, -prob_5_10.iloc[:, 1], 5)[0]) auc10 = float( cumulative_dynamic_auc(data_train, data_test_b, -prob_5_10.iloc[:, 2], 10)[0]) unoc5 = float( concordance_index_ipcw(data_train, data_test_b, -prob_5_10.iloc[:, 1], 5)[0]) unoc10 = float( concordance_index_ipcw(data_train, data_test_b, -prob_5_10.iloc[:, 2], 10)[0]) results = pd.DataFrame({ 'auc5': [auc5], 'auc10': [auc10], 'unoc5': [unoc5], 'unoc10': [unoc10] }) results_all = results_all.append(results, ignore_index=True, sort=False) for column in results_all: stats = results_all[column].agg(['mean', 'count', 'std']) scores = np.array(results_all[column]) sorted_scores = np.sort(scores, axis=None) ci95_lo = sorted_scores[int(0.05 * len(sorted_scores))] ci95_hi = sorted_scores[int(0.95 * len(sorted_scores))] results_stat = pd.DataFrame({ 'mean': [stats[0]], 'ci95_lo': ci95_lo, 'ci95_hi': [ci95_hi], 'std': [stats[2]], 'count': [stats[1]] }) results_final = results_final.append(results_stat, ignore_index=False, sort=False) results_final.index = results_all.columns.tolist() return results_final