def output_stats(model, surv, X_train, df_train, X_val, df_val): """ Compute the output of the model on the test set # Arguments model: neural network model trained with final parameters. X_train : input variables of the training set df_train: training dataset X_val : input variables of the validation set df_val: validation dataset # Returns results_test: Uno C-index at 5 and 10 years and Integrated Brier Score """ time_grid = np.linspace(np.percentile(df_val['yy'], 10), np.percentile(df_val['yy'], 90), 100) data_train = skSurv.from_arrays(event=df_train['status'], time=df_train['yy']) data_test = skSurv.from_arrays(event=df_val['status'], time=df_val['yy']) c5 = concordance_index_ipcw(data_train, data_test, np.array(-determine_surv_prob(surv, 5)), 5)[0] c10 = concordance_index_ipcw(data_train, data_test, np.array(-determine_surv_prob(surv, 10)), 10)[0] ev = EvalSurv(surv, np.array(df_val['yy']), np.array(df_val['status']), censor_surv='km') ibs = ev.integrated_brier_score(time_grid) res = pd.DataFrame([c5, c10, ibs]).T res.columns = ['unoc5', 'unoc10', 'ibs'] return res
def compute_CI_scores(model, quantiles, DATA_te, DATA_tr, pretrain_state, risk=0): '''Compute CI score based on CDF Inputs: model: trained model quantiles: DATA_te: passed test Data (featurs) DATA_tr: passed train Data (featurs) t_horizon: a vector of times to calculate cdf pretrain_state: show we use pretrain model or not ''' cdf_preds = predict_cdf(model, DATA_te, quantiles, pretrain_state) cdf_preds = [cdf.numpy() for cdf in cdf_preds] _, t_valid, e_valid = DATA_te _, t_train, e_train = DATA_tr t_train = t_train.astype('float64') t_tvalid = t_valid.astype('float64') e_train = e_train.astype('bool') e_valid = e_valid.astype('bool') uncensored = np.where(e_valid == 1)[0] et1 = np.array([(e_train[i], t_train[i]) for i in range(len(e_train))], dtype=[('e', bool), ('t', int)]) et2 = np.array([(e_valid[i], t_valid[i]) for i in range(len(e_valid))], dtype=[('e', bool), ('t', int)]) if (cdf_preds[0].shape[0] > 0 and cdf_preds[1].shape[0] > 0 and cdf_preds[2].shape[0] > 0 and cdf_preds[3].shape[0] > 0): cdf_ci_25 = concordance_index_ipcw(et1, et2, -cdf_preds[0], tau=quantiles[0]) cdf_ci_50 = concordance_index_ipcw(et1, et2, -cdf_preds[1], tau=quantiles[1]) cdf_ci_75 = concordance_index_ipcw(et1, et2, -cdf_preds[2], tau=quantiles[2]) cdf_ci_m = concordance_index_ipcw(et1, et2, -cdf_preds[3], tau=quantiles[3]) else: cdf_ci_25 = (0, 0) cdf_ci_50 = (0, 0) cdf_ci_75 = (0, 0) cdf_ci_m = (0, 0) return cdf_ci_25[0], cdf_ci_50[0], cdf_ci_75[0], cdf_ci_m[0]
def test_uno_c_not_1d(whas500_pred, dim): event, time, risk = whas500_pred y = Surv.from_arrays(event, time) risk = numpy.tile(risk[:, numpy.newaxis], (1, dim)) with pytest.raises(ValueError, match="Expected 1D array, got 2D array instead:"): concordance_index_ipcw(y, y, risk)
def evaluate_performance(T_train, c_train, T_test, c_test, prediction, time_horizon, num_causes=2, cause_names=["Cause 1", "Cause 2"]): Harell_c_index = [] UNO_c_index = [] dynamic_auc = [] for _ in range(num_causes): y_train = np.array([((c_train.loc[c_train.index[k]]== _ + 1), T_train.loc[T_train.index[k]]) for k in range(len(T_train))], dtype=[('Status', '?'), ('Survival_in_days', '<f8')]) y_test = np.array([((c_test.loc[c_test.index[k]]== _ + 1), T_test.loc[T_test.index[k]]) for k in range(len(T_test))], dtype=[('Status', '?'), ('Survival_in_days', '<f8')]) Harell_c_index.append(concordance_index(T_test, prediction[_ + 1], event_observed=(c_test==(_+1))*1)) tau = max(y_train['Survival_in_days']) ci_tau = concordance_index_ipcw(y_train, y_test, 1 - prediction[_ + 1], tau=tau)[0] UNO_c_index.append(ci_tau) try: dynamic_auc_val = cumulative_dynamic_auc(y_train, y_test, 1 - prediction[_ + 1], times=[time_horizon])[0][0] except ValueError: print('*warning: exception while calculating dynamic_auc, dynamic_auc is not calculated*') dynamic_auc_val = "-" dynamic_auc.append(dynamic_auc_val) print("--- Cause: {} -> [C-index: {:0.4f} ] [Dynamic AUC-ROC: {} ]".format( cause_names[_], UNO_c_index[-1], '{:0.4f}'.format(dynamic_auc[-1]) if dynamic_auc[-1] != "-" else "-"))
def output_simulations(surv, df_train, x_test, df_test, name): """ Compute the output of the model on the test set # Arguments model: neural network model trained with final parameters. df_train: training dataset x_test: 20 simulated input variables df_test: test dataset name: name of the model # Returns results_test: AUC and Uno C-index at median survival time """ data_train = skSurv.from_arrays(event=df_train['status'], time=df_train['yy']) data_test = skSurv.from_arrays(event=df_test['status'], time=df_test['yy']) cens_test = 100. - df_test['status'].sum( ) * 100. / df_test['status'].shape[0] time_med = np.percentile(data_test['time'], np.linspace(0, 50, 2)) auc_med = float( cumulative_dynamic_auc(data_train, data_test, -determine_surv_prob(surv, time_med[1]), time_med[1])[0]) unoc = float( concordance_index_ipcw(data_train, data_test, -determine_surv_prob(surv, time_med[1]), time_med[1])[0]) results_test = pd.DataFrame({ 't_med': time_med[1], 'auc_med': [auc_med], 'unoc': [unoc], 'cens_rate': [cens_test] }) return results_test
def ipcw(self, F_train, F_test, T_train, T_test, survival_prob_valid): struct_train = np.zeros(len(F_train), dtype={'names':('F_train', 'T_train'),'formats':('?','i4')}) struct_test = np.zeros(len(F_test), dtype={'names':('F_test', 'T_test'),'formats':('?','i4')}) struct_train['F_train'] = F_train.astype('bool') struct_train['T_train'] = T_train struct_test['F_test'] = F_test.astype('bool') struct_test['T_test'] = T_test c_ipcw = '%.5g'%(1-concordance_index_ipcw(struct_train, struct_test, survival_prob_valid)[0]) return c_ipcw
def test_uno_c_all_censored(): y_train = Surv.from_arrays( time=(2, 4, 6, 8, 10, 11, 15, 19), event=(True, True, True, True, True, True, True, True)) y_test = Surv.from_arrays( time=(1, 3, 5, 7, 12, 13, 20), event=(True, False, False, True, True, False, False)) estimate = (5, 8, 13, 11, 9, 7, 4) ret_uno = concordance_index_ipcw(y_train, y_test, estimate) ret_harrell = concordance_index_censored(y_test['event'], y_test['time'], estimate) assert ret_uno == ret_harrell
def calc_metrics(tr_t_, tr_y_, te_t_, te_y_, preds, eval_time): train_y_ = [(tr_y_.iloc[i, 0], tr_t_.iloc[i, 0]) for i in range(len(tr_y_))] train_y_ = np.array(train_y_, dtype=[('status', 'bool'), ('time', '<f8')]) test_y_ = [(te_y_.iloc[i, 0], te_t_.iloc[i, 0]) for i in range(len(te_y_))] test_y_ = np.array(test_y_, dtype=[('status', 'bool'), ('time', '<f8')]) c_index, _, _, _, _ = concordance_index_ipcw(train_y_, test_y_, preds, int(eval_time)) brier_score = weighted_brier_score(np.asarray(tr_t_), np.asarray(tr_y_), preds, np.asarray(te_t_), np.asarray(te_y_), int(eval_time)) return c_index, brier_score
def simulation(n_samples, hazard_ratio, n_repeats=100): measures = ( "censoring", "Harrel's C", "Uno's C", ) data_mean = {} data_std = {} for measure in measures: data_mean[measure] = [] data_std[measure] = [] rnd = np.random.RandomState(seed=987) # iterate over different amount of censoring for cens in (.1, .25, .4, .5, .6, .7): data = { "censoring": [], "Harrel's C": [], "Uno's C": [], } # repeaditly perform simulation for _ in range(n_repeats): # generate data X_test, y_test, y_train, actual_c = generate_survival_data( n_samples, hazard_ratio, baseline_hazard=0.1, percentage_cens=cens, rnd=rnd) # estimate c-index c_harrell = concordance_index_censored(y_test["event"], y_test["time"], X_test) c_uno = concordance_index_ipcw(y_train, y_test, X_test) # save results data["censoring"].append(100. - y_test["event"].sum() * 100. / y_test.shape[0]) data["Harrel's C"].append(actual_c - c_harrell[0]) data["Uno's C"].append(actual_c - c_uno[0]) # aggregate results for key, values in data.items(): data_mean[key].append(np.mean(data[key])) data_std[key].append(np.std(data[key], ddof=1)) data_mean = pd.DataFrame.from_dict(data_mean) data_std = pd.DataFrame.from_dict(data_std) return data_mean, data_std
def train_model(): from dsm import datasets, DeepSurvivalMachines import numpy as np from sksurv.metrics import concordance_index_ipcw, brier_score survival_data = np.loadtxt('./new_survival_data.csv', delimiter=',') features = np.loadtxt('./new_features.csv', delimiter=',') x = features t = survival_data[:, 0] e = survival_data[:, 1] times = np.quantile(t[e == 1], [0.25, 0.5, 0.75]).tolist() cv_folds = 2 folds = list(range(cv_folds))*10000 folds = np.array(folds[:len(x)]) cis = [] brs = [] for fold in range(cv_folds): print("On Fold:", fold) x_train, t_train, e_train = x[folds != fold], t[folds != fold], e[folds != fold] x_test, t_test, e_test = x[folds == fold], t[folds == fold], e[folds == fold] print(x_train.shape) model = DeepSurvivalMachines(distribution='Weibull', layers=[100]) model.fit(x_train, t_train, e_train, iters=10, learning_rate=1e-3, batch_size=10) et_train = np.array([(e_train[i], t_train[i]) for i in range(len(e_train))], dtype=[('e', bool), ('t', int)]) et_test = np.array([(e_test[i], t_test[i]) for i in range(len(e_test))], dtype=[('e', bool), ('t', int)]) out_risk = model.predict_risk(x_test, times) out_survival = model.predict_survival(x_test, times) cis_ = [] for i in range(len(times)): cis_.append(concordance_index_ipcw(et_train, et_test, out_risk[:, i], times[i])[0]) cis.append(cis_) brs.append(brier_score(et_train, et_test, out_survival, times)[1]) print("Concordance Index:", np.mean(cis, axis=0)) print("Brier Score:", np.mean(brs, axis=0))
def test_uno_c_failure(uno_c_failure_data): y_train, y_test, estimate, match = uno_c_failure_data with pytest.raises(ValueError, match=match): concordance_index_ipcw(y_train, y_test, estimate)
def assert_uno_c_almost_equal(y_train, y_test, estimate, expected, tau=None): result = concordance_index_ipcw(y_train, y_test, estimate, tau=tau) assert_array_equal(result[1:], expected[1:]) assert_almost_equal(result[0], expected[0])
def main(args): """ Runs evaluation for the data set 1. Loads model from tar.gz 2. Reads in test features 3. Runs an accuracy report 4. Generates feature importance with SHAP Args: model-name (str): Name of the trained model, default xgboost test-features (str): preprocessed test features for evaluation, default test_features.csv train-features (str): preproceed train features for SHAP, default train_features.csv test-features (str): preproceed test features for SHAP, default test_features.csv report-name (str): Name of the evaluation output , default evaluation.json shap-name (str): Name of the SHAP feature importance output file, default shap.csv threshold (float): Threshold to cut probablities at , default 0.5 tau (int): time range for the c-index will be from 0 to tau , default 100 """ model_path = os.path.join("/opt/ml/processing/model", "model.tar.gz") logger.info(f"Extracting model from path: {model_path}") with tarfile.open(model_path) as tar: tar.extractall(path=".") logger.info("Loading model") with open(args.model_name, "rb") as f: model = pickle.load(f) logger.info("Loading train and test data") test_features_data = os.path.join("/opt/ml/processing/test", args.test_features) train_features_data = os.path.join("/opt/ml/processing/train", args.train_features) X_test = pd.read_csv(test_features_data, header=0) X_train = pd.read_csv(train_features_data, header=0) y_test = X_test.iloc[:, 0] y_train = X_train.iloc[:, 0] # Reverse transfrom to event and duration columns y_test_df = pd.DataFrame( np.vstack((np.where(y_test > 0, 1, 0), np.abs(y_test))).T, columns=["event", "duration"], ) y_train_df = pd.DataFrame( np.vstack((np.where(y_train > 0, 1, 0), np.abs(y_train))).T, columns=["event", "duration"], ) X_test.drop(X_test.columns[0], axis=1, inplace=True) X_train.drop(X_test.columns[0], axis=1, inplace=True) logger.info("Running inference") predictions = model.predict(xgboost.DMatrix(X_test.values[:, 1:]), output_margin=False) logger.info("Creating evaluation report") # NOTE: technical evaluation is really not as a classifier # TO DO: Normalize to 0 to 1 scale report_dict = classification_report(y_test_df["event"], predictions > args.threshold, output_dict=True) report_dict["accuracy"] = accuracy_score(y_test_df["event"], predictions > args.threshold) _, y_train_tuple = get_x_y(y_train_df, ["event", "duration"], pos_label=True) _, y_test_tuple = get_x_y(y_test_df, ["event", "duration"], pos_label=True) concordance_index = concordance_index_ipcw( y_train_tuple, y_test_tuple, predictions, tau=args.tau, # default within 100 days ) report_dict["concordance_index"] = { "cindex": float(concordance_index[0]), "concordant": int(concordance_index[1]), "discordant": int(concordance_index[2]), "tied_risk": int(concordance_index[3]), "tied_time": int(concordance_index[4]), } times, score = brier_score(y_train_tuple, y_test_tuple, predictions, y_test_df["duration"].max() - 1) report_dict["brier_score"] = { "times": times.astype(np.int32).tolist(), "score": score.astype(np.float32).tolist(), } logger.info(f"Classification report:\n{report_dict}") evaluation_output_path = os.path.join("/opt/ml/processing/evaluation", args.report_name) logger.info(f"Saving classification report to {evaluation_output_path}") logger.debug(report_dict) with open(evaluation_output_path, "w") as f: f.write(json.dumps(report_dict)) # SHAP latest_job_debugger_artifacts_path = "/opt/ml/processing/debug/debug-output" trial = create_trial(latest_job_debugger_artifacts_path) shap_values = trial.tensor("full_shap/f0").value(trial.last_complete_step) pd.DataFrame(shap_values).to_csv( os.path.join("/opt/ml/processing/evaluation", args.shap_name)) shap_no_base = shap_values[1:, :-1] feature_names = X_train.columns os.makedirs("/opt/ml/processing/plot/", exist_ok=True) logger.info(shap_values.shape, shap_no_base.shape, X_train.shape) shap.summary_plot(shap_no_base, features=X_train, feature_names=feature_names, show=False) plt.savefig("/opt/ml/processing/plot/feature_importance.png", bbox_inches="tight")
print(times) def plot_cumulative_dynamic_auc(risk_score, label, color=None): auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_score, times) plt.plot(times, auc, marker="o", color=color, label=label) plt.xlabel("days from enrollment") plt.ylabel("time-dependent AUC") plt.axhline(mean_auc, color=color, linestyle="--") plt.legend() for i, col in enumerate(num_columns): plot_cumulative_dynamic_auc(x_test[:, i], col, color="C{}".format(i)) ret = concordance_index_ipcw(y_train, y_test, x_test[:, i], tau=times[-1]) from sksurv.datasets import load_veterans_lung_cancer va_x, va_y = load_veterans_lung_cancer() cph = make_pipeline(OneHotEncoder(), CoxPHSurvivalAnalysis()) cph.fit(va_x, va_y) va_times = np.arange(7, 183, 7) # estimate performance on training data, thus use `va_y` twice. va_auc, va_mean_auc = cumulative_dynamic_auc(va_y, va_y, cph.predict(va_x), va_times) plt.plot(va_times, va_auc, marker="o") plt.axhline(va_mean_auc, linestyle="--")
def output_bootstrap(model, n_iterations, df_train, data_train, y_train, df_test, name): """ Compute the output of the model on the bootstraped test set # Arguments model: neural network model trained with final parameters. n_iterations: number of bootstrap iterations df_train: training dataset data_train: two columns dataset with survival time and censoring status for training samples y_train: survival time df_test: test dataset name: name of the model # Returns results_all: AUC and Uno C-index at 5 and 10 years """ if name == "CoxTime" or name == "Cox-CC": _ = model.compute_baseline_hazards() results_all = pd.DataFrame(columns=['auc5', 'auc10', 'unoc5', 'unoc10']) results_final = pd.DataFrame( columns=['mean', 'ci95_lo', 'ci95_hi', 'std', 'count']) for i in range(n_iterations): print(i) test_boot = resample(df_test, n_samples=len(df_test), replace=True) x_test_boot = test_boot.drop(['surv_test', 'cen_test'], axis=1) duration_test_b, event_test_b = test_boot[ 'surv_test'].values, test_boot['cen_test'].values data_test_b = skSurv.from_arrays(event=event_test_b, time=duration_test_b) if name == "Cox-CC" or name == "CoxTime" or name == "DeepHit": surv = model.predict_surv_df(np.array(x_test_boot, dtype='float32')) else: n_picktime = int(y_train[['s']].apply(pd.Series.nunique)) x_test_boot_all = pd.concat([x_test_boot] * n_picktime) time_test = pd.DataFrame( np.repeat(np.unique(y_train[['s']]), len(x_test_boot))) x_test_boot_all.reset_index(inplace=True, drop=True) x_test_boot_all = pd.concat([x_test_boot_all, time_test], axis=1) surv = make_predictions_pseudobs(model, y_train, x_test_boot_all, x_test_boot, name) time_grid = np.linspace(duration_test_b.min(), duration_test_b.max(), 100) prob_5_10 = pd.concat([ determine_surv_prob(surv, i) for i in (duration_test_b.min(), 5, 10) ], axis=1) auc5 = float( cumulative_dynamic_auc(data_train, data_test_b, -prob_5_10.iloc[:, 1], 5)[0]) auc10 = float( cumulative_dynamic_auc(data_train, data_test_b, -prob_5_10.iloc[:, 2], 10)[0]) unoc5 = float( concordance_index_ipcw(data_train, data_test_b, -prob_5_10.iloc[:, 1], 5)[0]) unoc10 = float( concordance_index_ipcw(data_train, data_test_b, -prob_5_10.iloc[:, 2], 10)[0]) results = pd.DataFrame({ 'auc5': [auc5], 'auc10': [auc10], 'unoc5': [unoc5], 'unoc10': [unoc10] }) results_all = results_all.append(results, ignore_index=True, sort=False) for column in results_all: stats = results_all[column].agg(['mean', 'count', 'std']) scores = np.array(results_all[column]) sorted_scores = np.sort(scores, axis=None) ci95_lo = sorted_scores[int(0.05 * len(sorted_scores))] ci95_hi = sorted_scores[int(0.95 * len(sorted_scores))] results_stat = pd.DataFrame({ 'mean': [stats[0]], 'ci95_lo': ci95_lo, 'ci95_hi': [ci95_hi], 'std': [stats[2]], 'count': [stats[1]] }) results_final = results_final.append(results_stat, ignore_index=False, sort=False) results_final.index = results_all.columns.tolist() return results_final
def test_uno_c_no_comparable(no_comparable_pairs): y, scores = no_comparable_pairs with pytest.raises(NoComparablePairException): concordance_index_ipcw(y, y, scores)
def __call__(self, E_y_true, y_pred): self.check_y_pred_dimensions(E_y_true, y_pred) risk = self._survival_to_risk(y_pred) struct_E_y_test = to_structured_array(E_y_true) score = concordance_index_ipcw(self.struct_E_y_train, struct_E_y_test, risk)[0] return score
dd.output = pt_output dd.event = pt_event dd.time = pt_time pt_output = torch.tensor(pt_output).cuda() pt_event = torch.tensor(pt_event).cuda() pt_time = torch.tensor(pt_time).cuda() cindex, concordant, discordant, tied_risk, tied_time, _, _ = concordance_index_censored( pt_event, pt_time, pt_output, tied_tol=1e-8) print("Harrell's C-index = " + str(cindex)) print("Concordant = " + str(concordant)) print("Discordant = " + str(discordant)) print("Tied risk = " + str(tied_risk)) print("Tied time = " + str(tied_time)) dev_event = np.concatenate( (datasets['train'].df.event.values, datasets['val'].df.event.values)) dev_time = np.concatenate( (datasets['train'].df.time.values, datasets['val'].df.time.values)) _dev_event = [bool(i) for i in dev_event] dev_data = np.array([(i, j) for i, j in zip(_dev_event, dev_time)], dtype=[('event', '?'), ('time', '<f8')]) _pt_event = [bool(i) for i in pt_event.cpu()] pt_data = np.array([(i, j) for i, j in zip(_pt_event, pt_time.cpu())], dtype=[('event', '?'), ('time', '<f8')]) cindex2, concordant2, discordant2, tied_risk2, tied_time2 = concordance_index_ipcw( dev_data, pt_data, pt_output.cpu(), tau=None, tied_tol=1e-08) print("Uno's C-index = " + str(cindex2)) print("Concordant = " + str(concordant2)) print("Discordant = " + str(discordant2)) print("Tied risk = " + str(tied_risk2)) print("Tied time = " + str(tied_time2))