def calculate_ipcw(dataset, time_of_censoring): """ Calculate IPCW weights by fitting a Cox model with all covariates. """ cph = CoxPHFitter() df = pd.DataFrame(dataset["X"][:, :-1]) df["t"] = dataset["t"] df["e"] = 1 - dataset["y"] cph.fit(df, "t", "e") sf = cph.predict_survival_function(dataset["X"][:, :-1]) weights = np.zeros(len(dataset["X"])) for i in range(len(dataset["X"])): if dataset["y"][i] == 1: idx = np.searchsorted(sf.index, dataset["t"][i]) else: idx = np.searchsorted(sf.index, time_of_censoring) weights[i] = 1 / sf.iloc[idx, i] return weights
def calculate_ipcw(dataset, time_of_censoring): """ Calculate inverse propensity of censorship weights for the dataset. Uses a Cox model to model the censoring distribution. """ cph = CoxPHFitter() df = pd.DataFrame(dataset["X"][:, :-1]) df["t"] = dataset["t"] df["c"] = 1 - dataset["y"] cph.fit(df, "t", "c") sf = cph.predict_survival_function(dataset["X"][:, :-1]) weights = np.zeros(len(dataset["X"])) for i in range(len(dataset["X"])): if dataset["y"][i] == 1: idx = np.searchsorted(sf.index, dataset["t"][i]) else: idx = np.searchsorted(sf.index, time_of_censoring) weights[i] = 1 / sf.iloc[idx, i] return weights
def run_baseline(runs=10): concordance = [] ibs = [] for i in tqdm(range(runs)): df_train,df_test,df_val = load_data("./summaries/survival_data") x_mapper, labtrans, train, val, x_test, durations_test, events_test, pca = transform_data( df_train,df_test,df_val,'LogisticHazard', "standard", cols_standardize, log_columns, num_durations=100) x_train, y_train = train cols = ['PC'+str(i) for i in range(x_train.shape[1])] + ['duration','event'] pc_col = ['PC'+str(i) for i in range(x_train.shape[1])] cox_train = pd.DataFrame(x_train,columns = pc_col) cox_test = pd.DataFrame(x_test,columns=pc_col) # cox_train.loc[:,pc_col] = x_train cox_train.loc[:,["duration"]] = y_train[0] cox_train.loc[:,'event'] = y_train[1] # cox_train = cox_train.drop(columns=[i for i in list(df_train) if i not in cols]) # cox_test.loc[:,pc_col] = x_test # cox_test = cox_test.drop(columns=[i for i in list(df_train) if i not in cols]) cox_train = cox_train.dropna() cox_test = cox_test.dropna() cph = CoxPHFitter().fit(cox_train, 'duration', 'event') # cph.print_summary() surv = cph.predict_survival_function(cox_test) ev = EvalSurv(surv, durations_test, events_test, censor_surv='km') concordance.append(ev.concordance_td('antolini')) time_grid = np.linspace(durations_test.min(), durations_test.max(), 100) ibs.append(ev.integrated_brier_score(time_grid)) print("Average concordance: %s"%np.mean(concordance)) print("Average IBS: %s"%np.mean(ibs)) plot_survival(cox_train, pc_col,cph,'./survival/cox',baseline=True)
import numpy as np import seaborn as sns sns.set() ### avoid coding problems #### import sys reload(sys) sys.setdefaultencoding('gbk') ############################## # load data from lifelines.datasets import load_regression_dataset regression_dataset = load_regression_dataset() #print regression_dataset # fit from lifelines import CoxPHFitter cph = CoxPHFitter() cph.fit(regression_dataset, 'T', event_col='E') X = regression_dataset.drop(['E', 'T'], axis=1) # draw cph.predict_survival_function(X.iloc[1:5]).plot() import matplotlib.pyplot as plt plt.xlim(0, 10) plt.ylim(0.2, 1) plt.title('survival curves for events') plt.show()
cph.fit(_X_train, duration_col='Survival', event_col='Event') # cph.print_summary() # access the results using cph.summary # Validation # _X_valid = _X_valid.drop(["Survival", "Event"], axis=1) # Testing input after preprocessing. _X_valid_1 = _X_valid_1.drop(["Survival", "Event"], axis=1) _X_valid_2 = _X_valid_2.drop(["Survival", "Event"], axis=1) _X_valid_3 = _X_valid_3.drop(["Survival", "Event"], axis=1) _X_valid_4 = _X_valid_4.drop(["Survival", "Event"], axis=1) _X_valid_5 = _X_valid_5.drop(["Survival", "Event"], axis=1) # _seq_pred_y_valid_1 = cph.predict_survival_function(_X_valid_1, np.arange(before_steps)).as_matrix() _seq_pred_y_valid = np.array([ cph.predict_survival_function(_X_valid_1, np.arange(before_steps + 1)).as_matrix()[1, :], cph.predict_survival_function(_X_valid_2, np.arange(before_steps + 1)).as_matrix()[2, :], cph.predict_survival_function(_X_valid_3, np.arange(before_steps + 1)).as_matrix()[3, :], cph.predict_survival_function(_X_valid_4, np.arange(before_steps + 1)).as_matrix()[4, :], cph.predict_survival_function(_X_valid_5, np.arange(before_steps + 1)).as_matrix()[5, :] ]) _seq_pred_y_valid = _seq_pred_y_valid.transpose() thrld_score = dict() for sur_thrld_valid in np.arange(1.0, 0.0, -0.01):
random_state=42) train = pd.concat([a_train, b_train], axis=1) train = train.reset_index(drop=True) test = pd.concat([a_test, b_test], axis=1) test = test.reset_index(drop=True) test_X = test.drop(droplist, axis=1) test_y = test['术后住院时间'] from lifelines import CoxPHFitter cph = CoxPHFitter() train = train.drop('神经系统-膈肌麻痹(可能膈神经损伤)', axis=1) test_X = test_X.drop('神经系统-膈肌麻痹(可能膈神经损伤)', axis=1) train = train.drop('Id', axis=1) test_X = test_X.drop('Id', axis=1) cph.fit(train, duration_col='术后住院时间', event_col='出院时状态', show_progress=True) cph.predict_partial_hazard(test_X) survival_result = cph.predict_survival_function(test_X) survival_result = survival_result[survival_result <= 0.5] LOSResult = pd.DataFrame(np.arange(1354).reshape((677, 2)), columns=['Id', 'LOS']) i = 0 for c in survival_result.columns: item = survival_result[c].idxmax() LOSResult.iloc[i, 0] = i LOSResult.iloc[i, 1] = item i = i + 1 test['Id'] = test.index test1 = pd.merge(test, LOSResult, on='Id') fig, ax = plt.subplots(figsize=(12, 12)) from lifelines import KaplanMeierFitter kmf_control = KaplanMeierFitter() ax = kmf_control.fit(test1['术后住院时间'], label='Real').plot(ax=ax,
# In[ ]: import matplotlip from matplotlib import pyplot as plt import lifelines from lifelines import KaplanMeierFitter #survival analysis library from lifelines.statistics import logrank_test #survival statistical testing from lifelines import CoxPHFitter df['churn'] = df1.fuga cph = CoxPHFitter() cph.fit(df, duration_col=ypd1['enddt'], event_col=ypd1['FUGA'], show_progress=True) cph.print_summary() cph.plot() # In[ ]: df_2 = df.drop(['enddt', 'FUGA'], axis=1) cph.predict_partial_hazard(df_2) cph.predict_survival_function(df_2, times=[5., 25., 50.]) cph.predict_median(X) kmf = KaplanMeierFitter() T = df['time_to_fuga'] #duration C = df['churn'] #censorship - 1 if death/churn is seen, 0 if censored
#creating dummies for the score factor for the survival analysis dummies0 = pd.get_dummies(df3['score_factor']) df3 = pd.concat([df3, dummies0], axis=1) #df3 = df3.drop(['score_factor', 'Low'], axis=1) ##### how good is the categorization in high, medium and low cph = CoxPHFitter() cph.fit(df = df3[['duration', 'event', 'High', 'Medium']], duration_col = 'duration', event_col = 'event') cph.print_summary() cph.plot() cph.fit(df = df3[['duration', 'event', 'High', 'Medium']], duration_col = 'duration', event_col = 'event') cph.predict_survival_function() #how good is the numeric decile_score df4 = df3[['duration', 'event', 'decile_score']] cph.fit(df = df4, duration_col = 'duration', event_col = 'event') cph.print_summary() #cph.plot() cph.predict_survival_function(X = df4)
def train_cox(x_train0, ix_in, y_per_pt, y_int, metric = 'auc', feature_grid = None): if feature_grid is None: feature_grid = np.logspace(7, 20, 14) survival = {} # for ic_in, ix_in in enumerate(ix_inner): train_index, test_index = ix_in x_train, x_test = x_train0.iloc[train_index, :], x_train0.iloc[test_index, :] lamb_dict = {} lamb_dict['auc'] = {} lamb_dict['ci'] = {} for il, lamb in enumerate(feature_grid): ix_inner2 = leave_one_out_cv(x_train, x_train['outcome'], ddtype='all_data') ix_rand_samp = np.random.choice(np.arange(len(ix_inner2)), 10, replace=False) ix_inner2_samp = np.array(ix_inner2, dtype='object')[ix_rand_samp] # ix_inner2_rand_samp = np.random.choice(ix_inner2, 10, replace = False) counter = 0 start = time.time() hazards = [] event_times = [] event_outcomes = [] probs_in = [] true = [] model = CoxPHFitter(penalizer=lamb, l1_ratio=1.) for ic_in2, ix_in2 in enumerate(ix_inner2_samp): start_inner = time.time() train_ix, test_ix = ix_in2 x_tr2, x_ts2 = x_train.iloc[train_ix, :], x_train.iloc[test_ix, :] tmpts_in = [xx.split('-')[1] for xx in x_tr2.index.values] samp_weights = get_class_weights(np.array(y_int[x_tr2.index.values]), tmpts_in) samp_weights[samp_weights <= 0] = 1 x_tr2.insert(x_tr2.shape[1], 'weights', samp_weights) try: model.fit(x_tr2, duration_col='week', event_col='outcome', weights_col='weights', robust=True, show_progress = False) except: counter += 1 continue pred_f = model.predict_survival_function(x_ts2.iloc[0, :]) probs_in.append(1 - pred_f.loc[4.0].item()) true.append(x_ts2['outcome'].iloc[-1]) hazard = model.predict_partial_hazard(x_ts2) hazards.append(hazard) event_times.append(x_ts2['week']) event_outcomes.append(x_ts2['outcome']) end_inner = time.time() # print('Inner ix ' + str(ic_in2) + ' complete in ' + str(end_inner - start_inner)) # if metric == 'CI': try: score = concordance_index(pd.concat(event_times), pd.concat(hazards), pd.concat(event_outcomes)) lamb_dict['ci'][lamb] = score end_t = time.time() print(str(il) + ' complete') print((end_t - start)/60) except: print('No score available') continue # elif metric == 'auc': try: score = sklearn.metrics.roc_auc_score(true, probs_in) lamb_dict['auc'][lamb] = score except: continue lambdas, aucs_in = list(zip(*lamb_dict[metric].items())) ix_max = np.argmax(aucs_in) best_lamb = lambdas[ix_max] model_out = CoxPHFitter(penalizer=best_lamb, l1_ratio=1.) tmpts_in = [xx.split('-')[1] for xx in x_train.index.values] samp_weights = get_class_weights(np.array(y_int[x_train.index.values]), tmpts_in) samp_weights[samp_weights<=0] = 1 x_train.insert(x_train.shape[1], 'weights', samp_weights) x_train['weights'] = samp_weights try: model_out.fit(x_train, duration_col='week', event_col='outcome', weights_col='weights', robust=True) except: return {} pred_f = model_out.predict_survival_function(x_test.iloc[0, :]) pt = x_test.index.values[0].split('-')[0] hazard_out = model_out.predict_partial_hazard(x_test) pts = [ii.split('-')[0] for ii in x.index.values] tmpts = [ii.split('-')[1] for ii in x.index.values] # if pt not in survival.keys(): # survival[pt] = {} ixs = np.where(np.array(pts) == pt)[0] survival['actual'] = str(np.max([float(tmpt) for tmpt in np.array(tmpts)[ixs]])) if y_per_pt[pt] == 'Cleared': survival['actual'] = survival['actual'] + '+' probs_sm = 1 - pred_f.loc[4.0].item() y_pred_exp = model_out.predict_expectation(x_test.iloc[[0], :]) survival['predicted'] = str(np.round(y_pred_exp.item(), 3)) surv_func = pred_f # probs_df = pd.Series(probs_sm) # y_pp = y_per_pt.replace('Cleared', 0).replace('Recur', 1) # final_df = pd.concat([y_pp, probs_df], axis=1).dropna() final_dict = {} # final_dict['probability_df'] = final_df final_dict['model'] = model_out final_dict['survival'] = survival final_dict['survival_function'] = surv_func final_dict['prob_true'] = (probs_sm, y_per_pt[pt]) final_dict['times_hazards_outcomes'] = (x_test['week'], hazard_out, x_test['outcome']) final_dict['lambdas'] = lamb_dict # final_dict['auc'] = sklearn.metrics.roc_auc_score(final_df[0], final_df[1]) return final_dict
def compute_coxhr(pair, df, lag, step_size, is_sex_specific, nindivs, res_writer): logger.info(f"Running Cox regression") prior, outcome = pair # Handle sex-specific endpoints if is_sex_specific: df = df.drop(columns=["female"]) # Fit Cox model cph = CoxPHFitter() cph.fit( df, duration_col="duration", event_col="outcome", step_size=step_size, # For the case-cohort study we need weights and robust errors: weights_col="weight", robust=True) # Compute absolute risk mean_indiv = pd.DataFrame({ "BIRTH_TYEAR": [MEAN_INDIV_BIRTH_YEAR], "prior": [MEAN_INDIV_HAS_PRIOR_ENDPOINT], "female": [MEAN_INDIV_FEMALE_RATIO] }) if is_sex_specific: mean_indiv.pop("female") if lag is None: predict_at = STUDY_ENDS - STUDY_STARTS lag_value = None else: _min_lag, max_lag = lag predict_at = max_lag lag_value = max_lag surv_probability = cph.predict_survival_function(mean_indiv, times=[predict_at ]).values[0][0] absolute_risk = 1 - surv_probability # Get values out of the fitted model norm_mean = cph._norm_mean prior_coef = cph.params_["prior"] prior_se = cph.standard_errors_["prior"] prior_hr = np.exp(prior_coef) prior_ci_lower = np.exp(prior_coef - 1.96 * prior_se) prior_ci_upper = np.exp(prior_coef + 1.96 * prior_se) prior_pval = cph.summary.p["prior"] prior_zval = cph.summary.z["prior"] prior_norm_mean = norm_mean["prior"] year_coef = cph.params_["BIRTH_TYEAR"] year_se = cph.standard_errors_["BIRTH_TYEAR"] year_hr = np.exp(year_coef) year_ci_lower = np.exp(year_coef - 1.96 * year_se) year_ci_upper = np.exp(year_coef + 1.96 * year_se) year_pval = cph.summary.p["BIRTH_TYEAR"] year_zval = cph.summary.z["BIRTH_TYEAR"] year_norm_mean = norm_mean["BIRTH_TYEAR"] if not is_sex_specific: sex_coef = cph.params_["female"] sex_se = cph.standard_errors_["female"] sex_hr = np.exp(sex_coef) sex_ci_lower = np.exp(sex_coef - 1.96 * sex_se) sex_ci_upper = np.exp(sex_coef + 1.96 * sex_se) sex_pval = cph.summary.p["female"] sex_zval = cph.summary.z["female"] sex_norm_mean = norm_mean["female"] else: sex_coef = np.nan sex_se = np.nan sex_hr = np.nan sex_ci_lower = np.nan sex_ci_upper = np.nan sex_pval = np.nan sex_zval = np.nan sex_norm_mean = np.nan # Save the baseline cumulative hazard (bch) df_bch = cph.baseline_cumulative_hazard_ baseline_cumulative_hazard = bch_at(df_bch, predict_at) bch_values = {} for time in BCH_TIMEPOINTS: bch_values[time] = bch_at(df_bch, time) # Save values res_writer.writerow([ prior, outcome, lag_value, step_size, nindivs, absolute_risk, prior_coef, prior_se, prior_hr, prior_ci_lower, prior_ci_upper, prior_pval, prior_zval, prior_norm_mean, year_coef, year_se, year_hr, year_ci_lower, year_ci_upper, year_pval, year_zval, year_norm_mean, sex_coef, sex_se, sex_hr, sex_ci_lower, sex_ci_upper, sex_pval, sex_zval, sex_norm_mean, baseline_cumulative_hazard, bch_values[0], bch_values[2.5], bch_values[5], bch_values[7.5], bch_values[10], bch_values[12.5], bch_values[15], bch_values[17.5], bch_values[20], bch_values[21.99] ])
def roc_cut(df, vars, group, time=0, family='logistic', save_tab=False): roc_cut = pd.DataFrame() if family == 'logistic': table = df[vars] y = df[group] for col in table: J = len(list(table.columns)) for j in range(J): v = table[col].name pred = sma.GLM(y, sma.add_constant(table[col]), family=sma.families.Binomial()).fit().predict( sma.add_constant(table[col])) fpr, tpr, thresholds = roc_curve(y, pred) r = round(auc(fpr, tpr) * 100, 1) #AUC i = np.arange(len(tpr)) roc = pd.DataFrame({ 'fpr': pd.Series(fpr, index=i), 'tpr': pd.Series(tpr, index=i), '1-fpr': pd.Series(1 - fpr, index=i), 'tf': pd.Series(tpr - (1 - fpr), index=i), 'thresholds': pd.Series(thresholds, index=i) }) roc = roc.iloc[(roc.tf - 0).abs().argsort()[:1]] thres = roc.iloc[0, 4] sens = round(roc.iloc[0, 1] * 100, 1) #sensetivity spec = round(roc.iloc[0, 2] * 100, 1) #specifisity cut = roc_job(pred, predictor=table[col], pos=thres).compute() roc_cut = roc_cut.append( { 'Фактор': v, 'AUC, %': r, 'Порог': cut, 'Чувствительность, %': sens, 'Специфичность, %': spec }, ignore_index=True) elif family == 'cox': cph = CoxPHFitter() table = df[vars] table[time] = df[time] table[group] = df[group] for col in table.columns[:-2]: v = table[col].name cph.fit(table[[col, group, time]], duration_col=time, event_col=group) pred = cph.predict_survival_function( table[[col]], np.percentile(df[[time]], 0.99)).T fpr, tpr, thresholds = roc_curve(table[group], pred) r = round(auc(fpr, tpr) * 100, 1) #AUC i = np.arange(len(tpr)) roc = pd.DataFrame({ 'fpr': pd.Series(fpr, index=i), 'tpr': pd.Series(tpr, index=i), '1-fpr': pd.Series(1 - fpr, index=i), 'tf': pd.Series(tpr - (1 - fpr), index=i), 'thresholds': pd.Series(thresholds, index=i) }) roc = roc.iloc[(roc.tf - 0).abs().argsort()[:1]] thres = roc.iloc[0, 4] sens = round(roc.iloc[0, 1] * 100, 1) #sensetivity spec = round(roc.iloc[0, 2] * 100, 1) #specifisity cut = roc_job(pred[0], predictor=table[col], pos=thres).compute() roc_cut = roc_cut.append( { 'Фактор': v, 'AUC, %': r, 'Порог': cut, 'Чувствительность, %': sens, 'Специфичность, %': spec }, ignore_index=True) else: print('Error') roc_cut = roc_cut.reindex(columns=[ 'Фактор', 'AUC, %', 'Порог', 'Чувствительность, %', 'Специфичность, %' ]) if save_tab == True: return pd.DataFrame.to_excel(roc_cut, 'Пороги по ROC-анализу.xlsx') else: return roc_cut return roc_cut
def create_model(temp_features, current_cluster, use_cluster_as_feature): print('----------------------------------------------------------------------------------------------------------------------------') print('----------------------------------------------------------------------------------------------------------------------------') print('----------------------------------------------------------------------------------------------------------------------------') print('----------------------------------------------------------------------------------------------------------------------------') print('----------------------------------------------------------------------------------------------------------------------------') # ============================================================================= # #Keep TransplantationID in test data for error analysis # ============================================================================= temp_labels = np.array(temp_features['Longterm_TransplantOutcome']) temp_features= temp_features.drop('TransplantationID', axis = 1) temp_features= temp_features.drop('PatientID', axis = 1) if use_cluster_as_feature: temp_features = pd.get_dummies(data=temp_features, columns=['cluster']) print('Creating model for all clusters with cluster as feature') else: temp_features= temp_features.drop('cluster', axis = 1) print('Creating model for cluster ' + str(current_cluster)) #for col in temp_features.columns: # print(col) # ============================================================================= # #Spliting datasets into train and test sets # ============================================================================= from sklearn.model_selection import train_test_split train_features, test_features, train_labels, test_labels = train_test_split(temp_features, temp_labels, test_size = 0.25, random_state = 42) # ============================================================================= # #SMOTE for upsampling # ============================================================================= from imblearn.over_sampling import SMOTE train_features, train_labels = SMOTE().fit_resample(train_features, train_labels) # ============================================================================= # # Drop features with no variance # ============================================================================= events = train_labels.astype(bool) for col in train_features.columns: if (train_features.loc[events, col].var() == 0.0 or train_features.loc[~events, col].var() == 0.0 ) and col != 'Longterm_TransplantOutcome': #print('Dropped column ' + col + ' (no variance)') train_features.drop([col], axis=1, inplace=True) test_features.drop([col], axis=1, inplace=True) # ============================================================================= # #Cox Regression model # ============================================================================= cph = CoxPHFitter(penalizer=0.1) ## Instantiate the class to create a cph object cph.fit(train_features, 'tenure', event_col='Longterm_TransplantOutcome', show_progress=False, step_size=0.1) ## Fit the data to train the model print('concordance index: ' + str(cph.concordance_index_)) tr_rows = test_features.loc[:, test_features.columns != 'Longterm_TransplantOutcome'].iloc[:, :] predictions = cph.predict_survival_function(tr_rows) predictions = predictions.transpose() # ============================================================================= # #Error analysis # ============================================================================= for col in predictions.columns: if float(col) > (365*6): col_use = col print(col_use) break predictions = predictions[col_use] predictions = predictions.to_frame(name='prediction') predictions.loc[predictions['prediction'] > 0.5, ['prediction']] = 1 predictions.loc[predictions['prediction'] <= 0.5, ['prediction']] = 0 predictions=(~predictions.astype(bool)).astype(int) labels = pd.DataFrame(test_labels, columns=['label']) predictions.reset_index(drop=True, inplace=True) labels.reset_index(drop=True, inplace=True) # ============================================================================= # #Confusion matrix # ============================================================================= from sklearn.metrics import confusion_matrix conf_mat = confusion_matrix(labels, predictions) print(conf_mat) import seaborn seaborn.heatmap(conf_mat) labels_desc = [1,0 ] cm = confusion_matrix(predictions, labels, labels_desc) print_cm(cm, labels_desc) # ============================================================================= # #Precision, Recall, F1-Score # ============================================================================= print(sklearn.metrics.classification_report(labels, predictions, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False, zero_division='warn')) # ============================================================================= # #ROC curve # ============================================================================= import sklearn.metrics as metrics fpr, tpr, threshold = metrics.roc_curve(labels, predictions) roc_auc = metrics.auc(fpr, tpr) import matplotlib.pyplot as plt plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show()
''' # 1. Kaplan Meier Survivor Function kmf = KaplanMeierFitter() T = data['dur'] C = data['evt'] kmf.fit(T, event_observed=C) fig1 = kmf.plot(title='Survivor Function, Drop Out') fig1.savefig('fig1.png') # 2. Nelson Aalen Cumulative Hazard Function naf = NelsonAalenFitter() naf.fit(T, event_observed=C) fig2 = naf.plot(title='Cumulative Hazard Function, Drop Out') fig2.savefig('fig2.png') # 3. Cox Proportional Hazard Model cph = CoxPHFitter() cph.fit(data, 'sex', event_col='evt') fig3 = cph.predict_survival_function(data).plot() fig3.savefig('fig3.png') ''' I couldn't make this one give me the result I wanted. The functioning Stata code is: stphplot, by(sex) nolntime and the resulting visualization is... ''' img = mpimg.imread('cph.png') imgplot = plt.imshow(img) plt.show()
train_data_df = \ pd.DataFrame(np.hstack((X_train_standardized, y_train)), columns=feature_names + ['time', 'status']) surv_model = CoxPHFitter() surv_model.fit(train_data_df, duration_col='time', event_col='status', show_progress=False, step_size=.1) sorted_y_test = np.sort(np.unique(y_test[:, 0])) if sorted_y_test[0] != 0: mesh_points = np.concatenate(([0.], sorted_y_test)) else: mesh_points = sorted_y_test surv = \ surv_model.predict_survival_function(X_test_standardized, mesh_points) surv = surv.values.T # --------------------------------------------------------------------- # compute c-index # if cindex_method == 'cum_haz': cum_haz = \ surv_model.predict_cumulative_hazard(X_test_standardized, sorted_y_test) cum_haz = cum_haz.values.T cum_hazard_scores = cum_haz.sum(axis=1) test_cindex = concordance_index(y_test[:, 0], -cum_hazard_scores, y_test[:, 1]) elif cindex_method == 'cum_haz_from_surv':
censored_subjects = df.loc[df['DEAD'] == 0] num_cs = len(censored_subjects) print(num_cs) # Add tailor made truck driven nnn km d = [[1000, 10000, 0, 0, 0, 0, 2], [1000, 10000, 0, 1, 0, 0, 2], [1000, 10000, 0, 0, 1, 0, 2], [1000, 10000, 0, 1, 1, 0, 2]] num_d = len(d) dfn = pd.DataFrame( d, columns=["ID", "KM", "DEAD", "ENGINE", "MOUNTAIN", "CITY", "MONDAY"]) print(dfn) censored_subjects = censored_subjects.append(dfn, ignore_index=True) print(censored_subjects) unconditioned_sf = cph.predict_survival_function(censored_subjects) print(unconditioned_sf) from lifelines.utils import median_survival_times, qth_survival_times predictions_75 = qth_survival_times(0.75, unconditioned_sf) predictions_25 = qth_survival_times(0.25, unconditioned_sf) predictions_50 = median_survival_times(unconditioned_sf) print(predictions_50) fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 4)) for f in unconditioned_sf: ax.plot(unconditioned_sf[f], alpha=.5, label=f) #ax.legend() fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 4))
#print cancer['T'].unique() #print cancer['E'].unique() #cancer = cancer.dropna() # the '-1' term # refers to not adding an intercept column (a column of all 1s). # It can be added to the Fitter class. covMatrix = cancer.cov() cf = CoxPHFitter() cf.fit(covMatrix, 'T', event_col= 'E') #extra paramater for categorical , strata=catVar cf.print_summary() curve = cf.predict_survival_function(cancer) curve.plot() plt.show() print "hazard coeff",cf.hazards_ print "baseline ", cf.baseline_hazard_ ''' scores = k_fold_cross_validation(cf, covMatrix, 'T', event_col='E', k=3) print scores print np.mean(scores) print np.std(scores) '''
pred_surv = [] for i in range(y_pred.shape[0]): pred_surv.append(np.interp(fu_time, times, y_pred[i, :])) pred_surv = np.array(pred_surv) (pred, actual) = calib_plot(fu_time, n_bins, pred_surv, data_test.time.as_matrix(), data_test.dead.as_matrix(), CB_color_cycle[3], 'Deepsurv', alpha=my_alpha, markersize=my_markersize, markertype='s') #mse_array[2, fu_time_i] = ((pred-actual)**2).mean() y_pred = cph.predict_survival_function(data_test) times = y_pred.index.values.astype('float64') y_pred = y_pred.as_matrix().transpose() pred_surv = [] for i in range(y_pred.shape[0]): pred_surv.append(np.interp(fu_time, times, y_pred[i, :])) pred_surv = np.array(pred_surv) (pred, actual) = calib_plot(fu_time, n_bins, pred_surv, data_test.time.as_matrix(), data_test.dead.as_matrix(), CB_color_cycle[2], 'Cox PH model', alpha=my_alpha, markersize=my_markersize,
X_train_std = X_train_std[:, :-1] X_test_std = X_test_std[:, :-1] feature_names = feature_names[:-1] train_data_df = \ pd.DataFrame(np.hstack((X_train_std, y_train)), columns=feature_names + ['time', 'status']) surv_model = CoxPHFitter() surv_model.fit(train_data_df, duration_col='time', event_col='status', show_progress=False, step_size=.1) surv_df = surv_model.predict_survival_function(X_test_std, sorted_y_test) surv = surv_df.values.T print() print('[Test data statistics]') sorted_y_test_times = np.sort(y_test[:, 0]) print('Quartiles:') print('- Min observed time:', np.min(y_test[:, 0])) print('- Q1 observed time:', sorted_y_test_times[int(0.25 * len(sorted_y_test_times))]) print('- Median observed time:', np.median(y_test[:, 0])) print('- Q3 observed time:', sorted_y_test_times[int(0.75 * len(sorted_y_test_times))]) print('- Max observed time:', np.max(y_test[:, 0])) print('Mean observed time:', np.mean(y_test[:, 0])) print('Fraction censored:', 1. - np.mean(y_test[:, 1]))
# Organize the data: data.loc[data.status == 1, 'dead'] = 0 data.loc[data.status == 2, 'dead'] = 1 data.head() # Fit data into our object: kmf.fit(durations=data["time"], event_observed=data["dead"]) # Get the event table: kmf.event_table # Get required columns from the data: data = data[[ 'time', 'age', 'sex', 'ph.ecog', 'ph.karno', 'pat.karno', 'meal.cal', 'wt.loss', 'dead' ]] # Get the summary using CoxPHFitter: cph = CoxPHFitter() cph.fit(data, "time", event_col="dead") cph.print_summary() # Plot the result on graph: cph.plot() data.iloc[10:15, :] # Plotting the data: d_data = data.iloc[10:15, :] cph.predict_survival_function(d_data).plot()
c = (wavelet_HLL_glszm_LargeAreaHighGrayLevelEmphasis - 12543870000) / 14860100000 d = (wavelet_LLL_gldm_LargeDependenceHighGrayLevelEmphasis - 27874.14) / 11933.33 ##rad_score calculation rad_score = a * (-1.3008) + b * 0.6083 + c * (-0.4295) + d * 0.3595 ##form datafram for test patient imformation test_patient = pd.DataFrame([(rad_score, age, FVC, LDH_rate)]) test_patient.columns = ('rad_score', 'age', 'FVC<50', 'LDH_rate') ## form cox model train = pd.read_csv('train.plus.rad_score_renew+HRCTscore.csv') from lifelines import CoxPHFitter cph = CoxPHFitter() #data reorgination feature_tr = train[[ 'Survival', 'CustomLabel', 'rad_score', 'age', 'FVC<50', 'LDH_rate' ]] cph.fit(feature_tr, duration_col='Survival', event_col='CustomLabel') #cph.plot(hazard_ratios=True) #cph.predict_median(feature_te) cph.predict_survival_function(test_patient, 24) #test predict #find baseline hazard fuction cph.baseline_hazard_ cph.baseline_cumulative_hazard_ cph.baseline_survival_
# 4. We get the concordance. Our model has a concordance of .929 out of 1, so it’s a very good Cox model. We can use # this to compare between models, kind of like accuracy in Logistic Regression. # lets actually plot all of this to get a better picture cph.plot() cph.plot_covariate_groups('TotalCharges', values=[0,4000], cmap='coolwarm') # you can see in the survival curve plot that customers that have Total charges closer to 0 are at a higher risk of # churning compared to those with charges closer to 4000. # now lets do some churn prediction now that we have some useful insights into what makes customers churn. # lets take all the non churners as we can't retain those who have already churned, these are called censored_subjects # sticking to Survival Analysis lingo. censored_subjects = data.loc[data['Churn_Yes'] == 0] # now we can predict their unconditioned survival curves unconditioned_sf = cph.predict_survival_function(censored_subjects) # these are unconditioned because we will predict some churn before the customers current tenure time. # lets condition the above prediction conditioned_sf = unconditioned_sf.apply(lambda c: (c/c.loc[data.loc[c.name, 'tenure']]).clip_upper(1)) # now we can investigate customers to see how the conditioning has affected their survival over the baseline rate subject = 12 unconditioned_sf[subject].plot(ls="--", color="#A60628", label="unconditioned") conditioned_sf[subject].plot(color="#A60628", label="conditioned on $T>58$") plt.legend() # we can see that cust 12 is still a customer after 58 months, which means cust 12's survival curve drops slower than # the baseline for similar custs without that condition. # the predict_survival_function has created a metrix of survival probabilities for each remaining customer at each # point in time. what we need to do now is use that to select a single value as prdiction for how long a customer
def get_surv_curv( data, player): ##add percentile of prediction as an annottion on the graph cph = CoxPHFitter() cph.fit(data, 'NBA_Experience', event_col='active') X = data.loc[[player]].drop(['NBA_Experience', 'active'], axis=1) league_surv = cph.baseline_survival_ player_surv = cph.predict_survival_function(X) x = data.drop(['NBA_Experience', 'active'], axis=1) predictions = cph.predict_expectation(x) percentiles = predictions.rank(pct=True) player_pct = percentiles.loc[player] string = 'Career Length Prediction Percentile: ' + str( round(player_pct.values[0], 2)) trace1 = go.Scatter(name='League Average', x=league_surv.index, y=league_surv['baseline survival'].values, marker={'color': "#253046"}) trace2 = go.Scatter(name=player, x=player_surv.index, y=player_surv[player].values, marker={'color': '#B35E3B'}) data = [trace1, trace2] layout = go.Layout({ "xaxis": { "title": "Years in the NBA", 'color': '#253046' }, "yaxis": { "title": "Probability of remaining in the NBA", 'color': '#253046' }, 'paper_bgcolor': '#F8F3F1', 'plot_bgcolor': '#F8F3F1', 'margin': { 't': 50, 'r': 30 }, 'annotations': [{ 'x': 13, 'y': 0.78, 'text': string, 'showarrow': False, 'font': { 'size': 14, 'color': '#253046' } }], 'legend': { 'x': .8, 'y': 1, 'traceorder': 'normal' } }) fig = go.Figure(data=data, layout=layout) return fig
print('Training Process finished') # evaluate our test data x_test = x_test.reshape((x_test.shape[0], time_steps, num_input)) predicted_y = sess.run(tf.nn.softmax(logits), feed_dict={X: x_test, Y: y_test}) print("Test accuracy is:", sess.run(accuracy, feed_dict={X: x_test, Y: y_test})) # Survival analysis using deep learning output as features increase_indices = np.where(y_test[:,1] == 1)[0] pre_increase_day = np.reshape(xx_test[increase_indices,90], (increase_indices.shape[0],1)) pre_increase_x = np.reshape(predicted_y[increase_indices,:], (-1,3)) event_col = [1]*pre_increase_x.shape[0] sur_data = np.column_stack((pre_increase_day, pre_increase_x)) sur_data = np.column_stack((sur_data, event_col)) df = pd.DataFrame(data=sur_data) df = df.drop([1], axis=1) # predict survival hazard from lifelines import CoxPHFitter cph = CoxPHFitter() cph.fit(df, duration_col=0, event_col=4) cph.print_summary() # access the results using cph.summary cph.plot() X = df.drop([0, 4], axis=1) cph.predict_partial_hazard(X) sur_pred=cph.predict_survival_function(X)
kmf = KaplanMeierFitter() kmf.fit(duration, event_observed = not_censor) kmf.survival_function_.plot() # Cox-PH Model Regression from lifelines import CoxPHFitter cf = CoxPHFitter() cf.fit(data, 'duration', event_col = 'event') cf.print_summary() ## Get Predictions from Model ## # 24 year old college grad #college_24 = pd.DataFrame({'age':[24], 'college':[1]}) #cf.predict_survival_function(college_24).plot() # 65 year old high school grad #hs_65 = pd.DataFrame({'age':[65], 'college':[0]}) #cf.predict_survival_function(hs_65).plot() # Predicted Survival for 24yr-old College Grad and 65yr-old HS Grad mixed = pd.DataFrame({'age':[24, 65,42], 'college':[1,0,.4], 'index': ['24yr old College Grad','65yr old HS Grad','Average']}) mixed = mixed.set_index(['index']) # setting row names cf.predict_survival_function(mixed).plot() # Plotting survival pl.title('Probability of Survival at Time t') pl.xlabel('Time t') pl.ylabel('Probability of Survival') """ cf.predict_survival_function without the .plot() option will return a matrix-like object that has the probability of survival at time t. """
#scores.to_csv(r'T:\tbase\feature_importances.csv', quoting=csv.QUOTE_NONNUMERIC) # ============================================================================= # #Cox Regression Model # ====================================================== cph = CoxPHFitter(penalizer=0.1) ## Instantiate the class to create a cph object cph.fit(train_features, 'tenure', event_col='Longterm_TransplantOutcome', show_progress=True, step_size=0.1) ## Fit the data to train the model cph.summary.to_csv(r'T:\tbase\cph_summary.csv') print('concordance index: ' + str(cph.concordance_index_)) tr_rows = test_features.loc[:, test_features.columns != 'Longterm_TransplantOutcome'].iloc[:, :] tr_rows_res = test_features.loc[:, test_features.columns == 'Longterm_TransplantOutcome'].iloc[:, :] cph.predict_survival_function(tr_rows).plot() print(tr_rows_res) predictions = cph.predict_survival_function(tr_rows) predictions = predictions.transpose() #pd.DataFrame(predictions.columns).to_clipboard() # ============================================================================= # #Qualitative error analysis for col in predictions.columns: if float(col) > (365*6): col_use = col print(col_use) break
df_dummy.shape # In[79]: # proportional hazard model cph = CoxPHFitter() cph.fit(df_dummy, 'finalTerm', event_col='default') cph.print_summary() # In[80]: tr_rows = df_dummy.iloc[0:10] # In[81]: cph.predict_survival_function(tr_rows).plot() # In[82]: preds = cph.predict_survival_function(df_dummy).T # In[87]: final = pd.concat([df, preds], axis=1) # In[88]: final # In[105]:
def compute_coxhr(endpoint, df, lag, nindivs, res_writer): logger.info(f"Running Cox regression") # Handle sex-specific endpoints is_sex_specific = pd.notna(endpoint.SEX) if is_sex_specific: df = df.drop(columns=["female"]) # Fit Cox model cph = CoxPHFitter() cph.fit( df, duration_col="duration", event_col="death", # For the case-cohort study we need weights and robust errors: weights_col="weight", robust=True) # Compute absolute risk mean_indiv = {"BIRTH_TYEAR": [1959.0], "endpoint": [True], "female": [0.5]} if is_sex_specific: mean_indiv.pop("female") if lag is None: predict_at = STUDY_ENDS - STUDY_STARTS lag_value = None else: _min_lag, max_lag = lag predict_at = max_lag lag_value = max_lag surv_probability = cph.predict_survival_function(pd.DataFrame(mean_indiv), times=[predict_at ]).values[0][0] absolute_risk = 1 - surv_probability norm_mean = cph._norm_mean # Get values out of the fitted model endp_coef = cph.params_["endpoint"] endp_se = cph.standard_errors_["endpoint"] endp_hr = np.exp(endp_coef) endp_ci_lower = np.exp(endp_coef - 1.96 * endp_se) endp_ci_upper = np.exp(endp_coef + 1.96 * endp_se) endp_pval = cph.summary.p["endpoint"] endp_zval = cph.summary.z["endpoint"] endp_norm_mean = norm_mean["endpoint"] year_coef = cph.params_["BIRTH_TYEAR"] year_se = cph.standard_errors_["BIRTH_TYEAR"] year_hr = np.exp(year_coef) year_ci_lower = np.exp(year_coef - 1.96 * year_se) year_ci_upper = np.exp(year_coef + 1.96 * year_se) year_pval = cph.summary.p["BIRTH_TYEAR"] year_zval = cph.summary.z["BIRTH_TYEAR"] year_norm_mean = norm_mean["BIRTH_TYEAR"] if not is_sex_specific: sex_coef = cph.params_["female"] sex_se = cph.standard_errors_["female"] sex_hr = np.exp(sex_coef) sex_ci_lower = np.exp(sex_coef - 1.96 * sex_se) sex_ci_upper = np.exp(sex_coef + 1.96 * sex_se) sex_pval = cph.summary.p["female"] sex_zval = cph.summary.z["female"] sex_norm_mean = norm_mean["female"] else: sex_coef = np.nan sex_se = np.nan sex_hr = np.nan sex_ci_lower = np.nan sex_ci_upper = np.nan sex_pval = np.nan sex_zval = np.nan sex_norm_mean = np.nan # Save the baseline cumulative hazard (bch) df_bch = cph.baseline_cumulative_hazard_ baseline_cumulative_hazard = bch_at(df_bch, predict_at) bch_values = {} for time in BCH_TIMEPOINTS: bch_values[time] = bch_at(df_bch, time) # Save values res_writer.writerow([ endpoint.NAME, lag_value, nindivs, absolute_risk, endp_coef, endp_se, endp_hr, endp_ci_lower, endp_ci_upper, endp_pval, endp_zval, endp_norm_mean, year_coef, year_se, year_hr, year_ci_lower, year_ci_upper, year_pval, year_zval, year_norm_mean, sex_coef, sex_se, sex_hr, sex_ci_lower, sex_ci_upper, sex_pval, sex_zval, sex_norm_mean, baseline_cumulative_hazard, bch_values[0], bch_values[2.5], bch_values[5], bch_values[7.5], bch_values[10], bch_values[12.5], bch_values[15], bch_values[17.5], bch_values[20], bch_values[21.99] ]) logger.info("done running Cox regression")
# cph.print_summary() # access the results using cph.summary # Validation # _X_valid = _X_valid.drop(["Survival", "Event"], axis=1) # Testing input after preprocessing. _X_valid_1 = _X_valid_1.drop(["Survival", "Event"], axis=1) _X_valid_2 = _X_valid_2.drop(["Survival", "Event"], axis=1) _X_valid_3 = _X_valid_3.drop(["Survival", "Event"], axis=1) _X_valid_4 = _X_valid_4.drop(["Survival", "Event"], axis=1) _X_valid_5 = _X_valid_5.drop(["Survival", "Event"], axis=1) # _seq_pred_y_valid_1 = cph.predict_survival_function(_X_valid_1, np.arange(before_steps)).as_matrix() _seq_pred_y_valid = np.array([cph.predict_survival_function(_X_valid_1, np.arange(before_steps+1)).as_matrix()[1,:], cph.predict_survival_function(_X_valid_2, np.arange(before_steps+1)).as_matrix()[2,:], cph.predict_survival_function(_X_valid_3, np.arange(before_steps+1)).as_matrix()[3,:], cph.predict_survival_function(_X_valid_4, np.arange(before_steps+1)).as_matrix()[4,:], cph.predict_survival_function(_X_valid_5, np.arange(before_steps+1)).as_matrix()[5,:]]) _seq_pred_y_valid = _seq_pred_y_valid.transpose() thrld_score = dict() for sur_thrld_valid in np.arange(1.0, 0.0, -0.01): yy = [] pp = [] seq_pred_y_valid = np.zeros((_seq_pred_y_valid.shape[0], before_steps)) seq_pred_y_valid[np.where(_seq_pred_y_valid[:, :before_steps] > sur_thrld_valid)] = 1 early_correct = np.sum(seq_pred_y_valid == batch_y_valid, axis=0)
#loan_grade = grading("what is the loan grade? Choose from the following Grade 'A','B','C','D','E','F','G'" ) ownership = home( "Loanholder's house ownership. Is it 'Mortgage', 'Own', 'Rent' or 'Any'?") d = { 'annual_inc': inc, 'loan_amnt': loan, 'delinq_2yrs': delinq_2yrs, 'Grade': loan_grade } d.update(ownership) Test = pd.DataFrame([d]) survivalProb = pd.DataFrame(np.array(cf.predict_survival_function(Test))) time = pd.DataFrame(cf.predict_survival_function(Test).index.values).rename( columns={0: 'Time'}) hazard = 1 - pd.DataFrame(cf.predict_survival_function(Test)) payment_term = get_non_negative_int('What is the payment term? ') payments = get_non_negative_int('What are the series of loan payments? ') ExpTerms = pd.DataFrame(time * survivalProb * payment_term).astype(int) def npv(rate, cashflows): total = 0.0 for i, cashflow in enumerate(cashflows): total += cashflow / (1 + rate)**i return total
""" # print cancer['T'].unique() # print cancer['E'].unique() # cancer = cancer.dropna() # the '-1' term # refers to not adding an intercept column (a column of all 1s). # It can be added to the Fitter class. covMatrix = cancer.cov() cf = CoxPHFitter() cf.fit(covMatrix, "T", event_col="E") # extra paramater for categorical , strata=catVar cf.print_summary() curve = cf.predict_survival_function(cancer) curve.plot() plt.show() print "hazard coeff", cf.hazards_ print "baseline ", cf.baseline_hazard_ """ scores = k_fold_cross_validation(cf, covMatrix, 'T', event_col='E', k=3) print scores print np.mean(scores) print np.std(scores) """
lis = [50,140,380,500] #plt.plot(cph.predict_survival_function(new_final.iloc[lis,:]),label = '1') plt.plot(cph.predict_survival_function(new_final.iloc[50:51,:]),label = 'engine 1') plt.plot(cph.predict_survival_function(new_final.iloc[140:141,:]),label = 'engine 2') plt.plot(cph.predict_survival_function(new_final.iloc[380:381,:]),label = 'engine 3') plt.plot(cph.predict_survival_function(new_final.iloc[500:501,:]),label = 'engine 4') plt.xlabel('Remaining Life Cycle') plt.ylabel('Probability') plt.legend() plt.axvline(x=150) plt.grid() ''' # randomize 10 new_lis = [18,20,55,61,86,124,168,229,260,269,362,387,390,437,458,\ 519,530,618,656,667] mat = cph.predict_survival_function(new_final.iloc[new_lis, :]) mat = mat.round(decimals=4) for xxx in [60, 71, 159, 197, 208]: print('%g \n' % xxx) print(mat[xxx].iloc[np.where( np.logical_and(mat[xxx] > 0.48, mat[xxx] < 0.52))]) for bbb in new_lis: print('%g' % bbb) print(final.iloc[bbb, :].time_in_cycle) print('\n') plt.scatter(a, b, label='real engine life cycle') plt.scatter(a, c, label='Predicted to Breakdown Soon')
surv_model = CoxPHFitter() surv_model.fit(train_data_df, duration_col='time', event_col='status', show_progress=False, step_size=.1) elapsed = time.time() - tic print('Time elapsed: %f second(s)' % elapsed) np.savetxt(time_elapsed_filename, np.array(elapsed).reshape(1, -1)) # --------------------------------------------------------------------- # evaluation # sorted_y_test = np.unique(y_test[:, 0]) surv_df = surv_model.predict_survival_function(X_test_std, sorted_y_test) surv = surv_df.values.T ev = EvalSurv(surv_df, y_test[:, 0], y_test[:, 1], censor_surv='km') cindex_td = ev.concordance_td('antolini') print('c-index (td):', cindex_td) linear_predictors = \ surv_model.predict_log_partial_hazard(X_test_std) cindex = concordance_index(y_test[:, 0], -linear_predictors, y_test[:, 1]) print('c-index:', cindex) time_grid = np.linspace(sorted_y_test[0], sorted_y_test[-1], 100) integrated_brier = ev.integrated_brier_score(time_grid) print('Integrated Brier score:', integrated_brier, flush=True)