def CPH_bootstrap(fp, num=False, sub=None): ''' Compute CPH with bootstrapping :param fp: (str) filename of selected features :param num: (bool) set True to include number of mets (col 42) :param sub: (none) for sub-analysis, if it is specified, df is already created, do not need to load file :return: (str) C-index (95% confidence interval) ''' if sub is not None: df = sub else: df = pd.read_csv(fp, index_col=0) # df = pd.read_csv(fp, index_col=0) # configure bootstrap (sampling 50% of data) n_iterations = 100 n_size = int(len(df) * 0.50) # calculate population of statistics metrics = [] for i in range(n_iterations): # prepare sample # if indicated, include number of mets (col 42) if num: sample = resample(df.iloc[:, np.r_[:20, 40, 41, 42]], n_samples=n_size) else: sample = resample(df.iloc[:, np.r_[:20, 40, 41]], n_samples=n_size) # calculate c-index and append to list cph = CoxPHFitter().fit(sample, 'Time', 'Event') score = concordance_index(sample['Time'], -cph.predict_partial_hazard(sample), sample['Event']) metrics.append(score) # calculate confidence interval alpha = 0.95 p = ((1.0 - alpha) / 2.0) * 100 lower = max(0.0, np.percentile(metrics, p)) p = (alpha + ((1.0 - alpha) / 2.0)) * 100 upper = min(1.0, np.percentile(metrics, p)) med = np.percentile(metrics, 50) # identify aggregation method name if num: name = fp.split('/')[-1].split('_')[0] + ' + NumMets' else: name = fp.split('/')[-1].split('_')[0] return print(name, 'CPH', '%.3f (%.3f-%.3f)' % (med, lower, upper))
def Cox_Model(train, test): ''' train: train_data test: test_data vars_list: variables list ''' cph = CoxPHFitter(penalizer=15) cph.fit(train, duration_col='生存时间(天)', event_col='是否死亡', show_progress=True, step_size=1) Cox_train_Cindex = concordance_index(train['生存时间(天)'], -cph.predict_partial_hazard(train), train['是否死亡']) Cox_test_Cindex = concordance_index(test['生存时间(天)'], -cph.predict_partial_hazard(test), test['是否死亡']) return Cox_train_Cindex, Cox_test_Cindex, cph
def cox_Proportional_hazard_model(): ################################################################## print('---------------------------------------') print('Standard Cox proportional hazards model') print('---------------------------------------') ''' Standard Cox proportional hazards model ''' cph = CoxPHFitter() cph.fit(data_train, duration_col='duration_d', event_col='CVD', show_progress=True) # cph.print_summary() # Cox model discrimination train set prediction = cph.predict_partial_hazard(data_train) print("\ntrain data c-index = " + str( concordance_index(data_train.duration_d, -prediction, data_train.CVD))) # Cox model discrimination test set prediction = cph.predict_partial_hazard(data_test) print("\ntest data c-index = " + str( concordance_index(data_test.duration_d, -prediction, data_test.CVD)))
def test_concordance_index_fast_is_same_as_slow(): size = 100 T = np.random.normal(size=size) P = np.random.normal(size=size) C = np.random.choice([0, 1], size=size) Z = np.zeros_like(T) # Hard to imagine these failing assert slow_cindex(T, Z, C) == fast_cindex(T, Z, C) assert slow_cindex(T, T, C) == fast_cindex(T, T, C) # This is the real test though assert slow_cindex(T, P, C) == fast_cindex(T, P, C) cp = CoxPHFitter() df = load_rossi() cp.fit(df, duration_col="week", event_col="arrest") T = cp.durations.values.ravel() P = -cp.predict_partial_hazard(df[df.columns.difference(["week", "arrest"])]).values.ravel() E = cp.event_observed.values.ravel() assert slow_cindex(T, P, E) == fast_cindex(T, P, E)
def trainCox(dataroot='./data/TCGA_GBMLGG/', ckpt_name='./checkpoints/surv_15_cox/', model='cox_omic', penalizer=1e-4): ### Creates Checkpoint Directory if not os.path.exists(ckpt_name): os.makedirs(ckpt_name) if not os.path.exists(os.path.join(ckpt_name, model)): os.makedirs(os.path.join(ckpt_name, model)) ### Load PNAS Splits pnas_splits = pd.read_csv(dataroot + 'pnas_splits.csv') pnas_splits.columns = ['TCGA ID'] + [str(k) for k in range(1, 16)] pnas_splits.index = pnas_splits['TCGA ID'] pnas_splits = pnas_splits.drop(['TCGA ID'], axis=1) ### Loads Data ignore_missing_moltype = True if model in [ 'cox_omic', 'cox_moltype', 'cox_grade+moltype', 'all' ] else False ignore_missing_histype = True if model in [ 'cox_histype', 'cox_grade', 'cox_grade+moltype', 'all' ] else False all_dataset = getCleanAllDataset( dataroot=dataroot, ignore_missing_moltype=ignore_missing_moltype, ignore_missing_histype=ignore_missing_histype)[1] model_feats = { 'cox_omic': [ 'TCGA ID', 'Histology', 'Grade', 'Molecular subtype', 'Histomolecular subtype' ], 'cox_moltype': ['Survival months', 'censored', 'codeletion', 'idh mutation'], 'cox_histype': ['Survival months', 'censored', 'Histology'], 'cox_grade': ['Survival months', 'censored', 'Grade'], 'cox_grade+moltype': ['Survival months', 'censored', 'codeletion', 'idh mutation', 'Grade'], 'cox_all': ['TCGA ID', 'Histomolecular subtype'] } cv_results = [] for k in pnas_splits.columns: pat_train = list( set(pnas_splits.index[pnas_splits[k] == 'Train']).intersection( all_dataset.index)) pat_test = list( set(pnas_splits.index[pnas_splits[k] == 'Test']).intersection( all_dataset.index)) feats = all_dataset.columns.drop( model_feats[model] ) if model == 'cox_omic' or model == 'cox_all' else model_feats[model] train = all_dataset.loc[pat_train] test = all_dataset.loc[pat_test] cph = CoxPHFitter(penalizer=penalizer) cph.fit(train[feats], duration_col='Survival months', event_col='censored', show_progress=False) cin = concordance_index(test['Survival months'], -cph.predict_partial_hazard(test[feats]), test['censored']) cv_results.append(cin) train.insert(loc=0, column='Hazard', value=-cph.predict_partial_hazard(train)) test.insert(loc=0, column='Hazard', value=-cph.predict_partial_hazard(test)) pickle.dump( train, open( os.path.join(ckpt_name, model, '%s_%s_pred_train.pkl' % (model, k)), 'wb')) pickle.dump( test, open( os.path.join(ckpt_name, model, '%s_%s_pred_test.pkl' % (model, k)), 'wb')) pickle.dump( cv_results, open(os.path.join(ckpt_name, model, '%s_results.pkl' % model), 'wb')) print("C-Indices across Splits", cv_results) print("Average C-Index: %f" % CI_pm(cv_results))
print('Training Process finished') # evaluate our test data x_test = x_test.reshape((x_test.shape[0], time_steps, num_input)) predicted_y = sess.run(tf.nn.softmax(logits), feed_dict={X: x_test, Y: y_test}) print("Test accuracy is:", sess.run(accuracy, feed_dict={X: x_test, Y: y_test})) # Survival analysis using deep learning output as features increase_indices = np.where(y_test[:,1] == 1)[0] pre_increase_day = np.reshape(xx_test[increase_indices,90], (increase_indices.shape[0],1)) pre_increase_x = np.reshape(predicted_y[increase_indices,:], (-1,3)) event_col = [1]*pre_increase_x.shape[0] sur_data = np.column_stack((pre_increase_day, pre_increase_x)) sur_data = np.column_stack((sur_data, event_col)) df = pd.DataFrame(data=sur_data) df = df.drop([1], axis=1) # predict survival hazard from lifelines import CoxPHFitter cph = CoxPHFitter() cph.fit(df, duration_col=0, event_col=4) cph.print_summary() # access the results using cph.summary cph.plot() X = df.drop([0, 4], axis=1) cph.predict_partial_hazard(X) sur_pred=cph.predict_survival_function(X)
print(x_train.shape, x_val.shape) # special for Cox xy_train_df = pd.DataFrame(x_train) xy_train_df['T'] = y_train xy_train_df['E'] = e_train xy_val_df = pd.DataFrame(x_val) xy_val_df['T'] = y_val xy_val_df['E'] = e_val xy_val_df_events = xy_val_df[xy_val_df['E'] == 1] cph = CoxPHFitter(penalizer=0.1).fit(xy_train_df, 'T', 'E') preds = -cph.predict_partial_hazard(xy_val_df) cindex_train = cph.score(xy_train_df, scoring_method='concordance_index') cindex_val = cph.score(xy_val_df, scoring_method='concordance_index') cindex_val_events = cph.score(xy_val_df_events, scoring_method='concordance_index') cph_cindex_trains.append(cindex_train) cph_cindex_vals.append(cindex_val) cph_cindex_vals_events.append(cindex_val_events) ps.append(p_to_drop) print('Train cindex {:.2f}'.format(cindex_train * 100)) print('Test cindex {:.2f}'.format(cindex_val * 100)) print('Test cindex Events Only {:.2f}'.format(cindex_val_events * 100)) print(
# In[ ]: import matplotlip from matplotlib import pyplot as plt import lifelines from lifelines import KaplanMeierFitter #survival analysis library from lifelines.statistics import logrank_test #survival statistical testing from lifelines import CoxPHFitter df['churn'] = df1.fuga cph = CoxPHFitter() cph.fit(df, duration_col=ypd1['enddt'], event_col=ypd1['FUGA'], show_progress=True) cph.print_summary() cph.plot() # In[ ]: df_2 = df.drop(['enddt', 'FUGA'], axis=1) cph.predict_partial_hazard(df_2) cph.predict_survival_function(df_2, times=[5., 25., 50.]) cph.predict_median(X) kmf = KaplanMeierFitter() T = df['time_to_fuga'] #duration C = df['churn'] #censorship - 1 if death/churn is seen, 0 if censored
def train_cox(x_train0, ix_in, y_per_pt, y_int, metric = 'auc', feature_grid = None): if feature_grid is None: feature_grid = np.logspace(7, 20, 14) survival = {} # for ic_in, ix_in in enumerate(ix_inner): train_index, test_index = ix_in x_train, x_test = x_train0.iloc[train_index, :], x_train0.iloc[test_index, :] lamb_dict = {} lamb_dict['auc'] = {} lamb_dict['ci'] = {} for il, lamb in enumerate(feature_grid): ix_inner2 = leave_one_out_cv(x_train, x_train['outcome'], ddtype='all_data') ix_rand_samp = np.random.choice(np.arange(len(ix_inner2)), 10, replace=False) ix_inner2_samp = np.array(ix_inner2, dtype='object')[ix_rand_samp] # ix_inner2_rand_samp = np.random.choice(ix_inner2, 10, replace = False) counter = 0 start = time.time() hazards = [] event_times = [] event_outcomes = [] probs_in = [] true = [] model = CoxPHFitter(penalizer=lamb, l1_ratio=1.) for ic_in2, ix_in2 in enumerate(ix_inner2_samp): start_inner = time.time() train_ix, test_ix = ix_in2 x_tr2, x_ts2 = x_train.iloc[train_ix, :], x_train.iloc[test_ix, :] tmpts_in = [xx.split('-')[1] for xx in x_tr2.index.values] samp_weights = get_class_weights(np.array(y_int[x_tr2.index.values]), tmpts_in) samp_weights[samp_weights <= 0] = 1 x_tr2.insert(x_tr2.shape[1], 'weights', samp_weights) try: model.fit(x_tr2, duration_col='week', event_col='outcome', weights_col='weights', robust=True, show_progress = False) except: counter += 1 continue pred_f = model.predict_survival_function(x_ts2.iloc[0, :]) probs_in.append(1 - pred_f.loc[4.0].item()) true.append(x_ts2['outcome'].iloc[-1]) hazard = model.predict_partial_hazard(x_ts2) hazards.append(hazard) event_times.append(x_ts2['week']) event_outcomes.append(x_ts2['outcome']) end_inner = time.time() # print('Inner ix ' + str(ic_in2) + ' complete in ' + str(end_inner - start_inner)) # if metric == 'CI': try: score = concordance_index(pd.concat(event_times), pd.concat(hazards), pd.concat(event_outcomes)) lamb_dict['ci'][lamb] = score end_t = time.time() print(str(il) + ' complete') print((end_t - start)/60) except: print('No score available') continue # elif metric == 'auc': try: score = sklearn.metrics.roc_auc_score(true, probs_in) lamb_dict['auc'][lamb] = score except: continue lambdas, aucs_in = list(zip(*lamb_dict[metric].items())) ix_max = np.argmax(aucs_in) best_lamb = lambdas[ix_max] model_out = CoxPHFitter(penalizer=best_lamb, l1_ratio=1.) tmpts_in = [xx.split('-')[1] for xx in x_train.index.values] samp_weights = get_class_weights(np.array(y_int[x_train.index.values]), tmpts_in) samp_weights[samp_weights<=0] = 1 x_train.insert(x_train.shape[1], 'weights', samp_weights) x_train['weights'] = samp_weights try: model_out.fit(x_train, duration_col='week', event_col='outcome', weights_col='weights', robust=True) except: return {} pred_f = model_out.predict_survival_function(x_test.iloc[0, :]) pt = x_test.index.values[0].split('-')[0] hazard_out = model_out.predict_partial_hazard(x_test) pts = [ii.split('-')[0] for ii in x.index.values] tmpts = [ii.split('-')[1] for ii in x.index.values] # if pt not in survival.keys(): # survival[pt] = {} ixs = np.where(np.array(pts) == pt)[0] survival['actual'] = str(np.max([float(tmpt) for tmpt in np.array(tmpts)[ixs]])) if y_per_pt[pt] == 'Cleared': survival['actual'] = survival['actual'] + '+' probs_sm = 1 - pred_f.loc[4.0].item() y_pred_exp = model_out.predict_expectation(x_test.iloc[[0], :]) survival['predicted'] = str(np.round(y_pred_exp.item(), 3)) surv_func = pred_f # probs_df = pd.Series(probs_sm) # y_pp = y_per_pt.replace('Cleared', 0).replace('Recur', 1) # final_df = pd.concat([y_pp, probs_df], axis=1).dropna() final_dict = {} # final_dict['probability_df'] = final_df final_dict['model'] = model_out final_dict['survival'] = survival final_dict['survival_function'] = surv_func final_dict['prob_true'] = (probs_sm, y_per_pt[pt]) final_dict['times_hazards_outcomes'] = (x_test['week'], hazard_out, x_test['outcome']) final_dict['lambdas'] = lamb_dict # final_dict['auc'] = sklearn.metrics.roc_auc_score(final_df[0], final_df[1]) return final_dict
ax=sns_plot_product_ax) sns_plot_product_ax.set_title('FDJ_J1 length vs product of complexity') sns_plot_product_ax.set(xlabel='FDJ_J1 length in days', ylabel='product of complexity scaling') sns_plot_product.savefig( "..\\output\\FDJ_J1 length vs product of complexity.png") sns_plot_product, sns_plot_product_ax = plt.subplots() sns.distplot(FDJ_J1.dropna()) sns_plot_product_ax.set_title('FDJ_J1 length histogram') sns_plot_product_ax.set(xlabel='FDJ_J1 length in days') sns_plot_product.savefig("..\\output\\FDJ_J1 length histogram.png") ## l460 data[data['Program_Display_Name'].str.split( expand=True)[0] == 'L460'].sort_values(by='Actual_FC_Gate') regression_data = pd.DataFrame() regression_data['FDJ_J1'] = FDJ_J1 regression_data['event'] = 1 regression_data['sumcomp'] = data_comp.total_sum_complexity regression_data = regression_data.dropna() ## model cph = CoxPHFitter() cph.fit(regression_data, 'FDJ_J1', event_col='event') cph.print_summary() cph.predict_partial_hazard(regression_data[['sumcomp']]) survival = cph.predict_survival_function(regression_data[['sumcomp']]) survival = cph.predict_survival_function(np.array([[20]]))
def surv_coxph(data_train, x_cols, duration_col, event_col, data_test=None, pt=None, show_extra=True): """Integrate functions that include modeling using Cox Regression and evaluating Parameters ---------- data_train : pandas.DataFame Full survival data for train. x_cols : list of str Name of column indicating variables. duration_col : str Name of column indicating time. event_col : str Name of column indicating event. data_test : pandas.DataFame Full survival data for test, default None. pt : float Predicted time for AUC. Returns ------- object Object of cox model in `lifelines.CoxPHFitter`. Examples -------- >>> surv_coxph(train_data, ['x1', 'x2'], 'T', 'E', test_data, pt=5*12) """ y_cols = [event_col, duration_col] cph = CoxPHFitter() cph.fit(data_train[x_cols + y_cols], duration_col=duration_col, event_col=event_col, show_progress=True) # CI of train pred_X_train = cph.predict_partial_hazard(data_train[x_cols]) pred_X_train.rename(columns={0: 'X'}, inplace=True) ci_train = concordance_index(data_train[duration_col], -pred_X_train, data_train[event_col]) # AUC of train at pt df = pd.concat([data_train[y_cols], pred_X_train], axis=1) roc_train = surv_roc(df, 'X', duration_col, event_col, pt=pt) if data_test is not None: # CI of test pred_X_test = cph.predict_partial_hazard(data_test[x_cols]) pred_X_test.rename(columns={0: 'X'}, inplace=True) ci_test = concordance_index(data_test[duration_col], -pred_X_test, data_test[event_col]) # AUC of test at pt df = pd.concat([data_test[y_cols], pred_X_test], axis=1) roc_test = surv_roc(df, 'X', duration_col, event_col, pt=pt) # Print Summary of CPH cph.print_summary() print "__________Metrics CI__________" print "CI of train: %.4f" % ci_train if data_test is not None: print "CI of test : %.4f" % ci_test print "__________Metrics AUC__________" print "AUC of train: %.4f" % roc_train['AUC'] if data_test is not None: print "AUC of test : %.4f" % roc_test['AUC'] if not show_extra: return cph # Print Coefficients print "__________Summary of Coefficients in CPH__________" cols = ['coef', 'p', 'lower 0.95', 'upper 0.95'] print cols[0], ":" for i in cph.summary.index: print "%.4f" % (cph.summary.loc[i, cols[0]]) print "__________" print cols[1], ":" for i in cph.summary.index: print "%.4f" % (cph.summary.loc[i, cols[1]]) print "__________" print "95% CI :" for i in cph.summary.index: print "[%.4f, %.4f]" % (cph.summary.loc[i, cols[2]], cph.summary.loc[i, cols[3]]) return cph
## Train and Predict if (model_name == 'linearregression' or model_name == 'xgb'): model.fit(X_train, Y_train) Predict = model.predict(X_test) elif (model_name == 'svmlin'): model.fit(X_train, Y_train) Predict = model.decision_function(X_test) elif (model_name == 'coxregression'): if data_name == 'maggic': model.fit(Train_All, duration_col='days_to_fu', event_col='death_all') Predict = model.predict_partial_hazard(X_test) elif (data_name == 'heart_trans' or 'heart_wait'): model.fit(Train_All, duration_col="'Survival'", event_col="'Censor'") Predict = model.predict_partial_hazard(X_test) else: model.fit(X_train, Y_train) Predict = model.predict_proba(X_test)[:, 1] # Performance AUC_ar[j][k] = metrics.roc_auc_score(Y_test, Predict) AUPRC_ar[j][k] = metrics.average_precision_score(Y_test, Predict) Cind_ar[j][k] = C_index(Y_test, Predict)
def __linear_small(self, is_death, train_data_path, basepath): small_dataset_file = train_data_path small_dataset = pandas.read_csv(small_dataset_file, encoding='UTF-8', index_col=[0]) del small_dataset['patient_id'] del small_dataset['name'] # 哑变量处理 formular = '' classify_attr = { 'gender', 'smoking', 'highflux', 'payment', 'marital', 'alcohol', 'HBsAg', 'HBsAb', 'HBeAg', 'HBeAb', 'HBcAb', 'HCV', 'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'diuretic', 'LipidD', 'CaPB', 'NCaPB', 'VitD', 'mucosaprotect', 'H2RA', 'PPI', 'APUD', 'access', 'ESRDcause', 'hypertension', 'DM', 'cardiovasculardisease', 'cerebrovasculardisease', 'bleeding', 'malignancy', 'ablocker', 'bblocker' } for column in small_dataset.columns: if column in classify_attr: formular = formular + 'C(' + column + ')+' else: formular = formular + column + '+' formular = formular[:-1] small_dataset = patsy.dmatrix(formular + '-1', small_dataset, return_type='dataframe') if is_death: T_true, E_true, T_false, E_false = ('survivaltime1', 'outcome1', 'survivaltime2', 'outcome2') attr_file, p632_file, var_file, kfold_file = ( 'lm_significant_attrs.txt', 'lm_stats632.csv', 'lm_statvar.txt', 'lm_statskfold.csv') beta_file, p_file = ('lm_coef.csv', 'lm_p.csv') else: T_true, E_true, T_false, E_false = ('survivaltime2', 'outcome2', 'survivaltime1', 'outcome1') attr_file, p632_file, var_file, kfold_file = ( 'lm_significant_attrs_e.txt', 'lm_stats632_e.csv', 'lm_statvar_e.txt', 'lm_statskfold_e.csv') beta_file, p_file = ('lm_coef_e.csv', 'lm_p_e.csv') del small_dataset[T_false] del small_dataset[E_false] significant_attrs = list() for column in small_dataset.columns: # print('column', column) if column in {T_true, E_true}: continue subset = small_dataset[[column, T_true, E_true]] # print('subset', subset) try: cox = CoxPHFitter() cox.fit(subset, T_true, E_true) # print('cox.summary['p'][0]:', cox.summary['p'][0]) if cox.summary['p'][0] < 0.05: significant_attrs.append(column) except Exception: continue output = open(attr_file, mode='w') for attr in significant_attrs: output.write(attr + '\n') output.close() input = open(attr_file) significant_attrs = [line.strip() for line in input.readlines()] input.close() significant_attrs.append(T_true) significant_attrs.append(E_true) print('linear_small ## sign_attr : %d' % len(significant_attrs)) small_dataset = small_dataset[significant_attrs] # 10000 times .632 bootstrap count = 0 stats632 = list() statscoef = list() statspvalue = list() while count < 10000: # 线性训练 try: train_set = small_dataset.take( numpy.random.randint(0, len(small_dataset), size=len(small_dataset))) test_set = small_dataset.ix[set( small_dataset.index).difference(set(train_set.index))] train_set.index = range(len(train_set)) test_set.index = range(len(test_set)) cox = CoxPHFitter() cox.fit(train_set, T_true, E_true) train_cindex = concordance_index( cox.durations, -cox.predict_partial_hazard(cox.data).values.ravel(), cox.event_observed) statscoef.append(cox.summary[['coef']].T) statspvalue.append(cox.summary[['p']].T) # test_set test_actual_T = test_set[T_true].copy() test_actual_E = test_set[E_true].copy() test_variable = test_set[test_set.columns.difference( [T_true, E_true])] test_predictT = cox.predict_expectation(test_variable) # small_set all_actual_T = small_dataset[T_true].copy() all_actual_E = small_dataset[E_true].copy() all_variable = small_dataset[small_dataset.columns.difference( [T_true, E_true])] all_predictT = cox.predict_expectation(all_variable) try: test_cindex = concordance_index(test_actual_T, test_predictT, test_actual_E) all_cindex = concordance_index(all_actual_T, all_predictT, all_actual_E) except Exception: test_cindex = concordance_index(test_actual_T, test_predictT) all_cindex = concordance_index(all_actual_T, all_predictT) stats632.append([train_cindex, test_cindex, all_cindex]) count += 1 print('632 -> %d' % count) except Exception: continue stats632_df = pandas.DataFrame(stats632, columns=['train', 'test', 'all']) stats632_df.to_csv(p632_file, encoding='UTF-8') statscoef_df = pandas.DataFrame( pandas.concat(statscoef, ignore_index=True)) statscoef_df.to_csv(beta_file, encoding='UTF-8') statspvalue_df = pandas.DataFrame( pandas.concat(statspvalue, ignore_index=True)) statspvalue_df.to_csv(p_file, encoding='UTF-8') # 2000 times 10-fold cross-validation、十折交叉 count = 0 statskfold = list() while count < 2000: try: cox = CoxPHFitter() scores = k_fold_cross_validation(cox, small_dataset, T_true, E_true, 10) statskfold.append(scores) count += 1 print('k-fold -> %d' % count) except Exception: continue statskfold_df = pandas.DataFrame(statskfold) statskfold_df.to_csv(basepath + "/" + kfold_file, encoding='UTF-8')
def __linear_big(self, is_death, train_data_path, basepath): big_dataset_file = train_data_path big_dataset = pandas.read_csv(big_dataset_file, encoding='UTF-8', index_col=[0]) del big_dataset['patient_id'] del big_dataset['name'] del big_dataset['tx_id'] # del big_dataset['tx_id.1'] del big_dataset['tx_date'] formular = '' # classify_attr = {'subject', 'treat_item', 'vascular_access_type', # 'dialysis_machine', 'reuse_times', 'anticoagulation_scope', # 'anticoagulation', 'protamine', 'replacement_way', # 'take_food', 'fluid_infusion', 'blood_pressure_pos', # 'gender', 'smoking', 'highflux', 'payment', 'marital', 'alcohol', 'HBsAg', 'HBsAb', # 'HBeAg', 'HBeAb', 'HBcAb', 'HCV', 'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'diuretic', # 'LipidD', 'CaPB', 'NCaPB', 'VitD', 'mucosaprotect', 'H2RA', 'PPI', 'APUD', 'access', # 'ESRDcause', 'hypertension', 'DM', 'cardiovasculardisease', 'cerebrovasculardisease', # 'bleeding', 'malignancy', 'ablocker', 'bblocker'} classify_attr = { 'subject', 'treat_item', 'vascular_access_type', 'dialysis_machine', 'anticoagulation_scope', 'anticoagulation', 'protamine', 'replacement_way', 'take_food', 'fluid_infusion', 'blood_pressure_pos', 'gender', 'smoking', 'highflux', 'payment', 'marital', 'alcohol', 'HBsAg', 'HBsAb', 'HBeAg', 'HBeAb', 'HBcAb', 'HCV', 'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'blocker', 'blocer', 'diuretic', 'LipidD', 'CaPB', 'NCaPB', 'VitD', 'mucosaprotect', 'H2RA', 'PPI', 'APUD', 'access', 'ESRDcause', 'hypertension', 'DM', 'cardiovasculardisease', 'cerebrovasculardisease', 'bleeding', 'malignancy' } # u'\xa6\xc2blocker' # print('classify_attr.dtype:', classify_attr.shape) for column in big_dataset.columns: # print("column", column) if column in classify_attr: formular = formular + 'C(' + column + ')+' else: formular = formular + column + '+' # print('formular:', formular) # 去掉最后面的'+' # type(formular): <type 'unicode'> formular = formular[:-1].encode('utf-8') # print('formular[:-1].type:', type(formular)) # '-1'表示不添加截取列 big_dataset = patsy.dmatrix(formular + '-1', big_dataset, return_type='dataframe') # print(type(big_dataset)) # print(big_dataset.columns) # print('big_dataset:', big_dataset) if is_death: T_true, E_true, T_false, E_false = ('survivaltime1', 'outcome1', 'survivaltime2', 'outcome2') attr_file, p632_file, var_file, kfold_file = ( 'lb_significant_attrs.txt', 'lb_stats632.csv', 'lb_statvar.txt', 'lb_statskfold.csv') beta_file, p_file = ('lb_coef.csv', 'lb_p.csv') else: T_true, E_true, T_false, E_false = ('survivaltime2', 'outcome2', 'survivaltime1', 'outcome1') attr_file, p632_file, var_file, kfold_file = ( 'lb_significant_attrs_e.txt', 'lb_stats632_e.csv', 'lb_statvar_e.txt', 'lb_statskfold_e.csv') beta_file, p_file = ('lb_coef_e.csv', 'lb_p_e.csv') del big_dataset[T_false] del big_dataset[E_false] significant_attrs = list() # 根据报错删除部分字段 del big_dataset['k_concentration'] del big_dataset['SDUFR_x'] del big_dataset['SDUFR_y'] del big_dataset['SDUFR_y_v'] del big_dataset['protamine_c'] del big_dataset['k_concentration_c'] """如果已经挑选出了具有统计意义的风险因子则不需要执行以下验证风险因子统计学意义的片段 """ #+++++++++++++++++++++++++++++++++++++++++++++++++++++ # for column in big_dataset.columns: # if column in {T_true, E_true}: # continue # subset = big_dataset[[column, T_true, E_true]] # # print('subset', subset) # try: # # print('start fitting ') # cox = CoxPHFitter() # cox.fit(subset, T_true, E_true) # help(cox) # print('cox value:', cox.print_summary()) # print('p value:', cox.summary['p'][0]) # if cox.summary['p'][0] < 0.05: # # print(column, cox.summary['p'][0]) # significant_attrs.append(column) # except Exception: # continue # output = open(basepath+"/"+attr_file, mode='w') # for attr in significant_attrs: # output.write(attr + '\n') # output.close() #++++++++++++++++++++++++++++++++++++++++++++++++++++ input = open(basepath + "/" + attr_file) significant_attrs = [line.strip() for line in input.readlines()] input.close() significant_attrs.append(T_true) significant_attrs.append(E_true) print('linear_big ## sign_attr : %d' % len(significant_attrs)) print(len(significant_attrs), T_true, E_true) big_dataset = big_dataset[significant_attrs] print(len(big_dataset.columns)) # exit() # 10000 times .632 bootstrap count = 0 stats632 = list() statscoef = list() statspvalue = list() while count < 10000: print('count', count) try: # big_dataset = big_dataset.take(numpy.random.permutation(len(big_dataset))) # big_dataset.index = range(len(big_dataset)) # percent = int(len(big_dataset) * 0.30) # train_set = big_dataset[:-percent] # test_set = big_dataset[-percent:] # train_set.index = range(len(train_set)) # test_set.index = range(len(test_set)) train_set = big_dataset.sample(1500, replace=False) test_set = big_dataset.sample(1500, replace=False) print('try fitting......', len(big_dataset), len(train_set), len(test_set)) cox = CoxPHFitter() cox.fit(train_set, T_true, E_true) train_cindex = concordance_index( cox.durations, -cox.predict_partial_hazard(cox.data).values.ravel(), cox.event_observed) statscoef.append(cox.summary[['coef']].T) statspvalue.append(cox.summary[['p']].T) print('try predicting......') # test_set test_actual_T = test_set[T_true] test_actual_E = test_set[E_true] test_variable = test_set[test_set.columns.difference( [T_true, E_true])] test_predictT = cox.predict_expectation(test_variable) # small_set all_actual_T = big_dataset[T_true] all_actual_E = big_dataset[E_true] all_variable = big_dataset[big_dataset.columns.difference( [T_true, E_true])] all_predictT = cox.predict_expectation(all_variable) print('try cindexing......') try: test_cindex = concordance_index(test_actual_T, test_predictT, test_actual_E) all_cindex = concordance_index(all_actual_T, all_predictT, all_actual_E) except Exception: test_cindex = concordance_index(test_actual_T, test_predictT) all_cindex = concordance_index(all_actual_T, all_predictT) print(train_cindex, test_cindex, all_cindex) # 0.5 0.5 0.5 # 0.963726363744 0.965792024703 0.964552831227 # 0.5 0.5 0.5 # 0.5 0.5 0.5 # 0.940458783243 0.939660104788 0.940145223899 # 0.950570809577 0.946854258363 0.949067405671 # 0.941352881629 0.941623634389 0.941462605414 # 0.5 0.5 0.5 stats632.append([train_cindex, test_cindex, all_cindex]) count += 1 print('632 -> %d' % count) except Exception as e: print(e.message) continue stats632_df = pandas.DataFrame(stats632, columns=['train', 'test', 'all']) stats632_df.to_csv(p632_file, encoding='UTF-8') statscoef_df = pandas.DataFrame( pandas.concat(statscoef, ignore_index=True)) statscoef_df.to_csv(beta_file, encoding='UTF-8') statspvalue_df = pandas.DataFrame( pandas.concat(statspvalue, ignore_index=True)) statspvalue_df.to_csv(p_file, encoding='UTF-8') print('10000 times .632 bootstrap has done.')
test_size=0.3, random_state=42) train = pd.concat([a_train, b_train], axis=1) train = train.reset_index(drop=True) test = pd.concat([a_test, b_test], axis=1) test = test.reset_index(drop=True) test_X = test.drop(droplist, axis=1) test_y = test['术后住院时间'] from lifelines import CoxPHFitter cph = CoxPHFitter() train = train.drop('神经系统-膈肌麻痹(可能膈神经损伤)', axis=1) test_X = test_X.drop('神经系统-膈肌麻痹(可能膈神经损伤)', axis=1) train = train.drop('Id', axis=1) test_X = test_X.drop('Id', axis=1) cph.fit(train, duration_col='术后住院时间', event_col='出院时状态', show_progress=True) cph.predict_partial_hazard(test_X) survival_result = cph.predict_survival_function(test_X) survival_result = survival_result[survival_result <= 0.5] LOSResult = pd.DataFrame(np.arange(1354).reshape((677, 2)), columns=['Id', 'LOS']) i = 0 for c in survival_result.columns: item = survival_result[c].idxmax() LOSResult.iloc[i, 0] = i LOSResult.iloc[i, 1] = item i = i + 1 test['Id'] = test.index test1 = pd.merge(test, LOSResult, on='Id') fig, ax = plt.subplots(figsize=(12, 12)) from lifelines import KaplanMeierFitter kmf_control = KaplanMeierFitter()
def cv_add_single_feat(k, tumors, survival, data, feat, penalizer): """ k-fold cross-validation by using an additional feature specified by `feat`. Parameters ---------- k: int number of fold for cross-validation tumors: list of str the list of sample names (TCGA barcode in our case) survival: dataframe survival data, each row a sample, the columns include event status, time of last follow-up Note: the existing features are also included in this dataframe (survival.columns rather than data.columns) data: dataframe all the candidate features of samples, each row a sample, each column a candidate feature feat: str a feature that is to be added to the exiting features in `survival` for CV feat have to be included in the data.cloumns penalizer: float l2 penalizer coefficient for CV Returns ------- ci: float concordance index of cross-validation T_concat: list of float ground truth last follow-up time E_concat: list of 0/1 list of event states T_pred_concat: list of float predicted survival time (negative hazard) """ # https://github.com/CamDavidsonPilon/lifelines/blob/master/lifelines/utils/__init__.py#L548 survival = survival.copy() T_concat = [] E_concat = [] T_pred_concat = [] assignments = get_assignments(len(tumors), k) cph = CoxPHFitter(penalizer=penalizer) # add feat to the existing features survival[feat] = data[feat] feats_columns = survival.columns.drop(["status", "months"]) for i in range(1, k + 1): ix = (assignments == i) train_data = survival.loc[~ix] test_data = survival.loc[ix] # fit the fitter to the training data if np.sum(train_data["status"].values) == 0: print("error!") else: cph.fit(train_data, duration_col="months", event_col="status", show_progress=False, step_size=0.1) #0.1 test_X = test_data[feats_columns] test_T = test_data["months"].values test_E = test_data["status"].values T_pred = -cph.predict_partial_hazard(test_X).values T_concat.append(test_T) E_concat.append(test_E) T_pred_concat.append(T_pred) T_concat = np.concatenate(T_concat) E_concat = np.concatenate(E_concat) T_pred_concat = np.concatenate(T_pred_concat) if np.sum(E_concat) == 0: print("error") else: ci = concordance_index(T_concat, T_pred_concat, E_concat) return ci, T_concat, E_concat, T_pred_concat
x_train = data_train.drop(["time", "dead"], axis=1).as_matrix() x_test = data_test.drop(["time", "dead"], axis=1).as_matrix() scaler = StandardScaler().fit(x_train) x_train = scaler.transform(x_train) #Standardize each predictor variable x_test = scaler.transform(x_test) ######################################## #Standard Cox proportional hazards model from lifelines import CoxPHFitter cph = CoxPHFitter() cph.fit(data_train, duration_col='time', event_col='dead') #cph.print_summary() #Cox model discrimination train set prediction = cph.predict_partial_hazard(data_train) print(concordance_index(data_train.time, -prediction, data_train.dead)) #0.735 #Cox model discrimination test set prediction = cph.predict_partial_hazard(data_test) print(concordance_index(data_test.time, -prediction, data_test.dead)) #0.735 ################################ #Nnet-survival / Our model (flexible version to #allow non-proportional hazards) halflife = 365. * 1.4 breaks = -np.log(1 - np.arange(0.0, 0.96, 0.05)) * halflife / np.log(2) #breaks=-np.log(1-np.arange(0.0,1,0.099))*halflife/np.log(2) n_intervals = len(breaks) - 1 timegap = breaks[1:] - breaks[:-1]
# Case 5 y_true = list(reversed([30, 30, 30, 20, 20])) event = [0, 1, 0, 1, 0] scores = list(reversed([15, 10, 5, 15, 20])) print("\nCase 5") print("Expected: 0.583, Output: {}".format(harrell_c(y_true, scores, event))) # Case 6 y_true = [10, 10] event = [0, 1] scores = [4, 5] print("\nCase 6") print(f"Expected: 1.0 , Output:{harrell_c(y_true, scores, event):.4f}") # Train scores = cph.predict_partial_hazard(one_hot_train) cox_train_scores = harrell_c(one_hot_train['time'].values, scores.values, one_hot_train['status'].values) # Validation scores = cph.predict_partial_hazard(one_hot_val) cox_val_scores = harrell_c(one_hot_val['time'].values, scores.values, one_hot_val['status'].values) # Test scores = cph.predict_partial_hazard(one_hot_test) cox_test_scores = harrell_c(one_hot_test['time'].values, scores.values, one_hot_test['status'].values) print("Train:", cox_train_scores) print("Val:", cox_val_scores) print("Test:", cox_test_scores)
def __predict_individual(self, is_death, train_data_path, basepath): big_dataset_file = train_data_path big_dataset = pd.read_csv(big_dataset_file, encoding='UTF-8', index_col=[0]) del big_dataset['patient_id'] del big_dataset['name'] del big_dataset['tx_id'] # del big_dataset['tx_id.1'] del big_dataset['tx_date'] formular = '' classify_attr = { 'subject', 'treat_item', 'vascular_access_type', 'dialysis_machine', 'anticoagulation_scope', 'anticoagulation', 'protamine', 'replacement_way', 'take_food', 'fluid_infusion', 'blood_pressure_pos', 'gender', 'smoking', 'highflux', 'payment', 'marital', 'alcohol', 'HBsAg', 'HBsAb', 'HBeAg', 'HBeAb', 'HBcAb', 'HCV', 'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'blocker', 'blocer', 'diuretic', 'LipidD', 'CaPB', 'NCaPB', 'VitD', 'mucosaprotect', 'H2RA', 'PPI', 'APUD', 'access', 'ESRDcause', 'hypertension', 'DM', 'cardiovasculardisease', 'cerebrovasculardisease', 'bleeding', 'malignancy' } for column in big_dataset.columns: # print("column", column) if column in classify_attr: formular = formular + 'C(' + column + ')+' else: formular = formular + column + '+' formular = formular[:-1].encode('utf-8') # '-1'表示不添加截取列 big_dataset = patsy.dmatrix(formular + '-1', big_dataset, return_type='dataframe') if is_death: T_true, E_true, T_false, E_false = ('survivaltime1', 'outcome1', 'survivaltime2', 'outcome2') attr_file, p632_file, var_file, kfold_file = ( 'lb_significant_attrs.txt', 'lb_stats632.csv', 'lb_statvar.txt', 'lb_statskfold.csv') beta_file, p_file = ('lb_coef.csv', 'lb_p.csv') else: T_true, E_true, T_false, E_false = ('survivaltime2', 'outcome2', 'survivaltime1', 'outcome1') attr_file, p632_file, var_file, kfold_file = ( 'lb_significant_attrs_e.txt', 'lb_stats632_e.csv', 'lb_statvar_e.txt', 'lb_statskfold_e.csv') beta_file, p_file = ('lb_coef_e.csv', 'lb_p_e.csv') del big_dataset[T_false] del big_dataset[E_false] significant_attrs = list() # 根据报错删除部分字段 del big_dataset['k_concentration'] del big_dataset['SDUFR_x'] del big_dataset['SDUFR_y'] del big_dataset['SDUFR_y_v'] del big_dataset['protamine_c'] del big_dataset['k_concentration_c'] """如果已经挑选出了具有统计意义的风险因子则不需要执行以下验证风险因子统计学意义的片段 """ #+++++++++++++++++++++++++++++++++++++++++++++++++++++ # for column in big_dataset.columns: # if column in {T_true, E_true}: # continue # subset = big_dataset[[column, T_true, E_true]] # # print('subset', subset) # try: # # print('start fitting ') # cox = CoxPHFitter() # cox.fit(subset, T_true, E_true) # help(cox) # print('cox value:', cox.print_summary()) # print('p value:', cox.summary['p'][0]) # if cox.summary['p'][0] < 0.05: # # print(column, cox.summary['p'][0]) # significant_attrs.append(column) # except Exception: # continue # output = open(basepath+"/"+attr_file, mode='w') # for attr in significant_attrs: # output.write(attr + '\n') # output.close() #++++++++++++++++++++++++++++++++++++++++++++++++++++ input = open(basepath + "/" + attr_file) significant_attrs = [line.strip() for line in input.readlines()] input.close() significant_attrs.append(T_true) significant_attrs.append(E_true) print('linear_big ## sign_attr : %d' % len(significant_attrs)) print(len(significant_attrs), T_true, E_true) big_dataset = big_dataset[significant_attrs] print(len(big_dataset.columns)) # 10000 times .632 bootstrap count = 9999 stats632 = list() statscoef = list() statspvalue = list() cox = CoxPHFitter() if count < 10000: print('count', count) try: train_set = big_dataset.sample(1500, replace=False) test_set = big_dataset.sample(1, replace=False) print('try fitting......', len(big_dataset), len(train_set), len(test_set)) # cox = CoxPHFitter() cox = cox.fit(train_set, T_true, E_true) print(test_set) cox.predict_survival_function(test_set).plot() print(cox.predict_log_hazard_relative_to_mean(test_set)) # for t_index,t_item in test_set.iterrows: # print(str(t_index)+"predict_survival_function") # print(cox.predict_survival_function(t_item)) # cox.predict_survival_function(t_item).plot() # print(str(t_index)+"predict_survival_function") # print(cox.predict_survival_function(t_item)) train_cindex = concordance_index( cox.durations, -cox.predict_partial_hazard(cox.data).values.ravel(), cox.event_observed) statscoef.append(cox.summary[['coef']].T) statspvalue.append(cox.summary[['p']].T) print('try predicting......') # test_set test_actual_T = test_set[T_true] test_actual_E = test_set[E_true] test_variable = test_set[test_set.columns.difference( [T_true, E_true])] test_predictT = cox.predict_expectation(test_variable) # small_set # all_actual_T = big_dataset[T_true] # all_actual_E = big_dataset[E_true] # all_variable = big_dataset[big_dataset.columns.difference([T_true, E_true])] # all_predictT = cox.predict_expectation(all_variable) # # print('try cindexing......') try: test_cindex = concordance_index(test_actual_T, test_predictT, test_actual_E) # all_cindex = concordance_index(all_actual_T, all_predictT, all_actual_E) except Exception: test_cindex = concordance_index(test_actual_T, test_predictT) # all_cindex = concordance_index(all_actual_T, all_predictT) # # stats632.append([train_cindex, test_cindex, all_cindex]) count += 1 print('632 -> %d' % count) except Exception as e: print(e.message) mean_patient = self.__filter_dt(test_set) print(cox.predict_log_hazard_relative_to_mean(test_set)) # mean_hazard = cox.predict_expectation(mean_patient) print(mean_hazard)