def test_cross_validator_returns_k_results(): cf = CoxPHFitter() results = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col='T', event_col='E', k=3) assert len(results) == 3 results = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col='T', event_col='E', k=5) assert len(results) == 5
def train(self, X, Y): # the fit method depend on the model type if ('semi_parametric' in str(type( self.model))) or ('multi_task' in str(type(self.model))): self.model.fit(X=X, T=Y['SurvivalTime'], E=Y['Event'], init_method='zeros', num_epochs=500) if 'survival_forest' in str(type(self.model)): self.model.fit(X=X, T=Y['SurvivalTime'], E=Y['Event'], max_features='all', max_depth=20, sample_size_pct=0.33) if 'lifelines' in str( type(self.model) ): # else we want to fit a model from lifeline library using cross validation k_fold_cross_validation(self.model, pd.concat([X, Y], axis=1), 'SurvivalTime', event_col='Event', k=5)
def test_cross_validator_returns_k_results(): cf = CoxPHFitter() results = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col="T", event_col="E", k=3) assert len(results) == 3 results = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col="T", event_col="E", k=5) assert len(results) == 5
def test_cross_validator_with_specific_loss_function(): def square_loss(y_actual, y_pred): return ((y_actual - y_pred) ** 2).mean() cf = CoxPHFitter() results_sq = utils.k_fold_cross_validation(cf, load_regression_dataset(), evaluation_measure=square_loss, duration_col='T', event_col='E') results_con = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col='T', event_col='E') assert list(results_sq) != list(results_con)
def test_cross_validator_with_specific_loss_function(): def square_loss(y_actual, y_pred): return ((y_actual - y_pred) ** 2).mean() cf = CoxPHFitter() results_sq = utils.k_fold_cross_validation(cf, load_regression_dataset(), evaluation_measure=square_loss, duration_col='T', event_col='E') results_con = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col='T', event_col='E') assert list(results_sq) != list(results_con)
def test_cross_validator_returns_fitters_k_results(): cf = CoxPHFitter() fitters = [cf, cf] results = utils.k_fold_cross_validation(fitters, load_regression_dataset(), duration_col='T', event_col='E', k=3) assert len(results) == 2 assert len(results[0]) == len(results[1]) == 3 results = utils.k_fold_cross_validation(fitters, load_regression_dataset(), duration_col='T', event_col='E', k=5) assert len(results) == 2 assert len(results[0]) == len(results[1]) == 5
def test_cross_validator_returns_fitters_k_results(): cf = CoxPHFitter() fitters = [cf, cf] results = utils.k_fold_cross_validation(fitters, load_regression_dataset(), duration_col="T", event_col="E", k=3) assert len(results) == 2 assert len(results[0]) == len(results[1]) == 3 results = utils.k_fold_cross_validation(fitters, load_regression_dataset(), duration_col="T", event_col="E", k=5) assert len(results) == 2 assert len(results[0]) == len(results[1]) == 5
def test_crossval_for_cox_ph_normalized(self, data_pred2, data_pred1): cf = CoxPHFitter() for data_pred in [data_pred1, data_pred2]: data_norm = data_pred.copy() times = data_norm['t'] # Normalize to mean = 0 and standard deviation = 1 times -= np.mean(times) times /= np.std(times) data_norm['t'] = times x1 = data_norm['x1'] x1 -= np.mean(x1) x1 /= np.std(x1) data_norm['x1'] = x1 if 'x2' in data_norm.columns: x2 = data_norm['x2'] x2 -= np.mean(x2) x2 /= np.std(x2) data_norm['x2'] = x2 scores = k_fold_cross_validation(cf, data_norm, duration_col='t', event_col='E', k=3, predictor='predict_partial_hazard') mean_score = 1 - np.mean(scores) # this is because we are using predict_partial_hazard expected = 0.9 msg = "Expected min-mean c-index {:.2f} < {:.2f}" assert mean_score > expected, msg.format(expected, mean_score)
def test_crossval_for_cox_ph_normalized(self, data_pred2, data_pred1): cf = CoxPHFitter() for data_pred in [data_pred1, data_pred2]: data_norm = data_pred.copy() times = data_norm['t'] # Normalize to mean = 0 and standard deviation = 1 times -= np.mean(times) times /= np.std(times) data_norm['t'] = times x1 = data_norm['x1'] x1 -= np.mean(x1) x1 /= np.std(x1) data_norm['x1'] = x1 if 'x2' in data_norm.columns: x2 = data_norm['x2'] x2 -= np.mean(x2) x2 /= np.std(x2) data_norm['x2'] = x2 scores = k_fold_cross_validation(cf, data_norm, duration_col='t', event_col='E', k=3, predictor='predict_partial_hazard') mean_score = 1 - np.mean(scores) # this is because we are using predict_partial_hazard expected = 0.9 msg = "Expected min-mean c-index {:.2f} < {:.2f}" assert mean_score > expected, msg.format(expected, mean_score)
def test_crossval_for_cox_ph_with_normalizing_times( self, data_pred2, data_pred1): cf = CoxPHFitter() for data_pred in [data_pred1, data_pred2]: # why does this data_norm = data_pred.copy() times = data_norm['t'] # Normalize to mean = 0 and standard deviation = 1 times -= np.mean(times) times /= np.std(times) data_norm['t'] = times scores = k_fold_cross_validation( cf, data_norm, duration_col='t', event_col='E', k=3, predictor='predict_partial_hazard') mean_score = 1 - np.mean(scores) expected = 0.9 msg = "Expected min-mean c-index {:.2f} < {:.2f}" assert mean_score > expected, msg.format(expected, mean_score)
def test_cross_validator_with_specific_loss_function(): cf = CoxPHFitter() results_sq = utils.k_fold_cross_validation( cf, load_regression_dataset(), scoring_method="concordance_index", duration_col="T", event_col="E")
def test_cross_validator_with_predictor_and_kwargs(): cf = CoxPHFitter() results_06 = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col='T', k=3, predictor="predict_percentile", predictor_kwargs={'p': 0.6})
def test_cross_validator_with_predictor(): cf = CoxPHFitter() results = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col='T', event_col='E', k=3, predictor="predict_expectation")
def main(): """ """ # simulating a feature matrix for 100 samples with 50 features data = np.random.random((100, 50)) # simulating time of observations (days) min 10 days, max 2500 days observed_time = np.random.randint(10, 2500, (100)) # simulating event (death) 0 did not occur 1 occured observed_event = np.random.randint(0, 2, (100)) test_data = np.random.random((25, 50)) test_observed_time = np.random.randint(10, 2500, (25)) test_observed_event = np.random.randint(0, 2, (25)) for feature_id, feature_vect in enumerate(data.T): dataframe = pd.DataFrame({ 'feature nb{0}'.format(feature_id): feature_vect, 'event': observed_event, 'time': observed_time }) #building a coxph model to see the significance of each independant feature cox_model = CoxPHFitter() cox_model.fit(dataframe, duration_col='time', event_col='event') pvalue = cox_model.summary.p[0] print('pvalue: {0} for feature nb: {1}'.format(pvalue, feature_id)) if pvalue > 0.05: print('feature nb {0} not overall significant!'.format(feature_id)) continue # test the robustness: score close / higher to 0.7 is a good sign scores = k_fold_cross_validation(cox_model, dataframe, duration_col='time', event_col='event', k=3) print('score (mean) (c-index) for {0}'.format(np.mean(scores))) # validate the features on the test set test_dataframe = pd.DataFrame({ 'feature nb{0}'.format(feature_id): test_data.T[feature_id], 'event': test_observed_event, 'time': test_observed_time }) inferred_time = cox_model.predict_expectation(test_dataframe) validation_c_index = concordance_index(test_observed_time, inferred_time, test_observed_event) print('validation c-index: {0}'.format(validation_c_index))
def test_cross_validator_with_predictor_and_kwargs(): cf = CoxPHFitter() results_06 = utils.k_fold_cross_validation( cf, load_regression_dataset(), duration_col="T", k=3, predictor="predict_percentile", predictor_kwargs={"p": 0.6}, ) assert len(results_06) == 3
def test_cross_validator_with_predictor(): cf = CoxPHFitter() results = utils.k_fold_cross_validation( cf, load_regression_dataset(), duration_col="T", event_col="E", k=3, predictor="predict_expectation", ) assert len(results) == 3
def crossValidate(name, fitter): from lifelines.utils import k_fold_cross_validation import numpy as np print("Cross Validating " + name) print( np.mean( k_fold_cross_validation(fitter, times, duration_col='time', event_col='success'))) print("End cross-validation of " + name)
def test_crossval_for_aalen_add(self, data_pred2, data_pred1): aaf = AalenAdditiveFitter() for data_pred in [data_pred1, data_pred2]: mean_scores = [] for repeat in range(20): scores = k_fold_cross_validation(aaf, data_pred, duration_col='t', event_col='E', k=3) mean_scores.append(np.mean(scores)) expected = 0.90 msg = "Expected min-mean c-index {:.2f} < {:.2f}" assert np.mean(mean_scores) > expected, msg.format(expected, np.mean(scores))
def test_crossval_for_aalen_add(self, data_pred2, data_pred1): aaf = AalenAdditiveFitter() for data_pred in [data_pred1, data_pred2]: mean_scores = [] for repeat in range(20): scores = k_fold_cross_validation(aaf, data_pred, duration_col='t', event_col='E', k=3) mean_scores.append(np.mean(scores)) expected = 0.90 msg = "Expected min-mean c-index {:.2f} < {:.2f}" assert np.mean(mean_scores) > expected, msg.format(expected, scores.mean())
def test_crossval_for_cox_ph(self, data_pred2, data_pred1): cf = CoxPHFitter() for data_pred in [data_pred1, data_pred2]: scores = k_fold_cross_validation(cf, data_pred, duration_col='t', event_col='E', k=3, predictor='predict_partial_hazard') mean_score = 1 - np.mean(scores) # this is because we are using predict_partial_hazard expected = 0.9 msg = "Expected min-mean c-index {:.2f} < {:.2f}" assert mean_score > expected, msg.format(expected, mean_score)
def test_crossval_for_cox_ph(self, data_pred2, data_pred1): cf = CoxPHFitter() for data_pred in [data_pred1, data_pred2]: scores = k_fold_cross_validation(cf, data_pred, duration_col='t', event_col='E', k=3, predictor='predict_partial_hazard') mean_score = 1 - np.mean(scores) # this is because we are using predict_partial_hazard expected = 0.9 msg = "Expected min-mean c-index {:.2f} < {:.2f}" assert mean_score > expected, msg.format(expected, mean_score)
def test_crossval_for_cox_ph_with_normalizing_times(self, data_pred2, data_pred1): cf = CoxPHFitter() for data_pred in [data_pred1, data_pred2]: # why does this data_norm = data_pred.copy() times = data_norm['t'] # Normalize to mean = 0 and standard deviation = 1 times -= np.mean(times) times /= np.std(times) data_norm['t'] = times scores = k_fold_cross_validation(cf, data_norm, duration_col='t', event_col='E', k=3, predictor='predict_partial_hazard') mean_score = 1 - np.mean(scores) expected = 0.9 msg = "Expected min-mean c-index {:.2f} < {:.2f}" assert mean_score > expected, msg.format(expected, mean_score)
def cox_regression_experiment(): dynamic_features = np.load('pick_5_visit_features_merge_1.npy')[ 0:2100, :, :-2] dynamic_features.astype(np.int32) labels = np.load('pick_5_visit_labels_merge_1.npy')[:, :, -4].reshape( -1, dynamic_features.shape[1], 1) data = np.concatenate((dynamic_features, labels), axis=2).reshape(-1, 94) data_set = pd.DataFrame(data) col_list = list(data_set.columns.values) new_col = [str(x) for x in col_list] data_set.columns = new_col np.savetxt('allPatient_now.csv', data_set, delimiter=',') print(list(data_set.columns.values)) cph = CoxPHFitter(penalizer=100) cph.fit(data_set, duration_col='0', event_col='93', show_progress=True) cph.print_summary() # cph.plot(columns=['15','20','21','25']) # plt.savefig('cox model' + '.png', format='png') scores = k_fold_cross_validation(cph, data_set, '0', event_col='93', k=5) print(scores) print(np.mean(scores)) print(np.std(scores))
from lifelines import CoxPHFitter from lifelines.utils import k_fold_cross_validation import pandas as pd import numpy as np data = pd.read_csv('/home/xinfy/Desktop/lasurv/project_BC_surv/coxnnet_py3mo/coxnetv1/ssBRCA/datacxph/x.csv') ytime = pd.read_csv('/home/xinfy/Desktop/lasurv/project_BC_surv/coxnnet_py3mo/coxnetv1/ssBRCA/datacxph/ytime.csv') ystatus = pd.read_csv('/home/xinfy/Desktop/lasurv/project_BC_surv/coxnnet_py3mo/coxnetv1/ssBRCA/datacxph/ystatus.csv') data['time'] = ytime['time'].values data['event'] = ystatus['status'].values cph = CoxPHFitter(penalizer=0.05) # #cph.fit(dataset, duration_col='time', event_col='event', show_progress=True) scores = k_fold_cross_validation(cph, data, duration_col='time', event_col='event', k=10, scoring_method="concordance_index") print(np.asarray(scores)) #print(np.asarray(scores).mean(),np.asarray(scores).std())
def test_cross_validator_with_stratified_cox_model(): cf = CoxPHFitter(strata=['race']) utils.k_fold_cross_validation(cf, load_rossi(), duration_col='week', event_col='arrest')
smysub = smy.loc[smy['type'] == sel].copy() axes[0, j].errorbar(x=np.arange(smysub.shape[0]), y=np.exp(smysub['coef']), marker='o', linestyle='', yerr=np.exp(smysub['err'])) axes[0, j].set_title('exp(beta) coefs for {}'.format( ['manufacturer', 'capacity'][j])) axes[0, j].set_xlim([-0.1, len(smysub) - 0.9]) axes[0, j].set_xticks(np.arange(smysub.shape[0])) axes[0, j].set_xticklabels([t.split('[')[1][2:-1] for t in smysub['index']]) cx1 = sa.CoxPHFitter(normalize=False) scores = k_fold_cross_validation(cx1, dft, k=5, duration_col='maxhours', event_col='failed', predictor='predict_expectation') fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 2)) sns.boxplot(scores, vert=False, color='lightblue', ax=axes, showmeans=True) axes.annotate('{:.3f}'.format(np.mean(scores)), xy=(np.mean(scores), 1), xycoords='data', xytext=(10, 10), textcoords='offset points', color='r', fontsize=12) axes.set_xlim([0.5, 1])
def test_cross_validator_with_predictor(): cf = CoxPHFitter() results = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col='T', event_col='E', k=3, predictor="predict_expectation") assert len(results) == 3
var3 0.2186 1.2443 0.0758 2.8836 0.0039 0.0700 0.3672 ** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Concordance = 0.580 """ cph.plot() # Using Aalen's Additive model aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(regression_dataset, duration_col='T', event_col='E') aaf.plot() X = regression_dataset.drop(['E', 'T'], axis=1) aaf.predict_survival_function( X.iloc[10:12]).plot() # get the unique survival functions of two subjects scores = k_fold_cross_validation(cph, regression_dataset, duration_col='T', event_col='E', k=10) print(scores) print(np.mean(scores)) print(np.std(scores)) plt.show() #============================================================================== #==============================================================================
def __linear_small(self, is_death, train_data_path, basepath): small_dataset_file = train_data_path small_dataset = pandas.read_csv(small_dataset_file, encoding='UTF-8', index_col=[0]) del small_dataset['patient_id'] del small_dataset['name'] # 哑变量处理 formular = '' classify_attr = { 'gender', 'smoking', 'highflux', 'payment', 'marital', 'alcohol', 'HBsAg', 'HBsAb', 'HBeAg', 'HBeAb', 'HBcAb', 'HCV', 'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'diuretic', 'LipidD', 'CaPB', 'NCaPB', 'VitD', 'mucosaprotect', 'H2RA', 'PPI', 'APUD', 'access', 'ESRDcause', 'hypertension', 'DM', 'cardiovasculardisease', 'cerebrovasculardisease', 'bleeding', 'malignancy', 'ablocker', 'bblocker' } for column in small_dataset.columns: if column in classify_attr: formular = formular + 'C(' + column + ')+' else: formular = formular + column + '+' formular = formular[:-1] small_dataset = patsy.dmatrix(formular + '-1', small_dataset, return_type='dataframe') if is_death: T_true, E_true, T_false, E_false = ('survivaltime1', 'outcome1', 'survivaltime2', 'outcome2') attr_file, p632_file, var_file, kfold_file = ( 'lm_significant_attrs.txt', 'lm_stats632.csv', 'lm_statvar.txt', 'lm_statskfold.csv') beta_file, p_file = ('lm_coef.csv', 'lm_p.csv') else: T_true, E_true, T_false, E_false = ('survivaltime2', 'outcome2', 'survivaltime1', 'outcome1') attr_file, p632_file, var_file, kfold_file = ( 'lm_significant_attrs_e.txt', 'lm_stats632_e.csv', 'lm_statvar_e.txt', 'lm_statskfold_e.csv') beta_file, p_file = ('lm_coef_e.csv', 'lm_p_e.csv') del small_dataset[T_false] del small_dataset[E_false] significant_attrs = list() for column in small_dataset.columns: # print('column', column) if column in {T_true, E_true}: continue subset = small_dataset[[column, T_true, E_true]] # print('subset', subset) try: cox = CoxPHFitter() cox.fit(subset, T_true, E_true) # print('cox.summary['p'][0]:', cox.summary['p'][0]) if cox.summary['p'][0] < 0.05: significant_attrs.append(column) except Exception: continue output = open(attr_file, mode='w') for attr in significant_attrs: output.write(attr + '\n') output.close() input = open(attr_file) significant_attrs = [line.strip() for line in input.readlines()] input.close() significant_attrs.append(T_true) significant_attrs.append(E_true) print('linear_small ## sign_attr : %d' % len(significant_attrs)) small_dataset = small_dataset[significant_attrs] # 10000 times .632 bootstrap count = 0 stats632 = list() statscoef = list() statspvalue = list() while count < 10000: # 线性训练 try: train_set = small_dataset.take( numpy.random.randint(0, len(small_dataset), size=len(small_dataset))) test_set = small_dataset.ix[set( small_dataset.index).difference(set(train_set.index))] train_set.index = range(len(train_set)) test_set.index = range(len(test_set)) cox = CoxPHFitter() cox.fit(train_set, T_true, E_true) train_cindex = concordance_index( cox.durations, -cox.predict_partial_hazard(cox.data).values.ravel(), cox.event_observed) statscoef.append(cox.summary[['coef']].T) statspvalue.append(cox.summary[['p']].T) # test_set test_actual_T = test_set[T_true].copy() test_actual_E = test_set[E_true].copy() test_variable = test_set[test_set.columns.difference( [T_true, E_true])] test_predictT = cox.predict_expectation(test_variable) # small_set all_actual_T = small_dataset[T_true].copy() all_actual_E = small_dataset[E_true].copy() all_variable = small_dataset[small_dataset.columns.difference( [T_true, E_true])] all_predictT = cox.predict_expectation(all_variable) try: test_cindex = concordance_index(test_actual_T, test_predictT, test_actual_E) all_cindex = concordance_index(all_actual_T, all_predictT, all_actual_E) except Exception: test_cindex = concordance_index(test_actual_T, test_predictT) all_cindex = concordance_index(all_actual_T, all_predictT) stats632.append([train_cindex, test_cindex, all_cindex]) count += 1 print('632 -> %d' % count) except Exception: continue stats632_df = pandas.DataFrame(stats632, columns=['train', 'test', 'all']) stats632_df.to_csv(p632_file, encoding='UTF-8') statscoef_df = pandas.DataFrame( pandas.concat(statscoef, ignore_index=True)) statscoef_df.to_csv(beta_file, encoding='UTF-8') statspvalue_df = pandas.DataFrame( pandas.concat(statspvalue, ignore_index=True)) statspvalue_df.to_csv(p_file, encoding='UTF-8') # 2000 times 10-fold cross-validation、十折交叉 count = 0 statskfold = list() while count < 2000: try: cox = CoxPHFitter() scores = k_fold_cross_validation(cox, small_dataset, T_true, E_true, 10) statskfold.append(scores) count += 1 print('k-fold -> %d' % count) except Exception: continue statskfold_df = pandas.DataFrame(statskfold) statskfold_df.to_csv(basepath + "/" + kfold_file, encoding='UTF-8')
all_features_drop_corr, de_corr_features = RandDropCorr( all_features_drop_low_var, 0.8) all_features_drop_corr.columns = de_corr_features all_features_reduced = pd.concat( [all_features_drop_corr, survival_df_filtered], axis=1).drop('case_submitter_id', axis=1) my_cph = CoxPHFitter(penalizer=0.005, l1_ratio=0.9) # haha.drop(['original_glszm_SizeZoneNonUniformity_1'],axis=1).to_csv('truth_reg_vars.csv') # my_cph.fit(haha.drop(['original_glszm_SizeZoneNonUniformity_1'],axis=1), duration_col = 'days_to_death', event_col='vital_status') my_cph.fit(all_features_reduced, duration_col='days_to_death', event_col='vital_status') my_cph.print_summary() haha = all_features_reduced.drop( ['original_glrlm_GrayLevelNonUniformityNormalized_1'], axis=1) haha = haha.drop(['original_glcm_SumEntropy_1'], axis=1) my_cph.fit(haha, duration_col='days_to_death', event_col='vital_status') my_cph.print_summary() scores = k_fold_cross_validation(my_cph, all_features_reduced, duration_col='days_to_death', event_col='vital_status', k=10, scoring_method="concordance_index") np.mean(scores)
tx = df['history_of_neoadjuvant_treatment']=='Yes' ax = plt.subplot(111) kmf1 = KaplanMeierFitter(alpha=0.95) kmf1.fit(durations=df.ix[tx, survival_col], event_observed=df.ix[tx, censor_col], label=['Tx==Yes']) kmf1.plot(ax=ax, show_censors=True, ci_show=False) kmf2 = KaplanMeierFitter(alpha=0.95) kmf2.fit(durations=df.ix[~tx, survival_col], event_observed=df.ix[~tx, censor_col], label=['Tx==No']) kmf2.plot(ax=ax, show_censors=True, ci_show=False ) add_at_risk_counts(kmf1, kmf2, ax=ax) plt.title ('Acute myeloid leukemia survival analysis with Tx and without Tx') plt.xlabel(survival_col) plt.savefig('km.png') results = logrank_test(df.ix[tx, survival_col], df.ix[~tx, survival_col], df.ix[tx, censor_col], df.ix[~tx, censor_col], alpha=.99 ) results.print_summary() cox = CoxPHFitter(normalize=False) df_age = df[[survival_col, censor_col, 'age_at_initial_pathologic_diagnosis']] df_age = df_age[pd.notnull(df_age['age_at_initial_pathologic_diagnosis'])] cox = cox.fit(df_age, survival_col, event_col=censor_col, include_likelihood=True) cox.print_summary() scores = k_fold_cross_validation(cox, df_age, survival_col, event_col=censor_col, k=10) print scores print 'Mean score', np.mean(scores) print 'Std', np.std(scores)
aft.fit(times, duration_col='time', event_col='success') aft.print_summary(3) #aft = WeibullAFTFitter().fit(times, 'time', 'success', ancillary_df=True) save(name + 'aft', aft.plot()) fitters[name] = aft crossValidate(name, aft) print("END " + name) print('EXAMPLE DATA FOLLOWS') from lifelines import AalenAdditiveFitter, CoxPHFitter from lifelines.datasets import load_regression_dataset from lifelines.utils import k_fold_cross_validation import numpy as np df = load_regression_dataset() #create the three models we'd like to compare. aaf_1 = AalenAdditiveFitter(coef_penalizer=0.5) aaf_2 = AalenAdditiveFitter(coef_penalizer=10) cph = CoxPHFitter() print( np.mean(k_fold_cross_validation(cph, df, duration_col='T', event_col='E'))) print( np.mean(k_fold_cross_validation(aaf_1, df, duration_col='T', event_col='E'))) print( np.mean(k_fold_cross_validation(aaf_2, df, duration_col='T', event_col='E')))
features = genfromtxt(features_path, delimiter=',')[1:].astype(int) #features = random.sample(range(0, 15939), 23) columns = columns[features] for i, column in enumerate(columns): columns[i] = column.strip() X = dataset["data"] X = X[:,features] data = pd.DataFrame(X, columns = columns) event = dataset["cencoring"] time = dataset["labels"][:,5] data["event"] = event.ravel() data["time"] = time cf = CoxPHFitter() scores = k_fold_cross_validation(cf, data, 'time', event_col='event', k=3) print scores print np.mean(scores) print np.std(scores) le = preprocessing.LabelEncoder() subtypes = le.fit_transform(dataset["subtypes"]) data["subtype"] = subtypes T = data["time"] C = data["event"] kmf = KaplanMeierFitter() kmf.fit(T, event_observed=C) kmf.plot(title = 'Survival Day Profile of Breast Cancer Patients')
def main(): parser = ArgumentParser() parser.add_argument('--data_dir', default='../../data/frequency') parser.add_argument('--out_dir', default='../../output') args = parser.parse_args() data_dir = args.data_dir out_dir = args.out_dir # collect data vocab = get_default_vocab() tf = pd.read_csv(os.path.join(data_dir, '2013_2016_tf_norm_log.tsv'), sep='\t', index_col=0) D_L = pd.read_csv(os.path.join(data_dir, '2013_2016_3gram_residuals.tsv'), sep='\t', index_col=0).loc[vocab, :].fillna(0, inplace=False) D_U = pd.read_csv(os.path.join(data_dir, '2013_2016_user_diffusion_log.tsv'), sep='\t', index_col=0).loc[vocab, :].fillna(0, inplace=False) D_S = pd.read_csv(os.path.join(data_dir, '2013_2016_subreddit_diffusion_log.tsv'), sep='\t', index_col=0).loc[vocab, :].fillna(0, inplace=False) D_T = pd.read_csv(os.path.join(data_dir, '2013_2016_thread_diffusion_log.tsv'), sep='\t', index_col=0).loc[vocab, :].fillna(0, inplace=False) # growth_words = get_growth_words() # growth_decline_words, split_points = get_growth_decline_words_and_params() success_words, fail_words, split_points = get_success_fail_words() split_points = split_points.apply(lambda x: int(ceil(x))) # organize into survival df combined_words = fail_words + success_words V = len(combined_words) deaths = pd.Series(pd.np.zeros(V), index=combined_words) deaths.loc[fail_words] = 1 N = tf.shape[1] split_points_combined = pd.concat([ split_points, pd.Series([ N, ] * len(success_words), index=success_words) ], axis=0) covariates = [tf, D_L, D_U, D_S, D_T] covariate_names = ['f', 'D_L', 'D_U', 'D_S', 'D_T'] survival_df = build_survival_df(fail_words, success_words, split_points_combined, covariates, covariate_names) survival_df_nan = survival_df[survival_df.isnull().any(axis=1)] # full timeframe test # fit regression using all covariates and all data up to and including time of death scaler = StandardScaler() survival_df_norm = survival_df.copy() survival_df_norm[covariate_names] = scaler.fit_transform( survival_df_norm[covariate_names]) cox_model = CoxPHFitter() event_var = 'death' time_var = 't' cox_model.fit(survival_df_norm, time_var, event_col=event_var) regression_output_file = os.path.join(out_dir, 'cox_regression_all_data.txt') orig_stdout = sys.stdout with open(regression_output_file, 'w') as regression_output: sys.stdout = regression_output cox_model.print_summary() sys.stdout = orig_stdout # fixed timeframe test # fit regression using all covariates and only data up to first m months m = 3 death_words = list(fail_words) right_censored_words = list(success_words) combined_words = death_words + right_censored_words fixed_death_times = pd.Series(pd.np.repeat(m, len(combined_words)), index=combined_words) covariates = [tf, D_L, D_U, D_S, D_T] covariate_names = ['f', 'D_L', 'D_U', 'D_S', 'D_T'] survival_df = build_survival_df(death_words, right_censored_words, fixed_death_times, covariates, covariate_names) # now provide the actual death/censorship times N = tf.shape[1] death_times = pd.concat([ split_points.loc[death_words], pd.Series([ N, ] * len(right_censored_words), index=right_censored_words) ], axis=0) survival_df['t'] = death_times cox_model = CoxPHFitter() survival_df.loc[:, covariate_names] = scaler.fit_transform( survival_df.loc[:, covariate_names]) cox_model.fit(survival_df, time_var, event_col=event_var) regression_output_file = os.path.join(out_dir, 'cox_regression_first_%d.txt' % (m)) orig_stdout = sys.stdout with open(regression_output_file, 'w') as regression_output: sys.stdout = regression_output cox_model.print_summary() sys.stdout = orig_stdout # concordance values # set up multiple models with different feature sets # then run 10-fold cross-validation to generate concordance scores # and plot distributions cv = 10 feature_sets = [] covariate_sets = [['f'], ['f', 'D_L'], ['f', 'D_U', 'D_S', 'D_T'], ['f', 'D_L', 'D_U', 'D_S', 'D_T']] covariate_set_names = ['f', 'f+L', 'f+S', 'f+L+S'] covariate_set_scores = {} cv = 10 for covariate_set, covariate_set_name in izip(covariate_sets, covariate_set_names): survival_df_relevant = survival_df.loc[:, covariate_set + [time_var, event_var]] cox_model = CoxPHFitter() scores = k_fold_cross_validation(cox_model, survival_df_relevant, time_var, event_col=event_var, k=cv) covariate_set_scores[covariate_set_name] = scores covariate_set_scores = pd.DataFrame(covariate_set_scores).transpose() score_names = ['score_%d' % (i) for i in range(cv)] covariate_set_scores.columns = score_names # significance test between f and f+C, f+D, f+C+D concordance scores pval_thresh = 0.05 baseline_scores = covariate_set_scores.loc['f', score_names] covariate_test_names = ['f+L', 'f+S', 'f+L+S'] # bonferroni correction = alpha / 3 pval_corrected = pval_thresh / len(covariate_test_names) covariate_set_scores.loc[:, 'pval_thresh'] = pval_corrected for covariate_test_name in covariate_test_names: covariate_test_scores = covariate_set_scores.loc[covariate_test_name, score_names] t_stat, pval = ttest_ind(covariate_test_scores, baseline_scores, equal_var=False) covariate_set_scores.loc[covariate_test_name, 't_test'] = t_stat covariate_set_scores.loc[covariate_test_name, 'pval'] = pval # write to file out_file = os.path.join( out_dir, 'cox_regression_concordance_%d_fold_scores.tsv' % (cv)) covariate_set_scores.to_csv(out_file, sep='\t', index=True)
def test_cross_validator_with_stratified_cox_model(): cf = CoxPHFitter(strata=["race"]) utils.k_fold_cross_validation(cf, load_rossi(), duration_col="week", event_col="arrest")
def test_cross_validator_with_stratified_cox_model(): cf = CoxPHFitter(strata=['race']) utils.k_fold_cross_validation(cf, load_rossi(), duration_col='week', event_col='arrest')