def main(): # Load data print('Load data...') hp = Hyperparameters() data = np.load('../' + hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz') print('Use all data for model fitting...') x = data['x'] time = data['time'] event = data['event'] cols_list = load_obj('../' + hp.data_pp_dir + 'cols_list.pkl') df = pd.DataFrame(x, columns=cols_list) df['TIME'] = time df['EVENT'] = event ################################################################### print('Add additional columns...') df_index_code = feather.read_dataframe('../' + hp.results_dir + 'hr_addcodes_' + hp.gender + '.feather') df_index_code = pd.concat([df_index_code[df_index_code['TYPE']==1].head(10), df_index_code[df_index_code['TYPE']==0].head(10)], sort=False) for index, row in df_index_code.iterrows(): print(row['DESCRIPTION']) df[row['DESCRIPTION']] = (data['codes'] == row['INDEX_CODE']).max(axis=1) cols_list = cols_list + [row['DESCRIPTION']] ################################################################### print('Fitting...') cph = CoxPHFitter() cph.fit(df, duration_col='TIME', event_col='EVENT', show_progress=True, step_size=0.5) cph.print_summary() print('done')
def main(data_df): for key in th_dict.keys(): if not key.find("HU") > 0: data_df[key] = data_df[key].fillna(0) data_df[key] = data_df[key].map(lambda input: 1 if input >= th_dict[key] else 0) add_DF = pd.DataFrame() add_DF["V-HU"] = data_df['HU_of_consolidation'] + data_df[ 'Volume_of_total_pneumonia_infection'] #0,1,2 combinations_df = pd.concat( [ data_df["Duration"], data_df["Death"], data_df["Age"], data_df["Blood_Oxygen"], data_df["C-Reactive_protein"], #data_df["White_blood_cell_count"] , data_df["Lymphocyte_count"], data_df["Cerebrovascular_Disease"], data_df["Sex"], #data_df["Neutrophil_count"], #data_df["D-dimer"] , data_df["Lactic_dehydrogenase"], add_DF["V-HU"], ], axis=1) cph = CoxPHFitter() cph.fit(combinations_df, "Duration", event_col="Death", step_size=0.01) cph.print_summary()
def test_proportional_hazard_test_with_weights_and_strata(): """ library(survival) df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458), "E" = c(1, 1, 1, 1, 1), "w" = c(1, 0.5, 2, 1, 1), "s" = c(1, 1, 0, 0, 0) ) c = coxph(formula=Surv(T, E) ~ var1 + strata(s), data=df, weights=w) cz = cox.zph(c, transform='identity') """ df = pd.DataFrame({ "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "w": [1, 0.5, 2, 1, 1], "s": [1, 1, 0, 0, 0], }) df["E"] = True cph = CoxPHFitter() cph.fit(df, "T", "E", weights_col="w", strata="s", robust=True) results = stats.proportional_hazard_test(cph, df, time_transform="identity") cph.print_summary() npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 0.0283, rtol=1e-3)
def coxph_coef(data, duration_col, event_col, silence=True): cph = CoxPHFitter() cph.fit(data, duration_col=duration_col, event_col=event_col, show_progress=(not silence)) if not silence: cph.print_summary() # if div is significant, return it's coefficent if (cph.summary['p'] < 0.05).any(): return np.exp(cph.hazards_['div']['coef']) # otherwise return negative value return -1.0
def do_baseline(foldnum, train, valid, exp_code, model_str): cph = CoxPHFitter() df = pd.DataFrame(train.x) print(df.shape) df['duration'] = train.y df['event'] = [1 if v == 0 else 0 for v in train.c] df = df.fillna(df.mean()) cph.fit(df, 'duration', event_col="event") cph.print_summary() valid_df = pd.DataFrame(valid.x) valid_df = valid_df.fillna(valid_df.mean()) print(cph.predict_log_partial_hazard(valid_df))
def f(train,threshold,test): hi=h(train) h_score=pd.DataFrame(hi, index=np.array(range(1,21149))) gene_ls=h_score.index[h_score.iloc[:,0]>1].tolist() candidate_genes=['V{0}'.format(element) for element in gene_ls] # qualified genes were selected stdsc = preprocessing.StandardScaler() np_scaled_train = stdsc.fit_transform(train.loc[:,candidate_genes]) np_scaled_test = stdsc.transform(test.loc[:,candidate_genes]) pca = sklearnPCA(n_components=1) X_train_pca = pca.fit_transform(np_scaled_train) # This is the result X_test_pca = pca.transform(np_scaled_test) eigen_val=pca.explained_variance_ #eigen value is the explained variance #assign pca score to the test dataset test=test.assign(w=pd.Series(np.ones(len(test.patient_id)))) test['w']=X_test_pca testset_surv=test[['event_free_survival_time_days','death','w']] #do cox-regression # Using Cox Proportional Hazards model cph = CoxPHFitter() cph.fit(testset_surv,'event_free_survival_time_days',event_col='death') return cph.print_summary()
def cox(d_male, d_female): df_male = pd.DataFrame({ "time":d_male, "event":1, "sex": 0 }) df_female = pd.DataFrame({ "time":d_female, "event":1, "sex": 1 }) df = pd.concat([df_male, df_female]) print(len(d_male), len(d_female)) cph = CoxPHFitter() cph.fit(df, duration_col="time", event_col="event") cph.print_summary()
def test_cox(phenotype_dataset, survival_dataset, pheno_survival_integrated, filtered_list, filter_type, filter_na_by_rows): headers = phenotype_dataset[0][0:1] + phenotype_dataset[0][ pheno_start:pheno_limit] + survival_dataset[0][4:] pandas.set_option("mode.use_inf_as_na", True) df = pandas.DataFrame( columns=headers, data=[[k] + v for k, v in pheno_survival_integrated.iteritems() ]) # np.array().astype(np.float32) decode_categorical_values(df) for cur_header in headers[1:]: if filter_type == FILTER_IN and cur_header not in filtered_list: df = df.drop(cur_header, 1) print "column {} was dropped as it's not filtered in".format( cur_header) continue if filter_type == FILTER_OUT and cur_header in filtered_list: df = df.drop(cur_header, 1) print "column {} was dropped as it has low variance".format( cur_header) continue if df[[cur_header ]].isnull().values.any() and (not filter_na_by_rows or filter_type == FILTER_OUT): df = df.drop(cur_header, 1) print "column {} was dropped as it has NaN values".format( cur_header) continue try: df[[cur_header]] = df[[cur_header]].apply(pandas.to_numeric) print "column {} has numeric values".format(cur_header) except ValueError: print "{} cannot be converted to numeric. converting to categorical instead".format( cur_header) df[cur_header] = df[cur_header].astype('category') df[cur_header] = df[cur_header].cat.codes if filter_na_by_rows and filter_type == FILTER_IN: print "remove NaN values by row" df.dropna(inplace=True) # df[['_OS','_OS_IND', 'age_at_initial_pathologic_diagnosis']] = df[['_OS','_OS_IND', 'age_at_initial_pathologic_diagnosis']].apply(pandas.to_numeric) # df["anatomic_treatment_site"] = df["anatomic_treatment_site"].astype('category') # df['anatomic_treatment_site'] = df['anatomic_treatment_site'].cat.codes df = df.drop(headers[0], 1) # df = pandas.get_dummies(df) # print str(df['tx_on_clinical_trial']) print "shape : {}".format(df.shape) cph = CoxPHFitter() cph.fit(df, duration_col='_OS', event_col='_OS_IND', show_progress=False, step_size=0.001) return cph.print_summary() # access the results using cph.summary
def test_cox(phenotype_dataset, gene_expression_top_var_headers_columns, survival_dataset, pheno_survival_integrated, filtered_list, filter_type, filter_na_by_rows): headers = ["ids"] + list( gene_expression_top_var_headers_columns[:pheno_limit]) + list( survival_dataset[0][4:]) pandas.set_option("mode.use_inf_as_na", True) df = pandas.DataFrame( columns=headers, data=[[k] + v for k, v in pheno_survival_integrated.iteritems() ]) # np.array().astype(np.float32) for cur_header in headers[1:]: if filter_type == FILTER_IN and cur_header.split( ".")[0] not in filtered_list: df = df.drop(cur_header, 1) print "column {} was dropped as it's not filtered in".format( cur_header) continue if filter_type == FILTER_OUT and cur_header in filtered_list: df = df.drop(cur_header, 1) print "column {} was dropped as it has low variance".format( cur_header) continue if df[[cur_header ]].isnull().values.any() and (not filter_na_by_rows or filter_type == FILTER_OUT): df = df.drop(cur_header, 1) print "column {} was dropped as it has NaN values".format( cur_header) continue try: df[[cur_header]] = df[[cur_header]].apply(pandas.to_numeric) print "column {} has numeric values".format(cur_header) except ValueError: print "{} cannot be converted to numeric. converting to categorical instead".format( cur_header) df[cur_header] = df[cur_header].astype('category') df[cur_header] = df[cur_header].cat.codes if filter_na_by_rows and filter_type == FILTER_IN: print "remove NaN values by row" df.dropna(inplace=True) df = df.drop(headers[0], 1) print "shape : {}".format(df.shape) cph = CoxPHFitter() cph.fit(df, duration_col='_OS', event_col='_OS_IND', show_progress=True, step_size=0.0001) return cph.print_summary() # access the results using cph.summary
def Cox_Label_HR(self): cph = CoxPHFitter() try: cph.fit(self.survival_label, self.duration_column, event_col=self.observed_column) self.cox_report_for_HR = cph.print_summary() self.HR = cph.hazard_ratios_[0] self.CI = cph.confidence_intervals_.values[0] # return cph.print_summary() except: print("The truncation has problem. ")
def main(): # get command line arguments cmd_args = commandLineParser() # import data (mother/rain) rainfall_df = pd.read_csv(cmd_args.rainfall_data) mother_df = pd.read_csv(cmd_args.DHS_data) # get relevant data from rain data merged = pd.merge(mother_df, rainfall_df, on=['DHSID', 'Year'], how='left') merged.set_index('IDHSPID', inplace=True) # drop unneeded columns drop_columns = ['DHSID', 'Year', r'%-ile', 'Total Rainfall (mm)'] for column in merged.columns: if column in drop_columns: merged.drop(column, axis=1, inplace=True) # change Bools into ones or zeros for column in [r'<5%-ile', r'<10%-ile', r'<15%-ile']: merged[column] = (merged[column] == True).astype(int) # regressions cph = CoxPHFitter() cph.fit(merged, 'Event Time', event_col='Event Occured') # display results cph.print_summary()
def fitcoxmodel(classification, T, E, pid, verbose=True): # Convert the inputs to PD dataframe data = dict() data['T'] = T data['E'] = E data['Cov'] = classification data = pd.DataFrame(data=data, index=pid) # Create the COX fitter cph = CoxPHFitter() cph.fit(data, duration_col='T', event_col='E') if verbose: cph.print_summary() # Retreive the coefficient s = cph.summary coef = s['coef']['Cov'] CI = [s['lower 0.95']['Cov'], s['upper 0.95']['Cov']] p = s['p']['Cov'] return coef, CI, p
def coxph_smoke(): rossi = load_rossi() cph = CoxPHFitter() cph.fit(rossi, duration_col='week', event_col='arrest') cph.print_summary() rossiH2O = h2o.H2OFrame(rossi) cphH2O = H2OCoxProportionalHazardsEstimator(stop_column="week") cphH2O.train(x=["age", "fin", "race", "wexp", "mar", "paro", "prio"], y="arrest", training_frame=rossiH2O) assert cphH2O.model_id != "" assert cphH2O.formula() == "Surv(week, arrest) ~ fin + age + race + wexp + mar + paro + prio", \ "Expected formula to be 'Surv(week, arrest) ~ fin + age + race + wexp + mar + paro + prio' but it was " + cphH2O.formula() predH2O = cphH2O.predict(test_data=rossiH2O) assert len(predH2O) == len(rossi) metricsH2O = cphH2O.model_performance(rossiH2O) py_concordance = concordance_for_lifelines(cph) assert abs(py_concordance - metricsH2O.concordance()) < 0.001
def cox_regression_experiment(): dynamic_features = np.load('pick_5_visit_features_merge_1.npy')[ 0:2100, :, :-2] dynamic_features.astype(np.int32) labels = np.load('pick_5_visit_labels_merge_1.npy')[:, :, -4].reshape( -1, dynamic_features.shape[1], 1) data = np.concatenate((dynamic_features, labels), axis=2).reshape(-1, 94) data_set = pd.DataFrame(data) col_list = list(data_set.columns.values) new_col = [str(x) for x in col_list] data_set.columns = new_col np.savetxt('allPatient_now.csv', data_set, delimiter=',') print(list(data_set.columns.values)) cph = CoxPHFitter(penalizer=100) cph.fit(data_set, duration_col='0', event_col='93', show_progress=True) cph.print_summary() # cph.plot(columns=['15','20','21','25']) # plt.savefig('cox model' + '.png', format='png') scores = k_fold_cross_validation(cph, data_set, '0', event_col='93', k=5) print(scores) print(np.mean(scores)) print(np.std(scores))
def main(filter_gap_days, rp_vs_radiation): # get RP procedure date with open('../data/empi_to_rp_date_dic.pkl', 'rb') as handle: empi_to_rp_date_dic = pickle.load(handle) # if rp_vs_radiation: # get biopsy date with open('../data/empi_to_date_oi_dic.pkl', 'rb') as handle: empi_to_date_oi_dic = pickle.load(handle) # rp date min : 1992-8-5, rp date max : 2020-7-29 # rp_date_range = [np.arange(1992, 1998), np.arange(1998, 2004), np.arange(2004, 2009), np.arange(2009, 2015), np.arange(2015, 2021)] # pre-process the data using radiation only patients / multiple RP patients df_first_date_rads = pd.read_csv( '../data/processed_data/first_date_rads.csv') df_first_date_rads.set_index('EMPI', inplace=True) for empi, pr_date in zip(df_first_date_rads.index, df_first_date_rads.prdate_parsed.values): if empi in empi_to_date_oi_dic.keys(): df_first_date_rads.at[empi, 'pr_date_minus_biopsy_date'] = ( pd.to_datetime(pr_date) - empi_to_date_oi_dic[empi]).days df_first_date_rads_filtered = df_first_date_rads.loc[ df_first_date_rads.pr_date_minus_biopsy_date > 0] df_first_date_rads_filtered = df_first_date_rads_filtered.loc[ df_first_date_rads_filtered.pr_date_minus_biopsy_date <= filter_gap_days] radiation_empis = set(df_first_date_rads_filtered.index) df_multirp = pd.read_csv('../data/processed_data/multirp.csv') df_multirp.set_index('EMPI', inplace=True) multirp_empis = set(df_multirp.index) # breakpoint() # get relevant data df_merged_comorb = pd.read_csv( '../data/merged/df_merged_biopsy_based_C.csv') df_merged_comorb.set_index(df_merged_comorb.columns[0], inplace=True) df_merged_comorb.index.name = 'EMPI' df_merged_psa_prior = pd.read_csv( '../data/merged/df_merged_biopsy_based_E.csv') df_merged_psa_prior.set_index(df_merged_psa_prior.columns[0], inplace=True) df_merged_psa_prior.index.name = 'EMPI' df_outcome_oi = pd.read_csv( '../data/df_outcome_final_biopsy_based_clean.csv') df_outcome_oi.set_index('EMPI', inplace=True) # filter_gap_days = 60 df_outcome_oi_filtered_rp_postive = df_outcome_oi.loc[ df_outcome_oi.rp_date_minus_biopsy_date_in_days <= filter_gap_days] df_outcome_oi_rp_negative = df_outcome_oi.loc[ df_outcome_oi.rp_date.isnull()] if rp_vs_radiation: # exclude both RP and Radiation both_rp_radiation_empis = set( df_outcome_oi_filtered_rp_postive.index) & radiation_empis df_outcome_oi_filtered_rp_postive = df_outcome_oi_filtered_rp_postive.loc[ set(df_outcome_oi_filtered_rp_postive.index) - both_rp_radiation_empis] df_outcome_oi_rp_negative = df_outcome_oi_rp_negative.loc[ set(df_outcome_oi_rp_negative.index) & radiation_empis - multirp_empis] df_outcome_oi_filtered_total = pd.concat( [df_outcome_oi_filtered_rp_postive, df_outcome_oi_rp_negative]) else: # treated (radiation and rp) vs AS df_outcome_oi_treated_postive = pd.concat([ df_outcome_oi_filtered_rp_postive, df_outcome_oi_rp_negative.loc[set(df_outcome_oi_rp_negative.index) & radiation_empis] ]) # df_outcome_oi_filtered_rp_postive df_outcome_oi_treated_negative = df_outcome_oi_rp_negative.loc[ set(df_outcome_oi_rp_negative.index) - radiation_empis] df_outcome_oi_filtered_total = pd.concat( [df_outcome_oi_treated_postive, df_outcome_oi_treated_negative]) # filter out negative time to death df_outcome_oi_filtered_final = df_outcome_oi_filtered_total.loc[ df_outcome_oi_filtered_total.time_to_death_in_month > 0] # df_outcome_oi_filtered_final.set_index('EMPI', inplace = True) # merge comorbidity data and psa prior data empi_common = set(df_merged_comorb.index) & set(df_merged_psa_prior.index) df_merged_psa_comorb = pd.concat([ df_merged_comorb.loc[empi_common], df_merged_psa_prior.loc[empi_common].psa_prior_to_rp ], axis=1) df_merged_psa_comorb.drop(columns=['wscore_agg'], inplace=True) # merge outcome and feature dfs empi_common_outcome = set(df_outcome_oi_filtered_final.index) & set( df_merged_psa_comorb.index) outcome_cols_oi = ['death_ind', 'time_to_death_in_month'] df_cox = pd.concat([ df_merged_psa_comorb.loc[empi_common_outcome], df_outcome_oi_filtered_final.loc[empi_common_outcome][outcome_cols_oi] ], axis=1) if not rp_vs_radiation: df_cox.loc[df_cox.index.isin(df_outcome_oi_treated_postive.index), 'rp_indicator'] = 1 df_cox.rename(columns={'rp_indicator': 'treated'}, inplace=True) # get biopsy date range # biopsy_date_list = [] # for empi in data_merged.index: # biopsy_date = empi_to_rp_date_dic[empi] # for yr_idx, yr_range in enumerate(rp_date_range): # if biopsy_date.year in yr_range: # biopsy_date_list.append(yr_idx) # break # data_merged['rp_date'] = biopsy_date_list # drop uninformative and extreme minority features drop_cols = ['benign', 'Unknown/other'] # pre-filtering # remove metastatic cancer patients df_cox = df_cox.loc[df_cox.metacanc_agg == 0] # drop_cols_non_informative = ['cevd_agg', 'rheumd_agg', 'pud_agg', 'mld_agg', 'diabwc_agg', 'aids_agg', 'metacanc_agg'] if rp_vs_radiation: drop_cols_non_informative = [ 'metacanc_agg', 'rheumd_agg', 'copd_agg', 'mld_agg', 'diabwc_agg', 'hp_agg', 'rend_agg', 'aids_agg' ] else: drop_cols_non_informative = [ 'metacanc_agg', 'cevd_agg', 'rheumd_agg', 'mld_agg', 'diabwc_agg', 'hp_agg', 'aids_agg' ] #, 'rheumd_agg', 'copd_agg', 'mld_agg', 'diabwc_agg', 'hp_agg', 'rend_agg', 'aids_agg'] df_cox_final = df_cox.drop(columns=drop_cols + drop_cols_non_informative) # standardize age and auxiiliary_mci_score,psa_prior_to_rp standardize_cols = ['Age at RP', 'auxiiliary_mci_score', 'psa_prior_to_rp'] for col in standardize_cols: if col == 'psa_prior_to_rp': breakpoint() df_cox_final[col] = (df_cox_final[col].values - np.mean( df_cox_final[col])) / np.std(df_cox_final[col].values) # df_cox_final = df_cox_final.loc[df_cox_final.overall_grade_merged == 1] # df_cox_final.drop(columns = ['overall_grade_merged'], inplace = True) # df_cox_final = df_cox_final.loc[df_cox_final.overall_grade_merged > 1] # df_cox_final.drop(columns = ['overall_grade_merged'], inplace = True) print('Final cox df stats : ') print(df_cox_final.sum()) cph = CoxPHFitter(penalizer=0.00, l1_ratio=0) cph.fit(df_cox_final, 'time_to_death_in_month', 'death_ind', show_progress=False, step_size=0.1) cph.print_summary() breakpoint() # rename colums for downstraem task compatibility df_cox_final.rename(columns={ 'death_ind': 'death', 'rp_indicator': 'rp', 'time_to_death_in_month': 'survtime' }, inplace=True) if rp_vs_radiation: df_cox_final.to_csv( '../data/df_cox_data_death_causal_inference_rp_vs_radiation.csv') else: df_cox_final.to_csv('../data/df_cox_data_death_causal_inference.csv') breakpoint() return
Created on Sat Feb 6 20:47:27 2016 @author: Rahul Ahuja """ from lifelines import CoxPHFitter cf = CoxPHFitter() import pandas as pd import numpy as np import grade ProcessedData = pd.read_csv('C:/Users/abmm832/Downloads/ProcessedData.csv', index_col=False) cf.fit(ProcessedData, 'Time', event_col='loan_status') cf.print_summary() #should have made the module of this function as well. Anyways I ve done it for loan grade module. def get_non_negative_int(prompt): while True: try: value = int(input(prompt)) except ValueError: print("Invalid entry") continue if value < 0: print("Invalid entry") continue else:
import pandas as pd from lifelines import WeibullAFTFitter, CoxPHFitter # This is an implementation of https://uwspace.uwaterloo.ca/bitstream/handle/10012/10265/Cook_Richard-10265.pdf N = 50000 p = 0.5 bX = np.log(0.5) bZ = np.log(4) Z = np.random.binomial(1, p, size=N) X = np.random.binomial(1, 0.5, size=N) X_ = 20000 + 10 * np.random.randn(N) W = weibull_min.rvs(1, scale=1, loc=0, size=N) Y = bX * X + bZ * Z + np.log(W) T = np.exp(Y) ####################################### df = pd.DataFrame({"T": T, "x": X, "x_": X_}) wf = WeibullAFTFitter().fit(df, "T") wf.print_summary(4) cph = CoxPHFitter().fit(df, "T", show_progress=True, step_size=1.0) cph.print_summary(4)
def check_cox(rossi, x, stratify_by, formula): if stratify_by: cph_py = CoxPHFitter(strata=stratify_by) else: cph_py = CoxPHFitter() for col in stratify_by: rossi[col] = rossi[col].astype('category') cph_py.fit(rossi, duration_col='week', event_col='arrest') cph_py.print_summary() rossi_h2o = h2o.H2OFrame(rossi) for col in stratify_by: rossi_h2o[col] = rossi_h2o[col].asfactor() cph_h2o = H2OCoxProportionalHazardsEstimator(stop_column="week", stratify_by=stratify_by) cph_h2o.train(x=x, y="arrest", training_frame=rossi_h2o) assert cph_h2o.model_id != "" assert cph_h2o.model_id != "" assert cph_h2o.formula() == formula, "Expected formula to be '" + formula + "' but it was " + cph_h2o.formula() predH2O = cph_h2o.predict(test_data=rossi_h2o) assert len(predH2O) == len(rossi) metrics_h2o = cph_h2o.model_performance(rossi_h2o) concordance_py = concordance_for_lifelines(cph_py) assert abs(concordance_py - metrics_h2o.concordance()) < 0.001 hazard_h2o_as_pandas = cph_h2o.baseline_hazard_frame.as_data_frame(use_pandas=True) hazard_py = cph_py.baseline_hazard_ for col_name in hazard_py.columns: hazard_py.rename(columns={col_name: str(col_name)}, inplace=True) hazard_py_reordered_columns = hazard_py.reset_index(drop=True).sort_index(axis=1) hazard_h2o_reordered_columns = hazard_h2o_as_pandas.drop('t', axis="columns").reset_index( drop=True).sort_index(axis=1) hazard_py_reordered_columns = fix_py_result_for_older_lifelines(hazard_py_reordered_columns) print("h2o:") print(hazard_h2o_as_pandas.reset_index(drop=True)) print("lifelines:") print(hazard_py_reordered_columns.reset_index(drop=True)) assert_frame_equal(hazard_py_reordered_columns, hazard_h2o_reordered_columns, check_dtype=False, check_index_type=False, check_column_type=False) survival_h2o_as_pandas = cph_h2o.baseline_survival_frame.as_data_frame(use_pandas=True) survival_py = cph_py.baseline_survival_ for col_name in survival_py.columns: survival_py.rename(columns={col_name: str(col_name)}, inplace=True) survival_py_reordered_columns = survival_py.reset_index(drop=True).sort_index(axis=1) survival_h2o_reordered_columns = survival_h2o_as_pandas.drop('t', axis="columns").reset_index( drop=True).sort_index(axis=1) survival_py_reordered_columns = fix_py_result_for_older_lifelines(survival_py_reordered_columns) print("h2o:") print(survival_h2o_as_pandas.reset_index(drop=True)) print("lifelines:") print(survival_py_reordered_columns.reset_index(drop=True)) assert_frame_equal(survival_py_reordered_columns, survival_h2o_reordered_columns, check_dtype=False, check_index_type=False, check_column_type=False)
# -*- coding: utf-8 -*- # cox regression if __name__ == "__main__": import pandas as pd import time import numpy as np from lifelines import CoxPHFitter from lifelines.datasets import load_rossi df = load_rossi() df = pd.concat([df] * 20) # df['week'] = np.random.exponential(1, size=df.shape[0]) cp = CoxPHFitter() start_time = time.time() cp.fit(df, duration_col="week", event_col="arrest", batch_mode=True) print("--- %s seconds ---" % (time.time() - start_time)) cp.print_summary(4)
print(cv['test_score'].mean()) #Feature Importance for idx,estimator in enumerate(cv['estimator']): print("Features sorted by their score for estimator {}:".format(idx)) feature_importances = pd.DataFrame(estimator.feature_importances_, index = X.columns, columns=['importance']).sort_values('importance', ascending=False) print(feature_importances) #Fit Logistic Regression model = sm.OLS(y, X) result = model.fit() print("regression results:\n", result.summary()) #Using Cox Proportional Hazards model df=dat[['Month', 'Status_F', 'Age(year)', 'Max.rotor.speed,RPM-Avg', 'active power-Avg']] print("df=",df ) #Survival Analysis_ Cox Proportional Hazard Model cph = CoxPHFitter() ## Instantiate the class to create a cph object cph.fit(df , 'Month', 'Status_F') ## Fit the data to train the model cph.print_summary() ## HAve a look at the significance of the features cph.plot() #Survival curves for the selected Turbines tr_rows = df.iloc[19:27, 2:] cph.predict_survival_function(tr_rows).plot() plt.show()
tx = df['history_of_neoadjuvant_treatment']=='Yes' ax = plt.subplot(111) kmf1 = KaplanMeierFitter(alpha=0.95) kmf1.fit(durations=df.ix[tx, survival_col], event_observed=df.ix[tx, censor_col], label=['Tx==Yes']) kmf1.plot(ax=ax, show_censors=True, ci_show=False) kmf2 = KaplanMeierFitter(alpha=0.95) kmf2.fit(durations=df.ix[~tx, survival_col], event_observed=df.ix[~tx, censor_col], label=['Tx==No']) kmf2.plot(ax=ax, show_censors=True, ci_show=False ) add_at_risk_counts(kmf1, kmf2, ax=ax) plt.title ('Acute myeloid leukemia survival analysis with Tx and without Tx') plt.xlabel(survival_col) plt.savefig('km.png') results = logrank_test(df.ix[tx, survival_col], df.ix[~tx, survival_col], df.ix[tx, censor_col], df.ix[~tx, censor_col], alpha=.99 ) results.print_summary() cox = CoxPHFitter(normalize=False) df_age = df[[survival_col, censor_col, 'age_at_initial_pathologic_diagnosis']] df_age = df_age[pd.notnull(df_age['age_at_initial_pathologic_diagnosis'])] cox = cox.fit(df_age, survival_col, event_col=censor_col, include_likelihood=True) cox.print_summary() scores = k_fold_cross_validation(cox, df_age, survival_col, event_col=censor_col, k=10) print scores print 'Mean score', np.mean(scores) print 'Std', np.std(scores)
def main(outcome_oi): # get RP procedure date with open('../data/empi_to_rp_date_dic.pkl', 'rb') as handle: empi_to_rp_date_dic = pickle.load(handle) # rp date min : 1992-8-5, rp date max : 2020-7-29 rp_date_range = [ np.arange(1992, 1998), np.arange(1998, 2004), np.arange(2004, 2009), np.arange(2009, 2015), np.arange(2015, 2021) ] # get relevant data data_merged = pd.read_csv( '../data/merged/df_merged_rp_positive_based_E.csv') data_merged.set_index('Unnamed: 0', inplace=True) data_merged.index.name = 'EMPI' data_merged_comorb = pd.read_csv( '../data/merged/df_merged_rp_positive_based_C.csv') data_merged_comorb.set_index('Unnamed: 0', inplace=True) data_merged_comorb.index.name = 'EMPI' empis_oi = set(data_merged.index) & set(data_merged_comorb.index) comorbs_oi = [ 'ami_agg', 'chf_agg', 'pvd_agg', 'cevd_agg', 'copd_agg', 'rheumd_agg', 'pud_agg', 'mld_agg', 'diab_agg', 'diabwc_agg', 'hp_agg', 'rend_agg', 'metacanc_agg', 'aids_agg' ] data_merged = pd.concat([ data_merged.loc[empis_oi], data_merged_comorb.loc[empis_oi][comorbs_oi] ], axis=1) # process pt stage data_merged['pt1'] = 0 data_merged['pt1a'] = 0 data_merged['pt1b'] = 0 data_merged['pt1c'] = 0 data_merged['pt2'] = 0 data_merged['pt2a'] = 0 data_merged['pt2b'] = 0 data_merged['pt2c'] = 0 data_merged['pt3'] = 0 data_merged['pt3a'] = 0 data_merged['pt3b'] = 0 data_merged['pt3c'] = 0 data_merged['pt4'] = 0 for empi, pt_stage in zip(data_merged.index, data_merged.pT_stage_combined.values): data_merged.at[empi, pt_stage] = 1 # if pt_stage[:-1] in data_merged.columns: # data_merged.at[empi, pt_stage[:-1]] = 1 # data_merged.drop(columns = ['pT_stage_combined'], inplace = True) # get rp procedure date rp_date_list = [] for empi in data_merged.index: rp_date = empi_to_rp_date_dic[empi] for yr_idx, yr_range in enumerate(rp_date_range): if rp_date.year in yr_range: rp_date_list.append(yr_idx) break data_merged['rp_date'] = rp_date_list # load outcome : df_outcome = pd.read_csv('../data/df_outcome_rp_positive_final.csv') df_outcome.set_index('EMPI', inplace=True) # outcome_oi = 'death' # 'death', 'bcr' empis_oi = set(df_outcome.index) & set(data_merged.index) df_outcome_oi = df_outcome.loc[empis_oi] df_merged_data_oi = data_merged.loc[empis_oi] print('Feature stats in the cohort of interest : ') print(df_merged_data_oi.sum()) drop_pt_stage = ['pt1', 'pt1a', 'pt1b', 'pt1c', 'pt3c', 'pt4', 'pt3'] # 'Unknown/other', 'pt3', 'Asian'] drop_other = ['Unknown/other', 'Asian'] drop_cols = drop_pt_stage + drop_other df_merged_data_oi.drop(columns=drop_cols, inplace=True) # df_merged_data_oi = df_merged_data_oi.loc[~df_merged_data_oi.isin(drop_pt_stage)] print('New featur stats in the cohort of interest : ') print(df_merged_data_oi.sum()) if outcome_oi == 'bcr': outcome_cols_oi = ['bcr_ind', 'time_to_bcr_in_month'] df_outcome_oi = df_outcome_oi[outcome_cols_oi] else: outcome_cols_oi = ['death_ind', 'time_to_death_in_month'] df_outcome_oi = df_outcome_oi[outcome_cols_oi] df_cox_data = pd.concat([df_merged_data_oi, df_outcome_oi], axis=1) print('\n') print('Exporting data...') if outcome_oi == 'bcr': df_cox_data.to_csv('../data/df_cox_data_bcr.csv') else: df_cox_data.to_csv('../data/df_cox_data_death.csv') print('\n') run_cox = True if run_cox: cph = CoxPHFitter(l1_ratio=1, penalizer=0.01) if outcome_oi == 'death': cph.fit(df_cox_data.drop(columns=['pT_stage_combined']), 'time_to_death_in_month', 'death_ind') cph.print_summary() print(cph.summary) cph.summary.round(3).to_csv('cox_result_death.csv') else: cph.fit(df_cox_data.drop(columns=['pT_stage_combined']), 'time_to_bcr_in_month', 'bcr_ind') cph.print_summary() print(cph.summary) cph.summary.round(3).to_csv('cox_result_bcr.csv') breakpoint() """ drop_cols = ['max_psa', 'min_psa', 'mean_psa', 'ami_agg', 'chf_agg', 'pvd_agg', 'cevd_agg', 'copd_agg', 'rheumd_agg', 'pud_agg', 'mld_agg', 'diabwc_agg', 'hp_agg', 'rend_agg', 'metacanc_agg', 'aids_agg'] """ return
ax = kmf.survival_function_.plot(ax=ax) ax.set_title('Survival function') plt.show() fig, ax = plt.subplots() ax = kmf.plot(ax=ax) ax.set_title('Survival with confidence intervals') plt.show() from lifelines import CoxPHFitter cph = CoxPHFitter() # 传入用作自变量的列 cph_bladder_df = bladder[['rx', 'number', 'size', 'enum', 'stop', 'event']] cph.fit(cph_bladder_df, duration_col='stop', event_col='event') # 输出系数 print(cph.print_summary()) rx1 = bladder.loc[bladder['rx'] == 1] rx2 = bladder.loc[bladder['rx'] == 2] kmf1 = KaplanMeierFitter() kmf1.fit(rx1['stop'], event_observed=rx1['event']) kmf2 = KaplanMeierFitter() kmf2.fit(rx2['stop'], event_observed=rx2['event']) fig, axes = plt.subplots() kmf1.plot_loglogs(ax=axes) kmf2.plot_loglogs(ax=axes) axes.legend(['rx1', 'rx2']) plt.show()
# position in the sorted array of times while setting other # positions to 0 so that the cumsum operation will result # in each of the positions having the same sum of risks for i in range(time.shape[0] - 1, 0, -1): # Going from smallest survival times to largest if time[i] == time[i - 1]: # Push risk to the later time (earlier in array position) risk[i - 1] = risk[i - 1] + risk[i] risk[i] = 0 event = K.gather(y_true[:, 1], indices=sorting.indices) denom = K.cumsum(risk) terms = xbeta - K.log(denom) loglik = K.cast(event, dtype=terms.dtype) * terms return -K.sum(loglik) # Compile model model.compile(optimizer="adam", loss=neg_log_pl) # Fit model with the whole dataset as a batch, since the # partial likelihood depends on all observations model.fit(X, y, batch_size=n, epochs=3000) # Compare to Cox model cph = CoxPHFitter() # CoxPHFitter uses Efron's method for handling tied survival times, # whereas neg_log_pl uses Breslow's method, so the likelihood # functions being optimized are not exactly the same cph.fit(kidtx, duration_col="time", event_col="death") cph.print_summary(decimals=8) model.get_weights()
all_features_drop_corr, de_corr_features = RandDropCorr( all_features_drop_low_var, 0.8) all_features_drop_corr.columns = de_corr_features all_features_reduced = pd.concat( [all_features_drop_corr, survival_df_filtered], axis=1).drop('case_submitter_id', axis=1) my_cph = CoxPHFitter(penalizer=0.005, l1_ratio=0.9) # haha.drop(['original_glszm_SizeZoneNonUniformity_1'],axis=1).to_csv('truth_reg_vars.csv') # my_cph.fit(haha.drop(['original_glszm_SizeZoneNonUniformity_1'],axis=1), duration_col = 'days_to_death', event_col='vital_status') my_cph.fit(all_features_reduced, duration_col='days_to_death', event_col='vital_status') my_cph.print_summary() haha = all_features_reduced.drop( ['original_glrlm_GrayLevelNonUniformityNormalized_1'], axis=1) haha = haha.drop(['original_glcm_SumEntropy_1'], axis=1) my_cph.fit(haha, duration_col='days_to_death', event_col='vital_status') my_cph.print_summary() scores = k_fold_cross_validation(my_cph, all_features_reduced, duration_col='days_to_death', event_col='vital_status', k=10, scoring_method="concordance_index") np.mean(scores)
cph_data['Husband_Race'] == 'Other Ethnic Groups', 0, 1) cph_data['Couple_Race'] = np.where(cph_data['Couple_Race'] == 'Same-Race', 0, 1) cph_data.drop(['Abbreviation', 'State'], axis=1, inplace=True) cph_data['Has_Children'] = np.where( cph_data['Has_Children'] == 'Have Children', 1, 0) cph_data['Household_Income_Range'] = np.where(cph_data['Household_Income_Range']=='42,830$ - 44,765$',0,\ (np.where(cph_data['Household_Income_Range']=='66,532$ - 70,303$',1,2))) print('---------CPH fitting starts her---------------- ') # Fitting the model and plotting the corresponding prediction cph = CoxPHFitter() cph.fit(cph_data, duration_col='Duration', event_col='Divorce', show_progress=True) cph.print_summary() sns.set() cph.plot() plt.savefig( '/home/raed/Dropbox/INSE - 6320/Final Project/CPH_Coefficients_plot.pdf') plt.show() #Calculating the correlation between covariates in order to understand the relationship in the data # calculate the correlation matrix cph_correlation = cph_data.corr() print(corr) #sns.heatmap(pd.crosstab(data.Duration, data.Poverty_Percentage)) sns.pairplot(cph_correlation) plt.savefig('/home/raed/Dropbox/INSE - 6320/Final Project/CPH_Correlation.pdf') plt.show()
""" # print cancer['T'].unique() # print cancer['E'].unique() # cancer = cancer.dropna() # the '-1' term # refers to not adding an intercept column (a column of all 1s). # It can be added to the Fitter class. covMatrix = cancer.cov() cf = CoxPHFitter() cf.fit(covMatrix, "T", event_col="E") # extra paramater for categorical , strata=catVar cf.print_summary() curve = cf.predict_survival_function(cancer) curve.plot() plt.show() print "hazard coeff", cf.hazards_ print "baseline ", cf.baseline_hazard_ """ scores = k_fold_cross_validation(cf, covMatrix, 'T', event_col='E', k=3) print scores print np.mean(scores) print np.std(scores) """
def multivariate(df): from lifelines import CoxPHFitter cph = CoxPHFitter() cph.fit(df, duration_col='time', event_col='status', show_progress=True) cph.print_summary() # access the results using cph.summary
def surv_coxph(data_train, x_cols, duration_col, event_col, data_test=None, pt=None, show_extra=True): """Integrate functions that include modeling using Cox Regression and evaluating Parameters ---------- data_train : pandas.DataFame Full survival data for train. x_cols : list of str Name of column indicating variables. duration_col : str Name of column indicating time. event_col : str Name of column indicating event. data_test : pandas.DataFame Full survival data for test, default None. pt : float Predicted time for AUC. Returns ------- object Object of cox model in `lifelines.CoxPHFitter`. Examples -------- >>> surv_coxph(train_data, ['x1', 'x2'], 'T', 'E', test_data, pt=5*12) """ y_cols = [event_col, duration_col] cph = CoxPHFitter() cph.fit(data_train[x_cols + y_cols], duration_col=duration_col, event_col=event_col, show_progress=True) # CI of train pred_X_train = cph.predict_partial_hazard(data_train[x_cols]) pred_X_train.rename(columns={0: 'X'}, inplace=True) ci_train = concordance_index(data_train[duration_col], -pred_X_train, data_train[event_col]) # AUC of train at pt df = pd.concat([data_train[y_cols], pred_X_train], axis=1) roc_train = surv_roc(df, 'X', duration_col, event_col, pt=pt) if data_test is not None: # CI of test pred_X_test = cph.predict_partial_hazard(data_test[x_cols]) pred_X_test.rename(columns={0: 'X'}, inplace=True) ci_test = concordance_index(data_test[duration_col], -pred_X_test, data_test[event_col]) # AUC of test at pt df = pd.concat([data_test[y_cols], pred_X_test], axis=1) roc_test = surv_roc(df, 'X', duration_col, event_col, pt=pt) # Print Summary of CPH cph.print_summary() print "__________Metrics CI__________" print "CI of train: %.4f" % ci_train if data_test is not None: print "CI of test : %.4f" % ci_test print "__________Metrics AUC__________" print "AUC of train: %.4f" % roc_train['AUC'] if data_test is not None: print "AUC of test : %.4f" % roc_test['AUC'] if not show_extra: return cph # Print Coefficients print "__________Summary of Coefficients in CPH__________" cols = ['coef', 'p', 'lower 0.95', 'upper 0.95'] print cols[0], ":" for i in cph.summary.index: print "%.4f" % (cph.summary.loc[i, cols[0]]) print "__________" print cols[1], ":" for i in cph.summary.index: print "%.4f" % (cph.summary.loc[i, cols[1]]) print "__________" print "95% CI :" for i in cph.summary.index: print "[%.4f, %.4f]" % (cph.summary.loc[i, cols[2]], cph.summary.loc[i, cols[3]]) return cph
model.add(Dense(32, input_shape=(7,), init='glorot_uniform')) # shape= length, dimension model.add(Activation('relu')) model.add(Dense(32, init='glorot_uniform')) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(1,activation="linear", init='glorot_uniform', W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01))) # sgd = SGD(lr=1e-5, decay=0.01, momentum=0.9, nesterov=True) rmsprop=RMSprop(lr=1e-5, rho=0.9, epsilon=1e-8) model.compile(loss=negative_log_likelihood(E_train), optimizer=sgd) print('Training...') model.fit(X_train, Y_train, batch_size=324, nb_epoch=1000, shuffle=False) # Shuffle False --> Important!! hr_pred=model.predict(X_train) hr_pred=np.exp(hr_pred) ci=concordance_index(Y_train,-hr_pred,E_train) hr_pred2=model.predict(X_val) hr_pred2=np.exp(hr_pred2) ci2=concordance_index(Y_val,-hr_pred2,E_val) print 'Concordance Index for training dataset:', ci print 'Concordance Index for test dataset:', ci2 #Cox Fitting cf = CoxPHFitter() cf.fit(rossi_dataset, 'week', event_col='arrest') cf.print_summary() # access the results using cf.summary
# -*- coding: utf-8 -*- # cox regression if __name__ == "__main__": import pandas as pd import time import numpy as np from lifelines import CoxPHFitter from lifelines.datasets import load_rossi df = load_rossi() df = pd.concat([df] * 16) # df = df.reset_index() # df['week'] = np.random.exponential(1, size=df.shape[0]) cp = CoxPHFitter() start_time = time.time() cp.fit(df, duration_col="week", event_col="arrest", batch_mode=True) print("--- %s seconds ---" % (time.time() - start_time)) cp.print_summary()
# -*- coding: utf-8 -*- # cox regression if __name__ == "__main__": import pandas as pd import time import numpy as np from lifelines import CoxPHFitter from lifelines.datasets import load_rossi, load_regression_dataset reps = 1 df = load_rossi() df = pd.concat([df] * reps) cp_breslow = CoxPHFitter(penalizer=0.1, l1_ratio=1.0, baseline_estimation_method="spline") start_time = time.time() cp_breslow.fit(df, duration_col="week", event_col="arrest", show_progress=True) print("--- %s seconds ---" % (time.time() - start_time)) cp_breslow.print_summary(2) print(cp_breslow.score(df)) print(cp_breslow.score(df, scoring_method="concordance_index"))
to_encode = ['edema', 'stage'] one_hot_train = to_one_hot(df_train, to_encode) one_hot_val = to_one_hot(df_val, to_encode) one_hot_test = to_one_hot(df_test, to_encode) print(one_hot_val.columns.tolist()) print(f"There are {len(one_hot_val.columns)} columns") print(one_hot_train.shape) one_hot_train.head() cph = CoxPHFitter() cph.fit(one_hot_train, duration_col='time', event_col='status', step_size=0.1) cph.print_summary() cph.plot_covariate_groups('edema_1.0', values=[0, 1]) def hazard_ratio(case_1, case_2, cox_params): hr = np.exp(np.dot(cox_params, (case_1 - case_2))) return hr i = 1 case_1 = one_hot_train.iloc[i, :].drop(['time', 'status']) j = 5
# -*- coding: utf-8 -*- # cox regression if __name__ == "__main__": import pandas as pd import time import numpy as np from lifelines import CoxPHFitter from lifelines.datasets import load_rossi, load_regression_dataset reps = 1 df = load_rossi() df = pd.concat([df] * reps) cph = CoxPHFitter() start_time = time.time() cph.fit(df, duration_col="week", event_col="arrest", show_progress=True) print("--- %s seconds ---" % (time.time() - start_time)) cph.print_summary(2) print(cph.compute_followup_hazard_ratios(df, [15, 20, 30, 40, 50, 52])) print(cph.hazard_ratios_) cph.compute_followup_hazard_ratios(df, [15, 20, 30, 40, 50, 52]).plot()