def cox_analysis_old(in_df): df = in_df.transpose() assert (not df.isnull().values.any()) cph = CoxPHFitter() cph.fit(df, duration_col='LivingDays', event_col='Dead') # cph.print_summary() pvals = cph._compute_p_values() return pvals[-1]
def forward_stepwise(data, vars_list, target_var, time_var, threshold_in=0.05, verbose=True, changed=True): vars_list = [var.strip() for var in vars_list] vars_list.remove(target_var) vars_list.remove(time_var) included = [] while changed == True: changed = False excluded = list(set(vars_list) - set(included)) print(excluded) new_pval = pd.Series(index=excluded) for new_column in excluded: X = data[included + [new_column] + [target_var] + [time_var]] X = X.dropna(thresh=X.shape[1]) model = CoxPHFitter(penalizer=15) model.fit(X, duration_col=time_var, event_col=target_var, show_progress=True, step_size=1) new_pval[new_column] = model._compute_p_values()[-1] best_pval = new_pval.min() if best_pval < threshold_in: best_feature = new_pval.index[new_pval.argmin()] included.append(best_feature) model_data = data[included + [target_var] + [time_var]] model_data = model_data.dropna(thresh=model_data.shape[1]) tmp_model = CoxPHFitter(penalizer=15) tmp_model.fit(X, duration_col=time_var, event_col=target_var, show_progress=True, step_size=1) #included = list(tmp_model._compute_p_values()[tmp_model._compute_p_values()<0.05].index) if 'const' in included: included.remove('const') changed = True if verbose: print('Add {:30} with p-value {:.6}'.format( best_feature, best_pval)) model = CoxPHFitter(penalizer=2) model.fit(X, duration_col=time_var, event_col=target_var, show_progress=True, step_size=1) return included, model, model_data
def estCoxPHTE(df, treatment_col='treated', duration_col='dx', event_col='disease', covars=[]): """Estimates treatment efficacy using proportional hazards (Cox model). Parameters ---------- df : pandas.DataFrame treatment_col : string Column in df indicating treatment. duration_col : string Column in df indicating survival times. event_col : string Column in df indicating events (censored data are 0) covars : list List of other columns to include in Cox model as covariates. Returns ------- est : float Estimate of vaccine efficacy ci : vector, length 2 95% confidence interval, [LL, UL] pvalue : float P-value for H0: VE=0""" coxphf = CoxPHFitter() coxphf.fit(df[[treatment_col, duration_col, event_col] + covars], duration_col=duration_col, event_col=event_col) te = 1 - np.exp(coxphf.hazards_.loc['coef', treatment_col]) ci = 1 - np.exp(coxphf.confidence_intervals_[treatment_col].loc[[ 'upper-bound', 'lower-bound' ]]) pvalue = coxphf._compute_p_values()[0] ind1 = df[treatment_col] == 0 ind2 = df[treatment_col] == 1 results = logrank_test(df[duration_col].loc[ind1], df[duration_col].loc[ind2], event_observed_A=df[event_col].loc[ind1], event_observed_B=df[event_col].loc[ind2]) index = ['TE', 'UB', 'LB', 'pvalue', 'logrank_pvalue', 'model'] return pd.Series([ te, ci['upper-bound'], ci['lower-bound'], pvalue, results.p_value, coxphf ], index=index)
def cox_analysis_mult(in_df, res_position=['pc1', 'pc2'], return_coeff=False): assert (not in_df.isnull().values.any()) cph = CoxPHFitter() cph.fit(in_df, duration_col='LivingDays', event_col='Dead') # cph.print_summary() pval = cph._compute_p_values()[-1] summary = cph.summary print(summary) coeff = summary.loc[res_position, 'coef'] pval = summary.loc[res_position, 'p'] if not return_coeff: return pval else: return coeff
def cox_analysis(in_df, res_position='pathway', return_coeff=False): assert (not in_df.isnull().values.any()) cph = CoxPHFitter() try: cph.fit(in_df, duration_col='LivingDays', event_col='Dead') except ValueError: print('Changing starting betas') try: cph.fit(in_df, duration_col='LivingDays', event_col='Dead', show_progress=True, initial_beta=np.array([[0], [0.1]])) except ValueError: print('Changing starting betas again') cph.fit(in_df, duration_col='LivingDays', event_col='Dead', show_progress=True, initial_beta=np.array([[0], [0.3]])) except np.linalg.LinAlgError: print('Sigular matrix, changing betas') cph.fit(in_df, duration_col='LivingDays', event_col='Dead', show_progress=True, initial_beta=np.array([[0], [0.1]])) # cph.print_summary() pval = cph._compute_p_values()[-1] summary = cph.summary # print(summary) coeff = summary.loc['pathway', 'coef'] pval = summary.loc['pathway', 'p'] if not return_coeff: return pval else: return coeff
def estCoxPHTE(df, treatment_col='treated', duration_col='dx', event_col='disease', covars=[]): """Estimates treatment efficacy using proportional hazards (Cox model). Parameters ---------- df : pandas.DataFrame treatment_col : string Column in df indicating treatment. duration_col : string Column in df indicating survival times. event_col : string Column in df indicating events (censored data are 0) covars : list List of other columns to include in Cox model as covariates. Returns ------- est : float Estimate of vaccine efficacy ci : vector, length 2 95% confidence interval, [LL, UL] pvalue : float P-value for H0: VE=0""" coxphf = CoxPHFitter() coxphf.fit(df[[treatment_col, duration_col, event_col]+covars], duration_col=duration_col, event_col=event_col) te = 1 - np.exp(coxphf.hazards_.loc['coef', treatment_col]) ci = 1 - np.exp(coxphf.confidence_intervals_[treatment_col].loc[['upper-bound', 'lower-bound']]) pvalue = coxphf._compute_p_values()[0] ind1 = df[treatment_col] == 0 ind2 = df[treatment_col] == 1 results = logrank_test(df[duration_col].loc[ind1], df[duration_col].loc[ind2], event_observed_A=df[event_col].loc[ind1], event_observed_B=df[event_col].loc[ind2]) index = ['TE', 'UB', 'LB', 'pvalue', 'logrank_pvalue', 'model'] return pd.Series([te, ci['upper-bound'], ci['lower-bound'], pvalue, results.p_value, coxphf], index=index)
def create_data_df(self): row_names_index = [(names, index) for v_conf in self.v_stack.configs for (names, index) in v_conf.list_index.items()] col_names = self.header + ["hr", "lcl", "ucl", "p_value"] output_df = pd.DataFrame(columns=col_names, data=0, index=range(len(row_names_index))) # loop over vconf # create sub df with cols as [1:] of rows in vconf + time value + other value # apply methods # get results # use results to populate outputs i = -1 for v_conf in self.v_stack.configs: col_names = set([x for x in v_conf.args]) name_df_dict = {} for col_name in col_names: one_hot = pd.get_dummies(self.df[col_name], drop_first=True) non_dropped = [x for x in one_hot.columns] dropped = [ x for x in self.df[col_name].unique() if x not in non_dropped and not pd.isnull(x) ] assert len(dropped) == 1 dropped = dropped[0] one_hot["_observed"] = self.df["_observed"] one_hot["_time_to_observation"] = self.df[ "_time_to_observation"] cph = CoxPHFitter() cph.fit( one_hot, duration_col="_time_to_observation", event_col="_observed", show_progress=False, ) res = cph.confidence_intervals_ res["hr"] = cph.params_ res["lcl"] = res["95% lower-bound"] res["ucl"] = res["95% upper-bound"] res["p_value"] = cph._compute_p_values() res = res.drop(columns=["95% lower-bound", "95% upper-bound"]) res = res.apply(lambda x: np.exp(x)) old_index = res.index res.at[dropped, :] = 1 res = res.reindex(index=[dropped] + list(old_index)) name_df_dict[col_name] = res for (names, index) in v_conf.list_index.items(): i += 1 # join on names res_df = name_df_dict[names[0]] line = [x for x in names] + [x for x in res_df.loc[names[1]]] output_df.at[i, :] = line # for (names, index) in v_conf.list_index.items(): # (cat, group, sample_size, label) = names # if group in non_dropped: # cph[group] = # output.append([]) for header in self.header: output_df[header] = output_df[header].astype(str) for i, (row_names, row_index) in enumerate(row_names_index): # set the header shit for header, name in zip(self.header, row_names): output_df.at[i, header] = name for h_conf in self.h_stack.configs: if h_conf.kind == "space": name = list(h_conf.name_index.keys())[0] output_df[name] = output_df[name].astype(str) output_df[name] = "" else: for col_name, col_index in h_conf.name_index.items(): idx = row_index & col_index new_df = self.df.iloc[idx] series_reducer = h_conf.reducer reducer = getattr(Reducers(), series_reducer) if h_conf.kind == "space": output_df[name] = output_df[name].astype(str) try: out_val = reducer(new_df[h_conf.name]) output_df.at[i, col_name] = out_val except KeyError: output_df[col_name] = output_df[col_name].astype( object) print(h_conf.name) output_df.at[i, col_name] = "ERROR" return output_df