def KM_estimate(self): kmf = KaplanMeierFitter() T = self.T kmf.fit(self.x, self.δ.astype(np.bool), alpha=self.confidence_α, timeline=T) Survival = np.array(kmf.predict(T)) self.KM = S_fun(self.x, self.M, T, Survival) self.KM.kmf = kmf self.KM.Sfun = self.KM.kmf.predict self.KM.mean = np.sum([(T[nn + 1] - T[nn]) * Survival[nn] for nn in range(len(Survival) - 1) ]).astype(float) + T[0] self.KM.σ = np.array( self.KM.kmf.survival_function_.std())[0].astype(float) self.KM.mean_σ = self.KM.σ self.KM.CI = np.array(self.KM.kmf.confidence_interval_) percents = np.array( [self.percentile(self.KM.Sfun, T, q / 100.) for q in σ_interval]) self.KM.median = self.KM.kmf.median_ self.KM.median_σ = 0.5 * np.diff(percents[1:])[0] self.current = 'KM'
def plot(self): """ Plot side-by-side kaplan-meier of input datasets """ figsize = (10, 5) fig, ax = plt.subplots(1, 2, figsize=figsize, sharey=True) # sns.set(font_scale=1.5) # sns.despine() palette = ['#0d3d56', '#006887', '#0098b5', '#00cbde', '#00ffff'] datasets = [self.stats_original_, self.stats_synthetic_] for data, label, ax_cur in zip(datasets, self.labels, ax): t = data['time'] e = data['event'] kmf = KaplanMeierFitter() groups = np.sort(data['group'].unique()) for g, color in zip(groups, palette): mask = (data['group'] == g) kmf.fit(t[mask], event_observed=e[mask], label=g) ax_cur = kmf.plot_survival_function(ax=ax_cur, color=color) ax_cur.legend(title=self.group_column) ax_cur.set_title('Kaplan-Meier - {} data'.format(label)) ax_cur.set_ylim(0, 1) plt.tight_layout()
def plot_survival_curves(df, time, event, by=None): ''' Creates survival curves grouped by the given categorical variable. ex) df.pipe(plot_survival_curves, time='days', event='cancel', by='state') ''' df = df.copy() fig, ax = plt.subplots() kmf = KaplanMeierFitter() if by: a = df[by].dropna().astype(str) for i in a.unique(): T = df.loc[df[by] == i, time] E = df.loc[df[by] == i, event] kmf.fit(T, event_observed=E, label=i) kmf.survival_function_.plot(ax=ax) plt.legend(title=by, loc=(1, 0)) else: T = df[time] E = df[event] kmf.fit(T, event_observed=E) kmf.survival_function_.plot(ax=ax) plt.legend().remove()
def subsetsImpactSurvival(subsets, metadata, metacensorcol="overall_survival", metaDFDcol="death_from_disease", plot=False, title=None, rounding=2): """ subsets is a dictionary, e.g.: subsets={'cluster {}'.format(i):metadata.index.isin(fitrue.columns[kmeans.labels_==i]) for i in range(4)} """ kmf = KaplanMeierFitter() lastvalues = {} for subset in subsets: kmf.fit(metadata[metacensorcol][subsets[subset]], metadata[metaDFDcol][subsets[subset]], label=subset) lastvalues[subset] = (sum(subsets[subset]), float(kmf.survival_function_.loc[ kmf.survival_function_.last_valid_index()])) try: kmf.plot(ax=ax) except NameError: ax = kmf.plot() if title: ax.set_title(title) ax.set_ylim((0, 1)) return lastvalues, ax
def marginal(self): #reverse KaplanMeier self.data['status'] = self.data['status'].values.astype(int) ^ 1 # weights at requested times if "IPCW.times" in self.what: kmf = KaplanMeierFitter() kmf.fit(self.data['failure_time'], event_observed=self.data['status'].values, timeline=self.times) self.weights = np.round(kmf.predict(self.times), decimals=4) #self.weights = kmf.conditional_time_to_event_(self.times) # self.times = predict(fit, newdata=data, times=times, level_chaos=1, mode="matrix", type="surv") self.times = [] else: self.times = None # weights at subject specific event times if "IPCW.subject.times" in self.what: # self.subject_times = prodlim.predictSurvIndividual(fit, lag=self.lag) self.subject_times = [] else: self.subject_times = None out = { 'times': self.times, 'subject_times': self.subject_times, 'method': self.method } out = self.output(out, self.keep, self.times, self.fit, self.call) # class(out) < - "IPCW" return self.weights
def __KM_analysis(self,duration_table,expressed_array,unexpressed_array,freq_set): data = {} expressed_T = [] expressed_C = [] unexpressed_T = [] unexpressed_C = [] for idx,row in enumerate(duration_table): if(idx>0): if row[0] in unexpressed_array and row[1] != "NA" and row[2] != "NA": unexpressed_T.append(float(row[1])) unexpressed_C.append(int(row[2])) elif row[0] in expressed_array and row[1] != "NA" and row[2] != "NA": expressed_T.append(float(row[1])) expressed_C.append(int(row[2])) results = logrank_test(expressed_T, unexpressed_T, expressed_C, unexpressed_C, alpha=.95 ) if(results.p_value < .0006): ax = plt.subplot(111) kmf = KaplanMeierFitter() kmf.fit(expressed_T, event_observed=expressed_C, label="Satisfying") kmf.plot(ax=ax, ci_force_lines=False) kmf.fit(unexpressed_T, event_observed=unexpressed_C, label="None-Satisfying") kmf.plot(ax=ax, ci_force_lines=False) plt.ylim(0,1) plt.title("Lifespans ("+str(freq_set)+")") plt.show() return results.p_value
def plot_two_groups(data, t_col_name, e_col_name, g_name, alpha): ''' functino to render the 2 groups and calculate the p values ''' T = data[t_col_name] E = data[e_col_name] groups = df[g_name] # get unique groups to get 1st and 2nd groups names uniques = df[g_name].unique() ix = (groups == uniques[0]) kmf = KaplanMeierFitter() # plot first group kmf.fit(T[~ix], E[~ix], label=uniques[1]) ax = kmf.plot() # plot second group kmf.fit(T[ix], E[ix], label=uniques[0]) kmf.plot(ax=ax) # get resoults for p Values results = logrank_test(T[ix], T[~ix], E[ix], E[~ix], alpha=alpha) plt.title('p-value: {0:.4f}, alpha: {1:.2f}'.format( results.p_value, alpha))
def fun(epsilon): li = [] for kk in range(100): newdata_= laplace_mechanism(his , np.sqrt(2.0) / epsilon) newdata = [max([0.0, d]) for d in newdata_] ntime = np.asarray([]) nevent = np.asarray([]) for i in range(bins0): ntime = np.append(ntime, np.linspace(bin_edges0[i], bin_edges0[i+1] , newdata[i])) #ntime = np.append(ntime, np.ones(newdata[i]) * 0.5 * (bin_edges0[i+1] + bin_edges0[i] )) # , newdata[i])) nevent = np.append(nevent,np.zeros(newdata[i])) for i in range(bins1): ntime = np.append(ntime,np.linspace(bin_edges1[i], bin_edges1[i+1], newdata[bins0 + i])) #ntime = np.append(ntime, np.ones(newdata[bins0 + i]) * 0.5 * (bin_edges1[i+1] + bin_edges1[i] )) # , newdata[i])) nevent = np.append(nevent, np.ones(newdata[bins0+i])) kmf1 = KaplanMeierFitter() kmf1.fit(ntime, event_observed=nevent) #naf1.fit(ntime, event_observed=nevent) out = kmf1.predict(kmf.timeline) #pyplot.plot (naf1.timeline, naf1.cumulative_hazard_.values) #pyplot.plot (naf.timeline, naf.cumulative_hazard_.values) #pyplot.show() mre = ( np.linalg.norm(out - true_value[:,0]) / np.linalg.norm(true_value[:,0]) ) li.append(mre) avg = np.average( li ) #mean_relative_error.append(avg) print "(%f, %f)" % (epsilon, avg)
def survival_analysis(dataframe, grouping, years = 5): # remove patients with null values df2 = dataframe.dropna(subset = [grouping]) df2 = df2.dropna(subset = ['_OS']) df2 = df2.dropna(subset = ['_EVENT']) # limit analysis to number of years specified df2['survival'] = np.nan df2['event'] = np.nan maxtime = years * 365 df2['survival'][(df2['_OS'] > maxtime)] = maxtime df2['event'][(df2['_OS'] > maxtime)] = 0 df2['survival'][(df2['_OS'] <= maxtime)] = df2['_OS'] df2['event'][(df2['_OS'] <= maxtime)] = df2['_EVENT'] # get groups grouped_data = df2.groupby(grouping) unique_groups = list(grouped_data.groups.keys()) unique_groups.sort() #plot survival curve kmf = KaplanMeierFitter() ax = plt.subplot(111) for i, group in enumerate(unique_groups): data = grouped_data.get_group(group) kmf.fit(data['survival'], data['event'], label = group) # print(data['_OS']) kmf.plot(ax=ax, show_censors = True) plt.show()
def plotKM(genes): extractSurvivalData() data = np.genfromtxt("data/survival_complete.txt", delimiter='\t', dtype=str) # df = load_waltons() # returns a Pandas DataFrame # print(df) df = pd.DataFrame(data, columns=['id', 'ER', 'PR', 'HER2', 'TN', 'GCH1', 'CDH1', 'CDH2', 'VIM', 'bCatenin', 'ZEB1', 'ZEB2', 'TWIST1', 'SNAI1', 'SNAI2', 'RET', 'NGFR', 'EGFR', 'AXL', 'STATUS', 'MONTHS']) kmf = KaplanMeierFitter() for i in range(0, 14): # divide the complete data set into type positive and type negative (e.g. ER+ and ER-) # data below contain the value of the gene ERP, ERN = separateLabels(df, 'ER', i, 1) # PRP, PRN = separateLabels(df, 'PR', i, 1) # HER2P, HER2N = separateLabels(df, 'HER2', i,1) # TNP, TNN = separateLabels(df, 'TN', i,1) # within each type (pos/neg), divide data into high/low gene expressions ERPH, ERPL = separateHighandLow(df, genes, i, ERP.values) # KM plot kmf.fit(ERPH[:, 2:3].astype(float), label='pos_high') ax = kmf.plot() kmf.fit(ERPL[:, 2:3].astype(float), label='pos_low') kmf.plot(ax=ax) plt.savefig("images/kmplot_" + genes[i]) plt.clf()
def plot_km_survf(data, t_col="t", e_col="e", save_file=''): """ Plot KM survival function curves. Parameters ---------- data: pandas.DataFrame Survival data to plot. t_col: str Column name in data indicating time. e_col: str Column name in data indicating events or status. save_model: string Path for saving model. """ from lifelines import KaplanMeierFitter from lifelines.plotting import add_at_risk_counts f = plt.figure() fig, ax = plt.subplots(figsize=(6, 4)) kmfh = KaplanMeierFitter() kmfh.fit(data[t_col], event_observed=data[e_col], label="KM Survival Curve") kmfh.survival_function_.plot(ax=ax) plt.ylim(0, 1.01) plt.xlabel("Time") plt.ylabel("Probalities") plt.legend(loc="best") add_at_risk_counts(kmfh, ax=ax) #plt.show() f.savefig(save_file + '.pdf', bbox_inches='tight')
def plot_survival_function(scdf): dfl = scdf.copy() dfl['sc3'] = dfl['frameZeroUtr3LenAdj'] + 3 for i in dfl.index: if dfl.loc[i, 'sc3'] < 101: dfl.loc[i, 'kill'] = 1 else: dfl.loc[i, 'kill'] = 0 kfm = KaplanMeierFitter() T = dfl['sc3'] E = dfl['kill'] kfm.fit(T, event_observed=E) kfm.survival_function_.plot() ax = plt.gca() ax.set_ylim(0, 1) ax.set_xlim(0, 100) figout = "%s/figures/Fig3S2B.pdf" % rootDir plt.savefig(figout, format='pdf', bbox_inches="tight")
class KaplanMeier: def __init__(self): self.kmf = KaplanMeierFitter() def fit(self, X, y): self.kmf.fit(durations=X, event_observed=y, left_censorship=True) print("cumulative_density_:") print(self.kmf.cumulative_density_) return self def predict_proba(self, X): return self.kmf.cumulative_density_.loc[np.squeeze(X), 'KM_estimate'] def predict(self, X): return np.where(self.predict_proba(X)>=0.5, 1.0, 0.0) def evaluate(self, X, y_bin_true, sample_weights=None): y_proba_pred = self.predict_proba(X) y_bin_pred = np.where(y_proba_pred>=0.5, 1.0, 0.0) # return log_loss(y_bin_true, y_proba_pred, sample_weight=sample_weights), \ # 0.0, \ # accuracy_score(y_bin_true, y_bin_pred, sample_weight=sample_weights) return log_loss(y_bin_true, y_proba_pred, sample_weight=sample_weights), \ c_index(y_bin_true, y_proba_pred, np.squeeze(X)), \ accuracy_score(y_bin_true, y_bin_pred, sample_weight=sample_weights)
def plot_survival_curves(rec_t, rec_e, antirec_t, antirec_e, experiment_name = '', output_file = None): # Set-up plots plt.figure(figsize=(12,3)) ax = plt.subplot(111) # Fit survival curves kmf = KaplanMeierFitter() kmf.fit(rec_t, event_observed=rec_e, label=' '.join([experiment_name, "Recommendation"])) kmf.plot(ax=ax,linestyle="-") kmf.fit(antirec_t, event_observed=antirec_e, label=' '.join([experiment_name, "Anti-Recommendation"])) kmf.plot(ax=ax,linestyle="--") # Format graph plt.ylim(0,1); ax.set_xlabel('Timeline (months)',fontsize='large') ax.set_ylabel('Percentage of Population Alive',fontsize='large') # Calculate p-value results = logrank_test(rec_t, antirec_t, rec_e, antirec_e, alpha=.95) results.print_summary() # Location the label at the 1st out of 9 tick marks xloc = max(np.max(rec_t),np.max(antirec_t)) / 9 if results.p_value < 1e-5: ax.text(xloc,.2,'$p < 1\mathrm{e}{-5}$',fontsize=20) else: ax.text(xloc,.2,'$p=%f$' % results.p_value,fontsize=20) plt.legend(loc='best',prop={'size':15}) if output_file: plt.tight_layout() pylab.savefig(output_file)
def dust_mass_KM(): vt19_data = Table.read('/home/jotter/nrao/tables/VT19.txt', format='latex') eis_data = Table.read('/home/jotter/nrao/tables/eisner_tbl.txt', format='ascii') dmass_data = Table.read( '/home/jotter/nrao/summer_research_2018/tables/r0.5_apr20_calc_vals.fits' ) vt19_dmass_raw = vt19_data['Mass'] eis_dmass_raw = eis_data['M_dust^a'] B3_dmass = np.log10(dmass_data['dust_mass_B3']) B7_dmass = np.log10(dmass_data['dust_mass_B7']) B7_dmass = B7_dmass[np.where(np.isnan(B7_dmass) == False)[0]] vt19_dmass = [] for dm in vt19_dmass_raw: vt19_dmass.append(dm.split()[0][1:]) vt19_dmass = np.log10(np.array(vt19_dmass[1:], dtype='float')) #eis_dmass = [] #for dm in eis_dmass_raw: # eis_dmass.append(dm.split()[0]) #eis_dmass = np.log10(np.array(eis_dmass, dtype='float')) #eis_dmass = eis_dmass[np.where(np.isinf(eis_dmass) == False)[0]] kmf = KaplanMeierFitter() kmf.fit(vt19_dmass) fig = plt.figure() ax = kmf.plot() plt.savefig('/home/jotter/nrao/plots/VT19_KM_plot.png')
def kaplan_meier(out, t, ttype): def make_label(ttype, nobs): return "Rand%d; %d obs." % (ttype, nobs) kmf = KaplanMeierFitter() kmf.fit(t, event_observed=out, label=make_label(ttype=ttype, nobs=len(out))) return kmf
def main(): args = parse_args() if args.data_dir is None: data_dir = DATA_DIR else: data_dir = Path(args.data_dir) with open(str(data_dir.joinpath(args.file_name)), 'rb') as f: inputdata_list = pickle.load(f) y_orig = inputdata_list[0] preds_bootfull = inputdata_list[1] inds_inbag = inputdata_list[2] del inputdata_list preds_bootfull_mat = np.concatenate(preds_bootfull, axis=1) inds_inbag_mat = np.array(inds_inbag).T inbag_mask = 1*np.array([np.any(inds_inbag_mat==_, axis=0) for _ in range(inds_inbag_mat.shape[0])]) preds_bootave_oob = np.divide(np.sum(np.multiply((1-inbag_mask), preds_bootfull_mat), axis=1), np.sum(1-inbag_mask, axis=1)) risk_groups = 1*(preds_bootave_oob > np.median(preds_bootave_oob)) wdf = pd.DataFrame( np.concatenate((y_orig, preds_bootave_oob[:, np.newaxis],risk_groups[:, np.newaxis]), axis=-1), columns=['status', 'time', 'preds', 'risk_groups'], index=[str(_) for _ in risk_groups] ) kmf = KaplanMeierFitter() ax = plt.subplot(111) kmf.fit(durations=wdf.loc['0','time'], event_observed=wdf.loc['0','status'], label="Low Risk") ax = kmf.plot(ax=ax) kmf.fit(durations=wdf.loc['1','time'], event_observed=wdf.loc['1','status'], label="High Risk") ax = kmf.plot(ax=ax) plt.ylim(0,1) plt.title("Kaplan-Meier Plots") plt.xlabel('Time (days)') plt.ylabel('Survival Probability')
def kmplot(df_high, df_low, ax): kmf_high = KaplanMeierFitter() kmf_low = KaplanMeierFitter() try: kmf_high.fit(durations=df_high.duration, event_observed=df_high.event, label='High: n = ' + str(len(df_high))) kmf_low.fit(durations=df_low.duration, event_observed=df_low.event, label="Low: n = " + str(len(df_low))) except ValueError: return ("NA", "0", "0", "0", "0") kmf_high.plot(ax=ax, color="red", show_censors=True, ci_show=False) kmf_low.plot(ax=ax, color="black", show_censors=True, ci_show=False) statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A=df_high.event, event_observed_B=df_low.event) p_value = statistics_result.p_value ax.set_xlabel('Time (months)') ax.set_ylabel('Probability') ax.text(0.95, 0.02, 'logrank P = ' + str('%.4f' % p_value), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes, color='black', fontsize=11) plt.legend(loc=3) hm5 = kmf_high.predict(60) hm10 = kmf_high.predict(120) lm5 = kmf_low.predict(60) lm10 = kmf_low.predict(120) return (p_value, hm5, hm10, lm5, lm10)
def plot_kaplan_meier(self, column, value): """[plot Kaplan meier survival plots of cleaned METABRIC clinical data] Args: column ([string]): [column in METABRIC data corresponding to a patient attribute, such as her2 receptor status] value ([string or integer]): [value of column that is a point of comparision. ie column:her2_recepter value:'negative'] Plots values in column vs != values in column """ kmf = KaplanMeierFitter() treatment_df = self.data[self.data[column] == value] not_treatment_df = self.data[self.data[column] != value] treatment_months = treatment_df.overall_survival_months not_treatment_months = not_treatment_df.overall_survival_months kmf.fit(treatment_months, event_observed=treatment_df['death_from_cancer'], label=value) ax = kmf.plot() kmf2 = KaplanMeierFitter() kmf2.fit(not_treatment_months, event_observed=not_treatment_df['death_from_cancer'], label=f'not {value}') ax = kmf2.plot(ax=ax) add_at_risk_counts(kmf, kmf2, ax=ax) ax.set_ylim([0.0, 1.0]) ax.set_xlabel('Timeline (Months)') ax.set_title(f'Kaplan Meier plot in months of {column} variable') # plt.figure(dpi=350) plt.tight_layout() plt.show()
def survival_for_two(df, treat, ctrl, legends, title, figname): # select the time and status info for treat and control group ix = df['group'] == treat t1 = df.loc[ix]['time'] print(t1.shape) e1 = df.loc[ix]['status'] t2 = df.loc[~ix]['time'] print(t2.shape) e2 = df.loc[~ix]['status'] results = logrank_test(t1, t2, event_observed_A=e1, event_observed_B=e2) pvalue = results.p_value print('pvalue:\t{}'.format(pvalue)) # survival curves plt.figure(figsize=(3., 3.)) ax = plt.subplot(111) kmf_control = KaplanMeierFitter() #g1 = kmf_control.fit(t1, e1, label=legends[0]).plot(ax=ax,show_censors=True,\ g1 = kmf_control.fit(t1, e1).plot(ax=ax,show_censors=True,\ censor_styles={'ms': 12, 'marker': '+'},ci_show=False,c='red',ls='-') kmf_exp = KaplanMeierFitter() #g2 = kmf_exp.fit(t2, e2, label=legends[1]).plot(ax=ax,show_censors=True,\ g2 = kmf_exp.fit(t2, e2).plot(ax=ax,show_censors=True,\ censor_styles={'ms': 12, 'marker': '+'},ci_show=False,c='k',ls='--') handles, labels = ax.get_legend_handles_labels() print(labels) lg = ax.legend(handles[1::2], legends, loc='lower left', borderaxespad=-.15, handletextpad=.2, labelspacing=.3, handlelength=1, frameon=False) if pvalue < 1: plt.axes().text(df['time'].max() * 0.45, 0.45, 'p={:.2f}'.format(pvalue), fontsize=16, ha='center') # plt.axes().text(df['time'].max()*0.45,0.45,'p={:.2e}'.format(pvalue),fontsize=16,ha='center') plt.ylim([-0.02, 1.05]) # plt.xlim([0,max_val*1]) plt.title(title, fontsize=22) plt.xlabel('Days', fontsize=22) plt.ylabel('Survival probability', fontsize=22) plt.savefig(figname, bbox_inches='tight', pad_inches=.1, dpi=600, transparent=True) plt.close() return results
def KM_estimator(relapsed_data, censored_data): durations = relapsed_data + censored_data event_observed = list(np.ones(len(relapsed_data))) + list(np.zeros(len(censored_data))) ax = plt.subplot(111) kmf = KaplanMeierFitter() kmf.fit(durations, event_observed, label='kaplan-meier curve') axes = plt.gca() axes.set_ylim([0, 1]) axes.set_xlim([0, 86]) axes.set_position([0.16, 0.175, 0.81, 0.8]) kmf.plot(show_censors=False, censor_styles={'ms': 3, 'marker': 's'}, ci_show=True, at_risk_counts=False) plt.xlabel('Time in Months', labelpad=10, fontsize=20) #, weight='bold' plt.ylabel('Survival Probability', labelpad=10, fontsize=20) for tick in ax.xaxis.get_major_ticks(): tick.label1.set_fontsize(15) #tick.label1.set_fontweight('bold') for tick in ax.yaxis.get_major_ticks(): tick.label1.set_fontsize(15) #tick.label1.set_fontweight('bold') plt.savefig('km.pdf') plt.show()
def meetingTimeHelper(df): """ Input: df: data frame, columns "tau" and "hasMet" tau: int, meeting time hasMet: boolean, whether we observe meeting event nXstep: int, number of steps at the end (either due to meeting or censoring) Outputs: KM fits of the meeting time (in sampling sweeps and in processor time) """ censoredTimes = [] for (idx, val) in enumerate(df["tau"]): if val == -1: censoredTime = df["nXstep"].iloc[idx] else: censoredTime = val censoredTimes.append(censoredTime) tauFitter = KaplanMeierFitter() tauFitted = tauFitter.fit(censoredTimes, df["hasMet"]) timeFitter = KaplanMeierFitter() timeFitted = timeFitter.fit(df["timeTaken"], df["hasMet"]) return tauFitted, timeFitted
def survival_plot_and_cox(self, df_arr, label=[], filename=''): plt.clf() color = ['red', 'green', 'blue', 'cyan', 'orange', 'black'] kmf = KaplanMeierFitter() naf = NelsonAalenFitter() for a in range(len(df_arr)): df_el = df_arr[a] if a == 0: kmf.fit(df_el['bcrmonth'], df_el['bcrstatus'], label=label[a]) ax = kmf.plot(show_censors=True, ci_show=False, color=color[a], ylim=(0, 1)) else: kmf.fit(df_el['bcrmonth'], df_el['bcrstatus'], label=label[a]) kmf.plot(ax=ax, show_censors=True, ci_show=False, color=color[a], ylim=(0, 1)) fig = ax.get_figure() fig.savefig(filename + '.png') fig.savefig(filename + '.pdf', format='PDF')
def KaplanMeier_dash(T, C): kmf = KaplanMeierFitter() kmf.fit(T, event_observed=C) kmf.plot(title='Kaplan Meier fitter') kmf.plot(ci_force_lines=True, title='Kaplan Meier fitter') kmf1 = plt.gcf() pyplot(kmf1, legend=False)
def km_analysis(survivalDf, durationCol, statusCol, saveFile=None): kmf = KaplanMeierFitter() kmf.fit(survivalDf.loc[:, durationCol], survivalDf.loc[:, statusCol]) survFunc = kmf.survival_function_ m, b, r, p, e = stats.linregress(list(survFunc.index), survFunc.iloc[:, 0]) survivalDf = survivalDf.sort_values(by=durationCol) ttpfs = numpy.array(survivalDf.loc[:, durationCol]) survTime = numpy.array(survFunc.index) survProb = [] for i in range(len(ttpfs)): date = ttpfs[i] if date in survTime: survProb.append(survFunc.loc[date, "KM_estimate"]) elif date not in survTime: lbix = numpy.where(numpy.array(survFunc.index) < date)[0][-1] est = 0.5 * (survFunc.iloc[lbix, 0] + survFunc.iloc[lbix + 1, 0]) survProb.append(est) kmEstimate = pandas.DataFrame(survProb) kmEstimate.columns = ["kmEstimate"] kmEstimate.index = survivalDf.index pfsDf = pandas.concat([survivalDf, kmEstimate], axis=1) if saveFile is not None: pfsDf.to_csv(saveFile) return pfsDf
def km_curve(labels_ids, survival_dataset, tested_gene_expression_headers_columns, gene_group , k=None, label_index=None): ax = plt.subplot(111) kmf = KaplanMeierFitter() all_labels = np.array([y for x in labels_ids for y in x]) label_event_list = [] label_duration_list = [] results = [] for i, cur_labels in enumerate(labels_ids): label_event = survival_dataset[np.in1d(survival_dataset[:, 0], cur_labels) & np.in1d(survival_dataset[:, 0], tested_gene_expression_headers_columns), 4].astype(np.int32) label_duration = survival_dataset[np.in1d(survival_dataset[:, 0], cur_labels) & np.in1d(survival_dataset[:, 0], tested_gene_expression_headers_columns), 3].astype(np.int32) label_event_list.append(label_event) label_duration_list.append(label_duration) labels_c = all_labels[~np.in1d(all_labels,cur_labels) & np.in1d(all_labels, tested_gene_expression_headers_columns)] label_event_c = survival_dataset[np.in1d(survival_dataset[:, 0], labels_c), 4].astype(np.int32) label_duration_c = survival_dataset[np.in1d(survival_dataset[:, 0], labels_c), 3].astype(np.int32) lr_results = logrank_test(label_duration, label_duration_c, label_event, label_event_c, alpha=.95) if len(label_duration) != 0: kmf.fit(list(label_duration), event_observed=list(label_event), label="cluster {} n={}, logrank pval = {}".format(i,len(label_duration), '{0:1.3e}'.format(lr_results.p_value))) # '%.7f' % kmf.plot(ax=ax, show_censors=True) print "lrank cluster {} vs all: {}".format(i, lr_results.p_value) results.append(lr_results.p_value) for j, cur_duration in enumerate(label_duration_list[:-1]): lr_results = logrank_test(label_duration, label_duration_list[j], label_event, label_event_list[j], alpha=.95) print "lrank cluster {} vs cluster {}: {}".format(i, j, lr_results.p_value) plt.ylim(0, 1); plt.title("clustering survival analysis"); plt.savefig(os.path.join(constants.BASE_PROFILE,"output" ,"cluster_by_p_{}_{}_k={}_label_i={}_{}.png".format(constants.CANCER_TYPE, gene_group.split("/")[-1],k,label_index , time.time()))) plt.cla() return results
def get_km_results(df, group_col, time_col, event_col): models = [] summary_ = None summary_result = None df = df[[event_col, time_col, group_col]].dropna() df[event_col] = df[event_col].astype('category') df[event_col] = df[event_col].cat.codes df[time_col] = df[time_col].astype('float') if not df.empty: for name, grouped_df in df.groupby(group_col): kmf = KaplanMeierFitter() t = grouped_df[time_col] e = grouped_df[event_col] kmf.fit(t, event_observed=e, label=name + " (N=" + str(len(t.tolist())) + ")") models.append(kmf) summary_ = multivariate_logrank_test(df[time_col].tolist(), df[group_col].tolist(), df[event_col].tolist(), alpha=99) if summary_ is not None: summary_result = "Multivariate logrank test: pval={}, t_statistic={}".format( summary_.p_value, summary_._test_statistic) return models, summary_result
def plot_km_estimates(self, index): # Kaplan-Meier estimations for sub group and complement rcParams['figure.figsize'] = 15, 6 plt.figure(index + 1) ax = plt.subplot(111) kmf_sg = KaplanMeierFitter() kmf_cpl = KaplanMeierFitter() kmf_sg.fit(self.sub_group['survival_times'], self.sub_group['events'], label='KM estimates for subgroup', alpha=UserInputs.kmf_alpha) kmf_sg.plot(ax=ax) kmf_cpl.fit(self.sub_group_complement['survival_times'], self.sub_group_complement['events'], label='KM estimates for complement', alpha=UserInputs.kmf_alpha) kmf_cpl.plot(ax=ax) title = self.string_repr[0] + ': ' + self.string_repr[1] plt.title(title) plt.xlabel('Time') plt.ylabel('Survival probability') fig_id = self.string_repr[0] + '_model' plt.savefig(fig_id) return
def single_submit(form): if form.validate_on_submit(): database = form.DataBase.data Gene = form.GeneName.data low = int(form.Low.data) high = int(form.High.data) static = {} data, os, static['mean'], static['std'] = ReadData(database, Gene) num = len(os) low = max(int(num * low / 100), 1) high = max(int(num * high / 100), 1) Low, High = data[:, 1][0:low], data[:, 1][-high:] group1, group2 = data[:, 2][0:low], data[:, 2][-high:] kmf = KaplanMeierFitter() kmf.fit(Low, group1, label=Gene + '/low') ax = kmf.plot() kmf.fit(High, group2, label=Gene + '/high') kmf.plot(ax=ax) plt.savefig("static/test.png", bbox_inches='tight') plt.close() return render_template("single.html", form=form, image="test.png", refresh=np.random.randn(), static=static) else: return render_template("single.html", form=form, err=form.errors)
def plot_km_survf(data, t_col="t", e_col="e",datatype='train_data'): """ Plot KM survival function curves. Parameters ---------- data: pandas.DataFrame Survival data to plot. t_col: str Column name in data indicating time. e_col: str Column name in data indicating events or status. """ from lifelines import KaplanMeierFitter from lifelines.plotting import add_at_risk_counts fig, ax = plt.subplots(figsize=(6, 4)) kmfh = KaplanMeierFitter() kmfh.fit(data[t_col], event_observed=data[e_col], label="KM Survival Curve") kmfh.survival_function_.plot(ax=ax) plt.ylim(0, 1.01) plt.xlabel("Time") plt.ylabel("Probalities") plt.legend(loc="best") add_at_risk_counts(kmfh, ax=ax) # plt.show() if datatype=='train_data': plt.savefig('/home/kyro_zhang/ZQX/train_data.png') else if datatype=='test_data': plt.savefig('/home/kyro_zhang/ZQX/test_data.png') else: plt.savefig('/home/kyro_zhang/ZQX/predict_data.png')
def kaplan_meier_curve( data_df: Union[pd.DataFrame, str], task: str = "liver", threshold: Union[float, List] = 0.5, process_dir: str = None, ): if isinstance(data_df, str): data_df = pd.read_csv(data_df) if isinstance(threshold, float): thresholds = [threshold, 1] else: thresholds = threshold thresholds.append(1) ax = plt.subplot(111) kmf = KaplanMeierFitter() prev_threshold = -1 for threshold in thresholds: name = f"{task}: {prev_threshold} < y <= {threshold}" grouped_df = data_df[(data_df[task] > prev_threshold) & (data_df[task] <= threshold)] kmf.fit(grouped_df["duration"], grouped_df["event"], label=name) kmf.plot(ax=ax) prev_threshold = threshold plt.xlabel("Follow-up time (days)") plt.ylabel("Probability of survival") if process_dir is not None: plt.tight_layout() plt.savefig(os.path.join(process_dir, f"{task}_kaplan_meier.pdf"))
def graph(months, survival_status, has_mutation, name): survival_data = pd.DataFrame({ 'OS_MONTHS': months, 'OS_STATUS': survival_status # 0 if living, 1 if dead }) #0 if don't have mutation, 1 if do have mutation in has_mutation ## create an kmf object kmf = KaplanMeierFitter() ## fit the data into a model for each group kmf.fit(survival_data.OS_MONTHS[has_mutation], survival_data.OS_STATUS[has_mutation], label="have mutation") layer1 = kmf.plot(ci_show=True) kmf.fit(survival_data.OS_MONTHS[~has_mutation], survival_data.OS_STATUS[~has_mutation], label="no mutation") layer2 = kmf.plot(ax=layer1, ci_show=True) plt.title('{} survival plot'.format(name)) ## view plot plt.show()
def KM_median(array, upper_lim_flags, left_censor=True, return_type='percentile'): kmf = KaplanMeierFitter() if upper_lim_flags is not None: if left_censor == True: kmf.fit_left_censoring(array, upper_lim_flags) else: kmf.fit(array, event_observed=upper_lim_flags) #right censoring else: kmf.fit(array, upper_lim_flags) median = median_survival_times(kmf.survival_function_) if return_type == 'percentile': upper_perc = kmf.percentile(0.25) lower_perc = kmf.percentile(0.75) print( f'median and 1st/3rd quartiles: {median}, {lower_perc}, {upper_perc}' ) return median, upper_perc, lower_perc elif return_type == 'ci': median_ci = median_survival_times(kmf.confidence_interval_).values print(f'median and CI: {median}, {median_ci}') return median, median_ci[0][0], median_ci[0][1] elif return_type == 'median': return median
def plot_Kaplan_Meier_feature(donor_dataset): '''Accepts a dataframe of donor data. For each feature (column), it plots the Kaplan-Meier curves of the donors based on whether the feature is true or false. The active donors ('censored') will be excluded from the plot. Parameters: donor_dataset: Pandas dataframe which contain at least the columns 'Total-years' and 'censored'. 'Total_years' represents how many years the donors have been active. 'censored' indicates whether a donor is still active (True = active donor). Output: Kaplan-Meier plot(s). This function does not return anything. ''' T = donor_dataset['Total_years'] C = donor_dataset['censored'] features = list(donor_dataset.columns) features.remove('Total_years') features.remove('censored') features.remove('Baseline') kmf = KaplanMeierFitter() for feature in features: Above_mean = donor_dataset[feature] > donor_dataset[donor_dataset['censored'] == 0][feature].mean() fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(111) kmf = KaplanMeierFitter() kmf.fit(T[Above_mean], C[Above_mean], label = feature + ': Yes or > mean') kmf.plot(ax=ax, linewidth = 2) kmf.fit(T[~Above_mean], C[~Above_mean], label = feature + ': No or < mean') kmf.plot(ax=ax, linewidth = 2) ax.set_xlabel('Years', size = 10) ax.set_ylabel('Surviving donor population', size = 10) ax.set_xlim(0,40) ax.set_ylim(0, 1) ax.grid() ax.legend(loc = 'upper right', fontsize = 10) plt.show()
def plot_riskGroups(data_groups, event_col, duration_col, labels=[], plot_join=False, xlabel="Survival time (Month)", ylabel="Survival Rate", legend="Risk Groups", title="Survival function of Risk groups", save_fig_as=""): """Plot survival curve for different risk groups. Parameters ---------- data_groups : list(`pandas.DataFame`) list of DataFame[['E', 'T']], risk groups from lowest to highest. event_col : str column in DataFame indicating events. duration_col : atr column in DataFame indicating durations. labels : list(str), default [] One text label for one group. plot_join : bool, default False Is plotting for two adjacent risk group, default False. save_fig_as : str Name of file for saving in local. Returns ------- None Plot figure of each risk-groups. Examples -------- >>> plot_riskGroups(df_list, "E", "T", labels=["Low", "Mid", "High"]) """ # init labels N_groups = len(data_groups) if len(labels) == 0: for i in range(N_groups): labels.append(str(i+1)) # Plot fig, ax = plt.subplots(figsize=(8, 6)) kmfit_groups = [] for i in range(N_groups): kmfh = KaplanMeierFitter() sub_group = data_groups[i] kmfh.fit(sub_group[duration_col], event_observed=sub_group[event_col], label=labels[i]) kmfh.survival_function_.plot(ax=ax) kmfit_groups.append(kmfh) # Plot two group (i, i + 1) if plot_join: for i in range(N_groups - 1): kmfh = KaplanMeierFitter() sub_group = pd.concat([data_groups[i], data_groups[i+1]], axis=0) kmfh.fit(sub_group[duration_col], event_observed=sub_group[event_col], label=labels[i]+'&'+labels[i+1]) kmfh.survival_function_.plot(ax=ax) kmfit_groups.append(kmfh) plt.ylim(0, 1.01) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.title(title) plt.legend(loc="best", title=legend) add_at_risk_counts(*kmfit_groups, ax=ax) plt.show() if save_fig_as != "": fig.savefig(save_fig_as, format='png', dpi=600, bbox_inches='tight')
def kmplot(df_high, df_low): kmf_high = KaplanMeierFitter() kmf_low = KaplanMeierFitter() try: kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high))) kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low))) except ValueError: return("NA", "0", "0", "0", "0") statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event) p_value = statistics_result.p_value hm5 = kmf_high.predict(60) hm10 = kmf_high.predict(120) lm5 = kmf_low.predict(60) lm10 = kmf_low.predict(120) return(p_value, hm5, hm10, lm5, lm10)
def surAnalysis(storeId): duration = [] observed = [] for elem in survival.find({'store_id':storeId}): duration.append(elem['duration']/86400) observed.append(elem['observed']) if duration==[]: pass else: dura_obj = array(duration) obs_obj = array(observed) kmf = KaplanMeierFitter() kmf.fit(dura_obj,obs_obj) ax = kmf.plot() #ax.set_xlim(0,1) #ax.set_ylim(0.85,1.0) ax.get_figure().savefig('F:\workshop\lbs_lyf\static\images\\' + storeId) plt.close(ax.get_figure())
def survival(time, status, pGroups=None): kmf = KaplanMeierFitter() if pGroups is None: order = [i for i in range(2, len(time)) if time[i] != "" and status[i] != ""] t = [float(time[i]) for i in order] s = [int(status[i]) for i in order] kmf.fit(t, s) ax = kmf.plot(color='red') return ax else: ax = None groups = [ "" for i in time] for k in range(len(pGroups)): df = pd.DataFrame() order = [i for i in pGroups[k][2] if time[i] != "" and status[i] != ""] if len(order) <= 0: continue for i in order: groups[i] = k t = [float(time[i]) for i in order] s = [int(status[i]) for i in order] kmf.fit(t, s, label = pGroups[k][0]) if ax is None: ax = kmf.plot(color=pGroups[k][1], ci_show=False, show_censors=True) else: ax = kmf.plot(ax = ax, color=pGroups[k][1], ci_show=False, show_censors=True) order = [i for i in range(len(groups)) if groups[i] != ""] if len(order) > 0: t = [float(time[i]) for i in order] s = [int(status[i]) for i in order] g = [int(groups[i]) for i in order] from lifelines.statistics import multivariate_logrank_test from matplotlib.legend import Legend res = multivariate_logrank_test(t, g, s) leg = Legend(ax, [], [], title = "p = %.2g" % res.p_value, loc='lower left', frameon=False) ax.add_artist(leg); return ax
def kmplot(df_high, df_low, ax): kmf_high = KaplanMeierFitter() kmf_low = KaplanMeierFitter() try: kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high))) kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low))) except ValueError: return("NA", "0", "0", "0", "0") kmf_high.plot(ax = ax, color = "red", show_censors=True, ci_show=False) kmf_low.plot(ax = ax, color = "black", show_censors=True, ci_show=False) statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event) p_value = statistics_result.p_value ax.set_xlabel('Time (months)') ax.set_ylabel('Probability') ax.text(0.95, 0.02, 'logrank P = ' + str('%.4f' % p_value), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes, color = 'black', fontsize = 11) plt.legend(loc=3) hm5 = kmf_high.predict(60) hm10 = kmf_high.predict(120) lm5 = kmf_low.predict(60) lm10 = kmf_low.predict(120) return(p_value, hm5, hm10, lm5, lm10)
def plot_Kaplan_Meier_overall(donor_dataset): '''Accepts a dataframe of donor data. Plots the overall Kaplan-Meier curve based of the lifetime of the donors. The active donors ('censored') will be excluded from the plot. Parameters: donor_dataset: Pandas dataframe which contain at least the columns 'Total-years' and 'censored'. 'Total_years' represents how many years the donors have been active. 'censored' indicates whether a donor is still active (True = active donor). Output: A Kaplan-Meier plot. This function does not return anything. ''' #This produces two data frames of the columns 'Total_years' #and 'censored.' The former indicates how manay years a #donor has donoted before she/he churned. The latter indicates #whether the donor is censored (not churned). Only donor who #has churned (not censored) are used because we don't know the #'Total_years' of donors who have not churned yet. T = donor_dataset['Total_years'] C = donor_dataset['censored'] #Create KaplanMeierInstance kmf = KaplanMeierFitter() kmf.fit(T, C, label = 'Overall') #plot KM function fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(111) kmf.plot(ax=ax) ax.set_xlabel('Years', size = 20) ax.set_ylabel('Surviving donor population', size = 20) ax.set_xlim(0,40) ax.set_ylim(0, 1) ax.grid() ax.legend(loc = 'best', fontsize = 20) plt.show() return
def get_sa(request): dirname = os.path.dirname(os.path.dirname(__file__)).replace('\\', '/') kmffile = '/images/test1.jpg' naffile = '/images/test2.jpg' context = {} context['kmf'] = kmffile context['naf'] = naffile if not os.path.exists(dirname + kmffile) and not os.path.exists(dirname + naffile): df = load_waltons() T = df['T'] # an array of durations E = df['E'] # a either boolean or binary array representing whether the 'death' was observed (alternatively an individual can be censored) kmf = KaplanMeierFitter(alpha=0.95) kmf.fit(durations=T, event_observed=E, timeline=None, entry=None, label='KM_estimate', alpha=None, left_censorship=False, ci_labels=None) naf = NelsonAalenFitter(alpha=0.95, nelson_aalen_smoothing=True) naf.fit(durations=T, event_observed=E, timeline=None, entry=None, label='NA_estimate', alpha=None, ci_labels=None) kmf.plot() plt.savefig(dirname + kmffile) naf.plot() plt.savefig(dirname + naffile) # return render_to_response(template_name='sa_test.html', context=context, context_instance=RequestContext(request=request)) return render(request=request, template_name='sa_test.html', context=context)
def __init__(self, db, male=False, female=False, other=False, both=True): self.db = db self.male = male self.female = female self.other = other self.both = both duration = [] observed = [] group = [] for elem in self.db.find(): duration.append(elem['duration'] / 86400) observed.append(elem['observed']) group.append(elem['gender']) dura_obj = array(duration) obs_obj = array(observed) group_obj = array(group) DataFrame(dura_obj, index=group_obj) DataFrame(obs_obj, index=group_obj) male = group_obj == 1 female = group_obj == 2 other = group_obj == 0 kmf = KaplanMeierFitter() kmf.fit(dura_obj, obs_obj, label='both') ax = kmf.plot() if self.male is True: kmf.fit(dura_obj[male], obs_obj[male], label='male') kmf.plot(ax=ax) if self.female is True: kmf.fit(dura_obj[female], obs_obj[female], label='female') kmf.plot(ax=ax) if self.other is True: kmf.fit(dura_obj[other], obs_obj[other], label='other') kmf.plot(ax=ax) # ax.set_xlim(19,22) # ax.set_ylim(1,2) ax.get_figure().savefig('maleAndFemale')
def generate_plot(): # Perhaps `regenerate_plot`? """ Dynamically fit and plot a Kaplan-Meier curve. """ df_ = df.copy() # Use constraints for index in range(len(categories)): if index not in category_select.active: df_ = df_[df_.category != category_select.labels[index]] df_ = df_[min_size_select.value <= df_['size']] df_ = df_[df_['size'] <= max_size_select.value] df_ = df_[min_age_select.value <= df_.age] df_ = df_[df_.age <= max_age_select.value] if 0 not in sex_select.active: # Male df_ = df_[df_.sex != 1] if 1 not in sex_select.active: # Female df_ = df_[df_.sex != 2] if len(df_) == 0: # Bad constraints status.text = 'No cases found. Try different constraints.' return doa = [not survived for survived in df_.survived] kmf = KaplanMeierFitter() fit = kmf.fit(df_.days, event_observed=doa, label='prob_of_surv') # Here, we are using the smoothed version of the Kaplan-Meier curve # The stepwise version would work just as well data, surv_func = renderer.data_source.data, fit.survival_function_ data.update(x=surv_func.index, y=surv_func.prob_of_surv) start, end = 0, max(df_.days) # bounds='auto' doesn't work? plot.x_range.update(start=start, end=end, bounds=(start, end)) status.text = '{} cases found.'.format(len(df_))
def plot_kmf(df, condition_col, censor_col, survival_col, threshold=None, title=None, xlabel=None, ax=None, print_as_title=False): """ Plot survival curves by splitting the dataset into two groups based on condition_col if threshold is defined, the groups are split based on being > or < condition_col if threshold == 'median', the threshold is set to the median of condition_col Parameters ---------- df: dataframe condition_col: string, column which contains the condition to split on survival_col: string, column which contains the survival time censor_col: string, threshold: int or string, if int, condition_col is thresholded, if 'median', condition_col thresholded at its median title: Title for the plot, default None ax: an existing matplotlib ax, optional, default None print_as_title: bool, optional, whether or not to print text within the plot's title vs. stdout, default False """ kmf = KaplanMeierFitter() if threshold is not None: if threshold == 'median': threshold = df[condition_col].median() condition = df[condition_col] > threshold label = '{} > {}'.format(condition_col, threshold) else: condition = df[condition_col] label = '{}'.format(condition_col) df_with_condition = df[condition] df_no_condition = df[~condition] survival_no_condition = df_no_condition[survival_col] survival_with_condition = df_with_condition[survival_col] event_no_condition = (df_no_condition[censor_col].astype(bool)) event_with_condition = (df_with_condition[censor_col].astype(bool)) kmf.fit(survival_no_condition, event_no_condition, label="") if ax: kmf.plot(ax=ax, show_censors=True, ci_show=False) else: ax = kmf.plot(show_censors=True, ci_show=False) kmf.fit(survival_with_condition, event_with_condition, label=(label)) kmf.plot(ax=ax, show_censors=True, ci_show=False) # Set the y-axis to range 0 to 1 ax.set_ylim(0, 1) no_cond_str = "# no condition {}".format(len(survival_no_condition)) cond_str = "# with condition {}".format(len(survival_with_condition)) if title: ax.set_title(title) elif print_as_title: ax.set_title("%s | %s" % (no_cond_str, cond_str)) else: print(no_cond_str) print(cond_str) if xlabel: ax.set_xlabel(xlabel) results = logrank_test(survival_no_condition, survival_with_condition, event_observed_A=event_no_condition, event_observed_B=event_with_condition) return results
males = df[df['gender']=='Male'] females = df[df['gender']=='Female'] T = df["lifetime"] #measured in days C = df["dead"] females_ = df["gender"] == "Female" males_ = df["gender"] == "Male" community_stats = { 'community': community, 'size': females.count()[0] + males.count()[0], 'women_frequency_median' : females['activity_freq'].median(), 'men_frequency_median' : males['activity_freq'].median(), 'frequency_difference_median': females['activity_freq'].median() - males['activity_freq'].median(), 'women_frequency_mean' : females['activity_freq'].mean(), 'men_frequency_mean' : males['activity_freq'].mean(), 'frequency_difference_mean': females['activity_freq'].mean() - males['activity_freq'].mean(), 'frequency_pvalue': 2* stats.mannwhitneyu(females['activity_freq'], males['activity_freq'])[1], 'women_lifetime_median':kmf.fit(T[females_], event_observed=C[females_], label="Female").median_, 'men_lifetime_median':kmf.fit(T[males_], event_observed=C[males_], label="Male").median_, 'lifetime_pvalue': logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 ).p_value } community_stats['lifetime_difference_median'] = community_stats["women_lifetime_median"] - community_stats["men_lifetime_median"] results_db.insert( community_stats )
cutoff = 30 # Generate a censor length cutoff = np.repeat(cutoff, N) duration = np.minimum(event_t,cutoff) # "Cut-off" observations over cutoff level not_censor = event_t <= duration # generate a boolean indicator of censoring not_censor = not_censor.astype(int) # convert boolean to zeroes and ones # Convert to data frame data = pd.DataFrame({'duration': duration, 'event': not_censor, 'age': age, 'college': college}) # Plot observations with censoring # plot_lifetimes(duration, event_observed = not_censor) # Kaplan Meier Summary for Simulated Data from lifelines import KaplanMeierFitter kmf = KaplanMeierFitter() kmf.fit(duration, event_observed = not_censor) kmf.survival_function_.plot() # Cox-PH Model Regression from lifelines import CoxPHFitter cf = CoxPHFitter() cf.fit(data, 'duration', event_col = 'event') cf.print_summary() ## Get Predictions from Model ## # 24 year old college grad #college_24 = pd.DataFrame({'age':[24], 'college':[1]}) #cf.predict_survival_function(college_24).plot() # 65 year old high school grad
def get_kmf_fit(qs): t = qs.values_list('days_since_complaint', flat=True) c = qs.values_list('is_closed', flat=True) kmf = KaplanMeierFitter() kmf.fit(t, event_observed=c) return kmf
def _plot_kmf_single(df, condition_col, survival_col, censor_col, threshold, title, xlabel, ylabel, ax, with_condition_color, no_condition_color, with_condition_label, no_condition_label, color_map, label_map, color_palette, ci_show, print_as_title): """ Helper function to produce a single KM survival plot, among observations in df by groups defined by condition_col. All inputs are required - this function is intended to be called by `plot_kmf`. """ # make color inputs consistent hex format if colors.is_color_like(with_condition_color): with_condition_color = colors.to_hex(with_condition_color) if colors.is_color_like(no_condition_color): no_condition_color = colors.to_hex(no_condition_color) ## prepare data to be plotted; producing 3 outputs: # - `condition`, series containing category labels to be plotted # - `label_map` (mapping condition values to plot labels) # - `color_map` (mapping condition values to plotted colors) if threshold is not None: is_median = threshold == "median" if is_median: threshold = df[condition_col].median() label_suffix = float_str(threshold) condition = df[condition_col] > threshold default_label_no_condition = "%s ≤ %s" % (condition_col, label_suffix) if is_median: label_suffix += " (median)" default_label_with_condition = "%s > %s" % (condition_col, label_suffix) with_condition_label = with_condition_label or default_label_with_condition no_condition_label = no_condition_label or default_label_no_condition if not label_map: label_map = {False: no_condition_label, True: with_condition_label} if not color_map: color_map = {False: no_condition_color, True: with_condition_color} elif df[condition_col].dtype == 'O' or df[condition_col].dtype.name == "category": condition = df[condition_col].astype("category") if not label_map: label_map = dict() [label_map.update({condition_value: '{} = {}'.format(condition_col, condition_value)}) for condition_value in condition.unique()] if not color_map: rgb_values = sb.color_palette(color_palette, len(label_map.keys())) hex_values = [colors.to_hex(col) for col in rgb_values] color_map = dict(zip(label_map.keys(), hex_values)) elif df[condition_col].dtype == 'bool': condition = df[condition_col] default_label_with_condition = "= {}".format(condition_col) default_label_no_condition = "¬ {}".format(condition_col) with_condition_label = with_condition_label or default_label_with_condition no_condition_label = no_condition_label or default_label_no_condition if not label_map: label_map = {False: no_condition_label, True: with_condition_label} if not color_map: color_map = {False: no_condition_color, True: with_condition_color} else: raise ValueError('Don\'t know how to plot data of type\ {}'.format(df[condition_col].dtype)) # produce kmf plot for each category (group) identified above kmf = KaplanMeierFitter() grp_desc = list() grp_survival_data = dict() grp_event_data = dict() grp_names = list(condition.unique()) for grp_name, grp_df in df.groupby(condition): grp_survival = grp_df[survival_col] grp_event = (grp_df[censor_col].astype(bool)) grp_label = label_map[grp_name] grp_color = color_map[grp_name] kmf.fit(grp_survival, grp_event, label=grp_label) desc_str = "# {}: {}".format(grp_label, len(grp_survival)) grp_desc.append(desc_str) grp_survival_data[grp_name] = grp_survival grp_event_data[grp_name] = grp_event if ax: ax = kmf.plot(ax=ax, show_censors=True, ci_show=ci_show, color=grp_color) else: ax = kmf.plot(show_censors=True, ci_show=ci_show, color=grp_color) ## format the plot # Set the y-axis to range 0 to 1 ax.set_ylim(0, 1) y_tick_vals = ax.get_yticks() ax.set_yticklabels(["%d" % int(y_tick_val * 100) for y_tick_val in y_tick_vals]) # plot title if title: ax.set_title(title) elif print_as_title: ax.set_title(' | '.join(grp_desc)) else: [print(desc) for desc in grp_desc] # axis labels if xlabel: ax.set_xlabel(xlabel) if ylabel: ax.set_ylabel(ylabel) ## summarize analytical version of results ## again using same groups as are plotted if len(grp_names) == 2: # use log-rank test for 2 groups results = logrank_test(grp_survival_data[grp_names[0]], grp_survival_data[grp_names[1]], event_observed_A=grp_event_data[grp_names[0]], event_observed_B=grp_event_data[grp_names[1]]) elif len(grp_names) == 1: # no analytical result for 1 or 0 groups results = NullSurvivalResults() else: # cox PH fitter for >2 groups cf = CoxPHFitter() cox_df = patsy.dmatrix('+'.join([condition_col, survival_col, censor_col]), df, return_type='dataframe') del cox_df['Intercept'] results = cf.fit(cox_df, survival_col, event_col=censor_col) results.print_summary() # add metadata to results object so caller can print them results.survival_data_series = grp_survival_data results.event_data_series = grp_event_data results.desc = grp_desc return results
from lifelines.datasets import load_waltons # Load data frame df = load_waltons() # Print dataframe print (df.head()) # Get separare frame for event and time T = df['T'] E = df['E'] from lifelines import KaplanMeierFitter kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) # more succiently, kmf.fit(T,E) kmf.survival_function_ kmf.median_ kmf.plot() # Multiple groups groups = df['group'] ix = (groups == 'miR-137') kmf.fit(T[~ix], E[~ix], label='control') ax = kmf.plot()
duration = [] observed = [] group = [] for elem in after_users.find(): #if elem['duration'] >=1500000: duration.append(elem['duration']/86400) observed.append(elem['observed']) group.append(elem['gender']) dura_obj = array(duration) obs_obj = array(observed) group_obj = array(group) DataFrame(dura_obj,index=group_obj) DataFrame(obs_obj,index=group_obj) male = group_obj ==1 female = group_obj ==2 other = group_obj ==0 kmf = KaplanMeierFitter() kmf.fit(dura_obj[male],obs_obj[male], label = 'male') ax = kmf.plot() kmf.fit(dura_obj[female],obs_obj[female], label = 'female') kmf.plot(ax=ax) kmf.fit(dura_obj,obs_obj, label = 'both') kmf.plot(ax=ax) #kmf.fit(dura_obj[other],obs_obj[other], label = 'other') #kmf.plot(ax=ax) #ax.set_xlim(19,22) #ax.set_ylim(1,2) ax.get_figure().savefig('maleAndFemale_both_17day')
print(df.head()) ''' T E group 0 6 1 miR-137 1 13 1 miR-137 2 13 1 miR-137 3 13 1 miR-137 4 19 1 miR-137 ''' T = df['T'] E = df['E'] groups = df['group'] ix = (groups == 'miR-137') kmf = KaplanMeierFitter() kmf.fit(T[~ix], E[~ix], label='control') ax = kmf.plot() kmf.fit(T[ix], E[ix], label='miR-137') kmf.plot(ax=ax) plt.ylabel('Survival Probability') plt.show() # Compare the two curves results = logrank_test(T[ix], T[~ix], event_observed_A=E[ix], event_observed_B=E[~ix]) results.print_summary()
def plot_survival(unique_groups, grouped_data, analysis_type, censors, ci, showplot, stat_results, time='Months'): #plot survival curve kmf = KaplanMeierFitter() fig, ax = plt.subplots() n_in_groups = [] f = open('Kaplan_%s.txt' % (analysis_type), 'a') f.write("\nPercent %s\n" % analysis_type) headers = "Group\t" for x in range(95,-1,-5): headers += str(x) + "%\t" f.write("%s\n" % headers) for i, group in enumerate(unique_groups): data = grouped_data.get_group(group) n_in_groups.append(len(data)) # Adjust survival data from days to whatever form wanted if time.lower() == 'months': survival_time = (data['survival']/(365/12)) elif time.lower() == 'years': survival_time = (data['survival']/(365)) else: survival_time = data['survival'] kmf.fit(survival_time, data['event'], label = group) # print(data[survival]) # print(kmf.survival_function_) f.write("%s\t" % group) for x in range(95, -1, -5): f.write(str(qth_survival_times(x/100, kmf.survival_function_)) + "\t") f.write("\n") kmf.plot(ax=ax, show_censors=censors, ci_show=ci, linewidth=2.5) # Make the graph pretty! textbox = dict(horizontalalignment = 'left', verticalalignment = 'bottom', fontname = 'Arial', fontsize = 18) labels = dict(horizontalalignment = 'center', verticalalignment = 'center', fontname = 'Arial', fontsize = 28) ax.grid(False) ax.set_ylim(0,1.05) ax.spines['left'].set_linewidth(2.5) ax.spines['right'].set_linewidth(2.5) ax.spines['top'].set_linewidth(2.5) ax.spines['bottom'].set_linewidth(2.5) ax.yaxis.set_tick_params(width=2.5) ax.xaxis.set_tick_params(width=2.5) ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') # plt.title('%s' % (analysis_type), labels, y = 1.05) plt.xlabel('%s Post-Diagnosis' % time, labels, labelpad = 20) if analysis_type == 'survival': plt.ylabel('Overall Survival', labels, labelpad = 20) else: plt.ylabel('Relapse-Free Survival', labels, labelpad=20) plt.xticks(fontname = 'Arial', fontsize = 24) plt.yticks(fontname = 'Arial', fontsize = 24) ax.tick_params(axis='y', pad=10) ax.tick_params(axis='x', pad=10) legend = ax.legend(frameon=False,loc=3) counter=0 for label in legend.get_texts(): label.set_fontsize(20) label.set_text('%s n=%d' % (unique_groups[counter], n_in_groups[counter])) counter += 1 if len(unique_groups) == 2: plt.text(0.95, 0.05, 'p = %.2g' % (stat_results.p_value), fontname='Arial', fontsize=20, ha='right', transform=ax.transAxes) plt.tight_layout() fig.savefig('Kaplan_%s.png' % analysis_type, transparent = True) fig.savefig('Kaplan_%s.eps' % analysis_type, transparent = True) if showplot == True: plt.show() plt.close(fig)
def data_fit(self): user_list = [] self.hyd_events.create_index('FromUserName') self.hyd_events.create_index('Event') self.hyd_users.create_index('openid') for elem in self.hyd_events.find({'Event': 'subscribe'}): user_list.append(elem['FromUserName']) user_list = list(set(user_list)) print len(user_list) now_time = time.time() # add subscribe time # three tag: pic, text, event # format: 'user_id':'', 'sub_time':'', 'unsub_time':'', 'event':''. duration = [] observed = [] group = [] time_block = [] for elem in user_list: user_dict = {} for item in self.hyd_events.find({'FromUserName': elem}): time_block.append(item['CreateTime']) earlist = min(time_block) latest = max(time_block) sub_time = int(earlist) curt = self.hyd_events.find_one({'$and': [{'FromUserName': elem}, {'Event': 'unsubscribe'}]}) if curt is None: unsub_time = int(now_time) user_dict['observed'] = 0 else: unsub_time = int(latest) user_dict['observed'] = 1 try: user_dict['duration'] = abs(unsub_time - sub_time) except Exception, e: print e print unsub_time print sub_time check = self.hyd_users.find_one({'openid': elem}) # if gender exists, set it, if not, set gender=0, which means gender unknow try: user_dict['gender'] = check['sex'] except TypeError: user_dict['gender'] = 0 duration.append(user_dict['duration'] / 86400) observed.append(user_dict['observed']) group.append(user_dict['gender']) dura_obj = array(duration) obs_obj = array(observed) group_obj = array(group) DataFrame(dura_obj, index=group_obj) DataFrame(obs_obj, index=group_obj) male = group_obj == 1 female = group_obj == 2 other = group_obj == 0 kmf = KaplanMeierFitter() kmf.fit(dura_obj, obs_obj, label='both') ax = kmf.plot() ax.get_figure().savefig('maleAndFemale')
return t elif is_number(c['year_of_birth']) == True and is_number(c['age_at_diagnosis']) == True and is_number(c['days_to_death']) == False: t = 2018 - float(c['year_of_birth']) - (float(c['age_at_diagnosis'])*4/(365*3 + 366)) return t else: return "NotApplicable" matrix['duration'] = matrix.apply(duration, axis = 1) matrix['event'] = matrix.apply(event, axis = 1) matrix = matrix[['bcr_sample_barcode', 'duration', 'event']] #new_header = matrix.iloc[0] #grab the first row for the header #matrix = matrix[1:] #take the data less the header row #matrix.columns = new_header matrix = matrix[matrix['duration']!="NotApplicable"] kmf = KaplanMeierFitter() kmf.fit(durations = matrix.duration, event_observed = matrix.event) kmf.survival_function_ # plot the KM estimate kmf.plot() # Add title and y-axis label plt.title("The Kaplan-Meier Estimate for BRCA (total)") plt.ylabel("Probability a patient is still active") plt.show()
EPS_LIST = [0.05,0.1,0.2,0.4,0.8,1.6] bins0 = config.BIN0 bins1 = config.BIN1 df = pd.read_stata("wichert.dta") data_ = zip(df.time/max(df.time), df.event.astype(int)) data = [(a, b) for (a,b) in data_ if a >= config.GAMMA] print("[*] Remove #%d outliers" % (len(data_) - len(data))) N = len(df) # number of data points kmf = KaplanMeierFitter() (T, E) = zip(*data) kmf.fit(T, event_observed=E) #naf = NelsonAalenFitter() #naf.fit(T, event_observed=E) #ax = pyplot.subplot(121) #naf.plot(ax=ax) #ax = pyplot.subplot(122) #kmf.plot(ax=ax) true_value = kmf.survival_function_.values #naf.cumulative_hazard_.to_csv("naf.csv") #pyplot.show() data0 = [ a for (a,b) in data if b == 0 ] data1 = [ a for (a,b) in data if b == 1 ]
def execute(): matplotlib.rc("font", size=20) engine, session = database.initialize("sqlite:///../data/isrid-master.db") # Query with Group.size may take awhile, at least for Charles # Not sure why query = session.query(Incident.total_hours, Subject.survived, Group.category, Group.size).join(Group, Subject) print("Tabulating query... may take awhile for unknown reasons.") df = tabulate(query) print("Done tabulating.") print(df.describe()) database.terminate(engine, session) df = df.assign( days=[total_hours.total_seconds() / 3600 / 24 for total_hours in df.total_hours], doa=[not survived for survived in df.survived], ) df = df[0 <= df.days] rows, columns = 2, 2 grid, axes = plt.subplots(rows, columns, figsize=(15, 10)) categories = Counter(df.category) plot = 0 kmfs = [] options = {"show_censors": True, "censor_styles": {"marker": "|", "ms": 6}, "censor_ci_force_lines": False} for category, count in categories.most_common()[: rows * columns]: print("Category:", category) ax = axes[plot // columns, plot % columns] df_ = df[df.category == category] N, Ndoa = len(df_), sum(df_.doa) Srate = 100 * (1 - Ndoa / N) grp = df_[df_.size > 1] sng = df_[df_.size == 1] kmf = KaplanMeierFitter() # kmf.fit(df_.days, event_observed=df_.doa, label=category) # kmf.plot(ax=ax, ci_force_lines=True) kmf.fit(grp.days, event_observed=grp.doa, label=category + " Groups") kmf.plot(ax=ax, **options) kmf.fit(sng.days, event_observed=sng.doa, label=category + " Singles") kmf.plot(ax=ax, **options) kmfs.append(kmf) ax.set_xlim(0, min(30, 1.05 * ax.get_xlim()[1])) ax.set_ylim(0, 1) ax.set_title("{}, N = {}, DOA = {}, {:.0f}% surv".format(category, N, Ndoa, Srate)) ax.set_xlabel("Total Incident Time (days)") ax.set_ylabel("Probability of Survival") # ax.legend_.remove() # ax.grid(True) plot += 1 grid.suptitle("Kaplan-Meier Survival Curves", fontsize=25) grid.tight_layout() grid.subplots_adjust(top=0.9) grid.savefig("../doc/figures/kaplan-meier/km-grid-large.svg", transparent=True) combined = plt.figure(figsize=(15, 10)) ax = combined.add_subplot(1, 1, 1) for kmf in kmfs[: rows * columns]: kmf.plot(ci_show=False, show_censors=True, censor_styles={"marker": "|", "ms": 6}, ax=ax) ax.set_xlim(0, 15) ax.set_ylim(0, 1) ax.set_xlabel("Total Incident Time (days)") ax.set_ylabel("Probability of Survival") ax.set_title("Kaplan-Meier Survival Curves", fontsize=25) ax.grid(True) combined.savefig("../doc/figures/kaplan-meier/km-combined-large.svg", transparent=True) plt.show()
import pandas as pd from pandas import DataFrame, Series import numpy as np import scipy import lifelines figsize(12.5,5) np.set_printoptions(precision=2, suppress=True) from lifelines import KaplanMeierFitter survival_times = np.array([0.,3.,4.5, 10., 1.]) events = np.array([False, True, True, False, True]) kmf = KaplanMeierFitter() kmf.fit(survival_times, event_observed=events) print kmf.survival_function_ print kmf.median_ kmf.plot() ## example 2 import matplotlib.pylab as plt %pylab figsize(12.5,6) from lifelines.plotting import plot_lifetimes from numpy.random import uniform, exponential N = 25 current_time = 10
#Griffin Calme #Group 15, week 8 activity #Kaplan Meier survival curve import pandas as pd from lifelines import KaplanMeierFitter import matplotlib.pyplot as plt kmf = KaplanMeierFitter() df = pd.DataFrame.from_csv('wk8gp15KapMeier.csv') print(df) groups = df['Group'] ix = (groups == 2) T = df['SERIAL TIME (years)'] E = df['STATUS'] kmf.fit(T[~ix], E[~ix], label='1') ax = kmf.plot() kmf.fit(T[ix], E[ix], label='2') kmf.plot(ax=ax, ci_force_lines=False) plt.show()
from lifelines import KaplanMeierFitter import matplotlib.pyplot as plt df = pd.read_csv('joined.csv.bz2', sep=',', compression='bz2', low_memory=False) # strip ' months' in column 'term' df['term'] = df['term'].map(lambda x: int(x.strip(' months'))) # prepare column 'T' for training survival model df['T'] = df['firstMissed'] / df['term'] df.loc[df['loan_status']=='Fully Paid', 'T']=1 # column 'E' seems to be column 'censored' T = df['T'] E = ~df['censored'] kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) # more succiently, kmf.fit(T,E) kmf.survival_function_ kmf.median_ kmf.plot() plt.show()