def test_kmf_with_risk_counts(self, block): data1 = np.random.exponential(10, size=(100)) kmf = KaplanMeierFitter() kmf.fit(data1) kmf.plot(at_risk_counts=True) self.plt.title("test_kmf_with_risk_counts") self.plt.show(block=block)
def plot_KM(stime, censor, g1, pval, figname): sns.set_style('white') kmf = KaplanMeierFitter() f, ax = plt.subplots(figsize=(3, 3)) np.set_printoptions(precision=2, suppress=False) kmf.fit(stime[g1], event_observed=censor[g1], label=["high-risk group"]) kmf.plot(ax=ax, ci_show=False, show_censors=True) kmf.fit(stime[~g1], event_observed=censor[~g1], label=["low-risk group"]) kmf.plot(ax=ax, ci_show=False, show_censors=True) ax.grid(b=False) sns.despine() plt.ylim(0, 1) plt.xlabel("time", fontsize=14) plt.ylabel("survival", fontsize=14) plt.text(0.7, 0.85, 'pval = %.2e' % (pval), fontdict={'size': 12}, horizontalalignment='center', verticalalignment='center', transform=ax.transAxes) plt.xticks(rotation=45) for item in (ax.get_xticklabels() + ax.get_yticklabels()): item.set_fontsize(10) plt.tight_layout() plt.savefig(figname, format='eps') plt.close()
def test_kmf_minimum_observation_bias(): N = 250 kmf = KaplanMeierFitter() T, C = exponential_survival_data(N, 0.1, scale=10) B = 0.01 * T kmf.fit(T, C, entry=B) kmf.plot() plt.title("Should have larger variances in the tails")
def test_flat_style_and_marker(self, block): data1 = np.random.exponential(10, size=200) data2 = np.random.exponential(2, size=200) C1 = np.random.binomial(1, 0.9, size=200) C2 = np.random.binomial(1, 0.95, size=200) kmf = KaplanMeierFitter() kmf.fit(data1, C1, label='test label 1') ax = kmf.plot(flat=True, censor_styles={'marker': '+', 'mew': 2, 'ms': 7}) kmf.fit(data2, C2, label='test label 2') kmf.plot(ax=ax, censor_styles={'marker': 'o', 'ms': 7}, flat=True) self.plt.title("testing kmf flat styling + marker") self.plt.show(block=block) return
def test_kmf_left_censorship_plots(self, block): kmf = KaplanMeierFitter() lcd_dataset = load_lcd() alluvial_fan = lcd_dataset.loc[lcd_dataset['group'] == 'alluvial_fan'] basin_trough = lcd_dataset.loc[lcd_dataset['group'] == 'basin_trough'] kmf.fit(alluvial_fan['T'], alluvial_fan['C'], left_censorship=True, label='alluvial_fan') ax = kmf.plot() kmf.fit(basin_trough['T'], basin_trough['C'], left_censorship=True, label='basin_trough') ax = kmf.plot(ax=ax) self.plt.title("test_kmf_left_censorship_plots") self.plt.show(block=block) return
def test_kmf_with_inverted_axis(self, block, kmf): T = np.random.exponential(size=100) kmf = KaplanMeierFitter() kmf.fit(T, label="t2") ax = kmf.plot(invert_y_axis=True, at_risk_counts=True) T = np.random.exponential(3, size=100) kmf = KaplanMeierFitter() kmf.fit(T, label="t1") kmf.plot(invert_y_axis=True, ax=ax, ci_force_lines=False) self.plt.title("test_kmf_with_inverted_axis") self.plt.show(block=block)
def test_kmf_plotting(self, block): data1 = np.random.exponential(10, size=(100)) data2 = np.random.exponential(2, size=(200, 1)) data3 = np.random.exponential(4, size=(500, 1)) kmf = KaplanMeierFitter() kmf.fit(data1, label='test label 1') ax = kmf.plot() kmf.fit(data2, label='test label 2') kmf.plot(ax=ax) kmf.fit(data3, label='test label 3') kmf.plot(ax=ax) self.plt.title("test_kmf_plotting") self.plt.show(block=block) return
def test_kmf_left_censorship_plots(self, block): matplotlib = pytest.importorskip("matplotlib") from matplotlib import pyplot as plt kmf = KaplanMeierFitter() lcd_dataset = load_lcd() alluvial_fan = lcd_dataset.ix[lcd_dataset['group'] == 'alluvial_fan'] basin_trough = lcd_dataset.ix[lcd_dataset['group'] == 'basin_trough'] kmf.fit(alluvial_fan['T'], alluvial_fan['C'], left_censorship=True, label='alluvial_fan') ax = kmf.plot() kmf.fit(basin_trough['T'], basin_trough['C'], left_censorship=True, label='basin_trough') ax = kmf.plot(ax=ax) plt.show(block=block) return
def test_kmf_left_censorship_plots(self): matplotlib = pytest.importorskip("matplotlib") from matplotlib import pyplot as plt kmf = KaplanMeierFitter() lcd_dataset = load_lcd() alluvial_fan = lcd_dataset.ix[lcd_dataset['group'] == 'alluvial_fan'] basin_trough = lcd_dataset.ix[lcd_dataset['group'] == 'basin_trough'] kmf.fit(alluvial_fan['T'], alluvial_fan['C'], left_censorship=True, label='alluvial_fan') ax = kmf.plot() kmf.fit(basin_trough['T'], basin_trough['C'], left_censorship=True, label='basin_trough') ax = kmf.plot(ax=ax) plt.show() return
def test_seaborn_doesnt_cause_kmf_plot_error(self, block, kmf, capsys): import seaborn as sns df = load_waltons() T = df['T'] E = df['E'] kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) kmf.plot() self.plt.title('test_seaborn_doesnt_cause_kmf_plot_error') self.plt.show(block=block) _, err = capsys.readouterr() assert err == ""
def test_seaborn_doesnt_cause_kmf_plot_error(self, block, kmf, capsys): import seaborn as sns df = load_waltons() T = df["T"] E = df["E"] kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) kmf.plot() self.plt.title("test_seaborn_doesnt_cause_kmf_plot_error") self.plt.show(block=block) _, err = capsys.readouterr() assert err == ""
def test_flat_style_no_censor(self, block): data1 = np.random.exponential(10, size=200) kmf = KaplanMeierFitter() kmf.fit(data1, label='test label 1') ax = kmf.plot(flat=True, censor_styles={'marker': '+', 'mew': 2, 'ms': 7}) self.plt.title('test_flat_style_no_censor') self.plt.show(block=block) return
def test_kmf_left_censorship_plots(self, block): kmf = KaplanMeierFitter() lcd_dataset = load_lcd() alluvial_fan = lcd_dataset.loc[lcd_dataset["group"] == "alluvial_fan"] basin_trough = lcd_dataset.loc[lcd_dataset["group"] == "basin_trough"] kmf.fit(alluvial_fan["T"], alluvial_fan["C"], left_censorship=True, label="alluvial_fan") ax = kmf.plot() kmf.fit(basin_trough["T"], basin_trough["C"], left_censorship=True, label="basin_trough") ax = kmf.plot(ax=ax) self.plt.title("test_kmf_left_censorship_plots") self.plt.show(block=block) return
def test_negative_times_still_plots(self, block): n = 40 T = np.linspace(-2, 3, n) C = np.random.randint(2, size=n) kmf = KaplanMeierFitter() kmf.fit(T, C) ax = kmf.plot() self.plt.title('test_negative_times_still_plots') self.plt.show(block=block) return
def plot_KM(stime, censor, g1, pval, figname): sns.set_style('white') kmf = KaplanMeierFitter() f, ax = plt.subplots(figsize=(3, 3)) np.set_printoptions(precision=2, suppress=False) kmf.fit(stime[g1], event_observed=censor[g1], label=["high-risk group"]) kmf.plot(ax=ax, ci_show=False, show_censors=True) kmf.fit(stime[~g1], event_observed=censor[~g1], label=["low-risk group"]) kmf.plot(ax=ax, ci_show=False, show_censors=True) ax.grid(b=False) sns.despine() plt.ylim(0,1) plt.xlabel("time", fontsize=14) plt.ylabel("survival", fontsize=14) plt.text(0.7, 0.85, 'pval = %.2e' % (pval), fontdict={'size': 12}, horizontalalignment='center', verticalalignment='center', transform=ax.transAxes) plt.xticks(rotation=45) for item in (ax.get_xticklabels() + ax.get_yticklabels()): item.set_fontsize(10) plt.tight_layout() plt.savefig(figname, format='eps') plt.close()
def survival_estimation(directory=tmp_dir): """ Use the Kaplan-Meier Estimate to estimate the survival function see: https://github.com/CamDavidsonPilon/lifelines """ from lifelines.estimation import KaplanMeierFitter df = get_lifetime_data_frame(recompute=False) # Estimate the survival function for all developers T = df['duration'] C = df['censored'] kmf = KaplanMeierFitter() kmf.fit(T, event_observed=C, label='All developers') print("Median survival time for all developers: {} years".format( kmf.median_)) fig = plt.figure(figsize=(10, 8)) ax = plt.subplot(111) kmf.plot(ax=ax, color=color_map(2)) plt.ylabel('Survival probablility') plt.xlabel('Time in years') plt.ylim(0, 1) plt.grid() #plt.title("Estimated Survival function for developer activity") if directory is None: plt.ion() plt.show() else: plt.savefig('{0}/survival_all.png'.format(directory)) plt.savefig('{0}/survival_all.pdf'.format(directory)) plt.close() # Estimate the survival function by connectivity level mtop = df['top'] == 1 kmf = KaplanMeierFitter() fig = plt.figure(figsize=(10, 8)) ax = plt.subplot(111) kmf.fit(T[mtop], event_observed=C[mtop], label="Top connectivity level") print("Median survival time for top developers: {} years".format( kmf.median_)) kmf.plot(ax=ax, color=color_map(0)) kmf.fit(T[~mtop], event_observed=C[~mtop], label="Not in the top") print("Median survival time for not top developers: {} years".format( kmf.median_)) kmf.plot(ax=ax, color=color_map(1)) plt.ylabel('Survival probablility') plt.xlabel('Time in years') plt.ylim(0, 1) plt.grid() #plt.title("Estimated Survival function for top level connectivity") if directory is None: plt.ion() plt.show() else: plt.savefig('{0}/survival_top.png'.format(directory)) plt.savefig('{0}/survival_top.pdf'.format(directory)) plt.close()
# The method takes the same parameters as it's R counterpart, a time vector and a vector indicating which observations are observed or censored. The model fitting sequence is similar to the [scikit-learn](http://scikit-learn.org/stable/) api. # In[16]: f = tongue.type==1 T = tongue[f]['time'] C = tongue[f]['delta'] kmf.fit(T, event_observed=C) # To get a plot with the confidence intervals, we simply can call `plot()` on our `kmf` object. # In[17]: kmf.plot(title='Tumor DNA Profile 1') # Now we can convert this plot to an interactive [Plotly](https://plot.ly) object. However, we will have to augment the legend and filled area manually. Once we create a helper function, the process is simple. # # Please see the Plotly Python [user guide](https://plot.ly/python/overview/#in-%5B37%5D) for more insight on how to update plot parameters. # # > Don't forget you can also easily edit the chart properties using the Plotly GUI interface by clicking the "Play with this data!" link below the chart. # In[19]: p = kmf.plot(ci_force_lines=True, title='Tumor DNA Profile 1 (95% CI)') # Collect the plot object kmf1 = plt.gcf()
#df2= pd.read_table('genomicMatrix.tab',sep='\t') #print list(df.columns.values) survival_col = '_OS' censor_col = '_OS_IND' clinical_predictors = ['age_at_initial_pathologic_diagnosis'] df = df[pd.notnull(df[survival_col])] tx = df['history_of_neoadjuvant_treatment']=='Yes' ax = plt.subplot(111) kmf1 = KaplanMeierFitter(alpha=0.95) kmf1.fit(durations=df.ix[tx, survival_col], event_observed=df.ix[tx, censor_col], label=['Tx==Yes']) kmf1.plot(ax=ax, show_censors=True, ci_show=False) kmf2 = KaplanMeierFitter(alpha=0.95) kmf2.fit(durations=df.ix[~tx, survival_col], event_observed=df.ix[~tx, censor_col], label=['Tx==No']) kmf2.plot(ax=ax, show_censors=True, ci_show=False ) add_at_risk_counts(kmf1, kmf2, ax=ax) plt.title ('Acute myeloid leukemia survival analysis with Tx and without Tx') plt.xlabel(survival_col) plt.savefig('km.png') results = logrank_test(df.ix[tx, survival_col], df.ix[~tx, survival_col], df.ix[tx, censor_col], df.ix[~tx, censor_col], alpha=.99 ) results.print_summary() cox = CoxPHFitter(normalize=False)
def uw_tier_histplots(): sample['Underwriter Tier'] = sample['lead_underwriter_tier'] sample['IPO Duration'] = sample['IPO_duration'] ranks = ["-1", "0+", "7+", "9"] def uw_tier_duration(x): return sample[sample.lead_underwriter_tier==x]['IPO_duration'] kwstat = kruskalwallis(*[uw_tier_duration(x) for x in ranks]) # g = sb.FacetGrid(sample, # row="Underwriter Tier", # hue="Underwriter Tier", # palette=cp_four("cool_r"), # size=2, aspect=4, # hue_order=ranks, row_order=ranks, # legend=ranks, xlim=(0,1095)) # g.map(sb.distplot, "IPO Duration") # plt.savefig("IPO_tiers_KP_survival.pdf", format='pdf', dpi=200) from lifelines.estimation import KaplanMeierFitter from lifelines.statistics import logrank_test import matplotlib.pyplot as plt ranks = ["-1", "0+", "7+", "9"] ranklabels = ['No Underwriter', 'Low Rank', 'Mid Rank', 'Rank 9 (elite)'] kmf = KaplanMeierFitter() # Success f, ax = plt.subplots(1,1,figsize=(12, 4), sharex=True) T = 1 # annotation line thickness for rank, rlabel, color in zip(ranks, ranklabels, cp_four("cool_r")): uw = sample[sample.lead_underwriter_tier==rank] kmf.fit(uw['IPO_duration'], label='{} N={}'.format(rlabel, len(uw)), alpha=0.9) kmf.plot(ax=ax, c=color, alpha=0.7) quartiles = [int(np.percentile(kmf.durations, x)) for x in [25, 50, 75]][::-1] aprops = dict(facecolor=color, width=T, headwidth=T) if rank=="-1": plt.annotate("75%: {} days".format(quartiles[0]), (quartiles[0], 0.25), xytext=(quartiles[0]+145, 0.25+.04), arrowprops=aprops) plt.annotate("50%: {} days".format(quartiles[1]), (quartiles[1], 0.50), xytext=(quartiles[1]+145, 0.50+.04), arrowprops=aprops) plt.annotate("25%: {} days".format(quartiles[2]), (quartiles[2], 0.75), xytext=(quartiles[2]+145, 0.75+0.04), arrowprops=aprops) elif rank=="9": plt.annotate("75%: {} days".format(quartiles[0]), (quartiles[0], 0.25), xytext=(quartiles[0]+415, 0.25+.1), arrowprops=aprops) plt.annotate("50%: {} days".format(quartiles[1]), (quartiles[1], 0.50), xytext=(quartiles[1]+290, 0.50+.1), arrowprops=aprops) plt.annotate("25%: {} days".format(quartiles[2]), (quartiles[2], 0.75), xytext=(quartiles[2]+165, 0.75+0.1), arrowprops=aprops) plt.annotate("Kruskall Wallis\nH: {:.3f}\nprob: {:.3f}".format(*kwstat), (960, 0.1)) plt.ylim(0,1) plt.xlim(0,1095) plt.title("Kaplan-Meier survival times by bank tier") plt.xlabel("IPO Duration (days)") plt.ylabel(r"$S(t)=Pr(T>t)$") plt.savefig("IPO_tiers_KP_survival.pdf", format='pdf', dpi=200)
def plot_kaplan_function(duration_key): from lifelines.estimation import KaplanMeierFitter from lifelines.statistics import logrank_test import matplotlib.pyplot as plt duration_keys = ["days_from_priced_to_listing", "days_to_final_price_revision", # "days_to_first_price_update", "days_from_s1_to_listing", "days_to_first_price_change"] duration_key = duration_keys[-1] kmf = KaplanMeierFitter() f, ax = plt.subplots(1,1,figsize=(12, 4), sharex=True) T = 1 # annotation line thickness xoffset = 0.4 # annotation offset (x-axis) yoffset = 0.04 # Above filing price range kmf.fit(above[duration_key], label='Upward Price Amendment: N={}'.format(len(above)), alpha=0.9) kmf.plot(ax=ax, c=colors[5], alpha=0.7) quartiles = [int(np.percentile(kmf.durations, x)) for x in [25, 50, 75]][::-1] aprops = dict(facecolor=colors[5], width=T, headwidth=T) plt.annotate("75%: {} days".format(quartiles[0]), (quartiles[0], 0.25), xytext=(quartiles[0]+xoffset, 0.25+yoffset), arrowprops=aprops) plt.annotate("50%: {} days".format(quartiles[1]), (quartiles[1], 0.50), xytext=(quartiles[1]+xoffset, 0.50+yoffset), arrowprops=aprops) plt.annotate("25%: {} days".format(quartiles[2]), (quartiles[2], 0.75), xytext=(quartiles[2]+xoffset, 0.75+yoffset), arrowprops=aprops) # Under filing price range kmf.fit(under[duration_key], label='Downward Price Amendment: N={}'.format(len(under)),) kmf.plot(ax=ax, c=colors[2], alpha=0.7) quartiles = [int(np.percentile(kmf.durations, x)) for x in [25, 50, 75]][::-1] aprops = dict(facecolor=colors[2], width=T, headwidth=T) plt.annotate("75%: {} days".format(quartiles[0]), (quartiles[0], 0.25), xytext=(quartiles[0]+xoffset, 0.25+yoffset+0.05), arrowprops=aprops) plt.annotate("50%: {} days".format(quartiles[1]), (quartiles[1], 0.50), xytext=(quartiles[1]+xoffset, 0.50+yoffset+0.05), arrowprops=aprops) plt.annotate("25%: {} days".format(quartiles[2]), (quartiles[2], 0.75), xytext=(quartiles[2]+xoffset, 0.75+yoffset+0.05), arrowprops=aprops) # log rank tests + general graph labels # summary, p_value, results = logrank_test( # above[duration_key], # within[duration_key], # under[duration_key], # alpha=0.95) # ax.annotate("Log-rank test: (prob={p:.3f})".format(p=p_value), # xy=(1210, 0.08)) plt.ylim(0,1) plt.xlim(0, max(np.percentile(above[duration_key], 90), np.percentile(under[duration_key],90))) plt.title("Kaplan-Meier Survival Functions") plt.xlabel("Delay (days) in {}".format(duration_key)) plt.ylabel(r"$S(t)=Pr(T>t)$")
dead.hist(bins=20, column='lenfol') plt.show() #plot the cumulative hazard (cdf) dead.hist(bins=100, column='lenfol', cumulative=True, normed=1) plt.show() #plot survival curve kaplen_meier = KaplanMeierFitter() time_of_event = df['lenfol']; event = df['fstat']; time = np.linspace(0, 2500, 100) kaplen_meier.fit(time_of_event, timeline=time, event_observed=event, label='All patients') kaplen_meier.plot() plt.show() #stratify Congestive Heart Complications history = df['chf'] == 1; kaplen_meier = KaplanMeierFitter() kaplen_meier.fit(time_of_event[history], timeline=time, event_observed=event[history], label='Congestive heart complications') ax = kaplen_meier.plot() kaplen_meier.fit(time_of_event[~history], timeline=time, event_observed=event[~history], label='No congestive heart complications') kaplen_meier.plot(ax=ax, c="b") plt.show() #Cox proportional hazard
cph.fit(df = df5, duration_col = 'duration', event_col = 'event') cph.predict_survival_function(X = df5).plot() #Kaplan Meier plots from lifelines.estimation import KaplanMeierFitter kmf = KaplanMeierFitter() df6 = df3[['duration', 'event']] kmf.fit(df6['duration'],df6['event']) kmf.plot() #how does the survival curve look alike for black people df6a = df3[df3['race_factor'] == 'African-American'] df6a = df6a[df6a['score_factor'] == 'Low'] df6b = df6a[['duration', 'event']] kmf.fit(df6b['duration'],df6b['event']) kmf.plot() #how does the survival curve look alike for white people df6c = df3[df3['race_factor'] == 'Caucasian'] df6c = df6c[df6c['score_factor'] == 'Low'] df6d = df6c[['duration', 'event']] kmf.fit(df6d['duration'],df6d['event'])
cf = CoxPHFitter() scores = k_fold_cross_validation(cf, data, 'time', event_col='event', k=3) print scores print np.mean(scores) print np.std(scores) le = preprocessing.LabelEncoder() subtypes = le.fit_transform(dataset["subtypes"]) data["subtype"] = subtypes T = data["time"] C = data["event"] kmf = KaplanMeierFitter() kmf.fit(T, event_observed=C) kmf.plot(title = 'Survival Day Profile of Breast Cancer Patients') # Basal f1 = data.subtype == 0 T1 = data[f1]['time'] C1 = data[f1]['event'] # Her2 f2 = data.subtype == 1 T2 = data[f2]['time'] C2 = data[f2]['event'] # LumA f3 = data.subtype == 2 T3 = data[f3]['time'] C3 = data[f3]['event']