def test_marginal_kaplan_meier_curves(self): marginal_survival = MarginalSurvival(survival_model=None) marginal_survival.fit(self.X, self.a) marginal_curves_causallib = marginal_survival.estimate_population_outcome( self.X, self.a, self.t, self.y) marginal_survival_lifelines = MarginalSurvival( survival_model=lifelines.KaplanMeierFitter()) marginal_survival_lifelines.fit(self.X, self.a) marginal_curves_causallib_lifelines = marginal_survival_lifelines.estimate_population_outcome( self.X, self.a, self.t, self.y) lifelines_km_a0 = lifelines.KaplanMeierFitter() lifelines_km_a0.fit(durations=self.t[self.a == 0], event_observed=self.y[self.a == 0]) lifelines_km_a1 = lifelines.KaplanMeierFitter() lifelines_km_a1.fit(durations=self.t[self.a == 1], event_observed=self.y[self.a == 1]) marginal_curves_lifelines = pd.DataFrame({ 0: lifelines_km_a0.predict(sorted(self.t.unique())), 1: lifelines_km_a1.predict(sorted(self.t.unique())) }) marginal_curves_lifelines.columns.name = 'a' marginal_curves_lifelines.index.name = 't' pd.testing.assert_frame_equal(marginal_curves_causallib, marginal_curves_causallib_lifelines) pd.testing.assert_frame_equal(marginal_curves_causallib, marginal_curves_lifelines)
def test_kmf_add_at_risk_counts_with_custom_subplot(self, block, kmf): # https://github.com/CamDavidsonPilon/lifelines/issues/991#issuecomment-614427882 import lifelines import matplotlib as mpl from lifelines.datasets import load_waltons plt = self.plt waltons = load_waltons() ix = waltons["group"] == "control" img_no = 3 height = 4 * img_no half_inch = 0.5 / height # in percent height _fig = plt.figure(figsize=(6, height), dpi=100) gs = mpl.gridspec.GridSpec(img_no, 1) # plt.subplots_adjust(left=0.08, right=0.98, bottom=half_inch, top=1 - half_inch) for i in range(img_no): ax = plt.subplot(gs[i, 0]) kmf_control = lifelines.KaplanMeierFitter() ax = kmf_control.fit(waltons.loc[ix]["T"], waltons.loc[ix]["E"], label="control").plot(ax=ax) kmf_exp = lifelines.KaplanMeierFitter() ax = kmf_exp.fit(waltons.loc[~ix]["T"], waltons.loc[~ix]["E"], label="exp").plot(ax=ax) ax = lifelines.plotting.add_at_risk_counts(kmf_exp, kmf_control, ax=ax) plt.subplots_adjust(hspace=0.6) plt.title("test_kmf_add_at_risk_counts_with_custom_subplot") plt.show(block=block)
def censored_roc(data, pred_var, time_var, orig_var, dur_var, time_val): subset = data[data[time_var] == time_val] #KM for full sample km_full = lifelines.KaplanMeierFitter() km_full.fit(subset[dur_var], subset[orig_var]) sf_full = list(km_full.survival_function_at_times(times=[time_val]))[0] #Getting reduced set of potential thresholds thresh = pd.unique(subset[pred_var].round(3)) thresh.sort() thresh = np.flip(thresh) #Estimating Curves tpr = [0.0] fpr = [0.0] km_above = lifelines.KaplanMeierFitter() km_below = lifelines.KaplanMeierFitter() for tv in thresh[1:-1]: above_test = (subset[pred_var] > tv) #KM for sample above sub_above = subset[above_test] km_above.fit(sub_above[dur_var], sub_above[orig_var]) sf_above = list( km_above.survival_function_at_times(times=[time_val]))[0] #KM for sample below sub_below = subset[~above_test] km_below.fit(sub_below[dur_var], sub_below[orig_var]) sf_below = list( km_below.survival_function_at_times(times=[time_val]))[0] #Now calculating sens/spec prop_above = above_test.mean() sens = ((1 - sf_above) * prop_above) / (1 - sf_full) spec = (sf_below * (1 - prop_above)) / (sf_full) tpr.append(sens) fpr.append(1 - spec) tpr.append(1.0) fpr.append(1.0) roc_dat = pd.DataFrame(zip(fpr, tpr, thresh), columns=['FPR', 'TPR', 'THRESH']) #Now fudging out places that are non-monotonic roc_new = roc_dat roc_new['FPR'] = roc_new['FPR'].round(3) roc_new['TPR'] = roc_new['TPR'].round(3) nonN, any_min = check_min(roc_new) while nonN > 0: roc_new = roc_new[~any_min].copy() nonN, any_min = check_min(roc_new) roc_new['Time'] = time_val try: auc_stat = metrics.auc(roc_new['FPR'], roc_new['TPR']) except: auc_stat = -1 return roc_new, auc_stat
def SurvivalPlot(surv_list, event_list, duration_list, name_list, legend_list, fig=None, is_show_KM=False, store_folder=None): assert (len(surv_list) == len(event_list) and len(surv_list) == len(duration_list)) if fig is None: fig = plt.figure() km = lifelines.KaplanMeierFitter() fig.clear() ax = fig.add_subplot(1, 1, 1) for index, (surv_df, event, duration, name, legend) in enumerate( zip(surv_list, event_list, duration_list, name_list, legend_list)): if is_show_KM: km.fit(duration, event, timeline=surv_df.index) km.plot_survival_function(color=color_list[index], ax=ax, ci_show=False, linestyle='--', label='{}-KM'.format(name)) ax.step(surv_df.index, surv_df.values.mean(axis=1), color=color_list[index], label=legend) ax.legend() ax.set_ylabel('Survival Function') ax.set_xlabel('Time')
def all_source_plot(self, **kwargs): """ KaplanMeier fit and plot, using baidutongji all_source dataframe as input :param kwargs: :return: """ all_source = self.data_frame title = kwargs['title'] path = kwargs['path'] old = all_source[all_source['visitor'] == 'old'] old_c = old.loc[:, 'avg_visit_time'].str.isdigit() old_cleaned = old[old_c].copy() new = all_source[all_source['visitor'] == 'new'] new_c = new.loc[:, 'avg_visit_time'].str.isdigit() new_cleaned = new[new_c].copy() kmf = lifelines.KaplanMeierFitter() fig, ax = plt.subplots() kmf.fit(new_cleaned['avg_visit_time'], label="New Visitors") kmf.plot(ax=ax, show_censors=True) kmf.fit(old_cleaned['avg_visit_time'], label="Old Visitors") kmf.plot(ax=ax, show_censors=True) plt.ylim(0, 1) plt.title(title) plt.tight_layout() plt.savefig(path) plt.close('all')
def plot_detect(filename, name, event_id, md): """ What is the distribution of times that infection is first detected. """ detection_times, none_detected = dataformat.first_of_event( filename, event_id) logger.info("Detected {0} times out of {1}".format( len(detection_times), len(detection_times) + none_detected)) if len(detection_times) is 0: logger.info("The event {0} did not happen.".format(event_id)) sys.exit(0) kmf = lifelines.KaplanMeierFitter() last = max(detection_times) + 1 detection = np.hstack([ np.array(detection_times), last * np.ones( (none_detected, ), dtype=np.double) ]) P = [1] * len(detection_times) + [0] * none_detected kmf.fit(detection, P, label=name) ax = kmf.plot() ax.set_title(name) ax.set_xlabel("Days") ax.set_ylabel("Survival") SaveFig("{0}_survival.pdf".format(name), md) plt.clf() plt.close()
def estimate_kaplan_meier(y, survival, duration_column='duration', observed_column='observed'): """Estimate survival curves for groups defined in y based on survival data in ``survival`` Parameters ---------- y: pd.Series, groups (clusters, subtypes). the index is the sample names survival: pd.DataFrame with the same index as y, with columns for the duration (survival time for each patient) and whether or not the death was observed. If the death was not observed (sensored), the duration is the time of the last followup. duration_column: the name of the column in ``survival`` with the duration observed_column: the name of the column in ``survival`` with True/False values for whether death was observed or not Returns ------- km_estimates: pd.DataFrame, index is the timeline, columns are survival functions (estimated by Kaplan-Meier) for each class, as defined in ``y``. """ try: import lifelines except ImportError: raise ImportError('The module ``lifelines`` was not found. It is required for this functionality. You may install it using `pip install lifelines`.') kmf = lifelines.KaplanMeierFitter() sfs = dict() for cl in y.unique(): ixs = list(set(y[y==cl].index) & set(survival.index)) kmf.fit(survival.loc[ixs][duration_column], survival.loc[ixs][observed_column], label=cl) sfs[cl] = kmf.survival_function_ return pd.concat([sfs[k] for k in sorted(y.unique())], axis=1).interpolate()
def run_survival(data, gene_name): ay = plt.subplot(111) ay.set_title(gene_name) gene = gene_name + '_expression' gene = ''.join(gene) genders = ['male', 'female'] group_by = ['gender'] group_by.append(gene) # print group_by gene_groups = ['Underexpressed', 'Overexpressed', 'Normal_expression'] kmf = lifelines.KaplanMeierFitter() grouped_data = data.groupby(group_by) for gene_group in gene_groups: for gender in genders: try: pre_tuple_list = [gender, gene_group] group = tuple(pre_tuple_list) # print 'tuple: ' + str(group) d = grouped_data.get_group(group) kaplan_meier_time = pd.to_numeric(d['time']) kaplan_meier_event = d['death_status'] # TODO: Change label to display N n_patients = [len(d)] pre_tuple_list.append(n_patients) label = str(pre_tuple_list) kmf.fit(kaplan_meier_time, kaplan_meier_event, label=label) kmf.plot(ax=ay, show_censors=True, ci_show=False) except KeyError: # print "No " + str(gender) + ' in gene' + str(gene_group) pass event_durations = data.as_matrix(columns=['time']) data['stat_col'] = data[gene] + data['gender'] group_labels = data.as_matrix(columns=['stat_col']) event = numpy.array(data.as_matrix(columns=['death_status'])) result = multivariate_logrank_test(event_durations, group_labels, event, 0.85) os.chdir(str(CD + '/' + cohort)) if not os.path.exists(CD + '/' + cohort + '/results'): os.makedirs('results') os.chdir(str(CD + '/' + cohort + '/results')) plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0) plt.savefig(str(gene_name + '_' + str(result.is_significant) + '.png'), bbox_inches='tight') # f.close() plt.close()
def usage_plot(cohorts): """ Plot usage participants in cohort """ kmf = lifelines.KaplanMeierFitter() norm_column = ['Anxiety', 'Mood', 'Psychosis', 'Sleep', 'Social', 'Medication'] for cohort in cohorts: time_vals = [] est_vals = [] for day_group in cohort.groupby(cohort.index): #print(day_group[1][norm_column]) for col in norm_column: time_vals += day_group[1][day_group[1][col].notnull()][col].index.tolist() est_vals += len(day_group[1][day_group[1][col].notnull()][col].index) * [1] kmf.fit(time_vals, est_vals) if 'ax' not in locals(): ax = kmf.plot() else: ax = kmf.plot(ax=ax) plt.xlabel('Day') plt.ylabel('Percentage of surveys remaining') #plt.show() plt.savefig('test_kmf.png')
def plot_km_curve(df_tune, df_test): """Returns KM curves for each risk group for `df_test`. Risk groups are defined via thresholds computed on `df_tune`. Args: df_tune: a pd.DataFrame of tune set data. df_test: a pd.DataFrame of test set data. """ # Compute risk groups df_test['risk_group'] = discretize(df_tune[RISK_SCORE], df_test[RISK_SCORE]) # Plot KM curves per risk group fig, ax = plt.subplots() groups = ['Low Risk', 'Medium Risk', 'High Risk'] kmfs = [] for group in groups: kmf = lifelines.KaplanMeierFitter() df_group = df_test.query(f"risk_group=='{group}'") if df_group.empty: continue kmf.fit(df_group[TIME], event_observed=df_group[OBSERVED], label=group) kmf.plot(ax=ax) kmfs.append(kmf) lifelines.plotting.add_at_risk_counts(*kmfs, ax=ax) return fig
def event_table(player_name, style): """Create an event table of the batsman's innings""" df = runs_df(player_name, style) time, event = df.Runs, df.Out kmf = lifelines.KaplanMeierFitter().fit(time, event) event_table = kmf.event_table event_table['Name'] = player_name return event_table
def __init__(self, sdataMatrix): #print data KM = ll.KaplanMeierFitter() kmf = KM.fit(sdataMatrix[:,2], event_observed=sdataMatrix[:,1]).survival_function_ self._kmf = np.zeros((np.shape(kmf)[0] ,2)) self._kmf[:,0] = np.asarray(list(kmf.index)) self._kmf[:,1] = list(np.asarray(kmf)) self._predict_event = None
def fit(self, X, B, T): kmf = lifelines.KaplanMeierFitter() kmf.fit(T, event_observed=B) self.ts = kmf.survival_function_.index.values self.ps = 1.0 - kmf.survival_function_['KM_estimate'].values self.ps_hi = 1.0 - kmf.confidence_interval_[ 'KM_estimate_lower_0.95'].values self.ps_lo = 1.0 - kmf.confidence_interval_[ 'KM_estimate_upper_0.95'].values
def compare_unit_survival(infect0, infect1, unit, traj_cnt0, traj_cnt1, when_max, md): kmf = lifelines.KaplanMeierFitter() ax = plot_unit_survival(kmf, None, infect0, traj_cnt0, when_max, "Continuous") plot_unit_survival(kmf, ax, infect1, traj_cnt1, when_max, "NAADSM") SaveFig("unit_survival{0}.pdf".format(unit), md) plt.clf() plt.close()
def categorical_km_curves(feature, t='hour', event='survive', df=data, ax=None): for cat in sorted(data[feature].unique(), reverse=True): idx = data[feature] == cat kmf = lifelines.KaplanMeierFitter() kmf.fit(data[idx][t], event_observed=data[idx][event] == 0, label=cat) kmf.plot(ax=ax, label=cat, ci_show=False, c=colours[cat])
def km_median(values, censored, censorship='upper'): kmf = lifelines.KaplanMeierFitter() if censorship == 'upper': kmf.fit_left_censoring(values, censored) return kmf.median_survival_time_ elif censorship == 'lower': kmf.fit(values, censored) return kmf.median_survival_time_ else: print('error') return
def test_kaplan_meier_against_lifelines(): kmf = lifelines.KaplanMeierFitter() for i in range(100): test_params = [] for b in ((1, 100), (0.5, 20)): test_params.append(np.random.uniform(*b)) test_params = np.array(test_params) x = surpyval.Weibull.random(int(np.random.uniform(2, 1000, 1)), *test_params) n = np.ones_like(x) * int(np.random.uniform(1, 5)) x_test = np.random.uniform(x.min()/2, x.max()*2, 100) ll_est = kmf.fit(x, weights=n).predict(x_test).values surp_est = surpyval.KaplanMeier.fit(x, n=n).sf(x_test) if not np.allclose(ll_est, surp_est, 1e-15): raise AssertionError('Kaplan-Meier different to lifelines?!')
def kaplan_plot(dataframe, group_col=None, event_col='TTE', observed_col='OBS', xlim=None, ax=None): """ Creates a Kaplan-Meier plot for each group in `group_col` Parameters ---------- dataframe : DataFrame Data to use for plots group_col : str, optional Groups to plot. If not separating by group, use a column with a single string value event_col : str, optional Name of the time to event column observed_col : str, optional Name of the event censoring column. 1 = event observed, 0 otherwise xlim : int Length of x-axis for plot ax : axis, optional If adding to an existing plot, set this to the existing ax value Returns ------- None Call to plt.plot() of Kaplan-Meier estimated survival curve """ kmf = lifelines.KaplanMeierFitter() if group_col is not None: add = ' by ' + group_col for group in dataframe[group_col].unique(): grp = (dataframe[group_col] == group) kmf.fit(dataframe.loc[grp, event_col], event_observed=dataframe.loc[grp, observed_col], label=group) if ax is None: ax = kmf.plot() else: ax = kmf.plot(ax=ax) else: add = '' kmf.fit(dataframe[event_col], event_observed=dataframe[observed_col]) ax = kmf.plot() if xlim is not None: ax.set_xlim(left=0, right=xlim) plt.title('Estimated Survival Curve' + add)
def basic_survival(df): T = df["duration"] E = df["degraded_obs"] kmf = ll.KaplanMeierFitter() model = kmf.fit(durations=T, event_observed=E) model.plot(figsize=(9, 8)) plt.title( 'Survival Function of Bridges over Time: Pooled Data Across all Bridges', fontsize=18) plt.savefig( '/Users/ian/Documents/exploratory/bridges/reports/figures/basic_survival.png' ) plt.show() plt.clf() plt.close()
def sf_KM(self, t_point): KM = ll.KaplanMeierFitter() kmf = KM.fit(self._sortedMatrix[:,2], event_observed=self._sortedMatrix[:,1]).survival_function_ self._kmf = np.zeros((np.shape(kmf)[0] ,2)) self._kmf[:,0] = np.asarray(list(kmf.index)) self._kmf[:,1] = list(np.asarray(kmf)) for i_t in range(len(self._kmf)): bl_sur = 1.0 if self._kmf[i_t, 0] > t_point: bl_sur = self._kmf[i_t-1, 1] break return bl_sur '''
def test_weighted_kaplan_meier_curves(self): weighted_survival = WeightedSurvival(weight_model=IPW( LogisticRegression(max_iter=10000, C=10), use_stabilized=True), survival_model=None) weighted_survival.fit(self.X, self.a) curves_causallib = weighted_survival.estimate_population_outcome( self.X, self.a, self.t, self.y) weighted_survival_lifelines_km = WeightedSurvival( weight_model=IPW(LogisticRegression(max_iter=10000, C=10), use_stabilized=True), survival_model=lifelines.KaplanMeierFitter()) weighted_survival_lifelines_km.fit(self.X, self.a) curves_causallib_lifelines = weighted_survival_lifelines_km.estimate_population_outcome( self.X, self.a, self.t, self.y) np.testing.assert_array_almost_equal(curves_causallib, curves_causallib_lifelines, decimal=8)
def disease_comparison(times0, times1, name, md): logger.debug("times0 len {0} times1 len {1}".format( len(times0), len(times1))) plt.clf() fig = plt.figure(1, figsize=(4, 3)) ax = fig.add_subplot(111) kmf = lifelines.KaplanMeierFitter() logger.info("Truncating times at 50.") for tidx in range(len(times0)): if times0[tidx] > 50: times0[tidx] = 50 P0 = [1] * len(times0) kmf.fit(times0, P0, label="Continuous") ax = kmf.plot(ax=ax) ax.set_title(name) P1 = [1] * len(times1) kmf.fit(times1, P1, label="NAADSM") kmf.plot(ax=ax) plt.tight_layout() SaveFig("disease_comparison{0}.pdf".format(name), md)
def stratifiedSurvival(t, eventTime, eventIndicator=None, followupTime=None, group=None): import matplotlib.pyplot as plt import lifelines as lf from lifelines.plotting import add_at_risk_counts import pandas as pd import copy tm = t[eventTime].copy() if (group is None): grp = pd.Series('Population', index=t.index) else: grp = t[group] if (eventIndicator is None): ev = ~t[eventTime].isnull() tm[tm.isnull()] = t.loc[tm.isnull(), followupTime] ######### Kaplan Meier curves stratified by sex kl = list() kmf = lf.KaplanMeierFitter() fig, ax = plt.subplots() for g in set(grp): kmf.fit(tm[grp == g], ev[grp == g], label=g) kmf.plot(ax=ax) kl.append(copy.deepcopy(kmf)) add_at_risk_counts(*kl, ax=ax) plt.legend(loc='lower left') plt.ylim([0, 1]) plt.xlabel('Time (years)') plt.ylabel('Survival') plt.title('Kaplan-Meier survival curve')
def make_km(tv_data, label='Untitled', endpoint=700): """Construct a Kaplan-Meier function for a dataframe of tumour volume measurements Arguments: tv_data - a pandas data frame of volume measurements with individuals in columns and timepoints as rows. Individuals are removed from study at the first NaN timepoint label - a title for this grouping endpoint - the volume at which the endpoint is reached Default: 700 Returns: a lifelines KaplanMeierFitter object """ survival = volume_to_survival(tv_data, endpoint=endpoint) kmf = lifelines.KaplanMeierFitter() kmf.fit(survival['Time'], event_observed=survival['Observed'], label=label) return kmf
# add in the time since column fch['time_until_refactor'] = 0 for idx, row in fch.iterrows(): ts = None chunk = fch[(fch['timestamp'] > row.timestamp) & (fch['refactor'] == 1) & (fch['filename'] == row.filename)] if chunk.shape[0] > 0: ts = chunk['timestamp'].min() fch.set_value(idx, 'observed', True) else: ts = fch['timestamp'].max() fch.set_value(idx, 'time_until_refactor', ts - row.timestamp) # plot out some survival curves fig = plt.figure() ax = plt.subplot(111) for filename in set(fch['file_owner'].values): sample = fch[fch['file_owner'] == filename] if sample.shape[0] > 20: print('Evaluating %s' % (filename, )) kmf = lifelines.KaplanMeierFitter() kmf.fit(sample['time_until_refactor'].values, event_observed=sample['observed'], timeline=list(range(365)), label=filename) ax = kmf.survival_function_.plot(ax=ax) plt.title('Survival function of file owners (thres=%s)' % (threshold, )) plt.xlabel('Lifetime (days)') plt.show()
def hazard2KMCurve(data, subtype): p = np.percentile(data['Hazard'], [33, 66]) if p[0] == p[1]: p[0] = 2.99997 data.insert(0, 'grade_pred', [hazard2grade(hazard, p) for hazard in data['Hazard']]) kmf_pred = lifelines.KaplanMeierFitter() kmf_gt = lifelines.KaplanMeierFitter() def get_name(model): mode2name = { 'pathgraphomic': 'Pathomic F.', 'pathomic': 'Pathomic F.', 'graphomic': 'Pathomic F.', 'path': 'Histology CNN', 'graph': 'Histology GCN', 'omic': 'Genomic SNN' } for mode in mode2name.keys(): if mode in model: return mode2name[mode] return 'N/A' fig = plt.figure(figsize=(10, 10), dpi=600) ax = plt.subplot() censor_style = {'ms': 20, 'marker': '+'} temp = data[data['Grade'] == 0] kmf_gt.fit(temp['Survival months'] / 365, temp['censored'], label="Grade II") kmf_gt.plot(ax=ax, show_censors=True, ci_show=False, c='g', linewidth=3, ls='--', markerfacecolor='black', censor_styles=censor_style) temp = data[data['grade_pred'] == 0] kmf_pred.fit(temp['Survival months'] / 365, temp['censored'], label="%s (Low)" % get_name(model)) kmf_pred.plot(ax=ax, show_censors=True, ci_show=False, c='g', linewidth=4, ls='-', markerfacecolor='black', censor_styles=censor_style) temp = data[data['Grade'] == 1] kmf_gt.fit(temp['Survival months'] / 365, temp['censored'], label="Grade III") kmf_gt.plot(ax=ax, show_censors=True, ci_show=False, c='b', linewidth=3, ls='--', censor_styles=censor_style) temp = data[data['grade_pred'] == 1] kmf_pred.fit(temp['Survival months'] / 365, temp['censored'], label="%s (Mid)" % get_name(model)) kmf_pred.plot(ax=ax, show_censors=True, ci_show=False, c='b', linewidth=4, ls='-', censor_styles=censor_style) if subtype != 'ODG': temp = data[data['Grade'] == 2] kmf_gt.fit(temp['Survival months'] / 365, temp['censored'], label="Grade IV") kmf_gt.plot(ax=ax, show_censors=True, ci_show=False, c='r', linewidth=3, ls='--', censor_styles=censor_style) temp = data[data['grade_pred'] == 2] kmf_pred.fit(temp['Survival months'] / 365, temp['censored'], label="%s (High)" % get_name(model)) kmf_pred.plot(ax=ax, show_censors=True, ci_show=False, c='r', linewidth=4, ls='-', censor_styles=censor_style) ax.set_xlabel('') ax.set_ylim(0, 1) ax.set_yticks(np.arange(0, 1.001, 0.5)) ax.tick_params(axis='both', which='major', labelsize=40) plt.legend(fontsize=32, prop=font_manager.FontProperties(family='Arial', style='normal', size=32)) if subtype != 'idhwt_ATC': ax.get_legend().remove() return fig
def compare_interior_kaplan(obs, var_pair, rescale_kaplan=False, rescale_interior=False): """ Interior vs kaplan est for `multi_locus_analysis.finite_window.ab_window`. Compare the Kaplan-Meier estimator to the empirical distribution function (eCDF) of interior times of data generated using the `multi_locus_analysis.finite_window.ab_window` or `multi_locus_analysis.finite_window.ab_window_fast` functions. """ kmfs = {} for name, state in obs.groupby('state'): times = state['wait_time'].values not_censored = (state['wait_type'] == 'interior').values kmfs[name] = lifelines.KaplanMeierFitter().fit( times, event_observed=not_censored, label=r'Meier-Kaplan Estimator, $\pm$95% conf int') fig, axs = _get_axes(var_pair, name='two-by-half column, four legend entries above') T = obs.window_size.max() for var in var_pair: ax = axs[var.name] # extract KM CDF fit tk = kmfs[var.name].cumulative_density_.index.values kmf = kmfs[var.name].cumulative_density_.values # and confidence intervals low, high = kmfs[var.name] \ .confidence_interval_cumulative_density_.values.T Z = kmf[-1] / var.cdf(T) if rescale_kaplan else 1 km_l = ax.plot(tk, kmf / Z, color=km_color, label='Kaplan-Meier')[0] ax.fill_between(tk, low / Z, high / Z, color=km_color, alpha=0.4) # plot actual distribution t = np.linspace(0, T, 101) analytical_l, = ax.plot(t, var.cdf(t), color='k', label='Actual CDF') # now compute the empirical distribution of the "interior" times interior, _ = _int_win_from_obs(obs, var.name) x, cdf = fw.ecdf(interior, pad_left_at_x=0) Z = 1 / var.cdf(x[-1]) if rescale_interior else 1 interior_l, = ax.plot(x, cdf / Z, c=var.color, ls=interior_linestyle, label='"Interior" eCDF') # prettify the plot ax.set_xlim([0, T]) ax.set_ylim([0, 1]) ax.set_xlabel('time') ax.set_ylabel(r'Cumulative probability') ax.legend( title=var.pretty_name, handles=[interior_l, km_l, analytical_l], # align bottom of legend 2% ax height above axis, filling full axis # width bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left', ncol=1, mode="expand", borderaxespad=0.) return fig
def plot_km_recs_antirecs(T, E, recommendation_idx, fig=None, ax=None, xlim=None, ylim=None, show_risk=False): """ Plot KM curves for (anti)recommendation patients. Parameters ---------- T: pandas DataFrame It needs to have column 'T' E: pandas DataFrame It needs to have column 'E' recommendation_idx: boolean array Array as given by get_recs_antirecs_index. It is True for recommendation patients. fig: figure handle (optional) ax: axes handle (optional) xlim: list (two elements, optional) x-axis boundaries. ylim: list (two elements, optional) y-axis boundaries. If left as None, defaults to [0, 1] show_risk: boolean (optional) Indicate if the number of patients at risk should be included below the axis (True) or not (False, default). Returns ------- tuple The first element corresponds to the figure handle. The second element correpsonds to the axes handle. """ # Create figure (if necessary). if (fig is None) and (ax is None): fig, ax = plt.subplots(1, 1, figsize=[12, 6]) elif fig is None: fig = ax.get_figure() elif ax is None: ax = fig.gca() # Initialize variables. kmf_list = [] T_list = [] C_list = [] # For each label, apply KMF and plot. labels = ['recommendation', 'anti-recommendation'] for label in labels: # Perform proper selection. if label=='recommendation': T_curr = T.loc[recommendation_idx, :] E_curr = E.loc[recommendation_idx, :] elif label=='anti-recommendation': T_curr = T.loc[~recommendation_idx, :] E_curr = E.loc[~recommendation_idx, :] # Create Kaplan Meier Fitter and fit. kmf = lifelines.KaplanMeierFitter() kmf.fit(T_curr, E_curr, label=label.capitalize()) # Plot KM curve. ax = kmf.plot(ax=ax, linewidth=5, legend=True) ax.legend(loc='best', frameon=False, fontsize='small') kmf_list.append(kmf) T_list.append(T_curr) C_list.append(E_curr) # Perform statistical analysis (log-rank test). results = lifelines.statistics.logrank_test(T_list[0], T_list[1], C_list[0], C_list[1], alpha=0.95) results.print_summary(style='ascii', decimals=4) # Calculate p-value text position and display. if ylim==None: y_pos = 0.1 else: y_pos = 0.1 + min(ylim) + ((max(ylim) - min(ylim))*0.1) if results.p_value < 0.001: p_value_text = "$p$ < 0.001" else: p_value_text = f"$p$ = {results.p_value:.4f}" ax.text(T['T'].min()*10, y_pos, p_value_text, fontsize='small') # Format x-axis ticks here. # xticks = np.arange(T['T'].min(), T['T'].max()) # xticks_float = xticks # xticks_floor = np.floor(xticks_float) # xticks_ceil = np.ceil(xticks_float) # xticks = np.unique(np.concatenate([xticks_floor, xticks_ceil], axis=None)) # # Remove unnecesary ticks. # ax.set_xticks(xticks) # ax.set_xticklabels(xticks.astype(int)) if xlim!=None: ax.set_xlim(np.array(xlim)) if ylim!=None: ax.set_ylim(ylim) else: ax.set_ylim([0, 1]) ax.set_ylabel("Survival probability", weight='bold') # Add risk counts. if show_risk: lifelines.plotting.add_at_risk_counts(kmf_list[0], kmf_list[1], ax=ax) # X-axis label is set here to be sure it is show correctly even if # patients at risk will be shown. ax.set_xlabel("Time", weight='bold') return fig, ax
def _example_pareto_alpha(V_T_N): import multi_locus_analysis.finite_window as fw import multi_locus_analysis.plotting.finite_window as fplt # unpack parameters first (betas, xmin), T, N_traj = V_T_N var_pair = [ fplt.Variable(scipy.stats.pareto(beta, scale=xmin), name=f'Pareto({beta:0.3g})') for beta in betas ] # run one simulation sim = fw.ab_window([var.rvs for var in var_pair], offset=-100 * np.sum([var.mean() for var in var_pair]), window_size=T, num_replicates=N_traj, states=[var.name for var in var_pair]) obs = fw.sim_to_obs(sim) # now extract alpha several different ways true_alpha = {var.name: var.args[0] + 1 for var in var_pair} mle_interior_est = {} mle_uncensored_baseline = {} fit_interior = {} fit_corrected = {} fit_kaplan = {} fit_uncensored_baseline = {} for var in var_pair: # mle, interior try: interior, windows = fplt._int_win_from_obs(obs, var.name) num_obs = len(interior) mle_interior_est[var.name] = _mla_stats.power_law_slope_mle( interior, xmin, num_obs) except: mle_interior_est[var.name] = np.nan # fit, interior try: x_int, cdf_int = fw.ecdf_windowed(interior, windows) fit_interior[var.name] = _alpha_from_cdf(x_int, cdf_int, xmin) except: fit_interior[var.name] = np.nan # fit, corrected try: exterior = fplt._ext_from_obs(obs, var.name) bin_centers, final_cdf = fw.ecdf_combined(exterior, interior, T) fit_corrected[var.name] = _alpha_from_cdf(bin_centers, final_cdf, xmin) except: fit_corrected[var.name] = np.nan # fit, kaplan try: times = np.concatenate([interior, exterior]) is_interior = np.concatenate( [np.ones_like(interior), np.zeros_like(exterior)]).astype(bool) kmf = lifelines.KaplanMeierFitter() \ .fit(times, event_observed=is_interior) x_kap = kmf.cumulative_density_.index.values cdf_kap = kmf.cumulative_density_.values.flatten() fit_kaplan[var.name] = _alpha_from_cdf(x_kap, cdf_kap, xmin) except: fit_kaplan[var.name] = np.nan # mle, uncensored baseline try: uncensored_obs = var.rvs(size=(num_obs, )) mle_uncensored_baseline[var.name] = _mla_stats.power_law_slope_mle( uncensored_obs, xmin, num_obs) except: mle_uncensored_baseline[var.name] = np.nan # fit, uncensored baseline try: x_unc, cdf_unc = _mla_stats.ecdf(uncensored_obs, pad_left_at_x=0) fit_uncensored_baseline[var.name] = \ _alpha_from_cdf(x_unc, cdf_unc, xmin) except: fit_uncensored_baseline[var.name] = np.nan df = pd.concat(map(pd.Series, [ true_alpha, mle_interior_est, mle_uncensored_baseline, fit_interior, fit_corrected, fit_kaplan, fit_uncensored_baseline ]), axis=1) df.columns = [ 'true', 'mle-interior', 'mle-uncensored', 'fit-interior', 'fit-corrected', 'fit-kaplan', 'fit-uncensored' ] return df
def _example_lambda_fit(V_T_N): import multi_locus_analysis.finite_window as fw import multi_locus_analysis.plotting.finite_window as fplt lambdas, T, N_traj = V_T_N var_pair = [ fplt.Variable(expon(scale=lam), name=f"Exp({lam})") for lam in lambdas ] sim = fw.ab_window([var.rvs for var in var_pair], offset=-100 * np.sum([var.mean() for var in var_pair]), window_size=T, num_replicates=N_traj, states=[var.name for var in var_pair]) obs = fw.sim_to_obs(sim) mean_est = fw.average_lifetime(obs) true_mean = {var.name: var.mean() for var in var_pair} naive_slope_est = {} correct_slope_est = {} kaplan_slope_est = {} uncensored_baseline = {} for var in var_pair: # naive interior, windows = fplt._int_win_from_obs(obs, var.name) try: x_int, cdf_int = fw.ecdf_windowed(interior, windows) naive_slope_est[var.name] = _mean_from_exp_cdf(x_int, cdf_int) except: naive_slope_est[var.name] = np.nan # corrected exterior = fplt._ext_from_obs(obs, var.name) try: bin_centers, final_cdf = fw.ecdf_combined(exterior, interior, T) correct_slope_est[var.name] = _mean_from_exp_cdf( bin_centers, final_cdf) except: correct_slope_est[var.name] = np.nan # kaplan times = np.concatenate([interior, exterior]) is_interior = np.concatenate( [np.ones_like(interior), np.zeros_like(exterior)]).astype(bool) try: kmf = lifelines.KaplanMeierFitter() \ .fit(times, event_observed=is_interior) x_kap = kmf.cumulative_density_.index.values cdf_kap = kmf.cumulative_density_.values.flatten() kaplan_slope_est[var.name] = _mean_from_exp_cdf(x_kap, cdf_kap) except: kaplan_slope_est[var.name] = np.nan # uncensored baseline num_obs = len(interior) try: x_unc, cdf_unc = _mla_stats.ecdf(var.rvs(size=(num_obs, )), pad_left_at_x=0) uncensored_baseline[var.name] = _mean_from_exp_cdf(x_unc, cdf_unc) except: uncensored_baseline[var.name] = np.nan df = pd.concat(map(pd.Series, [ true_mean, correct_slope_est, naive_slope_est, mean_est, kaplan_slope_est, uncensored_baseline ]), axis=1) df.columns = [ 'true', 'corrected', 'naive', 'count-based', 'kaplan', 'uncensored' ] return df