def _ll_null(self): if hasattr(self, "_ll_null_"): return self._ll_null_ initial_point = np.zeros(len(self._fitted_parameter_names)) model = self.__class__(breakpoints=self.breakpoints[:-1], penalizer=self.penalizer) regressors = {param_name: ["_intercept"] for param_name in self._fitted_parameter_names} if CensoringType.is_right_censoring(self): df = pd.DataFrame({"T": self.durations, "E": self.event_observed, "entry": self.entry, "_intercept": 1.0}) model.fit_right_censoring( df, "T", "E", initial_point=initial_point, entry_col="entry", regressors=regressors ) elif CensoringType.is_interval_censoring(self): df = pd.DataFrame( { "lb": self.lower_bound, "ub": self.upper_bound, "E": self.event_observed, "entry": self.entry, "_intercept": 1.0, } ) model.fit_interval_censoring( df, "lb", "ub", "E", initial_point=initial_point, entry_col="entry", regressors=regressors ) if CensoringType.is_left_censoring(self): raise NotImplementedError() self._ll_null_ = model.log_likelihood_ return self._ll_null_
def cdf_plot(model, timeline=None, **plot_kwargs): from lifelines import KaplanMeierFitter set_kwargs_ax(plot_kwargs) ax = plot_kwargs.pop("ax") if timeline is None: timeline = model.timeline COL_EMP = "empirical quantiles" if CensoringType.is_left_censoring(model): kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP, timeline=timeline) elif CensoringType.is_right_censoring(model): kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP, timeline=timeline) elif CensoringType.is_interval_censoring(model): raise NotImplementedError() kmf.plot_cumulative_density(ax=ax, **plot_kwargs) dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) ax.plot(timeline, dist_object.cdf(timeline), label="fitted %s" % dist, **plot_kwargs) ax.legend() return ax
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs): """ """ from lifelines import KaplanMeierFitter if ax is None: ax = plt.gca() if timeline is None: timeline = model.timeline COL_EMP = "empirical CDF" if CensoringType.is_left_censoring(model): empirical_kmf = KaplanMeierFitter().fit_left_censoring( model.durations, model.event_observed, label=COL_EMP, timeline=timeline ) elif CensoringType.is_right_censoring(model): empirical_kmf = KaplanMeierFitter().fit_right_censoring( model.durations, model.event_observed, label=COL_EMP, timeline=timeline ) elif CensoringType.is_interval_censoring(model): raise NotImplementedError("lifelines does not have a non-parametric interval model yet.") empirical_kmf.plot_cumulative_density(ax=ax, **plot_kwargs) dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) ax.plot(timeline, dist_object.cdf(timeline), label="fitted %s" % dist, **plot_kwargs) ax.legend() return ax
def plot_survival_function(self, **kwargs): """Alias of ``plot``""" if not CensoringType.is_interval_censoring(self): return _plot_estimate(self, estimate="survival_function_", **kwargs) else: # hack for now. color = coalesce(kwargs.get("c"), kwargs.get("color"), "k") self.survival_function_.plot(drawstyle="steps-pre", color=color, **kwargs)
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): T = Ts[0] elif CensoringType.is_left_censoring(self): T = Ts[1] elif CensoringType.is_interval_censoring(self): T = Ts[1] - Ts[0] return np.array([np.median(T), 1.0])
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): log_data = log(Ts[0]) elif CensoringType.is_left_censoring(self): log_data = log(Ts[1]) elif CensoringType.is_interval_censoring(self): log_data = log(Ts[1] - Ts[0]) return np.array([log_data.mean(), log(log_data.std()), 0.1])
def _get_initial_values(self, Ts, E, *args): if CensoringType.is_right_censoring(self): log_data = np.log(Ts[0]) elif CensoringType.is_left_censoring(self): log_data = np.log(Ts[1]) elif CensoringType.is_interval_censoring(self): log_data = np.log(Ts[1] - Ts[0]) return np.array([log_data.mean(), np.log(log_data.std()), 1.0])
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): log_T = np.log(Ts[0]) elif CensoringType.is_left_censoring(self): log_T = np.log(Ts[1]) elif CensoringType.is_interval_censoring(self): log_T = np.log(Ts[1]) return np.array([np.median(log_T), 1.0])
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): log_data = log(Ts[0]) elif CensoringType.is_left_censoring(self): log_data = log(Ts[1]) elif CensoringType.is_interval_censoring(self): # this fails if Ts[1] == Ts[0], so we add a some fudge factors. log_data = log(Ts[1] - Ts[0] + 0.01) return np.array([log_data.mean(), log(log_data.std() + 0.01), 0.1])
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs): """ This plot compares the empirical CDF (derived by KaplanMeier) vs the model CDF. Parameters ------------ model: lifelines univariate model timeline: iterable ax: matplotlib axis """ from lifelines import KaplanMeierFitter from matplotlib import pyplot as plt if ax is None: ax = plt.gca() if timeline is None: timeline = model.timeline COL_EMP = "empirical CDF" if CensoringType.is_left_censoring(model): empirical_kmf = KaplanMeierFitter().fit_left_censoring( model.durations, model.event_observed, label=COL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) elif CensoringType.is_right_censoring(model): empirical_kmf = KaplanMeierFitter().fit_right_censoring( model.durations, model.event_observed, label=COL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) elif CensoringType.is_interval_censoring(model): empirical_kmf = KaplanMeierFitter().fit_interval_censoring( model.lower_bound, model.upper_bound, label=COL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) empirical_kmf.plot_cumulative_density(ax=ax, **plot_kwargs) dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) ax.plot(timeline, dist_object.cdf(timeline), label="fitted %s" % dist, **plot_kwargs) ax.legend() return ax
def plot_cumulative_density(self, **kwargs): """ Plots a pretty figure of the cumulative density function. Matplotlib plot arguments can be passed in inside the kwargs. Parameters ----------- show_censors: bool place markers at censorship events. Default: False censor_styles: bool If show_censors, this dictionary will be passed into the plot call. ci_alpha: bool the transparency level of the confidence interval. Default: 0.3 ci_force_lines: bool force the confidence intervals to be line plots (versus default shaded areas). Default: False ci_show: bool show confidence intervals. Default: True ci_legend: bool if ci_force_lines is True, this is a boolean flag to add the lines' labels to the legend. Default: False at_risk_counts: bool show group sizes at time points. See function ``add_at_risk_counts`` for details. Default: False loc: slice specify a time-based subsection of the curves to plot, ex: >>> model.plot(loc=slice(0.,10.)) will plot the time values between t=0. and t=10. iloc: slice specify a location-based subsection of the curves to plot, ex: >>> model.plot(iloc=slice(0,10)) will plot the first 10 time points. Returns ------- ax: a pyplot axis object """ if not CensoringType.is_interval_censoring(self): return _plot_estimate(self, estimate="cumulative_density_", **kwargs) else: # hack for now. color = coalesce(kwargs.get("c"), kwargs.get("color"), "k") self.cumulative_density_.plot(drawstyle="steps", color=color, **kwargs)
def plot_survival_function(self, **kwargs): """Alias of ``plot``""" if not CensoringType.is_interval_censoring(self): return _plot_estimate(self, estimate="survival_function_", **kwargs) else: # hack for now. def safe_pop(dict, key): if key in dict: return dict.pop(key) else: return None color = coalesce(safe_pop(kwargs, "c"), safe_pop(kwargs, "color"), "k") self.survival_function_.plot(drawstyle="steps-pre", color=color, **kwargs)
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): T = Ts[0] elif CensoringType.is_left_censoring(self): T = np.clip(0.0001, np.inf, Ts[1]) elif CensoringType.is_interval_censoring(self): if E.sum() > 0: # Ts[1] can contain infs, so ignore this data okay_data = Ts[1] < 1e10 T = Ts[1] T = T[okay_data] else: T = np.array([1.0]) return np.array([np.median(T), 1.0])
def _ll_null(self): if hasattr(self, "_ll_null_"): return self._ll_null_ initial_point = np.zeros(len(self._fitted_parameter_names)) regressors = { name: ["intercept"] for name in self._fitted_parameter_names } model = self.__class__() with warnings.catch_warnings(): warnings.simplefilter("ignore") if CensoringType.is_right_censoring(self): df = pd.DataFrame({ "T": self.durations, "E": self.event_observed, "entry": self.entry, "intercept": 1 }) model.fit_right_censoring(df, "T", "E", initial_point=initial_point, entry_col="entry", regressors=regressors) elif CensoringType.is_interval_censoring(self): df = pd.DataFrame({ "lb": self.lower_bound, "ub": self.upper_bound, "E": self.event_observed, "entry": self.entry, "intercept": 1, }) model.fit_interval_censoring(df, "lb", "ub", "E", initial_point=initial_point, entry_col="entry", regressors=regressors) if CensoringType.is_left_censoring(self): raise NotImplementedError() self._ll_null_ = model._log_likelihood return self._ll_null_
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs): """ Cumulative Distribution Function """ from lifelines import KaplanMeierFitter #kmf = KaplanMeierFitter() #kmf.fit(durations = churn_data['tenure'], event_observed = churn_data['Churn - Yes'] ) if ax is None: ax = plt.gca() if timeline is None: timeline = model.timeline CDL_EMP = "empirical CDF" if CensoringType.is_left_censoring(model): emp_kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=CDL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) if CensoringType.is_right_censoring(model): emp_kmf = KaplanMeierFitter().fit_right_censoring( model.durations, model.event_observed, label=CDL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) if CensoringType.is_interval_censoring(model): emp_kmf = KaplanMeierFitter().fit_interval_censoring( model.lower_bound, model.upper_bound, label=CDL_EMP, timeline=timeline, weights=model.weights, entry=model.entry)
def survival_probability_calibration(model: RegressionFitter, training_df: pd.DataFrame, t0: float, ax=None): r""" Smoothed calibration curves for time-to-event models. This is analogous to calibration curves for classification models, extended to handle survival probabilities and censoring. Produces a matplotlib figure and some metrics. We want to calibrate our model's prediction of :math:`P(T < \text{t0})` against the observed frequencies. Parameters ------------- model: a fitted lifelines regression model to be evaluated training_df: DataFrame the DataFrame used to train the model t0: float the time to evaluate the probability of event occurring prior at. Returns ---------- ax: mpl axes ICI: mean absolute difference between predicted and observed E50: median absolute difference between predicted and observed https://onlinelibrary.wiley.com/doi/full/10.1002/sim.8570 """ def ccl(p): return np.log(-np.log(1 - p)) if ax is None: ax = plt.gca() T = model.duration_col E = model.event_col predictions_at_t0 = np.clip( 1 - model.predict_survival_function(training_df, times=[t0]).T.squeeze(), 1e-10, 1 - 1e-10) # create new dataset with the predictions prediction_df = pd.DataFrame({ "ccl_at_%d" % t0: ccl(predictions_at_t0), "constant": 1, T: model.durations, E: model.event_observed }) # fit new dataset to flexible spline model # this new model connects prediction probabilities and actual survival. It should be very flexible, almost to the point of overfitting. It's goal is just to smooth out the data! knots = 3 regressors = { "beta_": ["ccl_at_%d" % t0], "gamma0_": ["constant"], "gamma1_": ["constant"], "gamma2_": ["constant"] } # this model is from examples/royson_crowther_clements_splines.py crc = CRCSplineFitter(knots, penalizer=0) if CensoringType.is_right_censoring(model): crc.fit_right_censoring(prediction_df, T, E, regressors=regressors) elif CensoringType.is_left_censoring(model): crc.fit_left_censoring(prediction_df, T, E, regressors=regressors) elif CensoringType.is_interval_censoring(model): crc.fit_interval_censoring(prediction_df, T, E, regressors=regressors) # predict new model at values 0 to 1, but remember to ccl it! x = np.linspace(np.clip(predictions_at_t0.min() - 0.01, 0, 1), np.clip(predictions_at_t0.max() + 0.01, 0, 1), 100) y = 1 - crc.predict_survival_function(pd.DataFrame({ "ccl_at_%d" % t0: ccl(x), "constant": 1 }), times=[t0]).T.squeeze() # plot our results ax.set_title( "Smoothed calibration curve of \npredicted vs observed probabilities of t ≤ %d mortality" % t0) color = "tab:red" ax.plot(x, y, label="smoothed calibration curve", color=color) ax.set_xlabel("Predicted probability of \nt ≤ %d mortality" % t0) ax.set_ylabel("Observed probability of \nt ≤ %d mortality" % t0, color=color) ax.tick_params(axis="y", labelcolor=color) # plot x=y line ax.plot(x, x, c="k", ls="--") ax.legend() # plot histogram of our original predictions color = "tab:blue" twin_ax = ax.twinx() twin_ax.set_ylabel("Count of \npredicted probabilities", color=color) # we already handled the x-label with ax1 twin_ax.tick_params(axis="y", labelcolor=color) twin_ax.hist(predictions_at_t0, alpha=0.3, bins="sqrt", color=color) plt.tight_layout() deltas = ((1 - crc.predict_survival_function( prediction_df, times=[t0])).T.squeeze() - predictions_at_t0).abs() ICI = deltas.mean() E50 = np.percentile(deltas, 50) print("ICI = ", ICI) print("E50 = ", E50) return ax, ICI, E50
def qq_plot(model, ax=None, **plot_kwargs): """ Produces a quantile-quantile plot of the empirical CDF against the fitted parametric CDF. Large deviances away from the line y=x can invalidate a model (though we expect some natural deviance in the tails). Parameters ----------- model: obj A fitted lifelines univariate parametric model, like ``WeibullFitter`` plot_kwargs: kwargs for the plot. Returns -------- ax: The axes which was used. Examples --------- >>> from lifelines import * >>> from lifelines.plotting import qq_plot >>> from lifelines.datasets import load_rossi >>> df = load_rossi() >>> wf = WeibullFitter().fit(df['week'], df['arrest']) >>> qq_plot(wf) """ from lifelines.utils import qth_survival_times from lifelines import KaplanMeierFitter if ax is None: ax = plt.gca() dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) COL_EMP = "empirical quantiles" COL_THEO = "fitted %s quantiles" % dist if CensoringType.is_left_censoring(model): kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP) elif CensoringType.is_right_censoring(model): kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP) elif CensoringType.is_interval_censoring(model): raise NotImplementedError("lifelines does not have a non-parametric interval model yet.") q = np.unique(kmf.cumulative_density_.values[:, 0]) # this is equivalent to the old code `qth_survival_times(q, kmf.cumulative_density, cdf=True)` quantiles = qth_survival_times(1 - q, kmf.survival_function_) quantiles[COL_THEO] = dist_object.ppf(q) quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna() max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min() quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax) ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0) ax.set_ylim(min_, max_) ax.set_xlim(min_, max_) return ax
def qq_plot(model, ax=None, **plot_kwargs): """ Produces a quantile-quantile plot of the empirical CDF against the fitted parametric CDF. Large deviances away from the line y=x can invalidate a model (though we expect some natural deviance in the tails). Parameters ----------- model: obj A fitted lifelines univariate parametric model, like ``WeibullFitter`` plot_kwargs: kwargs for the plot. Returns -------- ax: The axes which was used. Examples --------- .. code:: python from lifelines import * from lifelines.plotting import qq_plot from lifelines.datasets import load_rossi df = load_rossi() wf = WeibullFitter().fit(df['week'], df['arrest']) qq_plot(wf) Notes ------ The interval censoring case uses the mean between the upper and lower bounds. """ from lifelines.utils import qth_survival_times from lifelines import KaplanMeierFitter if ax is None: ax = plt.gca() dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) COL_EMP = "empirical quantiles" COL_THEO = "fitted %s quantiles" % dist if CensoringType.is_left_censoring(model): kmf = KaplanMeierFitter().fit_left_censoring( model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry ) sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[COL_EMP] elif CensoringType.is_right_censoring(model): kmf = KaplanMeierFitter().fit_right_censoring( model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry ) sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[COL_EMP] elif CensoringType.is_interval_censoring(model): kmf = KaplanMeierFitter().fit_interval_censoring( model.lower_bound, model.upper_bound, label=COL_EMP, weights=model.weights, entry=model.entry ) sf, cdf = kmf.survival_function_.mean(1), kmf.cumulative_density_[COL_EMP + "_lower"] q = np.unique(cdf.values) quantiles = qth_survival_times(1 - q, sf) quantiles[COL_THEO] = dist_object.ppf(q) quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna() max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min() quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax) ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0) ax.set_ylim(min_, max_) ax.set_xlim(min_, max_) return ax