示例#1
0
def cdf_plot(model, timeline=None, **plot_kwargs):
    from lifelines import KaplanMeierFitter

    set_kwargs_ax(plot_kwargs)
    ax = plot_kwargs.pop("ax")

    if timeline is None:
        timeline = model.timeline

    COL_EMP = "empirical quantiles"

    if CensoringType.is_left_censoring(model):
        kmf = KaplanMeierFitter().fit_left_censoring(model.durations,
                                                     model.event_observed,
                                                     label=COL_EMP,
                                                     timeline=timeline)
    elif CensoringType.is_right_censoring(model):
        kmf = KaplanMeierFitter().fit_right_censoring(model.durations,
                                                      model.event_observed,
                                                      label=COL_EMP,
                                                      timeline=timeline)
    elif CensoringType.is_interval_censoring(model):
        raise NotImplementedError()

    kmf.plot_cumulative_density(ax=ax, **plot_kwargs)

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)
    ax.plot(timeline,
            dist_object.cdf(timeline),
            label="fitted %s" % dist,
            **plot_kwargs)
    ax.legend()
    return ax
    def _ll_null(self):
        if hasattr(self, "_ll_null_"):
            return self._ll_null_

        initial_point = np.zeros(len(self._fitted_parameter_names))

        model = self.__class__(breakpoints=self.breakpoints[:-1], penalizer=self.penalizer)
        regressors = {param_name: ["_intercept"] for param_name in self._fitted_parameter_names}
        if CensoringType.is_right_censoring(self):
            df = pd.DataFrame({"T": self.durations, "E": self.event_observed, "entry": self.entry, "_intercept": 1.0})
            model.fit_right_censoring(
                df, "T", "E", initial_point=initial_point, entry_col="entry", regressors=regressors
            )
        elif CensoringType.is_interval_censoring(self):
            df = pd.DataFrame(
                {
                    "lb": self.lower_bound,
                    "ub": self.upper_bound,
                    "E": self.event_observed,
                    "entry": self.entry,
                    "_intercept": 1.0,
                }
            )
            model.fit_interval_censoring(
                df, "lb", "ub", "E", initial_point=initial_point, entry_col="entry", regressors=regressors
            )
        if CensoringType.is_left_censoring(self):
            raise NotImplementedError()

        self._ll_null_ = model.log_likelihood_
        return self._ll_null_
示例#3
0
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs):
    """


    """
    from lifelines import KaplanMeierFitter

    if ax is None:
        ax = plt.gca()

    if timeline is None:
        timeline = model.timeline

    COL_EMP = "empirical CDF"

    if CensoringType.is_left_censoring(model):
        empirical_kmf = KaplanMeierFitter().fit_left_censoring(
            model.durations, model.event_observed, label=COL_EMP, timeline=timeline
        )
    elif CensoringType.is_right_censoring(model):
        empirical_kmf = KaplanMeierFitter().fit_right_censoring(
            model.durations, model.event_observed, label=COL_EMP, timeline=timeline
        )
    elif CensoringType.is_interval_censoring(model):
        raise NotImplementedError("lifelines does not have a non-parametric interval model yet.")

    empirical_kmf.plot_cumulative_density(ax=ax, **plot_kwargs)

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)
    ax.plot(timeline, dist_object.cdf(timeline), label="fitted %s" % dist, **plot_kwargs)
    ax.legend()
    return ax
示例#4
0
 def _create_initial_point(self, Ts, E, *args):
     if CensoringType.is_right_censoring(self):
         T = Ts[0]
     elif CensoringType.is_left_censoring(self):
         T = Ts[1]
     elif CensoringType.is_interval_censoring(self):
         T = Ts[1] - Ts[0]
     return np.array([np.median(T), 1.0])
 def _create_initial_point(self, Ts, E, *args):
     if CensoringType.is_right_censoring(self):
         log_data = log(Ts[0])
     elif CensoringType.is_left_censoring(self):
         log_data = log(Ts[1])
     elif CensoringType.is_interval_censoring(self):
         log_data = log(Ts[1] - Ts[0])
     return np.array([log_data.mean(), log(log_data.std()), 0.1])
示例#6
0
 def _get_initial_values(self, Ts, E, *args):
     if CensoringType.is_right_censoring(self):
         log_data = np.log(Ts[0])
     elif CensoringType.is_left_censoring(self):
         log_data = np.log(Ts[1])
     elif CensoringType.is_interval_censoring(self):
         log_data = np.log(Ts[1] - Ts[0])
     return np.array([log_data.mean(), np.log(log_data.std()), 1.0])
 def _create_initial_point(self, Ts, E, *args):
     if CensoringType.is_right_censoring(self):
         log_T = np.log(Ts[0])
     elif CensoringType.is_left_censoring(self):
         log_T = np.log(Ts[1])
     elif CensoringType.is_interval_censoring(self):
         log_T = np.log(Ts[1])
     return np.array([np.median(log_T), 1.0])
示例#8
0
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs):
    """
    This plot compares the empirical CDF (derived by KaplanMeier) vs the model CDF.

    Parameters
    ------------
    model: lifelines univariate model
    timeline: iterable
    ax: matplotlib axis

    """
    from lifelines import KaplanMeierFitter
    from matplotlib import pyplot as plt

    if ax is None:
        ax = plt.gca()

    if timeline is None:
        timeline = model.timeline

    COL_EMP = "empirical CDF"

    if CensoringType.is_left_censoring(model):
        empirical_kmf = KaplanMeierFitter().fit_left_censoring(
            model.durations,
            model.event_observed,
            label=COL_EMP,
            timeline=timeline,
            weights=model.weights,
            entry=model.entry)
    elif CensoringType.is_right_censoring(model):
        empirical_kmf = KaplanMeierFitter().fit_right_censoring(
            model.durations,
            model.event_observed,
            label=COL_EMP,
            timeline=timeline,
            weights=model.weights,
            entry=model.entry)
    elif CensoringType.is_interval_censoring(model):
        empirical_kmf = KaplanMeierFitter().fit_interval_censoring(
            model.lower_bound,
            model.upper_bound,
            label=COL_EMP,
            timeline=timeline,
            weights=model.weights,
            entry=model.entry)

    empirical_kmf.plot_cumulative_density(ax=ax, **plot_kwargs)

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)
    ax.plot(timeline,
            dist_object.cdf(timeline),
            label="fitted %s" % dist,
            **plot_kwargs)
    ax.legend()
    return ax
 def _create_initial_point(self, Ts, E, *args):
     if CensoringType.is_right_censoring(self):
         log_data = log(Ts[0])
     elif CensoringType.is_left_censoring(self):
         log_data = log(Ts[1])
     elif CensoringType.is_interval_censoring(self):
         # this fails if Ts[1] == Ts[0], so we add a some fudge factors.
         log_data = log(Ts[1] - Ts[0] + 0.01)
     return np.array([log_data.mean(), log(log_data.std() + 0.01), 0.1])
示例#10
0
 def _create_initial_point(self, Ts, E, *args):
     if CensoringType.is_right_censoring(self):
         T = Ts[0]
     elif CensoringType.is_left_censoring(self):
         T = np.clip(0.0001, np.inf, Ts[1])
     elif CensoringType.is_interval_censoring(self):
         if E.sum() > 0:
             # Ts[1] can contain infs, so ignore this data
             okay_data = Ts[1] < 1e10
             T = Ts[1]
             T = T[okay_data]
         else:
             T = np.array([1.0])
     return np.array([np.median(T), 1.0])
示例#11
0
    def _ll_null(self):
        if hasattr(self, "_ll_null_"):
            return self._ll_null_

        initial_point = np.zeros(len(self._fitted_parameter_names))
        regressors = {
            name: ["intercept"]
            for name in self._fitted_parameter_names
        }

        model = self.__class__()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            if CensoringType.is_right_censoring(self):
                df = pd.DataFrame({
                    "T": self.durations,
                    "E": self.event_observed,
                    "entry": self.entry,
                    "intercept": 1
                })
                model.fit_right_censoring(df,
                                          "T",
                                          "E",
                                          initial_point=initial_point,
                                          entry_col="entry",
                                          regressors=regressors)
            elif CensoringType.is_interval_censoring(self):
                df = pd.DataFrame({
                    "lb": self.lower_bound,
                    "ub": self.upper_bound,
                    "E": self.event_observed,
                    "entry": self.entry,
                    "intercept": 1,
                })
                model.fit_interval_censoring(df,
                                             "lb",
                                             "ub",
                                             "E",
                                             initial_point=initial_point,
                                             entry_col="entry",
                                             regressors=regressors)
            if CensoringType.is_left_censoring(self):
                raise NotImplementedError()

        self._ll_null_ = model._log_likelihood
        return self._ll_null_
示例#12
0
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs):
    """
         Cumulative Distribution Function
    """
    from lifelines import KaplanMeierFitter

    #kmf = KaplanMeierFitter()
    #kmf.fit(durations = churn_data['tenure'], event_observed = churn_data['Churn - Yes'] )

    if ax is None:
        ax = plt.gca()

    if timeline is None:
        timeline = model.timeline

    CDL_EMP = "empirical CDF"
    if CensoringType.is_left_censoring(model):
        emp_kmf = KaplanMeierFitter().fit_left_censoring(model.durations,
                                                         model.event_observed,
                                                         label=CDL_EMP,
                                                         timeline=timeline,
                                                         weights=model.weights,
                                                         entry=model.entry)
    if CensoringType.is_right_censoring(model):
        emp_kmf = KaplanMeierFitter().fit_right_censoring(
            model.durations,
            model.event_observed,
            label=CDL_EMP,
            timeline=timeline,
            weights=model.weights,
            entry=model.entry)
    if CensoringType.is_interval_censoring(model):
        emp_kmf = KaplanMeierFitter().fit_interval_censoring(
            model.lower_bound,
            model.upper_bound,
            label=CDL_EMP,
            timeline=timeline,
            weights=model.weights,
            entry=model.entry)
示例#13
0
def survival_probability_calibration(model: RegressionFitter,
                                     training_df: pd.DataFrame,
                                     t0: float,
                                     ax=None):
    r"""
    Smoothed calibration curves for time-to-event models. This is analogous to
    calibration curves for classification models, extended to handle survival probabilities
    and censoring. Produces a matplotlib figure and some metrics.

    We want to calibrate our model's prediction of :math:`P(T < \text{t0})` against the observed frequencies.

    Parameters
    -------------

    model:
        a fitted lifelines regression model to be evaluated
    training_df: DataFrame
        the DataFrame used to train the model
    t0: float
        the time to evaluate the probability of event occurring prior at.

    Returns
    ----------
    ax:
        mpl axes
    ICI:
        mean absolute difference between predicted and observed
    E50:
        median absolute difference between predicted and observed

    https://onlinelibrary.wiley.com/doi/full/10.1002/sim.8570

    """
    def ccl(p):
        return np.log(-np.log(1 - p))

    if ax is None:
        ax = plt.gca()

    T = model.duration_col
    E = model.event_col

    predictions_at_t0 = np.clip(
        1 -
        model.predict_survival_function(training_df, times=[t0]).T.squeeze(),
        1e-10, 1 - 1e-10)

    # create new dataset with the predictions
    prediction_df = pd.DataFrame({
        "ccl_at_%d" % t0: ccl(predictions_at_t0),
        "constant": 1,
        T: model.durations,
        E: model.event_observed
    })

    # fit new dataset to flexible spline model
    # this new model connects prediction probabilities and actual survival. It should be very flexible, almost to the point of overfitting. It's goal is just to smooth out the data!
    knots = 3
    regressors = {
        "beta_": ["ccl_at_%d" % t0],
        "gamma0_": ["constant"],
        "gamma1_": ["constant"],
        "gamma2_": ["constant"]
    }

    # this model is from examples/royson_crowther_clements_splines.py
    crc = CRCSplineFitter(knots, penalizer=0)
    if CensoringType.is_right_censoring(model):
        crc.fit_right_censoring(prediction_df, T, E, regressors=regressors)
    elif CensoringType.is_left_censoring(model):
        crc.fit_left_censoring(prediction_df, T, E, regressors=regressors)
    elif CensoringType.is_interval_censoring(model):
        crc.fit_interval_censoring(prediction_df, T, E, regressors=regressors)

    # predict new model at values 0 to 1, but remember to ccl it!
    x = np.linspace(np.clip(predictions_at_t0.min() - 0.01, 0, 1),
                    np.clip(predictions_at_t0.max() + 0.01, 0, 1), 100)
    y = 1 - crc.predict_survival_function(pd.DataFrame({
        "ccl_at_%d" % t0: ccl(x),
        "constant": 1
    }),
                                          times=[t0]).T.squeeze()

    # plot our results
    ax.set_title(
        "Smoothed calibration curve of \npredicted vs observed probabilities of t ≤ %d mortality"
        % t0)

    color = "tab:red"
    ax.plot(x, y, label="smoothed calibration curve", color=color)
    ax.set_xlabel("Predicted probability of \nt ≤ %d mortality" % t0)
    ax.set_ylabel("Observed probability of \nt ≤ %d mortality" % t0,
                  color=color)
    ax.tick_params(axis="y", labelcolor=color)

    # plot x=y line
    ax.plot(x, x, c="k", ls="--")
    ax.legend()

    # plot histogram of our original predictions
    color = "tab:blue"
    twin_ax = ax.twinx()
    twin_ax.set_ylabel("Count of \npredicted probabilities",
                       color=color)  # we already handled the x-label with ax1
    twin_ax.tick_params(axis="y", labelcolor=color)
    twin_ax.hist(predictions_at_t0, alpha=0.3, bins="sqrt", color=color)

    plt.tight_layout()

    deltas = ((1 - crc.predict_survival_function(
        prediction_df, times=[t0])).T.squeeze() - predictions_at_t0).abs()
    ICI = deltas.mean()
    E50 = np.percentile(deltas, 50)
    print("ICI = ", ICI)
    print("E50 = ", E50)

    return ax, ICI, E50
示例#14
0
def qq_plot(model, ax=None, **plot_kwargs):
    """
    Produces a quantile-quantile plot of the empirical CDF against
    the fitted parametric CDF. Large deviances away from the line y=x
    can invalidate a model (though we expect some natural deviance in the tails).

    Parameters
    -----------
    model: obj
        A fitted lifelines univariate parametric model, like ``WeibullFitter``
    plot_kwargs:
        kwargs for the plot.

    Returns
    --------
    ax:
        The axes which was used.

    Examples
    ---------

    >>> from lifelines import *
    >>> from lifelines.plotting import qq_plot
    >>> from lifelines.datasets import load_rossi
    >>> df = load_rossi()
    >>> wf = WeibullFitter().fit(df['week'], df['arrest'])
    >>> qq_plot(wf)


    """
    from lifelines.utils import qth_survival_times
    from lifelines import KaplanMeierFitter

    if ax is None:
        ax = plt.gca()

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)

    COL_EMP = "empirical quantiles"
    COL_THEO = "fitted %s quantiles" % dist

    if CensoringType.is_left_censoring(model):
        kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP)
    elif CensoringType.is_right_censoring(model):
        kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP)
    elif CensoringType.is_interval_censoring(model):
        raise NotImplementedError("lifelines does not have a non-parametric interval model yet.")

    q = np.unique(kmf.cumulative_density_.values[:, 0])
    # this is equivalent to the old code `qth_survival_times(q, kmf.cumulative_density, cdf=True)`
    quantiles = qth_survival_times(1 - q, kmf.survival_function_)
    quantiles[COL_THEO] = dist_object.ppf(q)
    quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna()

    max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min()

    quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax)
    ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0)
    ax.set_ylim(min_, max_)
    ax.set_xlim(min_, max_)

    return ax
    def _fit(
        self,
        durations,
        event_observed=None,
        timeline=None,
        entry=None,
        label="KM_estimate",
        alpha=None,
        ci_labels=None,
        weights=None,
    ):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          durations: an array, list, pd.DataFrame or pd.Series
            length n -- duration subject was observed for
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (postively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          left_censorship: bool, optional (default=False)
            True if durations and event_observed refer to left censorship events. Default False
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median``

        """
        self._check_values(durations)
        if event_observed is not None:
            self._check_values(event_observed)

        self._label = label

        if weights is not None:
            weights = np.asarray(weights)
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly propensity scores then?
  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  """,
                    StatisticalWarning,
                )

        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        is_left_censoring = CensoringType.is_left_censoring(self)
        primary_estimate_name = "survival_function_" if not is_left_censoring else "cumulative_density_"
        secondary_estimate_name = "cumulative_density_" if not is_left_censoring else "survival_function_"

        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = _preprocess_inputs(
            durations, event_observed, timeline, entry, weights
        )

        alpha = alpha if alpha else self.alpha
        log_estimate, cumulative_sq_ = _additive_estimate(
            self.event_table, self.timeline, self._additive_f, self._additive_var, is_left_censoring
        )

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table["entrance"] - self.event_table["removed"]).cumsum()
            if net_population.iloc[: int(n / 2)].min() == 0:
                ix = net_population.iloc[: int(n / 2)].idxmin()
                raise StatError(
                    """There are too few early truncation times and too many events. S(t)==0 for all t>%g. Recommend BreslowFlemingHarringtonFitter."""
                    % ix
                )

        # estimation
        setattr(self, primary_estimate_name, pd.DataFrame(np.exp(log_estimate), columns=[self._label]))
        setattr(self, secondary_estimate_name, pd.DataFrame(1 - np.exp(log_estimate), columns=[self._label]))

        self.__estimate = getattr(self, primary_estimate_name)
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels)
        self.median_ = median_survival_times(self.__estimate, left_censorship=is_left_censoring)
        self._cumulative_sq_ = cumulative_sq_

        setattr(self, "confidence_interval_" + primary_estimate_name, self.confidence_interval_)
        setattr(self, "confidence_interval_" + secondary_estimate_name, 1 - self.confidence_interval_)

        # estimation methods
        self._estimation_method = primary_estimate_name
        self._estimate_name = primary_estimate_name
        self._update_docstrings()

        return self
示例#16
0
    def _fit(self,
             durations,
             event_observed=None,
             timeline=None,
             entry=None,
             label=None,
             alpha=None,
             ci_labels=None,
             weights=None):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          durations: an array, list, pd.DataFrame or pd.Series
            length n -- duration subject was observed for
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (positively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_``

        """
        durations = np.asarray(durations)
        self._check_values(durations)

        if event_observed is not None:
            event_observed = np.asarray(event_observed)
            self._check_values(event_observed)

        self._label = coalesce(label, self._label, "KM_estimate")

        if weights is not None:
            weights = np.asarray(weights)
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly propensity scores then?
  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  """,
                    StatisticalWarning,
                )
        else:
            weights = np.ones_like(durations, dtype=float)

        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        is_left_censoring = CensoringType.is_left_censoring(self)
        primary_estimate_name = "survival_function_" if not is_left_censoring else "cumulative_density_"
        secondary_estimate_name = "cumulative_density_" if not is_left_censoring else "survival_function_"

        (self.durations, self.event_observed, self.timeline, self.entry,
         self.event_table,
         self.weights) = _preprocess_inputs(durations, event_observed,
                                            timeline, entry, weights)

        alpha = alpha if alpha else self.alpha
        log_estimate, cumulative_sq_ = _additive_estimate(
            self.event_table, self.timeline, self._additive_f,
            self._additive_var, is_left_censoring)

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table["entrance"] -
                              self.event_table["removed"]).cumsum()
            if net_population.iloc[:int(n / 2)].min() == 0:
                ix = net_population.iloc[:int(n / 2)].idxmin()
                raise StatError(
                    """There are too few early truncation times and too many events. S(t)==0 for all t>%g. Recommend BreslowFlemingHarringtonFitter."""
                    % ix)

        # estimation
        setattr(self, primary_estimate_name,
                pd.DataFrame(np.exp(log_estimate), columns=[self._label]))
        setattr(self, secondary_estimate_name,
                pd.DataFrame(1 - np.exp(log_estimate), columns=[self._label]))

        self.__estimate = getattr(self, primary_estimate_name)
        self.confidence_interval_ = self._bounds(
            cumulative_sq_.values[:, None], alpha, ci_labels)
        self._median = median_survival_times(self.survival_function_)
        self._cumulative_sq_ = cumulative_sq_

        setattr(self, "confidence_interval_" + primary_estimate_name,
                self.confidence_interval_)
        setattr(self, "confidence_interval_" + secondary_estimate_name,
                1 - self.confidence_interval_)

        # estimation methods
        self._estimation_method = primary_estimate_name
        self._estimate_name = primary_estimate_name

        return self
示例#17
0
文件: plotting.py 项目: vck/lifelines
def qq_plot(model, ax=None, **plot_kwargs):
    """
    Produces a quantile-quantile plot of the empirical CDF against
    the fitted parametric CDF. Large deviances away from the line y=x
    can invalidate a model (though we expect some natural deviance in the tails).

    Parameters
    -----------
    model: obj
        A fitted lifelines univariate parametric model, like ``WeibullFitter``
    plot_kwargs:
        kwargs for the plot.

    Returns
    --------
    ax:
        The axes which was used.

    Examples
    ---------
    .. code:: python

        from lifelines import *
        from lifelines.plotting import qq_plot
        from lifelines.datasets import load_rossi
        df = load_rossi()
        wf = WeibullFitter().fit(df['week'], df['arrest'])
        qq_plot(wf)

    Notes
    ------
    The interval censoring case uses the mean between the upper and lower bounds.

    """
    from lifelines.utils import qth_survival_times
    from lifelines import KaplanMeierFitter

    if ax is None:
        ax = plt.gca()

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)

    COL_EMP = "empirical quantiles"
    COL_THEO = "fitted %s quantiles" % dist

    if CensoringType.is_left_censoring(model):
        kmf = KaplanMeierFitter().fit_left_censoring(
            model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry
        )
        sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[COL_EMP]
    elif CensoringType.is_right_censoring(model):
        kmf = KaplanMeierFitter().fit_right_censoring(
            model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry
        )
        sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[COL_EMP]

    elif CensoringType.is_interval_censoring(model):
        kmf = KaplanMeierFitter().fit_interval_censoring(
            model.lower_bound, model.upper_bound, label=COL_EMP, weights=model.weights, entry=model.entry
        )
        sf, cdf = kmf.survival_function_.mean(1), kmf.cumulative_density_[COL_EMP + "_lower"]

    q = np.unique(cdf.values)

    quantiles = qth_survival_times(1 - q, sf)
    quantiles[COL_THEO] = dist_object.ppf(q)
    quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna()

    max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min()

    quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax)
    ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0)
    ax.set_ylim(min_, max_)
    ax.set_xlim(min_, max_)

    return ax