def predict_cumulative_hazard(self, X, times=None): """ X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. times: an iterable of increasing times to predict the cumulative hazard at. Default is the set of all durations (observed and unobserved). Uses a linear interpolation if points in time are not in the index. Returns the cumulative hazard of individuals. """ if self.strata: cumulative_hazard_ = pd.DataFrame() for stratum, stratified_X in X.groupby(self.strata): try: c_0 = self.baseline_cumulative_hazard_[[stratum]] except KeyError: raise StatError("""The stratum %s was not found in the original training data. For example, try the following on the original dataset, df: `df.groupby(%s).size()`. Expected is that %s is not present in the output. """ % (stratum, self.strata, stratum)) col = _get_index(stratified_X) v = self.predict_partial_hazard(stratified_X) cumulative_hazard_ = cumulative_hazard_.merge(pd.DataFrame(np.dot(c_0, v.T), index=c_0.index, columns=col), how='outer', right_index=True, left_index=True) else: c_0 = self.baseline_cumulative_hazard_ col = _get_index(X) v = self.predict_partial_hazard(X) cumulative_hazard_ = pd.DataFrame(np.dot(c_0, v.T), columns=col, index=c_0.index) if times is not None: # non-linear interpolations can push the survival curves above 1 and below 0. return cumulative_hazard_.reindex(cumulative_hazard_.index.union(times)).interpolate("index").loc[times] else: return cumulative_hazard_
def fit(self, durations, event_observed=None, timeline=None, entry=None, label='KM-estimate', alpha=None, left_censorship=False, ci_labels=None): """ Parameters: duration: an array, or pd.Series, of length n -- duration subject was observed for timeline: return the best estimate at the values in timelines (postively increasing) event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. left_censorship: True if durations and event_observed refer to left censorship events. Default False ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns: self, with new properties like 'survival_function_'. """ #if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_, estimate_name = 'survival_function_' if not left_censorship else 'cumulative_density_' v = preprocess_inputs(durations, event_observed, timeline, entry) self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v log_survival_function, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline, self._additive_f, self._additive_var, left_censorship) if entry is not None: #a serious problem with KM is that when the sample size is small and there are too few early # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same. # we adjust for this using the Breslow-Fleming-Harrington estimator n = self.event_table.shape[0] net_population = (self.event_table['entrance'] - self.event_table['removed']).cumsum() if net_population.iloc[:int(n/2)].min() == 0: ix = net_population.iloc[:int(n/2)].argmin() raise StatError("""There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BFH estimator."""%ix) # estimation setattr(self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[label])) self.__estimate = getattr(self,estimate_name) self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha if alpha else self.alpha, ci_labels) self.median_ = median_survival_times(self.__estimate) # estimation methods self.predict = _predict(self, estimate_name, label) self.subtract = _subtract(self, estimate_name) self.divide = _divide(self, estimate_name) # plotting functions self.plot = plot_estimate(self, estimate_name) setattr(self, "plot_" + estimate_name, self.plot) return self
def _fit( self, durations, event_observed=None, timeline=None, entry=None, label=None, alpha=None, ci_labels=None, weights=None, ): # pylint: disable=too-many-arguments,too-many-locals """ Parameters ---------- durations: an array, list, pd.DataFrame or pd.Series length n -- duration subject was observed for event_observed: an array, list, pd.DataFrame, or pd.Series, optional True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None timeline: an array, list, pd.DataFrame, or pd.Series, optional return the best estimate at the values in timelines (positively increasing) entry: an array, list, pd.DataFrame, or pd.Series, optional relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: tuple, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: an array, list, pd.DataFrame, or pd.Series, optional if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self: KaplanMeierFitter self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_`` """ self._check_values(durations) if event_observed is not None: self._check_values(event_observed) self._label = coalesce(label, self._label, "KM_estimate") if weights is not None: weights = np.asarray(weights) if (weights.astype(int) != weights).any(): warnings.warn( """It looks like your weights are not integers, possibly propensity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data." """, StatisticalWarning, ) else: weights = np.ones_like(durations, dtype=float) # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_, is_left_censoring = CensoringType.is_left_censoring(self) primary_estimate_name = "survival_function_" if not is_left_censoring else "cumulative_density_" secondary_estimate_name = "cumulative_density_" if not is_left_censoring else "survival_function_" ( self.durations, self.event_observed, self.timeline, self.entry, self.event_table, self.weights, ) = _preprocess_inputs(durations, event_observed, timeline, entry, weights) alpha = alpha if alpha else self.alpha log_estimate, cumulative_sq_ = _additive_estimate( self.event_table, self.timeline, self._additive_f, self._additive_var, is_left_censoring) if entry is not None: # a serious problem with KM is that when the sample size is small and there are too few early # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same. # we adjust for this using the Breslow-Fleming-Harrington estimator n = self.event_table.shape[0] net_population = (self.event_table["entrance"] - self.event_table["removed"]).cumsum() if net_population.iloc[:int(n / 2)].min() == 0: ix = net_population.iloc[:int(n / 2)].idxmin() raise StatError( """There are too few early truncation times and too many events. S(t)==0 for all t>%g. Recommend BreslowFlemingHarringtonFitter.""" % ix) # estimation setattr(self, primary_estimate_name, pd.DataFrame(np.exp(log_estimate), columns=[self._label])) setattr(self, secondary_estimate_name, pd.DataFrame(1 - np.exp(log_estimate), columns=[self._label])) self.__estimate = getattr(self, primary_estimate_name) self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels) self._median = median_survival_times(self.survival_function_) self.percentile = functools.partial( qth_survival_time, model_or_survival_function=self.survival_function_) self._cumulative_sq_ = cumulative_sq_ setattr(self, "confidence_interval_" + primary_estimate_name, self.confidence_interval_) setattr(self, "confidence_interval_" + secondary_estimate_name, 1 - self.confidence_interval_) # estimation methods self._estimation_method = primary_estimate_name self._estimate_name = primary_estimate_name self._update_docstrings() return self
def fit( self, durations, event_observed=None, timeline=None, entry=None, label="KM_estimate", alpha=None, left_censorship=False, ci_labels=None, weights=None, ): # pylint: disable=too-many-arguments,too-many-locals """ Parameters ---------- duration: an array, or pd.Series, of length n -- duration subject was observed for timeline: return the best estimate at the values in timelines (postively increasing) event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population were born at time 0. label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. left_censorship: True if durations and event_observed refer to left censorship events. Default False ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self: KaplanMeierFitter self with new properties like 'survival_function_'. """ check_nans_or_infs(durations) if event_observed is not None: check_nans_or_infs(event_observed) if weights is not None: if (weights.astype(int) != weights).any(): warnings.warn( """It looks like your weights are not integers, possibly prospenity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data." """, StatisticalWarning, ) # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_, estimate_name = "survival_function_" if not left_censorship else "cumulative_density_" v = _preprocess_inputs(durations, event_observed, timeline, entry, weights) self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v self._label = label alpha = alpha if alpha else self.alpha log_survival_function, cumulative_sq_ = _additive_estimate( self.event_table, self.timeline, self._additive_f, self._additive_var, left_censorship) if entry is not None: # a serious problem with KM is that when the sample size is small and there are too few early # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same. # we adjust for this using the Breslow-Fleming-Harrington estimator n = self.event_table.shape[0] net_population = (self.event_table["entrance"] - self.event_table["removed"]).cumsum() if net_population.iloc[:int(n / 2)].min() == 0: ix = net_population.iloc[:int(n / 2)].idxmin() raise StatError( """There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter.""" % ix) # estimation setattr( self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[self._label])) self.__estimate = getattr(self, estimate_name) self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels) self.median_ = median_survival_times(self.__estimate, left_censorship=left_censorship) # estimation methods self._estimation_method = estimate_name self._estimate_name = estimate_name self._predict_label = label self._update_docstrings() # plotting functions setattr(self, "plot_" + estimate_name, self.plot) return self