def fit(self, durations, event_observed=None, timeline=None, entry=None, label='NA_estimate', alpha=None, ci_labels=None, weights=None): """ Parameters: duration: an array, or pd.Series, of length n -- duration subject was observed for timeline: return the best estimate at the values in timelines (postively increasing) event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns: self, with new properties like 'cumulative_hazard_'. """ check_nans(durations) if event_observed is not None: check_nans(event_observed) v = _preprocess_inputs(durations, event_observed, timeline, entry, weights) self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v cumulative_hazard_, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline, self._additive_f, self._variance_f, False) # esimates self._label = label self.cumulative_hazard_ = pd.DataFrame(cumulative_hazard_, columns=[self._label]) self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha if alpha else self.alpha, ci_labels) self._cumulative_sq = cumulative_sq_ # estimation functions self.predict = self._predict("cumulative_hazard_", self._label) self.subtract = self._subtract("cumulative_hazard_") self.divide = self._divide("cumulative_hazard_") # plotting self.plot = self._plot_estimate("cumulative_hazard_") self.plot_cumulative_hazard = self.plot self.plot_hazard = self._plot_estimate('hazard_') return self
def fit(self, durations, event_observed=None, timeline=None, entry=None, label='Exponential_estimate', alpha=None, ci_labels=None): """ Parameters: duration: an array, or pd.Series, of length n -- duration subject was observed for timeline: return the best estimate at the values in timelines (postively increasing) event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns: self, with new properties like 'survival_function_' and 'lambda_'. """ check_nans(durations) if event_observed is not None: check_nans(event_observed) self.durations = np.asarray(durations, dtype=float) self.event_observed = np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like(self.durations) self.timeline = np.sort(np.asarray(timeline)) if timeline is not None else np.arange(int(self.durations.min()), int(self.durations.max()) + 1) self._label = label # estimation D = self.event_observed.sum() T = self.durations.sum() self.lambda_ = D / T self._lambda_variance_ = self.lambda_ / T self.survival_function_ = pd.DataFrame(np.exp(-self.lambda_ * self.timeline), columns=[self._label], index=self.timeline) self.confidence_interval_ = self._bounds(alpha if alpha else self.alpha, ci_labels) self.median_ = 1. / self.lambda_ * (np.log(2)) # estimation functions self.predict = self._predict(lambda t: np.exp(-self.lambda_ * t), self._label) self.subtract = self._subtract("survival_function_") self.divide = self._divide("survival_function_") # plotting self.plot = self._plot_estimate("survival_function_") self.plot_survival_function_ = self.plot return self
def _check_values(df, T, E): pass_for_numeric_dtypes_or_raise(df) check_nans(T) check_nans(E) check_low_var(df) check_complete_separation(df, E, T)
def fit(self, durations, event_observed=None, timeline=None, entry=None, label='KM_estimate', alpha=None, left_censorship=False, ci_labels=None, weights=None): """ Parameters: duration: an array, or pd.Series, of length n -- duration subject was observed for timeline: return the best estimate at the values in timelines (postively increasing) event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population were born at time 0. label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. left_censorship: True if durations and event_observed refer to left censorship events. Default False ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns: self, with new properties like 'survival_function_'. """ check_nans(durations) if event_observed is not None: check_nans(event_observed) # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_, estimate_name = 'survival_function_' if not left_censorship else 'cumulative_density_' v = _preprocess_inputs(durations, event_observed, timeline, entry, weights) self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v self._label = label alpha = alpha if alpha else self.alpha log_survival_function, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline, self._additive_f, self._additive_var, left_censorship) if entry is not None: # a serious problem with KM is that when the sample size is small and there are too few early # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same. # we adjust for this using the Breslow-Fleming-Harrington estimator n = self.event_table.shape[0] net_population = (self.event_table['entrance'] - self.event_table['removed']).cumsum() if net_population.iloc[:int(n / 2)].min() == 0: ix = net_population.iloc[:int(n / 2)].idxmin() raise StatError("""There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter.""" % ix) # estimation setattr(self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[self._label])) self.__estimate = getattr(self, estimate_name) self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels) self.median_ = median_survival_times(self.__estimate, left_censorship=left_censorship) # estimation methods self.predict = self._predict(estimate_name, label) self.subtract = self._subtract(estimate_name) self.divide = self._divide(estimate_name) # plotting functions self.plot = self._plot_estimate(estimate_name) setattr(self, "plot_" + estimate_name, self.plot) self.plot_loglogs = plot_loglogs(self) return self
def _check_values(self, df, T, E): pass_for_numeric_dtypes_or_raise(df) check_nans(T) check_nans(E)
def fit(self, durations, event_observed=None, timeline=None, entry=None, label='Weibull_estimate', alpha=None, ci_labels=None): """ Parameters: duration: an array, or pd.Series, of length n -- duration subject was observed for event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None timeline: return the estimate at the values in timeline (postively increasing) entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns: self, with new properties like `cumulative_hazard_', 'survival_function_', 'lambda_' and 'rho_'. """ check_nans(durations) if event_observed is not None: check_nans(event_observed) self.durations = np.asarray(durations, dtype=float) # check for negative or 0 durations - these are not allowed in a weibull model. if np.any(self.durations <= 0): raise ValueError( 'This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements.' ) self.event_observed = np.asarray( event_observed, dtype=int) if event_observed is not None else np.ones_like( self.durations) self.timeline = np.sort( np.asarray(timeline)) if timeline is not None else np.arange( int(self.durations.min()), int(self.durations.max()) + 1) self._label = label alpha = alpha if alpha is not None else self.alpha # estimation self.lambda_, self.rho_ = self._newton_rhaphson( self.durations, self.event_observed) self.survival_function_ = pd.DataFrame(self.survival_function_at_times( self.timeline), columns=[self._label], index=self.timeline) self.hazard_ = pd.DataFrame(self.hazard_at_times(self.timeline), columns=[self._label], index=self.timeline) self.cumulative_hazard_ = pd.DataFrame(self.cumulative_hazard_at_times( self.timeline), columns=[self._label], index=self.timeline) self.confidence_interval_ = self._bounds(alpha, ci_labels) self.median_ = 1. / self.lambda_ * (np.log(2))**(1. / self.rho_) # estimation functions - Cumulative hazard takes priority. self.predict = self._predict( lambda t: np.exp(-(self.lambda_ * t)**self.rho_), self._label) self.subtract = self._subtract("cumulative_hazard_") self.divide = self._divide("cumulative_hazard_") # plotting - Cumulative hazard takes priority. self.plot = self._plot_estimate("cumulative_hazard_") self.plot_cumulative_hazard = self.plot return self