def __init__(self, p_value, test_statistic, name=None, test_name=None, **kwargs): self.p_value = p_value self.test_statistic = test_statistic self.test_name = test_name self._p_value = utils._to_1d_array(p_value) self._test_statistic = utils._to_1d_array(test_statistic) assert len(self._p_value) == len(self._test_statistic) if name is not None: self.name = utils._to_list(name) assert len(self.name) == len(self._test_statistic) else: self.name = None for kw, value in kwargs.items(): setattr(self, kw, value) kwargs["test_name"] = test_name self._kwargs = kwargs
def preprocess_df(self, df, event_col, start_col, stop_col, weights_col, id_col): df = df.copy() if not (event_col in df and start_col in df and stop_col in df): raise KeyError( "A column specified in the call to `fit` does not exist in the DataFrame provided." ) if weights_col is None: self.weights_col = None assert "__weights" not in df.columns, "__weights is an internal lifelines column, please rename your column first." df["__weights"] = 1.0 else: self.weights_col = weights_col if (df[weights_col] <= 0).any(): raise ValueError("values in weights_col must be positive.") df = df.rename( columns={ event_col: "event", start_col: "start", stop_col: "stop", weights_col: "__weights" }) if self.strata is not None and self.id_col is not None: df = df.set_index(_to_list(self.strata) + [id_col]) df = df.sort_index() elif self.strata is not None and self.id_col is None: df = df.set_index(_to_list(self.strata)) elif self.strata is None and self.id_col is not None: df = df.set_index([id_col]) events, start, stop = ( pass_for_numeric_dtypes_or_raise_array( df.pop("event")).astype(bool), df.pop("start"), df.pop("stop"), ) weights = df.pop("__weights").astype(float) df = df.astype(float) self._check_values(df, events, start, stop) return df, events, start, stop, weights
def plot(self, columns=None, loc=None, iloc=None, ax=None, **kwargs): """" A wrapper around plotting. Matplotlib plot arguments can be passed in, plus: Parameters ----------- columns: string or list-like, optional If not empty, plot a subset of columns from the ``cumulative_hazards_``. Default all. loc: iloc: slice, optional specify a location-based subsection of the curves to plot, ex: ``.plot(iloc=slice(0,10))`` will plot the first 10 time points. """ from matplotlib import pyplot as plt assert loc is None or iloc is None, "Cannot set both loc and iloc in call to .plot" def shaded_plot(ax, x, y, y_upper, y_lower, **kwargs): (base_line,) = ax.plot(x, y, drawstyle="steps-post", **kwargs) ax.fill_between(x, y_lower, y2=y_upper, alpha=0.25, color=base_line.get_color(), linewidth=1.0, step="post") def create_df_slicer(loc, iloc): get_method = "loc" if loc is not None else "iloc" if iloc is None and loc is None: user_submitted_ix = slice(0, None) else: user_submitted_ix = loc if loc is not None else iloc return lambda df: getattr(df, get_method)[user_submitted_ix] subset_df = create_df_slicer(loc, iloc) if not columns: columns = self.cumulative_hazards_.columns else: columns = _to_list(columns) if ax is None: ax = plt.gca() x = subset_df(self.cumulative_hazards_).index.values.astype(float) for column in columns: ci = (1 - self.alpha) * 100 y = subset_df(self.cumulative_hazards_[column]).values index = subset_df(self.cumulative_hazards_[column]).index y_upper = subset_df(self.confidence_intervals_[column].loc["%g%% upper-bound" % ci]).values y_lower = subset_df(self.confidence_intervals_[column].loc["%g%% lower-bound" % ci]).values shaded_plot(ax, x, y, y_upper, y_lower, label=column, **kwargs) plt.hlines(0, index.min() - 1, index.max(), color="k", linestyles="--", alpha=0.5) ax.legend() return ax
def plot(self, columns=None, loc=None, iloc=None, **kwargs): """" A wrapper around plotting. Matplotlib plot arguments can be passed in, plus: Parameters ----------- columns: string or list-like, optional If not empty, plot a subset of columns from the ``cumulative_hazards_``. Default all. loc: iloc: slice, optional specify a location-based subsection of the curves to plot, ex: ``.plot(iloc=slice(0,10))`` will plot the first 10 time points. """ from matplotlib import pyplot as plt assert loc is None or iloc is None, "Cannot set both loc and iloc in call to .plot" def shaded_plot(ax, x, y, y_upper, y_lower, **kwargs): base_line, = ax.plot(x, y, drawstyle="steps-post", **kwargs) ax.fill_between(x, y_lower, y2=y_upper, alpha=0.25, color=base_line.get_color(), linewidth=1.0, step="post") def create_df_slicer(loc, iloc): get_method = "loc" if loc is not None else "iloc" if iloc is None and loc is None: user_submitted_ix = slice(0, None) else: user_submitted_ix = loc if loc is not None else iloc return lambda df: getattr(df, get_method)[user_submitted_ix] subset_df = create_df_slicer(loc, iloc) if not columns: columns = self.cumulative_hazards_.columns else: columns = _to_list(columns) set_kwargs_ax(kwargs) ax = kwargs.pop("ax") x = subset_df(self.cumulative_hazards_).index.values.astype(float) for column in columns: y = subset_df(self.cumulative_hazards_[column]).values index = subset_df(self.cumulative_hazards_[column]).index y_upper = subset_df(self.confidence_intervals_[column].loc["upper-bound"]).values y_lower = subset_df(self.confidence_intervals_[column].loc["lower-bound"]).values shaded_plot(ax, x, y, y_upper, y_lower, label=column, **kwargs) plt.hlines(0, index.min() - 1, index.max(), color="k", linestyles="--", alpha=0.5) ax.legend() return ax
def fit( self, df, id_col, event_col, start_col="start", stop_col="stop", weights_col=None, show_progress=False, step_size=None, robust=False, strata=None, initial_point=None, ): # pylint: disable=too-many-arguments """ Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times are handled using Efron's tie-method. Parameters ----------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). id_col: string A subject could have multiple rows in the DataFrame. This column contains the unique identifier per subject. event_col: string the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are non-censored. start_col: string the column that contains the start of a subject's time period. stop_col: string the column that contains the end of a subject's time period. weights_col: string, optional the column that contains (possibly time-varying) weight of each subject-period row. show_progress: since the fitter is iterative, show convergence diagnostics. robust: boolean, optional (default: True) Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. See "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 step_size: float, optional set an initial step size for the fitting algorithm. strata: list or string, optional specify a column or list of columns n to use in stratification. This is useful if a categorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns -------- self: CoxTimeVaryingFitter self, with additional properties like ``hazards_`` and ``print_summary`` """ self.strata = coalesce(strata, self.strata) self.robust = robust if self.robust: raise NotImplementedError("Not available yet.") self.event_col = event_col self.id_col = id_col self.stop_col = stop_col self.start_col = start_col self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): raise KeyError( "A column specified in the call to `fit` does not exist in the DataFrame provided." ) if weights_col is None: self.weights_col = None assert ( "__weights" not in df.columns ), "__weights is an internal lifelines column, please rename your column first." df["__weights"] = 1.0 else: self.weights_col = weights_col if (df[weights_col] <= 0).any(): raise ValueError("values in weights_col must be positive.") df = df.rename( columns={ id_col: "id", event_col: "event", start_col: "start", stop_col: "stop", weights_col: "__weights" }) if self.strata is None: df = df.set_index("id") else: df = df.set_index(_to_list(self.strata) + ["id"]) # TODO: needs to be a list df = df.sort_index() events, start, stop = ( pass_for_numeric_dtypes_or_raise_array( df.pop("event")).astype(bool), df.pop("start"), df.pop("stop"), ) weights = df.pop("__weights").astype(float) df = df.astype(float) self._check_values(df, events, start, stop) self._norm_mean = df.mean(0) self._norm_std = df.std(0) params_ = self._newton_rhaphson( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights, initial_point=initial_point, show_progress=show_progress, step_size=step_size, ) self.params_ = pd.Series(params_, index=df.columns, name="coef") / self._norm_std self.hazard_ratios_ = pd.Series(np.exp(self.params_), index=df.columns, name="exp(coef)") self.variance_matrix_ = -inv(self._hessian_) / np.outer( self._norm_std, self._norm_std) self.standard_errors_ = self._compute_standard_errors( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights) self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard( df, events, start, stop, weights) self.baseline_survival_ = self._compute_baseline_survival() self.event_observed = events self.start_stop_and_events = pd.DataFrame({ "event": events, "start": start, "stop": stop }) self.weights = weights self._n_examples = df.shape[0] self._n_unique = df.index.unique().shape[0] return self
def fit( self, df, id_col, event_col, start_col="start", stop_col="stop", weights_col=None, show_progress=False, step_size=None, robust=False, strata=None, initial_point=None, ): # pylint: disable=too-many-arguments """ Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times are handled using Efron's tie-method. Parameters ----------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). id_col: string A subject could have multiple rows in the DataFrame. This column contains the unique identifier per subject. event_col: string the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are non-censored. start_col: string the column that contains the start of a subject's time period. stop_col: string the column that contains the end of a subject's time period. weights_col: string, optional the column that contains (possibly time-varying) weight of each subject-period row. show_progress: since the fitter is iterative, show convergence diagnostics. robust: boolean, optional (default: True) Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. See "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 step_size: float, optional set an initial step size for the fitting algorithm. strata: list or string, optional specify a column or list of columns n to use in stratification. This is useful if a categorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns -------- self: CoxTimeVaryingFitter self, with additional properties like ``hazards_`` and ``print_summary`` """ self.strata = coalesce(strata, self.strata) self.robust = robust if self.robust: raise NotImplementedError("Not available yet.") self.event_col = event_col self.id_col = id_col self.stop_col = stop_col self.start_col = start_col self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): raise KeyError("A column specified in the call to `fit` does not exist in the DataFrame provided.") if weights_col is None: self.weights_col = None assert ( "__weights" not in df.columns ), "__weights is an internal lifelines column, please rename your column first." df["__weights"] = 1.0 else: self.weights_col = weights_col if (df[weights_col] <= 0).any(): raise ValueError("values in weights_col must be positive.") df = df.rename( columns={id_col: "id", event_col: "event", start_col: "start", stop_col: "stop", weights_col: "__weights"} ) if self.strata is None: df = df.set_index("id") else: df = df.set_index(_to_list(self.strata) + ["id"]) # TODO: needs to be a list df = df.sort_index() events, start, stop = ( pass_for_numeric_dtypes_or_raise_array(df.pop("event")).astype(bool), df.pop("start"), df.pop("stop"), ) weights = df.pop("__weights").astype(float) df = df.astype(float) self._check_values(df, events, start, stop) self._norm_mean = df.mean(0) self._norm_std = df.std(0) hazards_ = self._newton_rhaphson( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights, initial_point=initial_point, show_progress=show_progress, step_size=step_size, ) self.hazards_ = pd.Series(hazards_, index=df.columns, name="coef") / self._norm_std self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std) self.standard_errors_ = self._compute_standard_errors( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights ) self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, events, start, stop, weights) self.baseline_survival_ = self._compute_baseline_survival() self.event_observed = events self.start_stop_and_events = pd.DataFrame({"event": events, "start": start, "stop": stop}) self.weights = weights self._n_examples = df.shape[0] self._n_unique = df.index.unique().shape[0] return self
def plot_covariate_groups(self, covariates, values, plot_baseline=True, **kwargs): """ Produces a visual representation comparing the baseline survival curve of the model versus what happens when a covariate(s) is varied over values in a group. This is useful to compare subjects' survival as we vary covariate(s), all else being held equal. The baseline survival curve is equal to the predicted survival curve at all average values in the original dataset. Parameters ---------- covariates: string or list a string (or list of strings) of the covariate in the original dataset that we wish to vary. values: 1d or 2d iterable an iterable of the values we wish the covariate to take on. plot_baseline: bool also display the baseline survival, defined as the survival at the mean of the original dataset. kwargs: pass in additional plotting commands Returns ------- ax: matplotlib axis, or list of axis' the matplotlib axis that be edited. Examples --------- >>> from lifelines import datasets, WeibullAFTFitter >>> rossi = datasets.load_rossi() >>> wf = WeibullAFTFitter().fit(rossi, 'week', 'arrest') >>> wf.plot_covariate_groups('prio', values=np.arange(0, 15), cmap='coolwarm') >>> # multiple variables at once >>> wf.plot_covariate_groups(['prio', 'paro'], values=[[0, 0], [5, 0], [10, 0], [0, 1], [5, 1], [10, 1]], cmap='coolwarm') >>> # if you have categorical variables, you can simply things: >>> wf.plot_covariate_groups(['dummy1', 'dummy2', 'dummy3'], values=np.eye(3)) """ from matplotlib import pyplot as plt covariates = _to_list(covariates) values = _to_array(values) if len(values.shape) == 1: values = values[None, :].T if len(covariates) != values.shape[1]: raise ValueError( "The number of covariates must equal to second dimension of the values array." ) original_columns = self.params_.index.get_level_values(1) for covariate in covariates: if covariate not in original_columns: raise KeyError( "covariate `%s` is not present in the original dataset" % covariate) ax = kwargs.pop("ax", None) or plt.figure().add_subplot(111) # model X x_bar = self._norm_mean.to_frame().T X = pd.concat([x_bar] * values.shape[0]) if np.array_equal(np.eye(len(covariates)), values): X.index = ["%s=1" % c for c in covariates] else: X.index = [ ", ".join("%s=%g" % (c, v) for (c, v) in zip(covariates, row)) for row in values ] for covariate, value in zip(covariates, values.T): X[covariate] = value # model ancillary X x_bar_anc = self._norm_mean_ancillary.to_frame().T ancillary_X = pd.concat([x_bar_anc] * values.shape[0]) for covariate, value in zip(covariates, values.T): ancillary_X[covariate] = value if self.fit_intercept: X["_intercept"] = 1.0 ancillary_X["_intercept"] = 1.0 self.predict_survival_function(X, ancillary_X=ancillary_X).plot(ax=ax, **kwargs) if plot_baseline: self.predict_survival_function( x_bar, ancillary_X=x_bar_anc).rename(columns={ 0: "baseline survival" }).plot(ax=ax, ls=":", color="k") return ax