def print_summary(self, decimals=2, **kwargs): """ Print summary statistics describing the fit, the coefficients, and the error bounds. Parameters ----------- decimals: int, optional (default=2) specify the number of decimal places to show kwargs: print additional meta data in the output (useful to provide model names, dataset names, etc.) when comparing multiple outputs. """ # Print information about data first justify = string_justify(18) print(self) print("{} = '{}'".format(justify("event col"), self.event_col)) if self.weights_col: print("{} = '{}'".format(justify("weights col"), self.weights_col)) if self.strata: print("{} = {}".format(justify("strata"), self.strata)) if self.penalizer > 0: print("{} = {}".format(justify("penalizer"), self.penalizer)) print("{} = {}".format(justify("number of subjects"), self._n_unique)) print("{} = {}".format(justify("number of periods"), self._n_examples)) print("{} = {}".format(justify("number of events"), self.event_observed.sum())) print("{} = {:.{prec}f}".format(justify("log-likelihood"), self._log_likelihood, prec=decimals)) print("{} = {} UTC".format(justify("time fit was run"), self._time_fit_was_called)) for k, v in kwargs.items(): print("{} = {}\n".format(justify(k), v)) print(end="\n") print("---") df = self.summary # Significance codes last print( df.to_string( float_format=format_floats(decimals), formatters={"p": format_p_value(decimals), "exp(coef)": format_exp_floats(decimals)}, ) ) # Significance code explanation print("---") with np.errstate(invalid="ignore", divide="ignore"): sr = self.log_likelihood_ratio_test() print( "Log-likelihood ratio test = {:.{prec}f} on {} df, -log2(p)={:.{prec}f}".format( sr.test_statistic, sr.degrees_freedom, -np.log2(sr.p_value), prec=decimals ) )
def print_summary(self, decimals=2, **kwargs): """ Print summary statistics describing the fit, the coefficients, and the error bounds. Parameters ----------- decimals: int, optional (default=2) specify the number of decimal places to show kwargs: print additional meta data in the output (useful to provide model names, dataset names, etc.) when comparing multiple outputs. """ # Print information about data first justify = string_justify(18) print(self) print("{} = '{}'".format(justify("duration col"), self.duration_col)) print("{} = '{}'".format(justify("event col"), self.event_col)) if self.weights_col: print("{} = '{}'".format(justify("weights col"), self.weights_col)) if self.coef_penalizer > 0: print("{} = '{}'".format(justify("coef penalizer"), self.coef_penalizer)) if self.smoothing_penalizer > 0: print("{} = '{}'".format(justify("smoothing penalizer"), self.smoothing_penalizer)) print("{} = {}".format(justify("number of subjects"), self._n_examples)) print("{} = {}".format(justify("number of events"), self.event_observed.sum())) print("{} = {}".format(justify("time fit was run"), self._time_fit_was_called)) for k, v in kwargs.items(): print("{} = {}\n".format(justify(k), v)) print(end="\n") print("---") df = self.summary print( df.to_string( float_format=format_floats(decimals), formatters={"p": format_p_value(decimals), "exp(coef)": format_exp_floats(decimals)}, ) ) # Significance code explanation print("---") print("Concordance = {:.{prec}f}".format(self.score_, prec=decimals))
def check_assumptions( self, training_df: DataFrame, advice: bool = True, show_plots: bool = False, p_value_threshold: float = 0.01, plot_n_bootstraps: int = 10, columns: Optional[List[str]] = None, ) -> None: """ Use this function to test the proportional hazards assumption. See usage example at https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html Parameters ----------- training_df: DataFrame the original DataFrame used in the call to ``fit(...)`` or a sub-sampled version. advice: bool, optional display advice as output to the user's screen show_plots: bool, optional display plots of the scaled schoenfeld residuals and loess curves. This is an eyeball test for violations. This will slow down the function significantly. p_value_threshold: float, optional the threshold to use to alert the user of violations. See note below. plot_n_bootstraps: in the plots displayed, also display plot_n_bootstraps bootstrapped loess curves. This will slow down the function significantly. columns: list, optional specify a subset of columns to test. Examples ---------- >>> from lifelines.datasets import load_rossi >>> from lifelines import CoxPHFitter >>> >>> rossi = load_rossi() >>> cph = CoxPHFitter().fit(rossi, 'week', 'arrest') >>> >>> cph.check_assumptions(rossi) Notes ------- The ``p_value_threshold`` is arbitrarily set at 0.01. Under the null, some covariates will be below the threshold (i.e. by chance). This is compounded when there are many covariates. Similarly, when there are lots of observations, even minor deviances from the proportional hazard assumption will be flagged. With that in mind, it's best to use a combination of statistical tests and eyeball tests to determine the most serious violations. References ----------- section 5 in https://socialsciences.mcmaster.ca/jfox/Books/Companion/appendices/Appendix-Cox-Regression.pdf, http://www.mwsug.org/proceedings/2006/stats/MWSUG-2006-SD08.pdf, http://eprints.lse.ac.uk/84988/1/06_ParkHendry2015-ReassessingSchoenfeldTests_Final.pdf """ if not training_df.index.is_unique: raise IndexError( "`training_df` index should be unique for this exercise. Please make it unique or use `.reset_index(drop=True)` to force a unique index" ) residuals = self.compute_residuals(training_df, kind="scaled_schoenfeld") test_results = proportional_hazard_test( self, training_df, time_transform=["rank", "km"], precomputed_residuals=residuals) residuals_and_duration = residuals.join(training_df[self.duration_col]) counter = 0 n = residuals_and_duration.shape[0] for variable in self.params_.index.intersection(columns or self.params_.index): minumum_observed_p_value = test_results.summary.loc[variable, "p"].min() if np.round(minumum_observed_p_value, 2) > p_value_threshold: continue counter += 1 if counter == 1: if advice: print( fill( """The ``p_value_threshold`` is set at %g. Even under the null hypothesis of no violations, some covariates will be below the threshold by chance. This is compounded when there are many covariates. Similarly, when there are lots of observations, even minor deviances from the proportional hazard assumption will be flagged.""" % p_value_threshold, width=100, )) print() print( fill( """With that in mind, it's best to use a combination of statistical tests and visual tests to determine the most serious violations. Produce visual plots using ``check_assumptions(..., show_plots=True)`` and looking for non-constant lines. See link [A] below for a full example.""", width=100, )) print() test_results.print_summary() print() print() print( "%d. Variable '%s' failed the non-proportional test: p-value is %s." % (counter, variable, format_p_value(4)(minumum_observed_p_value)), end="\n\n", ) if advice: values = training_df[variable] value_counts = values.value_counts() n_uniques = value_counts.shape[0] # Arbitrary chosen 10 and 4 to check for ability to use strata col. # This should capture dichotomous / low cardinality values. if n_uniques <= 10 and value_counts.min() >= 5: print( fill( " Advice: with so few unique values (only {0}), you can include `strata=['{1}', ...]` in the call in `.fit`. See documentation in link [E] below." .format(n_uniques, variable), width=100, )) else: print( fill( """ Advice 1: the functional form of the variable '{var}' might be incorrect. That is, there may be non-linear terms missing. The proportional hazard test used is very sensitive to incorrect functional forms. See documentation in link [D] below on how to specify a functional form.""" .format(var=variable), width=100, ), end="\n\n", ) print( fill( """ Advice 2: try binning the variable '{var}' using pd.cut, and then specify it in `strata=['{var}', ...]` in the call in `.fit`. See documentation in link [B] below.""" .format(var=variable), width=100, ), end="\n\n", ) print( fill( """ Advice 3: try adding an interaction term with your time variable. See documentation in link [C] below.""", width=100, ), end="\n\n", ) if show_plots: from matplotlib import pyplot as plt fig = plt.figure() # plot variable against all time transformations. for i, (transform_name, transformer) in enumerate( TimeTransformers().iter(["rank", "km"]), start=1): p_value = test_results.summary.loc[(variable, transform_name), "p"] ax = fig.add_subplot(1, 2, i) y = residuals_and_duration[variable] tt = transformer(self.durations, self.event_observed, self.weights)[self.event_observed.values] ax.scatter(tt, y, alpha=0.75) y_lowess = lowess(tt.values, y.values) ax.plot(tt, y_lowess, color="k", alpha=1.0, linewidth=2) # bootstrap some possible other lowess lines. This is an approximation of the 100% confidence intervals for _ in range(plot_n_bootstraps): ix = sorted(np.random.choice(n, n)) tt_ = tt.values[ix] y_lowess = lowess(tt_, y.values[ix]) ax.plot(tt_, y_lowess, color="k", alpha=0.30) best_xlim = ax.get_xlim() ax.hlines(0, 0, tt.max(), linestyles="dashed", linewidths=1) ax.set_xlim(best_xlim) ax.set_xlabel("%s-transformed time\n(p=%.4f)" % (transform_name, p_value), fontsize=10) fig.suptitle("Scaled Schoenfeld residuals of '%s'" % variable, fontsize=14) plt.tight_layout() plt.subplots_adjust(top=0.90) if advice and counter > 0: print( dedent(r""" --- [A] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html [B] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Bin-variable-and-stratify-on-it [C] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Introduce-time-varying-covariates [D] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Modify-the-functional-form [E] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Stratification """)) if counter == 0: print("Proportional hazard assumption looks okay.")
def to_ascii(self, decimals=2, **kwargs): extra_kwargs = dict(list(self._kwargs.items()) + list(kwargs.items())) meta_data = self._stringify_meta_data(extra_kwargs) df = self.summary with np.errstate(invalid="ignore", divide="ignore"): df["-log2(p)"] = -np.log2(df["p"]) s = self.__repr__() s += "\n" + meta_data + "\n" s += "---\n" s += df.to_string( float_format=format_floats(decimals), index=self.name is not None, formatters={"p": format_p_value(decimals)} ) return s
def to_html(self, decimals=2, **kwargs): extra_kwargs = dict(list(self._kwargs.items()) + list(kwargs.items())) summary_df = self.summary headers = [] for k, v in extra_kwargs.items(): headers.append((k, v)) header_df = pd.DataFrame.from_records(headers).set_index(0) header_html = header_df.to_html(header=False, notebook=True, index_names=False) summary_html = summary_df.to_html(float_format=format_floats(decimals), formatters={**{"p": format_p_value(decimals)}}) return header_html + summary_html
def ascii_print(self): decimals = self.decimals df = self.model.summary justify = self.justify print(self.model) for string, value in self.headers: print("{} = {}".format(justify(string), value)) print(end="\n") print("---") df.columns = utils.map_leading_space(df.columns) columns = df.columns if len(columns) <= 7: # only need one row of display first_row_set = [ "coef", "exp(coef)", "se(coef)", "coef lower 95%", "coef upper 95%", "exp(coef) lower 95%", "exp(coef) upper 95%", "z", "p", "-log2(p)", ] second_row_set = [] else: first_row_set = [ "coef", "exp(coef)", "se(coef)", "coef lower 95%", "coef upper 95%", "exp(coef) lower 95%", "exp(coef) upper 95%", ] second_row_set = ["z", "p", "-log2(p)"] print( df.to_string( float_format=utils.format_floats(decimals), formatters={ **{c: utils.format_exp_floats(decimals) for c in columns if "exp(" in c}, **{utils.leading_space("p"): utils.format_p_value(decimals)}, }, columns=[c for c in utils.map_leading_space(first_row_set) if c in columns], ) ) if second_row_set: print() print( df.to_string( float_format=utils.format_floats(decimals), formatters={ **{c: utils.format_exp_floats(decimals) for c in columns if "exp(" in c}, **{utils.leading_space("p"): utils.format_p_value(decimals)}, }, columns=utils.map_leading_space(second_row_set), ) ) with np.errstate(invalid="ignore", divide="ignore"): try: print("---") if utils.CensoringType.is_right_censoring(self.model) and self.model._KNOWN_MODEL: print("Concordance = {:.{prec}f}".format(self.model.score_, prec=decimals)) except AttributeError: pass try: sr = self.model.log_likelihood_ratio_test() print( "Log-likelihood ratio test = {:.{prec}f} on {} df, -log2(p)={:.{prec}f}".format( sr.test_statistic, sr.degrees_freedom, -np.log2(sr.p_value), prec=decimals ) ) except AttributeError: pass print()
def to_ascii(self): df = self.model.summary justify = self.justify ci = 100 * (1 - self.model.alpha) decimals = self.decimals repr_string = "" repr_string += repr(self.model) + "\n" for string, value in self.headers: repr_string += "{} = {}".format(justify(string), value) + "\n" repr_string += "\n" + "---" + "\n" df.columns = utils.map_leading_space(df.columns) if self.columns is not None: columns = df.columns.intersection( utils.map_leading_space(self.columns)) else: columns = df.columns if len(columns) <= 7: # only need one row of display first_row_set = [ "coef", "exp(coef)", "se(coef)", "coef lower %d%%" % ci, "coef upper %d%%" % ci, "exp(coef) lower %d%%" % ci, "exp(coef) upper %d%%" % ci, "z", "p", "-log2(p)", ] second_row_set = [] else: first_row_set = [ "coef", "exp(coef)", "se(coef)", "coef lower %d%%" % ci, "coef upper %d%%" % ci, "exp(coef) lower %d%%" % ci, "exp(coef) upper %d%%" % ci, ] second_row_set = ["z", "p", "-log2(p)"] repr_string += df[columns].to_string( float_format=utils.format_floats(decimals), formatters={ **{ c: utils.format_exp_floats(decimals) for c in columns if "exp(coef)" in c }, **{ utils.leading_space("p"): utils.format_p_value(decimals) }, }, columns=[ c for c in utils.map_leading_space(first_row_set) if c in columns ], ) if second_row_set: repr_string += "\n\n" repr_string += df[ columns].to_string( float_format=utils.format_floats(decimals), formatters={ **{ c: utils.format_exp_floats(decimals) for c in columns if "exp(" in c }, **{ utils.leading_space("p"): utils.format_p_value(decimals) }, }, columns=utils.map_leading_space(second_row_set), ) with np.errstate(invalid="ignore", divide="ignore"): repr_string += "\n" + "---" + "\n" for string, value in self.footers: repr_string += "{} = {}".format(string, value) + "\n" return repr_string
def ascii_print(self): decimals = self.decimals df = self.model.summary justify = self.justify print(self.model) for string, value in self.headers: print("{} = {}".format(justify(string), value)) print(end="\n") print("---") df.columns = utils.map_leading_space(df.columns) columns = df.columns if len(columns) <= 7: # only need one row of display first_row_set = [ "coef", "exp(coef)", "se(coef)", "coef lower 95%", "coef upper 95%", "exp(coef) lower 95%", "exp(coef) upper 95%", "z", "p", "-log2(p)", ] second_row_set = [] else: first_row_set = [ "coef", "exp(coef)", "se(coef)", "coef lower 95%", "coef upper 95%", "exp(coef) lower 95%", "exp(coef) upper 95%", ] second_row_set = ["z", "p", "-log2(p)"] print( df.to_string( float_format=utils.format_floats(decimals), formatters={ **{c: utils.format_exp_floats(decimals) for c in columns if "exp(coef)" in c}, **{utils.leading_space("p"): utils.format_p_value(decimals)}, }, columns=[c for c in utils.map_leading_space(first_row_set) if c in columns], ) ) if second_row_set: print() print( df.to_string( float_format=utils.format_floats(decimals), formatters={ **{c: utils.format_exp_floats(decimals) for c in columns if "exp(" in c}, **{utils.leading_space("p"): utils.format_p_value(decimals)}, }, columns=utils.map_leading_space(second_row_set), ) ) with np.errstate(invalid="ignore", divide="ignore"): print("---") for string, value in self.footers: print("{} = {}".format(string, value))
def print_summary(self, decimals=2, **kwargs): """ Print summary statistics describing the fit, the coefficients, and the error bounds. Parameters ----------- decimals: int, optional (default=2) specify the number of decimal places to show alpha: float or iterable specify confidence intervals to show kwargs: print additional metadata in the output (useful to provide model names, dataset names, etc.) when comparing multiple outputs. """ # Print information about data first justify = string_justify(18) print(self) print("{} = '{}'".format(justify("duration col"), self.duration_col)) if self.event_col: print("{} = '{}'".format(justify("event col"), self.event_col)) if self.weights_col: print("{} = '{}'".format(justify("weights col"), self.weights_col)) if self.penalizer > 0: print("{} = {}".format(justify("penalizer"), self.penalizer)) if self.robust: print("{} = {}".format(justify("robust variance"), True)) print("{} = {}".format(justify("number of subjects"), self._n_examples)) print("{} = {}".format(justify("number of events"), self.event_observed.sum())) print("{} = {:.{prec}f}".format(justify("log-likelihood"), self._log_likelihood, prec=decimals)) print("{} = {}".format(justify("time fit was run"), self._time_fit_was_called)) for k, v in kwargs.items(): print("{} = {}\n".format(justify(k), v)) print(end="\n") print("---") df = self.summary # Significance codes as last column print( df.to_string( float_format=format_floats(decimals), formatters={ "p": format_p_value(decimals), "exp(coef)": format_exp_floats(decimals) }, )) # Significance code explanation print("---") print("Concordance = {:.{prec}f}".format(self.score_, prec=decimals)) print( "Log-likelihood ratio test = {:.{prec}f} on {} df, -log2(p)={:.{prec}f}" .format(*self._compute_likelihood_ratio_test(), prec=decimals))
def print_summary(self, decimals=2, **kwargs): """ Print summary statistics describing the fit, the coefficients, and the error bounds. Parameters ----------- decimals: int, optional (default=2) specify the number of decimal places to show kwargs: print additional meta data in the output (useful to provide model names, dataset names, etc.) when comparing multiple outputs. """ # Print information about data first justify = string_justify(18) print(self) print("{} = '{}'".format(justify("event col"), self.event_col)) if self.weights_col: print("{} = '{}'".format(justify("weights col"), self.weights_col)) if self.strata: print("{} = {}".format(justify("strata"), self.strata)) if self.penalizer > 0: print("{} = {}".format(justify("penalizer"), self.penalizer)) print("{} = {}".format(justify("number of subjects"), self._n_unique)) print("{} = {}".format(justify("number of periods"), self._n_examples)) print("{} = {}".format(justify("number of events"), self.event_observed.sum())) print("{} = {:.{prec}f}".format(justify("log-likelihood"), self.log_likelihood_, prec=decimals)) print("{} = {} UTC".format(justify("time fit was run"), self._time_fit_was_called)) for k, v in kwargs.items(): print("{} = {}\n".format(justify(k), v)) print(end="\n") print("---") df = self.summary df.columns = map_leading_space(df.columns) print( df.to_string( float_format=format_floats(decimals), formatters={ leading_space("exp(coef)"): format_exp_floats(decimals), leading_space("exp(coef) lower 95%"): format_exp_floats(decimals), leading_space("exp(coef) upper 95%"): format_exp_floats(decimals), }, columns=map_leading_space([ "coef", "exp(coef)", "se(coef)", "coef lower 95%", "coef upper 95%", "exp(coef) lower 95%", "exp(coef) upper 95%", ]), )) print() print( df.to_string( float_format=format_floats(decimals), formatters={leading_space("p"): format_p_value(decimals)}, columns=map_leading_space(["z", "p", "-log2(p)"]), )) # Significance code explanation print("---") with np.errstate(invalid="ignore", divide="ignore"): sr = self.log_likelihood_ratio_test() print( "Log-likelihood ratio test = {:.{prec}f} on {} df, -log2(p)={:.{prec}f}" .format(sr.test_statistic, sr.degrees_freedom, -np.log2(sr.p_value), prec=decimals))
def to_html(self): decimals = self.decimals summary_df = self.model.summary columns = summary_df.columns headers = self.headers.copy() headers.insert(0, ("model", "lifelines." + self.model._class_name)) header_df = pd.DataFrame.from_records(headers).set_index(0) header_html = header_df.to_html(header=False, notebook=True, index_names=False) summary_html = summary_df.to_html( float_format=utils.format_floats(decimals), formatters={ **{ c: utils.format_exp_floats(decimals) for c in columns if "exp(" in c }, **{ "p": utils.format_p_value(decimals) }, }, ) footers = [] with np.errstate(invalid="ignore", divide="ignore"): try: if utils.CensoringType.is_right_censoring( self.model) and self.model._KNOWN_MODEL: footers.append( ("Concordance", "{:.{prec}f}".format(self.model.score_, prec=decimals))) except AttributeError: pass try: sr = self.model.log_likelihood_ratio_test() footers.extend([ ("Concordance", "{:.{prec}f}".format(self.model.concordance_index_, prec=decimals)), ( "Log-likelihood ratio test", "{:.{prec}f} on {} df".format(sr.test_statistic, sr.degrees_freedom, prec=decimals), ), ("-log2(p) of ll-ratio test", "{:.{prec}f}".format(-np.log2(sr.p_value), prec=decimals)), ]) except AttributeError: pass if footers: footer_df = pd.DataFrame.from_records(footers).set_index(0) footer_html = footer_df.to_html(header=False, notebook=True, index_names=False) else: footer_html = "" return header_html + summary_html + footer_html