def z_test(Z, alpha): """ Pendantically returns None, p-value if test is inconclusive, else returns True, p-value. """ p = p_value_normal(Z) if (Z < inv_normal_cdf((1.-alpha)/2.) ) or (Z > inv_normal_cdf((1+alpha)/2)): #reject null return True, p #TODO else: #cannot reject null return None, p
def _bounds(self, cumulative_sq_): alpha2 = inv_normal_cdf(1 - (1-self.alpha)/2) df = pd.DataFrame( index=self.timeline) name = self.survival_function_.columns[0] df["%s_upper_%.2f"%(name,self.alpha)] = self.survival_function_.values**(np.exp(alpha2*cumulative_sq_/np.log(self.survival_function_.values))) df["%s_lower_%.2f"%(name,self.alpha)] = self.survival_function_.values**(np.exp(-alpha2*cumulative_sq_/np.log(self.survival_function_.values))) return df
def _compute_confidence_intervals(self): z = inv_normal_cdf(1 - self.alpha / 2) se = self.standard_errors_ hazards = self.hazards_.values return pd.DataFrame( np.c_[hazards - z * se, hazards + z * se], columns=["lower-bound", "upper-bound"], index=self.hazards_.index )
def _bounds(self, cumulative_sq_): alpha2 = inv_normal_cdf(1 - (1-self.alpha)/2) df = pd.DataFrame( index=self.timeline) name = self.cumulative_hazard_.columns[0] df["%s_upper_%.2f"%(name,self.alpha)] = self.cumulative_hazard_.values*np.exp(alpha2*np.sqrt(cumulative_sq_)/self.cumulative_hazard_.values ) df["%s_lower_%.2f"%(name,self.alpha)] = self.cumulative_hazard_.values*np.exp(-alpha2*np.sqrt(cumulative_sq_)/self.cumulative_hazard_.values ) return df
def smoothed_hazard_confidence_intervals_(self, bandwidth, hazard_=None): """ Parameter: bandwidth: the bandwith to use in the Epanechnikov kernel. hazard_: a computed (n,) numpy array of estimated hazard rates. If none, uses naf.smoothed_hazard_ """ if hazard_ == None: hazard_ = self.smoothed_hazard_(bandwidth).values[:, 0] timeline = self.timeline alpha2 = inv_normal_cdf(1 - (1 - self.alpha) / 2) name = "smoothed-" + self.cumulative_hazard_.columns[0] self._cumulative_sq.iloc[0] = 0 var_hazard_ = self._cumulative_sq.diff().fillna(self._cumulative_sq.iloc[0]) C = var_hazard_.values != 0.0 # only consider the points with jumps std_hazard_ = np.sqrt( 1.0 / (2 * bandwidth ** 2) * np.dot( epanechnikov_kernel(timeline[:, None], timeline[C][None, :], bandwidth) ** 2, var_hazard_.values[C] ) ) values = { "%s_upper_%.2f" % (name, self.alpha): hazard_ * np.exp(alpha2 * std_hazard_ / hazard_), "%s_lower_%.2f" % (name, self.alpha): hazard_ * np.exp(-alpha2 * std_hazard_ / hazard_), } return pd.DataFrame(values, index=timeline)
def _compute_confidence_bounds_of_parameters(self): se = self._compute_standard_errors().ix['se'] alpha2 = inv_normal_cdf((1. + self.alpha) / 2.) return pd.DataFrame([ np.array([self.lambda_, self.rho_]) + alpha2 * se, np.array([self.lambda_, self.rho_]) - alpha2 * se, ], columns=['lambda_', 'rho_'], index=['upper-bound', 'lower-bound'])
def _compute_confidence_intervals(self): alpha2 = inv_normal_cdf((1. + self.alpha) / 2.) se = self._compute_standard_errors() hazards = self.hazards_.values return pd.DataFrame(np.r_[hazards - alpha2 * se, hazards + alpha2 * se], index=['lower-bound', 'upper-bound'], columns=self.hazards_.columns)
def _bounds(self, lagged_survival, alpha, ci_labels): """Bounds are based on pg 411 of "Modelling Survival Data in Medical Research" David Collett 3rd Edition, which is derived from Greenwood's variance estimator. Confidence intervals are obtained using the delta method transformation of SE(log(-log(F_j))). This ensures that the confidence intervals all lie between 0 and 1. Formula for the variance follows: .. math:: Var(F_j) = sum((F_j(t) - F_j(t_i))**2 * d/(n*(n-d) + S(t_i-1)**2 * ((d*(n-d))/n**3) + -2 * sum((F_j(t) - F_j(t_i)) * S(t_i-1) * (d/n**2) Delta method transformation: .. math:: SE(log(-log(F_j) = SE(F_j) / (F_j * |log(F_j)|) More information can be found at: https://support.sas.com/documentation/onlinedoc/stat/141/lifetest.pdf There is also an alternative method (Aalen) but this is not currently implemented """ # Preparing environment ci = 1 - alpha df = self.event_table.copy() df["Ft"] = self.cumulative_density_ df["lagS"] = lagged_survival.fillna(1) if ci_labels is None: ci_labels = ["%s_upper_%g" % (self._label, ci), "%s_lower_%g" % (self._label, ci)] assert len(ci_labels) == 2, "ci_labels should be a length 2 array." # Have to loop through each time independently. Don't think there is a faster way all_vars = [] for _, r in df.iterrows(): sf = df.loc[df.index <= r.name].copy() F_t = float(r["Ft"]) first_term = np.sum( (F_t - sf["Ft"]) ** 2 * sf["observed"] / sf["at_risk"] / (sf["at_risk"] - sf["observed"]) ) second_term = np.sum( sf["lagS"] ** 2 / sf["at_risk"] * sf[self.label_cmprisk] / sf["at_risk"] * (sf["at_risk"] - sf[self.label_cmprisk]) / sf["at_risk"] ) third_term = np.sum((F_t - sf["Ft"]) / sf["at_risk"] * sf["lagS"] * sf[self.label_cmprisk] / sf["at_risk"]) variance = first_term + second_term - 2 * third_term all_vars.append(variance) df["variance"] = all_vars # Calculating Confidence Intervals df["F_transformed"] = np.log(-np.log(df["Ft"])) df["se_transformed"] = np.sqrt(df["variance"]) / (df["Ft"] * np.absolute(np.log(df["Ft"]))) zalpha = inv_normal_cdf(1 - alpha / 2) df[ci_labels[0]] = np.exp(-np.exp(df["F_transformed"] + zalpha * df["se_transformed"])) df[ci_labels[1]] = np.exp(-np.exp(df["F_transformed"] - zalpha * df["se_transformed"])) return df["variance"], df[ci_labels]
def _compute_confidence_bounds_of_parameters(self): se = self._compute_standard_errors().loc['se'] alpha2 = inv_normal_cdf((1. + self.alpha) / 2.) return pd.DataFrame([ np.array([self.lambda_]) + alpha2 * se, np.array([self.lambda_]) - alpha2 * se, ], columns=['lambda_'], index=['upper-bound', 'lower-bound'])
def _compute_confidence_intervals(self): alpha2 = inv_normal_cdf(1 - (1-self.alpha)/2) n = self.timeline.shape[0] d = self.cumulative_hazards_.shape[1] index = [['upper']*n+['lower']*n, np.concatenate( [self.timeline, self.timeline] ) ] self.confidence_intervals_ = pd.DataFrame(np.zeros((2*n,d)),index=index) self.confidence_intervals_.ix['upper'] = self.cumulative_hazards_.values + alpha2*np.sqrt(self._variance.values) self.confidence_intervals_.ix['lower'] = self.cumulative_hazards_.values - alpha2*np.sqrt(self._variance.values) return
def _compute_confidence_intervals(self): z = inv_normal_cdf(1 - self.alpha / 2) std_error = np.sqrt(self.cumulative_variance_) return pd.concat( { "lower-bound": self.cumulative_hazards_ - z * std_error, "upper-bound": self.cumulative_hazards_ + z * std_error, } )
def _bounds(self, cumulative_sq_, alpha): # See http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes2.pdfg alpha2 = inv_normal_cdf((1.0 + alpha) / 2.0) df = pd.DataFrame(index=self.timeline) name = self.survival_function_.columns[0] v = np.log(self.survival_function_.values) df["%s_upper_%.2f" % (name, self.alpha)] = np.exp(-np.exp(np.log(-v) + alpha2 * np.sqrt(cumulative_sq_) / v)) df["%s_lower_%.2f" % (name, self.alpha)] = np.exp(-np.exp(np.log(-v) - alpha2 * np.sqrt(cumulative_sq_) / v)) return df
def _bounds(self, lagged_survival, alpha, ci_labels): """Bounds are based on pg411 of "Modelling Survival Data in Medical Research" David Collett 3rd Edition, which is derived from Greenwood's variance estimator. Confidence intervals are obtained using the delta method transformation of SE(log(-log(F_j))). This ensures that the confidence intervals all lie between 0 and 1. Formula for the variance follows: Var(F_j) = sum((F_j(t) - F_j(t_i))**2 * d/(n*(n-d) + S(t_i-1)**2 * ((d*(n-d))/n**3) + -2 * sum((F_j(t) - F_j(t_i)) * S(t_i-1) * (d/n**2) Delta method transformation: SE(log(-log(F_j) = SE(F_j) / (F_j * absolute(log(F_j))) More information can be found at: https://support.sas.com/documentation/onlinedoc/stat/141/lifetest.pdf There is also an alternative method (Aalen) but this is not currently implemented """ # Preparing environment df = self.event_table.copy() df["Ft"] = self.cumulative_density_ df["lagS"] = lagged_survival.fillna(1) if ci_labels is None: ci_labels = [ "%s_upper_%.2f" % (self._predict_label, alpha), "%s_lower_%.2f" % (self._predict_label, alpha) ] assert len(ci_labels) == 2, "ci_labels should be a length 2 array." # Have to loop through each time independently. Don't think there is a faster way all_vars = [] for _, r in df.iterrows(): sf = df.loc[df.index <= r.name].copy() F_t = float(r["Ft"]) sf["part1"] = ( (F_t - sf["Ft"])**2) * (sf["observed"] / (sf["at_risk"] * (sf["at_risk"] - sf["observed"]))) sf["part2"] = (((sf["lagS"])**2) * sf[self.label_cmprisk] * ((sf["at_risk"] - sf[self.label_cmprisk])) / (sf["at_risk"]**3)) sf["part3"] = (F_t - sf["Ft"]) * sf["lagS"] * (sf[self.label_cmprisk] / (sf["at_risk"]**2)) variance = (np.sum(sf["part1"])) + (np.sum( sf["part2"])) - 2 * (np.sum(sf["part3"])) all_vars.append(variance) df["variance"] = all_vars # Calculating Confidence Intervals df["F_transformed"] = np.log(-np.log(df["Ft"])) df["se_transformed"] = np.sqrt( df["variance"]) / (df["Ft"] * np.absolute(np.log(df["Ft"]))) zalpha = inv_normal_cdf((1.0 + alpha) / 2.0) df[ci_labels[0]] = np.exp(-np.exp(df["F_transformed"] + zalpha * df["se_transformed"])) df[ci_labels[1]] = np.exp(-np.exp(df["F_transformed"] - zalpha * df["se_transformed"])) return df["variance"], df[ci_labels]
def plot(self, columns=None, display_significance_code=True, **errorbar_kwargs): """ Produces a visual representation of the coefficients, including their standard errors and magnitudes. Parameters ---------- columns : list, optional specifiy a subset of the columns to plot display_significance_code: bool, optional (default: True) display asteriks beside statistically significant variables errorbar_kwargs: pass in additional plotting commands to matplotlib errorbar command Returns ------- ax: matplotlib axis the matplotlib axis that be edited. """ from matplotlib import pyplot as plt ax = errorbar_kwargs.get("ax", None) or plt.figure().add_subplot(111) errorbar_kwargs.setdefault("c", "k") errorbar_kwargs.setdefault("fmt", "s") errorbar_kwargs.setdefault("markerfacecolor", "white") errorbar_kwargs.setdefault("markeredgewidth", 1.25) errorbar_kwargs.setdefault("elinewidth", 1.25) errorbar_kwargs.setdefault("capsize", 3) alpha2 = inv_normal_cdf((1.0 + self.alpha) / 2.0) if columns is None: columns = self.hazards_.columns yaxis_locations = list(range(len(columns))) summary = self.summary.loc[columns] symmetric_errors = alpha2 * self.standard_errors_[columns].squeeze().values.copy() hazards = self.hazards_[columns].values[0].copy() order = np.argsort(hazards) ax.errorbar(hazards[order], yaxis_locations, xerr=symmetric_errors[order], **errorbar_kwargs) best_ylim = ax.get_ylim() ax.vlines(0, -2, len(columns) + 1, linestyles="dashed", linewidths=1, alpha=0.65) ax.set_ylim(best_ylim) if display_significance_code: tick_labels = [c + significance_code(p).strip() for (c, p) in summary["p"][order].iteritems()] else: tick_labels = columns[order] plt.yticks(yaxis_locations, tick_labels) plt.xlabel("log(HR) (%g%% CI)" % (self.alpha * 100)) return ax
def _compute_confidence_intervals(self): ci = 100 * (1 - self.alpha) z = inv_normal_cdf(1 - self.alpha / 2) std_error = np.sqrt(self.cumulative_variance_) return pd.concat({ "%g%% lower-bound" % ci: self.cumulative_hazards_ - z * std_error, "%g%% upper-bound" % ci: self.cumulative_hazards_ + z * std_error, })
def _compute_confidence_intervals(self): ci = 100 * (1 - self.alpha) z = inv_normal_cdf(1 - self.alpha / 2) se = self.standard_errors_ hazards = self.params_.values return pd.DataFrame( np.c_[hazards - z * se, hazards + z * se], columns=["%g%% lower-bound" % ci, "%g%% upper-bound" % ci], index=self.params_.index, )
def plot(self, columns=None, ax=None, **errorbar_kwargs): """ Produces a visual representation of the coefficients, including their standard errors and magnitudes. Parameters ---------- columns : list, optional specify a subset of the columns to plot errorbar_kwargs: pass in additional plotting commands to matplotlib errorbar command Returns ------- ax: matplotlib axis the matplotlib axis that be edited. """ from matplotlib import pyplot as plt if ax is None: ax = plt.gca() errorbar_kwargs.setdefault("c", "k") errorbar_kwargs.setdefault("fmt", "s") errorbar_kwargs.setdefault("markerfacecolor", "white") errorbar_kwargs.setdefault("markeredgewidth", 1.25) errorbar_kwargs.setdefault("elinewidth", 1.25) errorbar_kwargs.setdefault("capsize", 3) z = inv_normal_cdf(1 - self.alpha / 2) if columns is None: user_supplied_columns = False columns = self.params_.index else: user_supplied_columns = True yaxis_locations = list(range(len(columns))) symmetric_errors = z * self.standard_errors_[columns].values.copy() hazards = self.params_[columns].values.copy() order = list(range(len(columns) - 1, -1, -1)) if user_supplied_columns else np.argsort(hazards) ax.errorbar(hazards[order], yaxis_locations, xerr=symmetric_errors[order], **errorbar_kwargs) best_ylim = ax.get_ylim() ax.vlines(0, -2, len(columns) + 1, linestyles="dashed", linewidths=1, alpha=0.65) ax.set_ylim(best_ylim) tick_labels = [columns[i] for i in order] ax.set_yticks(yaxis_locations) ax.set_yticklabels(tick_labels) ax.set_xlabel("log(HR) (%g%% CI)" % ((1 - self.alpha) * 100)) return ax
def _compute_confidence_bounds_of_parameters(self): se = self._compute_standard_errors().loc["se"] alpha2 = inv_normal_cdf((1.0 + self.alpha) / 2.0) return pd.DataFrame( [ np.array([self.lambda_, self.rho_]) + alpha2 * se, np.array([self.lambda_, self.rho_]) - alpha2 * se ], columns=["lambda_", "rho_"], index=["upper-bound", "lower-bound"], )
def _compute_confidence_bounds_of_parameters(self): se = self._compute_standard_errors().loc["se"] alpha2 = inv_normal_cdf((1.0 + self.alpha) / 2.0) return pd.DataFrame( [ self._fitted_parameters_ + alpha2 * se, self._fitted_parameters_ - alpha2 * se ], columns=self._fitted_parameter_names, index=["upper-bound", "lower-bound"], )
def _bounds(self, cumulative_sq_, alpha): # See http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes2.pdfg alpha2 = inv_normal_cdf((1. + alpha) / 2.) df = pd.DataFrame(index=self.timeline) name = self.__estimate.columns[0] v = np.log(self.__estimate.values) df["%s_upper_%.2f" % (name, self.alpha)] = np.exp( -np.exp(np.log(-v) + alpha2 * np.sqrt(cumulative_sq_) / v)) df["%s_lower_%.2f" % (name, self.alpha)] = np.exp( -np.exp(np.log(-v) - alpha2 * np.sqrt(cumulative_sq_) / v)) return df
def _compute_confidence_intervals(self): alpha2 = inv_normal_cdf(1 - (1 - self.alpha) / 2) n = self.timeline.shape[0] d = self.cumulative_hazards_.shape[1] index = [['upper'] * n + ['lower'] * n, np.concatenate([self.timeline, self.timeline])] self.confidence_intervals_ = pd.DataFrame( np.zeros((2 * n, d)), index=index, columns=self.cumulative_hazards_.columns) self.confidence_intervals_.ix['upper'] = self.cumulative_hazards_.values + \ alpha2 * np.sqrt(self._variance.cumsum().values) self.confidence_intervals_.ix['lower'] = self.cumulative_hazards_.values - \ alpha2 * np.sqrt(self._variance.cumsum().values) return
def _bounds(self, cumulative_sq_, alpha, ci_labels): # See http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes2.pdf alpha2 = inv_normal_cdf((1. + alpha) / 2.) df = pd.DataFrame(index=self.timeline) v = np.log(self.__estimate.values) if ci_labels is None: ci_labels = ["%s_upper_%.2f" % (self._label, alpha), "%s_lower_%.2f" % (self._label, alpha)] assert len(ci_labels) == 2, "ci_labels should be a length 2 array." df[ci_labels[0]] = np.exp(-np.exp(np.log(-v) + alpha2 * np.sqrt(cumulative_sq_) / v)) df[ci_labels[1]] = np.exp(-np.exp(np.log(-v) - alpha2 * np.sqrt(cumulative_sq_) / v)) return df
def _bounds(self, cumulative_sq_, ci_labels): # See http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes2.pdfg alpha2 = inv_normal_cdf((1. + self.alpha) / 2.) df = pd.DataFrame(index=self.timeline) v = np.log(self.__estimate.values) if ci_labels is None: ci_labels = ["%s_upper_%.2f" % (self._label, self.alpha), "%s_lower_%.2f" % (self._label, self.alpha)] assert len(ci_labels) == 2, "ci_labels should be a length 2 array." df[ci_labels[0]] = np.exp(-np.exp(np.log(-v) + alpha2 * np.sqrt(cumulative_sq_) / v)) df[ci_labels[1]] = np.exp(-np.exp(np.log(-v) - alpha2 * np.sqrt(cumulative_sq_) / v)) return df
def _bounds(self, cumulative_sq_, alpha, ci_labels): z = inv_normal_cdf(1 - alpha / 2) df = pd.DataFrame(index=self.timeline) if ci_labels is None: ci_labels = ["%s_lower_%g" % (self._label, 1 - alpha), "%s_upper_%g" % (self._label, 1 - alpha)] assert len(ci_labels) == 2, "ci_labels should be a length 2 array." self.ci_labels = ci_labels cum_hazard_ = self.cumulative_hazard_.values df[ci_labels[0]] = cum_hazard_ * np.exp(-z * np.sqrt(cumulative_sq_) / np.where(cum_hazard_ == 0, 1, cum_hazard_)) df[ci_labels[1]] = cum_hazard_ * np.exp(z * np.sqrt(cumulative_sq_) / np.where(cum_hazard_ == 0, 1, cum_hazard_)) return df
def plot(self, columns=None, **errorbar_kwargs): """ Produces a visual representation of the coefficients, including their standard errors and magnitudes. Parameters ---------- columns : list, optional specify a subset of the columns to plot errorbar_kwargs: pass in additional plotting commands to matplotlib errorbar command Returns ------- ax: matplotlib axis the matplotlib axis that be edited. """ from matplotlib import pyplot as plt ax = errorbar_kwargs.pop("ax", None) or plt.figure().add_subplot(111) errorbar_kwargs.setdefault("c", "k") errorbar_kwargs.setdefault("fmt", "s") errorbar_kwargs.setdefault("markerfacecolor", "white") errorbar_kwargs.setdefault("markeredgewidth", 1.25) errorbar_kwargs.setdefault("elinewidth", 1.25) errorbar_kwargs.setdefault("capsize", 3) z = inv_normal_cdf(1 - self.alpha / 2) if columns is None: columns = self.hazards_.index yaxis_locations = list(range(len(columns))) symmetric_errors = z * self.standard_errors_[columns].to_frame().squeeze(axis=1).values.copy() hazards = self.hazards_[columns].values.copy() order = np.argsort(hazards) ax.errorbar(hazards[order], yaxis_locations, xerr=symmetric_errors[order], **errorbar_kwargs) best_ylim = ax.get_ylim() ax.vlines(0, -2, len(columns) + 1, linestyles="dashed", linewidths=1, alpha=0.65) ax.set_ylim(best_ylim) tick_labels = [columns[i] for i in order] ax.set_yticks(yaxis_locations) ax.set_yticklabels(tick_labels) ax.set_xlabel("log(HR) (%g%% CI)" % ((1 - self.alpha) * 100)) return ax
def _bounds(self, alpha, ci_labels): alpha2 = inv_normal_cdf((1.0 + alpha) / 2.0) df = pd.DataFrame(index=self.timeline) if ci_labels is None: ci_labels = ["%s_upper_%.2f" % (self._label, alpha), "%s_lower_%.2f" % (self._label, alpha)] assert len(ci_labels) == 2, "ci_labels should be a length 2 array." std = np.sqrt(self._lambda_variance_) cum_hazard = self.cumulative_hazard_ error = std * self.timeline[:, None] df[ci_labels[0]] = cum_hazard + alpha2 * error df[ci_labels[1]] = cum_hazard - alpha2 * error return df
def _bounds(self, cumulative_sq_, alpha, ci_labels): alpha2 = inv_normal_cdf(1 - (1 - alpha) / 2) df = pd.DataFrame(index=self.timeline) if ci_labels is None: ci_labels = ["%s_upper_%.2f" % (self._label, alpha), "%s_lower_%.2f" % (self._label, alpha)] assert len(ci_labels) == 2, "ci_labels should be a length 2 array." self.ci_labels = ci_labels df[ci_labels[0]] = self.cumulative_hazard_.values * \ np.exp(alpha2 * np.sqrt(cumulative_sq_) / self.cumulative_hazard_.values) df[ci_labels[1]] = self.cumulative_hazard_.values * \ np.exp(-alpha2 * np.sqrt(cumulative_sq_) / self.cumulative_hazard_.values) return df
def _bounds(self, cumulative_sq_, alpha, ci_labels): # This method calculates confidence intervals using the exponential Greenwood formula. # See https://www.math.wustl.edu/%7Esawyer/handouts/greenwood.pdf z = inv_normal_cdf(1 - alpha / 2) df = pd.DataFrame(index=self.timeline) v = np.log(self.__estimate.values) if ci_labels is None: ci_labels = ["%s_upper_%g" % (self._label, 1 - alpha), "%s_lower_%g" % (self._label, 1 - alpha)] assert len(ci_labels) == 2, "ci_labels should be a length 2 array." df[ci_labels[0]] = np.exp(-np.exp(np.log(-v) + z * np.sqrt(cumulative_sq_) / v)) df[ci_labels[1]] = np.exp(-np.exp(np.log(-v) - z * np.sqrt(cumulative_sq_) / v)) return df
def _bounds(self, cumulative_sq_, alpha, ci_labels): # This method calculates confidence intervals using the exponential Greenwood formula. # See https://www.math.wustl.edu/%7Esawyer/handouts/greenwood.pdf z = inv_normal_cdf(1 - alpha / 2) df = pd.DataFrame(index=self.timeline) v = np.log(self.__estimate.values) if ci_labels is None: ci_labels = ["%s_lower_%g" % (self._label, 1 - alpha), "%s_upper_%g" % (self._label, 1 - alpha)] assert len(ci_labels) == 2, "ci_labels should be a length 2 array." df[ci_labels[0]] = np.exp(-np.exp(np.log(-v) - z * np.sqrt(cumulative_sq_) / v)) df[ci_labels[1]] = np.exp(-np.exp(np.log(-v) + z * np.sqrt(cumulative_sq_) / v)) return df.fillna(1.0)
def _bounds(self, cumulative_sq_, alpha, ci_labels): alpha2 = inv_normal_cdf(1 - (1 - alpha) / 2) df = pd.DataFrame(index=self.timeline) if ci_labels is None: ci_labels = ["%s_upper_%.2f" % (self._label, self.alpha), "%s_lower_%.2f" % (self._label, self.alpha)] assert len(ci_labels) == 2, "ci_labels should be a length 2 array." self.ci_labels = ci_labels df[ci_labels[0]] = self.cumulative_hazard_.values * \ np.exp(alpha2 * np.sqrt(cumulative_sq_) / self.cumulative_hazard_.values) df[ci_labels[1]] = self.cumulative_hazard_.values * \ np.exp(-alpha2 * np.sqrt(cumulative_sq_) / self.cumulative_hazard_.values) return df
def _bounds(self, alpha, ci_labels): alpha2 = inv_normal_cdf((1. + alpha) / 2.) df = pd.DataFrame(index=self.timeline) if ci_labels is None: ci_labels = ["%s_upper_%.2f" % (self._label, alpha), "%s_lower_%.2f" % (self._label, alpha)] assert len(ci_labels) == 2, "ci_labels should be a length 2 array." std = np.sqrt(self._lambda_variance_) sv = self.survival_function_ error = std * self.timeline[:, None] * sv df[ci_labels[0]] = sv + alpha2 * error df[ci_labels[1]] = sv - alpha2 * error return df
def _compute_confidence_bounds_of_transform(self, transform, alpha, ci_labels): """ This computes the confidence intervals of a transform of the parameters. Ex: take the fitted parameters, a function/transform and the variance matrix and give me back confidence intervals of the transform. Parameters ----------- transform: function must a function of two parameters: ``params``, an iterable that stores the parameters ``times``, a numpy vector representing some timeline the function must use autograd imports (scipy and numpy) alpha: float confidence level ci_labels: tuple """ alpha2 = inv_normal_cdf((1.0 + alpha) / 2.0) df = pd.DataFrame(index=self.timeline) # pylint: disable=no-value-for-parameter gradient_of_cum_hazard_at_mle = make_jvp_reversemode(transform)( self._fitted_parameters_, self.timeline) gradient_at_times = np.vstack([ gradient_of_cum_hazard_at_mle(basis) for basis in np.eye(len(self._fitted_parameters_)) ]) std_cumulative_hazard = np.sqrt( np.einsum("nj,jk,nk->n", gradient_at_times.T, self.variance_matrix_, gradient_at_times.T)) if ci_labels is None: ci_labels = [ "%s_upper_%.2f" % (self._label, alpha), "%s_lower_%.2f" % (self._label, alpha) ] assert len(ci_labels) == 2, "ci_labels should be a length 2 array." df[ci_labels[0]] = transform( self._fitted_parameters_, self.timeline) + alpha2 * std_cumulative_hazard df[ci_labels[1]] = transform( self._fitted_parameters_, self.timeline) - alpha2 * std_cumulative_hazard return df
def smoothed_hazard_confidence_intervals_(self, bandwidth, hazard_=None): """ Parameter: bandwidth: the bandwith to use in the Epanechnikov kernel. hazard_: a computed (n,) numpy array of estimated hazard rates. If none, uses naf.smoothed_hazard_ """ if hazard_ is None: hazard_ = self.smoothed_hazard_(bandwidth).values[:, 0] timeline = self.timeline alpha2 = inv_normal_cdf(1 - (1 - self.alpha) / 2) self._cumulative_sq.iloc[0] = 0 var_hazard_ = self._cumulative_sq.diff().fillna(self._cumulative_sq.iloc[0]) C = (var_hazard_.values != 0.0) # only consider the points with jumps std_hazard_ = np.sqrt(1. / (bandwidth ** 2) * np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None, :], bandwidth) ** 2, var_hazard_.values[C])) values = { self.ci_labels[0]: hazard_ * np.exp(alpha2 * std_hazard_ / hazard_), self.ci_labels[1]: hazard_ * np.exp(-alpha2 * std_hazard_ / hazard_) } return pd.DataFrame(values, index=timeline)
def summary(self): """Summary statistics describing the fit. Returns ------- df : DataFrame Contains columns coef, np.exp(coef), se(coef), z, p, lower, upper""" ci = 100 * (1 - self.alpha) z = inv_normal_cdf(1 - self.alpha / 2) with np.errstate(invalid="ignore", divide="ignore", over="ignore", under="ignore"): df = pd.DataFrame(index=self.params_.index) df["coef"] = self.params_ df["exp(coef)"] = self.hazard_ratios_ df["se(coef)"] = self.standard_errors_ df["coef lower %g%%" % ci] = self.confidence_intervals_["%g%% lower-bound" % ci] df["coef upper %g%%" % ci] = self.confidence_intervals_["%g%% upper-bound" % ci] df["exp(coef) lower %g%%" % ci] = self.hazard_ratios_ * np.exp(-z * self.standard_errors_) df["exp(coef) upper %g%%" % ci] = self.hazard_ratios_ * np.exp(z * self.standard_errors_) df["z"] = self._compute_z_values() df["p"] = self._compute_p_values() df["-log2(p)"] = -np.log2(df["p"]) return df
def _bounds(self, alpha, ci_labels): alpha2 = inv_normal_cdf((1. + alpha) / 2.) df = pd.DataFrame(index=self.timeline) var_lambda_, var_rho_ = inv(self._jacobian).diagonal() def _dH_d_lambda(lambda_, rho, T): return rho / lambda_ * (lambda_ * T) ** rho def _dH_d_rho(lambda_, rho, T): return np.log(lambda_ * T) * (lambda_ * T) ** rho def sensitivity_analysis(lambda_, rho, var_lambda_, var_rho_, T): return var_lambda_ * _dH_d_lambda(lambda_, rho, T) ** 2 + var_rho_ * _dH_d_rho(lambda_, rho, T) ** 2 std_cumulative_hazard = np.sqrt(sensitivity_analysis(self.lambda_, self.rho_, var_lambda_, var_rho_, self.timeline)) if ci_labels is None: ci_labels = ["%s_upper_%.2f" % (self._label, alpha), "%s_lower_%.2f" % (self._label, alpha)] assert len(ci_labels) == 2, "ci_labels should be a length 2 array." df[ci_labels[0]] = self.cumulative_hazard_at_times(self.timeline) + alpha2 * std_cumulative_hazard df[ci_labels[1]] = self.cumulative_hazard_at_times(self.timeline) - alpha2 * std_cumulative_hazard return df
def plot(self, columns=None, parameters=None, **errorbar_kwargs): """ Produces a visual representation of the coefficients, including their standard errors and magnitudes. Parameters ---------- columns : list, optional specify a subset of the columns (variables from the training data) to plot parameter : list, optional specify a subset of the parameters to plot errorbar_kwargs: pass in additional plotting commands to matplotlib errorbar command Returns ------- ax: matplotlib axis the matplotlib axis that be edited. """ from matplotlib import pyplot as plt set_kwargs_ax(errorbar_kwargs) ax = errorbar_kwargs.pop("ax") errorbar_kwargs.setdefault("c", "k") errorbar_kwargs.setdefault("fmt", "s") errorbar_kwargs.setdefault("markerfacecolor", "white") errorbar_kwargs.setdefault("markeredgewidth", 1.25) errorbar_kwargs.setdefault("elinewidth", 1.25) errorbar_kwargs.setdefault("capsize", 3) z = inv_normal_cdf(1 - self.alpha / 2) params_ = self.params_.copy() standard_errors_ = self.standard_errors_.copy() if columns is not None: assert isinstance(columns, list), "columns must be a list" params_ = params_.loc[:, columns] standard_errors_ = standard_errors_.loc[:, columns] if parameters is not None: assert isinstance(parameters, list), "parameter must be a list" params_ = params_.loc[parameters] standard_errors_ = standard_errors_.loc[parameters] columns = params_.index hazards = params_.loc[columns].to_frame(name="coefs") hazards["se"] = z * standard_errors_.loc[columns] hazards = hazards.swaplevel(1, 0).sort_index() yaxis_locations = list(range(len(columns) - 1, -1, -1)) ax.errorbar(hazards["coefs"], yaxis_locations, xerr=hazards["se"], **errorbar_kwargs) # set zero hline best_ylim = ax.get_ylim() ax.vlines(0, -2, len(columns) + 1, linestyles="dashed", linewidths=1, alpha=0.65) ax.set_ylim(best_ylim) if isinstance(columns[0], tuple): tick_labels = ["%s: %s" % (p, c) for (p, c) in hazards.index] else: tick_labels = [i for i in hazards.index] plt.yticks(yaxis_locations, tick_labels) plt.xlabel("log(accelerated failure rate) (%g%% CI)" % ((1 - self.alpha) * 100)) return ax
def plot(self, columns=None, parameter=None, **errorbar_kwargs): """ Produces a visual representation of the coefficients, including their standard errors and magnitudes. Parameters ---------- columns : list, optional specify a subset of the columns to plot errorbar_kwargs: pass in additional plotting commands to matplotlib errorbar command Returns ------- ax: matplotlib axis the matplotlib axis that be edited. """ from matplotlib import pyplot as plt set_kwargs_ax(errorbar_kwargs) ax = errorbar_kwargs.pop("ax") errorbar_kwargs.setdefault("c", "k") errorbar_kwargs.setdefault("fmt", "s") errorbar_kwargs.setdefault("markerfacecolor", "white") errorbar_kwargs.setdefault("markeredgewidth", 1.25) errorbar_kwargs.setdefault("elinewidth", 1.25) errorbar_kwargs.setdefault("capsize", 3) z = inv_normal_cdf(1 - self.alpha / 2) params_ = self.params_.copy() standard_errors_ = self.standard_errors_.copy() if columns is not None: params_ = params_.loc[:, columns] standard_errors_ = standard_errors_.loc[:, columns] if parameter is not None: params_ = params_.loc[parameter] standard_errors_ = standard_errors_.loc[parameter] columns = params_.index hazards = params_.loc[columns].to_frame(name="coefs") hazards["se"] = z * standard_errors_.loc[columns] if isinstance(hazards.index, pd.MultiIndex): hazards = hazards.groupby(level=0, group_keys=False).apply( lambda x: x.sort_values(by="coefs", ascending=True)) else: hazards = hazards.sort_values(by="coefs", ascending=True) yaxis_locations = list(range(len(columns))) ax.errorbar(hazards["coefs"], yaxis_locations, xerr=hazards["se"], **errorbar_kwargs) best_ylim = ax.get_ylim() ax.vlines(0, -2, len(columns) + 1, linestyles="dashed", linewidths=1, alpha=0.65) ax.set_ylim(best_ylim) if isinstance(columns[0], tuple): tick_labels = ["%s: %s" % (c, p) for (p, c) in hazards.index] else: tick_labels = [i for i in hazards.index] plt.yticks(yaxis_locations, tick_labels) plt.xlabel("log(accelerated failure rate) (%g%% CI)" % ((1 - self.alpha) * 100)) return ax