Пример #1
0
def z_test(Z, alpha):
  """
  Pendantically returns None, p-value if test is inconclusive, else returns True, p-value.
  """
  p = p_value_normal(Z)
  if (Z < inv_normal_cdf((1.-alpha)/2.) ) or (Z > inv_normal_cdf((1+alpha)/2)):
    #reject null
    return True, p #TODO
  else:
    #cannot reject null
    return None, p
Пример #2
0
 def _bounds(self, cumulative_sq_):
     alpha2 = inv_normal_cdf(1 - (1-self.alpha)/2)
     df = pd.DataFrame( index=self.timeline)
     name = self.survival_function_.columns[0]
     df["%s_upper_%.2f"%(name,self.alpha)] = self.survival_function_.values**(np.exp(alpha2*cumulative_sq_/np.log(self.survival_function_.values)))
     df["%s_lower_%.2f"%(name,self.alpha)] = self.survival_function_.values**(np.exp(-alpha2*cumulative_sq_/np.log(self.survival_function_.values)))
     return df
 def _compute_confidence_intervals(self):
     z = inv_normal_cdf(1 - self.alpha / 2)
     se = self.standard_errors_
     hazards = self.hazards_.values
     return pd.DataFrame(
         np.c_[hazards - z * se, hazards + z * se], columns=["lower-bound", "upper-bound"], index=self.hazards_.index
     )
Пример #4
0
 def _bounds(self, cumulative_sq_):
     alpha2 = inv_normal_cdf(1 - (1-self.alpha)/2)
     df = pd.DataFrame( index=self.timeline)
     name = self.cumulative_hazard_.columns[0]
     df["%s_upper_%.2f"%(name,self.alpha)] = self.cumulative_hazard_.values*np.exp(alpha2*np.sqrt(cumulative_sq_)/self.cumulative_hazard_.values )
     df["%s_lower_%.2f"%(name,self.alpha)] = self.cumulative_hazard_.values*np.exp(-alpha2*np.sqrt(cumulative_sq_)/self.cumulative_hazard_.values )
     return df
Пример #5
0
    def smoothed_hazard_confidence_intervals_(self, bandwidth, hazard_=None):
        """
        Parameter:
          bandwidth: the bandwith to use in the Epanechnikov kernel.
          hazard_: a computed (n,) numpy array of estimated hazard rates. If none, uses naf.smoothed_hazard_
        """
        if hazard_ == None:
            hazard_ = self.smoothed_hazard_(bandwidth).values[:, 0]

        timeline = self.timeline
        alpha2 = inv_normal_cdf(1 - (1 - self.alpha) / 2)
        name = "smoothed-" + self.cumulative_hazard_.columns[0]
        self._cumulative_sq.iloc[0] = 0
        var_hazard_ = self._cumulative_sq.diff().fillna(self._cumulative_sq.iloc[0])
        C = var_hazard_.values != 0.0  # only consider the points with jumps
        std_hazard_ = np.sqrt(
            1.0
            / (2 * bandwidth ** 2)
            * np.dot(
                epanechnikov_kernel(timeline[:, None], timeline[C][None, :], bandwidth) ** 2, var_hazard_.values[C]
            )
        )
        values = {
            "%s_upper_%.2f" % (name, self.alpha): hazard_ * np.exp(alpha2 * std_hazard_ / hazard_),
            "%s_lower_%.2f" % (name, self.alpha): hazard_ * np.exp(-alpha2 * std_hazard_ / hazard_),
        }
        return pd.DataFrame(values, index=timeline)
Пример #6
0
 def _compute_confidence_bounds_of_parameters(self):
     se = self._compute_standard_errors().ix['se']
     alpha2 = inv_normal_cdf((1. + self.alpha) / 2.)
     return pd.DataFrame([
         np.array([self.lambda_, self.rho_]) + alpha2 * se,
         np.array([self.lambda_, self.rho_]) - alpha2 * se,
     ], columns=['lambda_', 'rho_'], index=['upper-bound', 'lower-bound'])
 def _compute_confidence_intervals(self):
     alpha2 = inv_normal_cdf((1. + self.alpha) / 2.)
     se = self._compute_standard_errors()
     hazards = self.hazards_.values
     return pd.DataFrame(np.r_[hazards - alpha2 * se,
                               hazards + alpha2 * se],
                         index=['lower-bound', 'upper-bound'],
                         columns=self.hazards_.columns)
    def _bounds(self, lagged_survival, alpha, ci_labels):
        """Bounds are based on pg 411 of "Modelling Survival Data in Medical Research" David Collett 3rd Edition, which
        is derived from Greenwood's variance estimator. Confidence intervals are obtained using the delta method
        transformation of SE(log(-log(F_j))). This ensures that the confidence intervals all lie between 0 and 1.

        Formula for the variance follows:

        .. math::

            Var(F_j) = sum((F_j(t) - F_j(t_i))**2 * d/(n*(n-d) + S(t_i-1)**2 * ((d*(n-d))/n**3) +
                        -2 * sum((F_j(t) - F_j(t_i)) * S(t_i-1) * (d/n**2)

        Delta method transformation:

        .. math::

            SE(log(-log(F_j) = SE(F_j) / (F_j * |log(F_j)|)

        More information can be found at: https://support.sas.com/documentation/onlinedoc/stat/141/lifetest.pdf
        There is also an alternative method (Aalen) but this is not currently implemented
        """
        # Preparing environment
        ci = 1 - alpha
        df = self.event_table.copy()
        df["Ft"] = self.cumulative_density_
        df["lagS"] = lagged_survival.fillna(1)
        if ci_labels is None:
            ci_labels = ["%s_upper_%g" % (self._label, ci), "%s_lower_%g" % (self._label, ci)]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."

        # Have to loop through each time independently. Don't think there is a faster way
        all_vars = []
        for _, r in df.iterrows():
            sf = df.loc[df.index <= r.name].copy()
            F_t = float(r["Ft"])
            first_term = np.sum(
                (F_t - sf["Ft"]) ** 2 * sf["observed"] / sf["at_risk"] / (sf["at_risk"] - sf["observed"])
            )
            second_term = np.sum(
                sf["lagS"] ** 2
                / sf["at_risk"]
                * sf[self.label_cmprisk]
                / sf["at_risk"]
                * (sf["at_risk"] - sf[self.label_cmprisk])
                / sf["at_risk"]
            )
            third_term = np.sum((F_t - sf["Ft"]) / sf["at_risk"] * sf["lagS"] * sf[self.label_cmprisk] / sf["at_risk"])
            variance = first_term + second_term - 2 * third_term
            all_vars.append(variance)
        df["variance"] = all_vars

        # Calculating Confidence Intervals
        df["F_transformed"] = np.log(-np.log(df["Ft"]))
        df["se_transformed"] = np.sqrt(df["variance"]) / (df["Ft"] * np.absolute(np.log(df["Ft"])))
        zalpha = inv_normal_cdf(1 - alpha / 2)
        df[ci_labels[0]] = np.exp(-np.exp(df["F_transformed"] + zalpha * df["se_transformed"]))
        df[ci_labels[1]] = np.exp(-np.exp(df["F_transformed"] - zalpha * df["se_transformed"]))
        return df["variance"], df[ci_labels]
Пример #9
0
 def _compute_confidence_bounds_of_parameters(self):
     se = self._compute_standard_errors().loc['se']
     alpha2 = inv_normal_cdf((1. + self.alpha) / 2.)
     return pd.DataFrame([
         np.array([self.lambda_]) + alpha2 * se,
         np.array([self.lambda_]) - alpha2 * se,
     ],
                         columns=['lambda_'],
                         index=['upper-bound', 'lower-bound'])
Пример #10
0
 def _compute_confidence_intervals(self):
   alpha2 = inv_normal_cdf(1 - (1-self.alpha)/2)
   n = self.timeline.shape[0]
   d = self.cumulative_hazards_.shape[1]
   index = [['upper']*n+['lower']*n, np.concatenate( [self.timeline, self.timeline] ) ]
   self.confidence_intervals_ = pd.DataFrame(np.zeros((2*n,d)),index=index)
   self.confidence_intervals_.ix['upper'] = self.cumulative_hazards_.values + alpha2*np.sqrt(self._variance.values)
   self.confidence_intervals_.ix['lower'] = self.cumulative_hazards_.values - alpha2*np.sqrt(self._variance.values)
   return 
 def _compute_confidence_intervals(self):
     z = inv_normal_cdf(1 - self.alpha / 2)
     std_error = np.sqrt(self.cumulative_variance_)
     return pd.concat(
         {
             "lower-bound": self.cumulative_hazards_ - z * std_error,
             "upper-bound": self.cumulative_hazards_ + z * std_error,
         }
     )
Пример #12
0
 def _bounds(self, cumulative_sq_, alpha):
     # See http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes2.pdfg
     alpha2 = inv_normal_cdf((1.0 + alpha) / 2.0)
     df = pd.DataFrame(index=self.timeline)
     name = self.survival_function_.columns[0]
     v = np.log(self.survival_function_.values)
     df["%s_upper_%.2f" % (name, self.alpha)] = np.exp(-np.exp(np.log(-v) + alpha2 * np.sqrt(cumulative_sq_) / v))
     df["%s_lower_%.2f" % (name, self.alpha)] = np.exp(-np.exp(np.log(-v) - alpha2 * np.sqrt(cumulative_sq_) / v))
     return df
Пример #13
0
    def _bounds(self, lagged_survival, alpha, ci_labels):
        """Bounds are based on pg411 of "Modelling Survival Data in Medical Research" David Collett 3rd Edition, which
        is derived from Greenwood's variance estimator. Confidence intervals are obtained using the delta method
        transformation of SE(log(-log(F_j))). This ensures that the confidence intervals all lie between 0 and 1.

        Formula for the variance follows:
        Var(F_j) = sum((F_j(t) - F_j(t_i))**2 * d/(n*(n-d) + S(t_i-1)**2 * ((d*(n-d))/n**3) +
                    -2 * sum((F_j(t) - F_j(t_i)) * S(t_i-1) * (d/n**2)

        Delta method transformation:
        SE(log(-log(F_j) = SE(F_j) / (F_j * absolute(log(F_j)))

        More information can be found at: https://support.sas.com/documentation/onlinedoc/stat/141/lifetest.pdf
        There is also an alternative method (Aalen) but this is not currently implemented
        """
        # Preparing environment
        df = self.event_table.copy()
        df["Ft"] = self.cumulative_density_
        df["lagS"] = lagged_survival.fillna(1)
        if ci_labels is None:
            ci_labels = [
                "%s_upper_%.2f" % (self._predict_label, alpha),
                "%s_lower_%.2f" % (self._predict_label, alpha)
            ]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."

        # Have to loop through each time independently. Don't think there is a faster way
        all_vars = []
        for _, r in df.iterrows():
            sf = df.loc[df.index <= r.name].copy()
            F_t = float(r["Ft"])
            sf["part1"] = (
                (F_t - sf["Ft"])**2) * (sf["observed"] /
                                        (sf["at_risk"] *
                                         (sf["at_risk"] - sf["observed"])))
            sf["part2"] = (((sf["lagS"])**2) * sf[self.label_cmprisk] *
                           ((sf["at_risk"] - sf[self.label_cmprisk])) /
                           (sf["at_risk"]**3))
            sf["part3"] = (F_t -
                           sf["Ft"]) * sf["lagS"] * (sf[self.label_cmprisk] /
                                                     (sf["at_risk"]**2))
            variance = (np.sum(sf["part1"])) + (np.sum(
                sf["part2"])) - 2 * (np.sum(sf["part3"]))
            all_vars.append(variance)
        df["variance"] = all_vars

        # Calculating Confidence Intervals
        df["F_transformed"] = np.log(-np.log(df["Ft"]))
        df["se_transformed"] = np.sqrt(
            df["variance"]) / (df["Ft"] * np.absolute(np.log(df["Ft"])))
        zalpha = inv_normal_cdf((1.0 + alpha) / 2.0)
        df[ci_labels[0]] = np.exp(-np.exp(df["F_transformed"] +
                                          zalpha * df["se_transformed"]))
        df[ci_labels[1]] = np.exp(-np.exp(df["F_transformed"] -
                                          zalpha * df["se_transformed"]))
        return df["variance"], df[ci_labels]
    def plot(self, columns=None, display_significance_code=True, **errorbar_kwargs):
        """
        Produces a visual representation of the coefficients, including their standard errors and magnitudes.

        Parameters
        ----------
        columns : list, optional
            specifiy a subset of the columns to plot
        display_significance_code: bool, optional (default: True)
            display asteriks beside statistically significant variables
        errorbar_kwargs:
            pass in additional plotting commands to matplotlib errorbar command

        Returns
        -------
        ax: matplotlib axis
            the matplotlib axis that be edited.

        """
        from matplotlib import pyplot as plt

        ax = errorbar_kwargs.get("ax", None) or plt.figure().add_subplot(111)

        errorbar_kwargs.setdefault("c", "k")
        errorbar_kwargs.setdefault("fmt", "s")
        errorbar_kwargs.setdefault("markerfacecolor", "white")
        errorbar_kwargs.setdefault("markeredgewidth", 1.25)
        errorbar_kwargs.setdefault("elinewidth", 1.25)
        errorbar_kwargs.setdefault("capsize", 3)

        alpha2 = inv_normal_cdf((1.0 + self.alpha) / 2.0)

        if columns is None:
            columns = self.hazards_.columns

        yaxis_locations = list(range(len(columns)))
        summary = self.summary.loc[columns]
        symmetric_errors = alpha2 * self.standard_errors_[columns].squeeze().values.copy()
        hazards = self.hazards_[columns].values[0].copy()

        order = np.argsort(hazards)

        ax.errorbar(hazards[order], yaxis_locations, xerr=symmetric_errors[order], **errorbar_kwargs)
        best_ylim = ax.get_ylim()
        ax.vlines(0, -2, len(columns) + 1, linestyles="dashed", linewidths=1, alpha=0.65)
        ax.set_ylim(best_ylim)

        if display_significance_code:
            tick_labels = [c + significance_code(p).strip() for (c, p) in summary["p"][order].iteritems()]
        else:
            tick_labels = columns[order]

        plt.yticks(yaxis_locations, tick_labels)
        plt.xlabel("log(HR) (%g%% CI)" % (self.alpha * 100))

        return ax
Пример #15
0
 def _compute_confidence_intervals(self):
     ci = 100 * (1 - self.alpha)
     z = inv_normal_cdf(1 - self.alpha / 2)
     std_error = np.sqrt(self.cumulative_variance_)
     return pd.concat({
         "%g%% lower-bound" % ci:
         self.cumulative_hazards_ - z * std_error,
         "%g%% upper-bound" % ci:
         self.cumulative_hazards_ + z * std_error,
     })
Пример #16
0
 def _compute_confidence_intervals(self):
     ci = 100 * (1 - self.alpha)
     z = inv_normal_cdf(1 - self.alpha / 2)
     se = self.standard_errors_
     hazards = self.params_.values
     return pd.DataFrame(
         np.c_[hazards - z * se, hazards + z * se],
         columns=["%g%% lower-bound" % ci, "%g%% upper-bound" % ci],
         index=self.params_.index,
     )
Пример #17
0
    def plot(self, columns=None, ax=None, **errorbar_kwargs):
        """
        Produces a visual representation of the coefficients, including their standard errors and magnitudes.

        Parameters
        ----------
        columns : list, optional
            specify a subset of the columns to plot
        errorbar_kwargs:
            pass in additional plotting commands to matplotlib errorbar command

        Returns
        -------
        ax: matplotlib axis
            the matplotlib axis that be edited.

        """
        from matplotlib import pyplot as plt

        if ax is None:
            ax = plt.gca()

        errorbar_kwargs.setdefault("c", "k")
        errorbar_kwargs.setdefault("fmt", "s")
        errorbar_kwargs.setdefault("markerfacecolor", "white")
        errorbar_kwargs.setdefault("markeredgewidth", 1.25)
        errorbar_kwargs.setdefault("elinewidth", 1.25)
        errorbar_kwargs.setdefault("capsize", 3)

        z = inv_normal_cdf(1 - self.alpha / 2)

        if columns is None:
            user_supplied_columns = False
            columns = self.params_.index
        else:
            user_supplied_columns = True

        yaxis_locations = list(range(len(columns)))
        symmetric_errors = z * self.standard_errors_[columns].values.copy()
        hazards = self.params_[columns].values.copy()

        order = list(range(len(columns) - 1, -1, -1)) if user_supplied_columns else np.argsort(hazards)

        ax.errorbar(hazards[order], yaxis_locations, xerr=symmetric_errors[order], **errorbar_kwargs)
        best_ylim = ax.get_ylim()
        ax.vlines(0, -2, len(columns) + 1, linestyles="dashed", linewidths=1, alpha=0.65)
        ax.set_ylim(best_ylim)

        tick_labels = [columns[i] for i in order]

        ax.set_yticks(yaxis_locations)
        ax.set_yticklabels(tick_labels)
        ax.set_xlabel("log(HR) (%g%% CI)" % ((1 - self.alpha) * 100))

        return ax
Пример #18
0
 def _compute_confidence_bounds_of_parameters(self):
     se = self._compute_standard_errors().loc["se"]
     alpha2 = inv_normal_cdf((1.0 + self.alpha) / 2.0)
     return pd.DataFrame(
         [
             np.array([self.lambda_, self.rho_]) + alpha2 * se,
             np.array([self.lambda_, self.rho_]) - alpha2 * se
         ],
         columns=["lambda_", "rho_"],
         index=["upper-bound", "lower-bound"],
     )
Пример #19
0
 def _compute_confidence_bounds_of_parameters(self):
     se = self._compute_standard_errors().loc["se"]
     alpha2 = inv_normal_cdf((1.0 + self.alpha) / 2.0)
     return pd.DataFrame(
         [
             self._fitted_parameters_ + alpha2 * se,
             self._fitted_parameters_ - alpha2 * se
         ],
         columns=self._fitted_parameter_names,
         index=["upper-bound", "lower-bound"],
     )
Пример #20
0
 def _bounds(self, cumulative_sq_, alpha):
     # See http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes2.pdfg
     alpha2 = inv_normal_cdf((1. + alpha) / 2.)
     df = pd.DataFrame(index=self.timeline)
     name = self.__estimate.columns[0]
     v = np.log(self.__estimate.values)
     df["%s_upper_%.2f" % (name, self.alpha)] = np.exp(
         -np.exp(np.log(-v) + alpha2 * np.sqrt(cumulative_sq_) / v))
     df["%s_lower_%.2f" % (name, self.alpha)] = np.exp(
         -np.exp(np.log(-v) - alpha2 * np.sqrt(cumulative_sq_) / v))
     return df
Пример #21
0
 def _compute_confidence_intervals(self):
     alpha2 = inv_normal_cdf(1 - (1 - self.alpha) / 2)
     n = self.timeline.shape[0]
     d = self.cumulative_hazards_.shape[1]
     index = [['upper'] * n + ['lower'] * n, np.concatenate([self.timeline, self.timeline])]
     self.confidence_intervals_ = pd.DataFrame(
         np.zeros((2 * n, d)), index=index, columns=self.cumulative_hazards_.columns)
     self.confidence_intervals_.ix['upper'] = self.cumulative_hazards_.values + \
         alpha2 * np.sqrt(self._variance.cumsum().values)
     self.confidence_intervals_.ix['lower'] = self.cumulative_hazards_.values - \
         alpha2 * np.sqrt(self._variance.cumsum().values)
     return
Пример #22
0
    def _bounds(self, cumulative_sq_, alpha, ci_labels):
        # See http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes2.pdf
        alpha2 = inv_normal_cdf((1. + alpha) / 2.)
        df = pd.DataFrame(index=self.timeline)
        v = np.log(self.__estimate.values)

        if ci_labels is None:
            ci_labels = ["%s_upper_%.2f" % (self._label, alpha), "%s_lower_%.2f" % (self._label, alpha)]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."

        df[ci_labels[0]] = np.exp(-np.exp(np.log(-v) + alpha2 * np.sqrt(cumulative_sq_) / v))
        df[ci_labels[1]] = np.exp(-np.exp(np.log(-v) - alpha2 * np.sqrt(cumulative_sq_) / v))
        return df
Пример #23
0
    def _bounds(self, cumulative_sq_, ci_labels):
        # See http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes2.pdfg
        alpha2 = inv_normal_cdf((1. + self.alpha) / 2.)
        df = pd.DataFrame(index=self.timeline)
        v = np.log(self.__estimate.values)

        if ci_labels is None:
            ci_labels = ["%s_upper_%.2f" % (self._label, self.alpha), "%s_lower_%.2f" % (self._label, self.alpha)]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."

        df[ci_labels[0]] = np.exp(-np.exp(np.log(-v) + alpha2 * np.sqrt(cumulative_sq_) / v))
        df[ci_labels[1]] = np.exp(-np.exp(np.log(-v) - alpha2 * np.sqrt(cumulative_sq_) / v))
        return df
Пример #24
0
    def _bounds(self, cumulative_sq_, alpha, ci_labels):
        z = inv_normal_cdf(1 - alpha / 2)
        df = pd.DataFrame(index=self.timeline)

        if ci_labels is None:
            ci_labels = ["%s_lower_%g" % (self._label, 1 - alpha), "%s_upper_%g" % (self._label, 1 - alpha)]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."
        self.ci_labels = ci_labels

        cum_hazard_ = self.cumulative_hazard_.values
        df[ci_labels[0]] = cum_hazard_ * np.exp(-z * np.sqrt(cumulative_sq_) / np.where(cum_hazard_ == 0, 1, cum_hazard_))
        df[ci_labels[1]] = cum_hazard_ * np.exp(z * np.sqrt(cumulative_sq_) / np.where(cum_hazard_ == 0, 1, cum_hazard_))
        return df
    def plot(self, columns=None, **errorbar_kwargs):
        """
        Produces a visual representation of the coefficients, including their standard errors and magnitudes.

        Parameters
        ----------
        columns : list, optional
            specify a subset of the columns to plot
        errorbar_kwargs:
            pass in additional plotting commands to matplotlib errorbar command

        Returns
        -------
        ax: matplotlib axis
            the matplotlib axis that be edited.

        """
        from matplotlib import pyplot as plt

        ax = errorbar_kwargs.pop("ax", None) or plt.figure().add_subplot(111)

        errorbar_kwargs.setdefault("c", "k")
        errorbar_kwargs.setdefault("fmt", "s")
        errorbar_kwargs.setdefault("markerfacecolor", "white")
        errorbar_kwargs.setdefault("markeredgewidth", 1.25)
        errorbar_kwargs.setdefault("elinewidth", 1.25)
        errorbar_kwargs.setdefault("capsize", 3)

        z = inv_normal_cdf(1 - self.alpha / 2)

        if columns is None:
            columns = self.hazards_.index

        yaxis_locations = list(range(len(columns)))
        symmetric_errors = z * self.standard_errors_[columns].to_frame().squeeze(axis=1).values.copy()
        hazards = self.hazards_[columns].values.copy()

        order = np.argsort(hazards)

        ax.errorbar(hazards[order], yaxis_locations, xerr=symmetric_errors[order], **errorbar_kwargs)
        best_ylim = ax.get_ylim()
        ax.vlines(0, -2, len(columns) + 1, linestyles="dashed", linewidths=1, alpha=0.65)
        ax.set_ylim(best_ylim)

        tick_labels = [columns[i] for i in order]

        ax.set_yticks(yaxis_locations)
        ax.set_yticklabels(tick_labels)
        ax.set_xlabel("log(HR) (%g%% CI)" % ((1 - self.alpha) * 100))

        return ax
Пример #26
0
    def _bounds(self, alpha, ci_labels):
        alpha2 = inv_normal_cdf((1.0 + alpha) / 2.0)
        df = pd.DataFrame(index=self.timeline)

        if ci_labels is None:
            ci_labels = ["%s_upper_%.2f" % (self._label, alpha), "%s_lower_%.2f" % (self._label, alpha)]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."

        std = np.sqrt(self._lambda_variance_)
        cum_hazard = self.cumulative_hazard_
        error = std * self.timeline[:, None]
        df[ci_labels[0]] = cum_hazard + alpha2 * error
        df[ci_labels[1]] = cum_hazard - alpha2 * error
        return df
Пример #27
0
    def _bounds(self, cumulative_sq_, alpha, ci_labels):
        alpha2 = inv_normal_cdf(1 - (1 - alpha) / 2)
        df = pd.DataFrame(index=self.timeline)

        if ci_labels is None:
            ci_labels = ["%s_upper_%.2f" % (self._label, alpha), "%s_lower_%.2f" % (self._label, alpha)]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."
        self.ci_labels = ci_labels

        df[ci_labels[0]] = self.cumulative_hazard_.values * \
            np.exp(alpha2 * np.sqrt(cumulative_sq_) / self.cumulative_hazard_.values)
        df[ci_labels[1]] = self.cumulative_hazard_.values * \
            np.exp(-alpha2 * np.sqrt(cumulative_sq_) / self.cumulative_hazard_.values)
        return df
    def _bounds(self, cumulative_sq_, alpha, ci_labels):
        # This method calculates confidence intervals using the exponential Greenwood formula.
        # See https://www.math.wustl.edu/%7Esawyer/handouts/greenwood.pdf
        z = inv_normal_cdf(1 - alpha / 2)
        df = pd.DataFrame(index=self.timeline)
        v = np.log(self.__estimate.values)

        if ci_labels is None:
            ci_labels = ["%s_upper_%g" % (self._label, 1 - alpha), "%s_lower_%g" % (self._label, 1 - alpha)]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."

        df[ci_labels[0]] = np.exp(-np.exp(np.log(-v) + z * np.sqrt(cumulative_sq_) / v))
        df[ci_labels[1]] = np.exp(-np.exp(np.log(-v) - z * np.sqrt(cumulative_sq_) / v))
        return df
Пример #29
0
    def _bounds(self, cumulative_sq_, alpha, ci_labels):
        # This method calculates confidence intervals using the exponential Greenwood formula.
        # See https://www.math.wustl.edu/%7Esawyer/handouts/greenwood.pdf
        z = inv_normal_cdf(1 - alpha / 2)
        df = pd.DataFrame(index=self.timeline)
        v = np.log(self.__estimate.values)

        if ci_labels is None:
            ci_labels = ["%s_lower_%g" % (self._label, 1 - alpha), "%s_upper_%g" % (self._label, 1 - alpha)]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."

        df[ci_labels[0]] = np.exp(-np.exp(np.log(-v) - z * np.sqrt(cumulative_sq_) / v))
        df[ci_labels[1]] = np.exp(-np.exp(np.log(-v) + z * np.sqrt(cumulative_sq_) / v))
        return df.fillna(1.0)
Пример #30
0
    def _bounds(self, cumulative_sq_, alpha, ci_labels):
        alpha2 = inv_normal_cdf(1 - (1 - alpha) / 2)
        df = pd.DataFrame(index=self.timeline)

        if ci_labels is None:
            ci_labels = ["%s_upper_%.2f" % (self._label, self.alpha), "%s_lower_%.2f" % (self._label, self.alpha)]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."
        self.ci_labels = ci_labels

        df[ci_labels[0]] = self.cumulative_hazard_.values * \
            np.exp(alpha2 * np.sqrt(cumulative_sq_) / self.cumulative_hazard_.values)
        df[ci_labels[1]] = self.cumulative_hazard_.values * \
            np.exp(-alpha2 * np.sqrt(cumulative_sq_) / self.cumulative_hazard_.values)
        return df
Пример #31
0
    def _bounds(self, alpha, ci_labels):
        alpha2 = inv_normal_cdf((1. + alpha) / 2.)
        df = pd.DataFrame(index=self.timeline)

        if ci_labels is None:
            ci_labels = ["%s_upper_%.2f" % (self._label, alpha), "%s_lower_%.2f" % (self._label, alpha)]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."

        std = np.sqrt(self._lambda_variance_)
        sv = self.survival_function_
        error = std * self.timeline[:, None] * sv
        df[ci_labels[0]] = sv + alpha2 * error
        df[ci_labels[1]] = sv - alpha2 * error
        return df
Пример #32
0
    def _compute_confidence_bounds_of_transform(self, transform, alpha,
                                                ci_labels):
        """
        This computes the confidence intervals of a transform of the parameters. Ex: take
        the fitted parameters, a function/transform and the variance matrix and give me
        back confidence intervals of the transform.

        Parameters
        -----------
        transform: function
            must a function of two parameters:
                ``params``, an iterable that stores the parameters
                ``times``, a numpy vector representing some timeline
            the function must use autograd imports (scipy and numpy)
        alpha: float
            confidence level
        ci_labels: tuple

        """
        alpha2 = inv_normal_cdf((1.0 + alpha) / 2.0)
        df = pd.DataFrame(index=self.timeline)

        # pylint: disable=no-value-for-parameter
        gradient_of_cum_hazard_at_mle = make_jvp_reversemode(transform)(
            self._fitted_parameters_, self.timeline)

        gradient_at_times = np.vstack([
            gradient_of_cum_hazard_at_mle(basis)
            for basis in np.eye(len(self._fitted_parameters_))
        ])

        std_cumulative_hazard = np.sqrt(
            np.einsum("nj,jk,nk->n", gradient_at_times.T,
                      self.variance_matrix_, gradient_at_times.T))

        if ci_labels is None:
            ci_labels = [
                "%s_upper_%.2f" % (self._label, alpha),
                "%s_lower_%.2f" % (self._label, alpha)
            ]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."

        df[ci_labels[0]] = transform(
            self._fitted_parameters_,
            self.timeline) + alpha2 * std_cumulative_hazard
        df[ci_labels[1]] = transform(
            self._fitted_parameters_,
            self.timeline) - alpha2 * std_cumulative_hazard
        return df
Пример #33
0
    def smoothed_hazard_confidence_intervals_(self, bandwidth, hazard_=None):
        """
        Parameter:
          bandwidth: the bandwith to use in the Epanechnikov kernel.
          hazard_: a computed (n,) numpy array of estimated hazard rates. If none, uses naf.smoothed_hazard_
        """
        if hazard_ is None:
            hazard_ = self.smoothed_hazard_(bandwidth).values[:, 0]

        timeline = self.timeline
        alpha2 = inv_normal_cdf(1 - (1 - self.alpha) / 2)
        self._cumulative_sq.iloc[0] = 0
        var_hazard_ = self._cumulative_sq.diff().fillna(self._cumulative_sq.iloc[0])
        C = (var_hazard_.values != 0.0)  # only consider the points with jumps
        std_hazard_ = np.sqrt(1. / (bandwidth ** 2) * np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None, :], bandwidth) ** 2, var_hazard_.values[C]))
        values = {
            self.ci_labels[0]: hazard_ * np.exp(alpha2 * std_hazard_ / hazard_),
            self.ci_labels[1]: hazard_ * np.exp(-alpha2 * std_hazard_ / hazard_)
        }
        return pd.DataFrame(values, index=timeline)
Пример #34
0
    def summary(self):
        """Summary statistics describing the fit.

        Returns
        -------
        df : DataFrame
            Contains columns coef, np.exp(coef), se(coef), z, p, lower, upper"""
        ci = 100 * (1 - self.alpha)
        z = inv_normal_cdf(1 - self.alpha / 2)
        with np.errstate(invalid="ignore", divide="ignore", over="ignore", under="ignore"):
            df = pd.DataFrame(index=self.params_.index)
            df["coef"] = self.params_
            df["exp(coef)"] = self.hazard_ratios_
            df["se(coef)"] = self.standard_errors_
            df["coef lower %g%%" % ci] = self.confidence_intervals_["%g%% lower-bound" % ci]
            df["coef upper %g%%" % ci] = self.confidence_intervals_["%g%% upper-bound" % ci]
            df["exp(coef) lower %g%%" % ci] = self.hazard_ratios_ * np.exp(-z * self.standard_errors_)
            df["exp(coef) upper %g%%" % ci] = self.hazard_ratios_ * np.exp(z * self.standard_errors_)
            df["z"] = self._compute_z_values()
            df["p"] = self._compute_p_values()
            df["-log2(p)"] = -np.log2(df["p"])
            return df
Пример #35
0
    def _bounds(self, alpha, ci_labels):
        alpha2 = inv_normal_cdf((1. + alpha) / 2.)
        df = pd.DataFrame(index=self.timeline)
        var_lambda_, var_rho_ = inv(self._jacobian).diagonal()

        def _dH_d_lambda(lambda_, rho, T):
            return rho / lambda_ * (lambda_ * T) ** rho

        def _dH_d_rho(lambda_, rho, T):
            return np.log(lambda_ * T) * (lambda_ * T) ** rho

        def sensitivity_analysis(lambda_, rho, var_lambda_, var_rho_, T):
            return var_lambda_ * _dH_d_lambda(lambda_, rho, T) ** 2 + var_rho_ * _dH_d_rho(lambda_, rho, T) ** 2

        std_cumulative_hazard = np.sqrt(sensitivity_analysis(self.lambda_, self.rho_, var_lambda_, var_rho_, self.timeline))

        if ci_labels is None:
            ci_labels = ["%s_upper_%.2f" % (self._label, alpha), "%s_lower_%.2f" % (self._label, alpha)]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."

        df[ci_labels[0]] = self.cumulative_hazard_at_times(self.timeline) + alpha2 * std_cumulative_hazard
        df[ci_labels[1]] = self.cumulative_hazard_at_times(self.timeline) - alpha2 * std_cumulative_hazard
        return df
Пример #36
0
    def _bounds(self, alpha, ci_labels):
        alpha2 = inv_normal_cdf((1. + alpha) / 2.)
        df = pd.DataFrame(index=self.timeline)
        var_lambda_, var_rho_ = inv(self._jacobian).diagonal()

        def _dH_d_lambda(lambda_, rho, T):
            return rho / lambda_ * (lambda_ * T) ** rho

        def _dH_d_rho(lambda_, rho, T):
            return np.log(lambda_ * T) * (lambda_ * T) ** rho

        def sensitivity_analysis(lambda_, rho, var_lambda_, var_rho_, T):
            return var_lambda_ * _dH_d_lambda(lambda_, rho, T) ** 2 + var_rho_ * _dH_d_rho(lambda_, rho, T) ** 2

        std_cumulative_hazard = np.sqrt(sensitivity_analysis(self.lambda_, self.rho_, var_lambda_, var_rho_, self.timeline))

        if ci_labels is None:
            ci_labels = ["%s_upper_%.2f" % (self._label, alpha), "%s_lower_%.2f" % (self._label, alpha)]
        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."

        df[ci_labels[0]] = self.cumulative_hazard_at_times(self.timeline) + alpha2 * std_cumulative_hazard
        df[ci_labels[1]] = self.cumulative_hazard_at_times(self.timeline) - alpha2 * std_cumulative_hazard
        return df
Пример #37
0
    def plot(self, columns=None, parameters=None, **errorbar_kwargs):
        """
        Produces a visual representation of the coefficients, including their standard errors and magnitudes.

        Parameters
        ----------
        columns : list, optional
            specify a subset of the columns (variables from the training data) to plot
        parameter : list, optional
            specify a subset of the parameters to plot
        errorbar_kwargs:
            pass in additional plotting commands to matplotlib errorbar command

        Returns
        -------
        ax: matplotlib axis
            the matplotlib axis that be edited.

        """
        from matplotlib import pyplot as plt

        set_kwargs_ax(errorbar_kwargs)
        ax = errorbar_kwargs.pop("ax")
        errorbar_kwargs.setdefault("c", "k")
        errorbar_kwargs.setdefault("fmt", "s")
        errorbar_kwargs.setdefault("markerfacecolor", "white")
        errorbar_kwargs.setdefault("markeredgewidth", 1.25)
        errorbar_kwargs.setdefault("elinewidth", 1.25)
        errorbar_kwargs.setdefault("capsize", 3)

        z = inv_normal_cdf(1 - self.alpha / 2)

        params_ = self.params_.copy()
        standard_errors_ = self.standard_errors_.copy()

        if columns is not None:
            assert isinstance(columns, list), "columns must be a list"
            params_ = params_.loc[:, columns]
            standard_errors_ = standard_errors_.loc[:, columns]
        if parameters is not None:
            assert isinstance(parameters, list), "parameter must be a list"
            params_ = params_.loc[parameters]
            standard_errors_ = standard_errors_.loc[parameters]

        columns = params_.index

        hazards = params_.loc[columns].to_frame(name="coefs")

        hazards["se"] = z * standard_errors_.loc[columns]
        hazards = hazards.swaplevel(1, 0).sort_index()

        yaxis_locations = list(range(len(columns) - 1, -1, -1))

        ax.errorbar(hazards["coefs"], yaxis_locations, xerr=hazards["se"], **errorbar_kwargs)

        # set zero hline
        best_ylim = ax.get_ylim()
        ax.vlines(0, -2, len(columns) + 1, linestyles="dashed", linewidths=1, alpha=0.65)
        ax.set_ylim(best_ylim)

        if isinstance(columns[0], tuple):
            tick_labels = ["%s: %s" % (p, c) for (p, c) in hazards.index]
        else:
            tick_labels = [i for i in hazards.index]

        plt.yticks(yaxis_locations, tick_labels)
        plt.xlabel("log(accelerated failure rate) (%g%% CI)" % ((1 - self.alpha) * 100))

        return ax
Пример #38
0
    def plot(self, columns=None, parameter=None, **errorbar_kwargs):
        """
        Produces a visual representation of the coefficients, including their standard errors and magnitudes.

        Parameters
        ----------
        columns : list, optional
            specify a subset of the columns to plot
        errorbar_kwargs:
            pass in additional plotting commands to matplotlib errorbar command

        Returns
        -------
        ax: matplotlib axis
            the matplotlib axis that be edited.

        """
        from matplotlib import pyplot as plt

        set_kwargs_ax(errorbar_kwargs)
        ax = errorbar_kwargs.pop("ax")
        errorbar_kwargs.setdefault("c", "k")
        errorbar_kwargs.setdefault("fmt", "s")
        errorbar_kwargs.setdefault("markerfacecolor", "white")
        errorbar_kwargs.setdefault("markeredgewidth", 1.25)
        errorbar_kwargs.setdefault("elinewidth", 1.25)
        errorbar_kwargs.setdefault("capsize", 3)

        z = inv_normal_cdf(1 - self.alpha / 2)

        params_ = self.params_.copy()
        standard_errors_ = self.standard_errors_.copy()

        if columns is not None:
            params_ = params_.loc[:, columns]
            standard_errors_ = standard_errors_.loc[:, columns]
        if parameter is not None:
            params_ = params_.loc[parameter]
            standard_errors_ = standard_errors_.loc[parameter]

        columns = params_.index

        hazards = params_.loc[columns].to_frame(name="coefs")
        hazards["se"] = z * standard_errors_.loc[columns]

        if isinstance(hazards.index, pd.MultiIndex):
            hazards = hazards.groupby(level=0, group_keys=False).apply(
                lambda x: x.sort_values(by="coefs", ascending=True))
        else:
            hazards = hazards.sort_values(by="coefs", ascending=True)

        yaxis_locations = list(range(len(columns)))

        ax.errorbar(hazards["coefs"],
                    yaxis_locations,
                    xerr=hazards["se"],
                    **errorbar_kwargs)
        best_ylim = ax.get_ylim()
        ax.vlines(0,
                  -2,
                  len(columns) + 1,
                  linestyles="dashed",
                  linewidths=1,
                  alpha=0.65)
        ax.set_ylim(best_ylim)

        if isinstance(columns[0], tuple):
            tick_labels = ["%s: %s" % (c, p) for (p, c) in hazards.index]
        else:
            tick_labels = [i for i in hazards.index]

        plt.yticks(yaxis_locations, tick_labels)
        plt.xlabel("log(accelerated failure rate) (%g%% CI)" %
                   ((1 - self.alpha) * 100))

        return ax