예제 #1
파일: stats.py 프로젝트: sean-reed/ramsmod
def log_rank_test(t1, d1, t2, d2):
    Performs a log-rank test to evaluate the null hypothesis that
    two groups have the same reliability from right-censored failure data.
    :param t1: Survival times for the observations in the failure data
    for group 1.
    :param d1: Indicator variable values showing if observations
    were failures (value 1) or right-censored (value 0) for group 1.
    :param t2: Survival times of the observations in the failure data
    for group 2.
    :param d2: Indicator variable values showing if observations
    were failures (value 1) or right-censored (value 0) for group 2.
    :return:  A tuple containing a Pandas DataFrame with a table of results from
    the calculations used to perform the test, the log-rank test statistic, the
    estimated variance of the statistic distribution and the calculated P-value for the test.

    # Convert inputs to pd.Series if not already.
    t1 = convert_to_pd_series(t1)
    d1 = convert_to_pd_series(d1)
    t2 = convert_to_pd_series(t2)
    d2 = convert_to_pd_series(d2)

    t = pd.concat([t1, t2])
    d = pd.concat([d1, d2])
    # Ordered failure times.
    tf = pd.Series(t[d == 1].unique()).sort_values(ignore_index=True)
    # Observed failures.
    m1 = tf.apply(lambda x: sum(t1[d1 == 1] == x))
    m2 = tf.apply(lambda x: sum(t2[d2 == 1] == x))
    # Number at risk.
    n1 = tf.apply(lambda x: sum(t1 >= x))
    n2 = tf.apply(lambda x: sum(t2 >= x))
    # Expected failures under null hypothesis.
    e1 = n1 / (n1 + n2) * (m1 + m2)
    e2 = n2 / (n1 + n2) * (m1 + m2)

    table = pd.DataFrame({
        'tf': tf,
        'm1f': m1,
        'm2f': m2,
        'n1f': n1,
        'n2f': n2,
        'e1f': e1,
        'e2f': e2

    # Calculate log-rank statistic.
    num = (n1 * n2 * (m1 + m2) * (n1 + n2 - m1 - m2))
    den = (n1 + n2).pow(2) * (n1 + n2 - 1)
    var = sum((num / den).replace([np.nan], 0))
    log_rank_stat = pow(sum(m1) - sum(e1), 2) / var
    p = chi2(1).sf(log_rank_stat)

    return table, log_rank_stat, var, p
예제 #2
파일: stats.py 프로젝트: sean-reed/ramsmod
def mantel_test(t_min_1, t_max_1, t_min_2, t_max_2):
    Performs a Mantel test to evaluate the null hypothesis that
    two groups have the same reliability from interval-censored failure data.
    :param t_min_1: Exclusive lower bounds of the failure intervals
    for the observations from the group 1 failure data.
    :param t_max_1: Inclusive upper bounds of the failure intervals
    for the observations from the group 1 failure data.
   :param t_min_2: Exclusive lower bounds of the failure intervals
    for the observations from the group 2 failure data.
    :param t_max_2: Inclusive upper bounds of the failure intervals
    for the observations from the group 2 failure data.
    :return: A tuple containing a Pandas DataFrame with a table containing results from
    calculations used to perform the test, the Mantel test statistic, the estimated
    variance in the test statistic and the calculated P-value for the test.
    # Convert inputs to pd.Series if not already.
    t_min_1 = convert_to_pd_series(t_min_1)
    t_max_1 = convert_to_pd_series(t_max_1)
    t_min_2 = convert_to_pd_series(t_min_2)
    t_max_2 = convert_to_pd_series(t_max_2)

    t_min = pd.concat([t_min_1, t_min_2], ignore_index=True)
    t_max = pd.concat([t_max_1, t_max_2], ignore_index=True)

    n_1 = t_min_1.size
    n_2 = t_min_2.size
    n = n_1 + n_2

    later = np.zeros(n)
    earlier = np.zeros(n)
    for i in range(n):
        later[i] = sum(t_min[i] >= t_max)
        earlier[i] = sum(t_max[i] <= t_min)

    v = later - earlier

    table = pd.DataFrame(
            't_min': t_min,
            't_max': t_max,
            'later': later,
            'earlier': earlier,
            'v': v
        index=range(1, n + 1))
    table.index.name = "Observation #"

    var = n_1 * n_2 * sum(np.power(v, 2)) / ((n_1 + n_2) * (n_1 + n_2 - 1))
    sd = np.sqrt(var)
    w = sum(v[:n_1])
    p = norm.sf(abs(w), scale=sd) * 2

    return table, w, var, p
예제 #3
def plot_interval_censored(tmin, tmax, ax=None, show_legend=True):
    Returns a plot of observations from interval-censored failure data.
    :param tmin: The exclusive lower bounds of the failure time intervals.
    :param tmax: The inclusive upper bound of the failure time intervals
    (use np.infty for right-censored observations).
    :param ax: Matplotlib axes on which to plot, if None then one will be created.
    :param show_legend: Boolean that is True if legend should be added to axes and
    False otherwise.
    :return: Matplotlib axes containing the plot.
    tmin = convert_to_pd_series(tmin)
    tmax = convert_to_pd_series(tmax)

    if ax is None:
        fig = plt.figure()  # Create plot figure.
        ax = fig.add_subplot()  # Create the axes to plot on.

    # Add the observations to the axes.
    for i in range(len(tmin)):
        if tmin.iloc[i] == tmax.iloc[i]:
            ax.scatter(tmin.iloc[i], i + 1, color='b', marker='o')
        elif tmax.iloc[i] == np.infty:
            ax.scatter(tmin.iloc[i], i + 1, color='b', marker='>')
            ax.hlines(i + 1, tmin.iloc[i], tmax.iloc[i], color='b')

    ax.set_ylabel('Observation #')

    # Add legend to axes.
    if show_legend:
        legend_elements = [
            Line2D([0], [0], color='b', label='Interval-censored'),
            Line2D([0], [0],
            Line2D([0], [0],
        ax.legend(handles=legend_elements, loc='upper right')

    return ax
예제 #4
def plot_right_censored(t, d, ax=None, show_legend=True):
    Returns a plot of observations from right-censored failure data.
    :param t: Survival times for each observation.
    :param d: Indicator variable value for each observation, where
    value 1 indicates exact failure observed and 0 indicates failure was right-censored.
    :param ax: Matplotlib axes on which to plot, if None then one will be created.
    :param show_legend: Boolean that is True if legend should be added to axes and
    False otherwise.

    :return: Matplotlib axes containing the plot.
    t = convert_to_pd_series(t)
    d = convert_to_pd_series(d)

    if ax is None:
        ax = plt.gca()

    # Add the observations to the axes.
    for i in range(len(t)):
        if d.iloc[i]:
            # Failure observation.
            ax.scatter(t.iloc[i], i + 1, color='b', marker='o')
            # Right-censored observation.
            ax.scatter(t.iloc[i], i + 1, color='b', marker='>')

    ax.set_ylabel('Observation #')

    # Add legend to axes.
    if show_legend:
        legend_elements = [
            Line2D([0], [0],
            Line2D([0], [0],

    return ax
예제 #5
def kaplan_meier_fit(t, d, ci=None, alpha=0.05):
    Produces a table from right-censored failure data containing data relating to
    the Kaplan-Meier estimate of the reliability function.
    :param t: Survival times of the observations in the failure data.
    :param d: Indicator variable values showing if observations were failures (value 1) or right-censored (value 0).
    :param ci: string with value 'gw' for Greenwood's and 'egw' for exponential
    Greenwood's confidence interval bounds.
    :param alpha: float giving level of significance for confidence interval (i.e.
    giving (1-alpha)*100% confidence level it contains true reliability). Default is 0.05.
    :return: Pandas DataFrame with columns 't', 'm', 'n' and 'R' containing
     the ordered failure times and corresponding number of observed failures,
     number at risk and Kaplan-Meier reliability estimates respectively.
    t = convert_to_pd_series(t)
    d = convert_to_pd_series(d)

    if t.size != d.size:
        raise ValueError("times and observed must be equal size.")

    # Get the ordered failure times.
    only_failures = t[d == 1]
    failures_with_zero = only_failures.append(pd.Series([0]),
    unique_failures = pd.Series(failures_with_zero.unique())
    tf = unique_failures.sort_values(ignore_index=True)

    # Get the number of failures observed at the ordered failure times.
    mf = tf.apply(lambda x: (only_failures == x).sum())

    # Get the number right-censored prior in prior interval to each ordered failure time.
    only_censored = t[d == 0]
    total_q = tf.apply(lambda x: (only_censored < x).sum()).shift(periods=-1)
    qf = total_q.diff()
    qf.iloc[0] = total_q.iloc[0]  # Set censored between t_(0) and t_(1).
    qf.iloc[-1] = (only_censored >= tf.iloc[-1]
                   ).sum()  # Set censored after final ordered failure time.

    # Get the number at risk to fail at the ordered failure times.
    nf = tf.apply(lambda x: (t >= x).sum())

    # Calculate conditional survival probability
    # at the ordered failure times.
    surv_probs = 1 - (mf / nf)

    # Calculate Kaplan-Meier reliability estimates at the
    # ordered failure times.
    r = surv_probs.cumprod()

    # Create Pandas DataFrame with results.
    km_table = pd.DataFrame({'t': tf, 'm': mf, 'q': qf, 'n': nf, 'R': r})
    km_table.index.name = "f"

    # Add confidence intervals.
    if ci == 'gw':
        km_table['CI Lower'], km_table['CI Upper'] = _greenwoods_ci(
            mf, nf, r, alpha)
    elif ci == 'egw':
        km_table['CI Lower'], km_table['CI Upper'] = _exp_greenwoods_ci(
            mf, nf, r, alpha)
    elif ci is not None:
        raise ValueError(
            "Invalid ci value, must be 'gw' for Greenwood's or 'egw' for exponential Greenwood's"
            " confidence interval.")

    return km_table
예제 #6
def turnbull_fit(tmin, tmax, tol=0.001):
    Computes Turnbull estimates of reliability from interval censored
    failure data.

    tmin: Lower bounds of failure times for observations.
    tmax: Upper bounds of failure times for observations.
    tol: Tolerance for the iterative procedure, convergence terminates when
     maximum difference from previous reliability estimate at any time is less than

    Pandas DataFrame with column 't' containing the interval end point time values
     and 'R' containing the corresponding Turnbull reliability estimates.
    tmin = convert_to_pd_series(tmin)
    tmax = convert_to_pd_series(tmax)

    if tmin.size != tmax.size:
        raise ValueError("t_min and t_max must be equal size.")

    t = np.sort(pd.Series([0]).append(tmin).append(tmax).unique())

    # Initial reliability function as equal reduction at each time grid point.
    reliabilities = np.linspace(1.0, 0, len(t))

    # Form alphas matrix describing if each observation (matrix rows)
    # could fail (value 1) or not (value 0) in each interval (matrix columns).
    n = len(tmin)
    m = len(t) - 1
    alphas = np.empty((n, m), dtype=np.bool)
    for i in range(n):
        t_min_i = tmin.iloc[i]
        t_max_i = tmax.iloc[i]
        for j in range(m):
            grid_t_min_j = t[j]
            grid_t_max_j = t[j + 1]
            if t_min_i != t_max_i:
                        j)] = t_min_i < grid_t_max_j and t_max_i > grid_t_min_j
            else:  # Exact failure observation.
                    i, j)] = t_min_i <= grid_t_max_j and t_max_i > grid_t_min_j

    while True:
        # Compute estimated failures in each interval.
        p = -np.diff(reliabilities)  # Interval failure probabilities.
        p_alphas = alphas * p
        d = ((p_alphas.T / p_alphas.sum(axis=1)).T.sum(axis=0))
        # Compute number at risk in each interval.
        y = np.cumsum(d[::-1])[::-1]
        updated_reliabilities = np.insert(np.cumprod(1 - (d / y)), 0, 1)

        difference = np.max(np.abs(updated_reliabilities - reliabilities))
        reliabilities = updated_reliabilities
        if difference <= tol:

    turnbull_table = pd.DataFrame({
        't': t,
        'R': reliabilities
                                  index=range(1, m + 2))

    return turnbull_table