예제 #1
0
파일: stats.py 프로젝트: sean-reed/ramsmod
def log_rank_test(t1, d1, t2, d2):
    """
    Performs a log-rank test to evaluate the null hypothesis that
    two groups have the same reliability from right-censored failure data.
    :param t1: Survival times for the observations in the failure data
    for group 1.
    :param d1: Indicator variable values showing if observations
    were failures (value 1) or right-censored (value 0) for group 1.
    :param t2: Survival times of the observations in the failure data
    for group 2.
    :param d2: Indicator variable values showing if observations
    were failures (value 1) or right-censored (value 0) for group 2.
    :return:  A tuple containing a Pandas DataFrame with a table of results from
    the calculations used to perform the test, the log-rank test statistic, the
    estimated variance of the statistic distribution and the calculated P-value for the test.
    """

    # Convert inputs to pd.Series if not already.
    t1 = convert_to_pd_series(t1)
    d1 = convert_to_pd_series(d1)
    t2 = convert_to_pd_series(t2)
    d2 = convert_to_pd_series(d2)

    t = pd.concat([t1, t2])
    d = pd.concat([d1, d2])
    # Ordered failure times.
    tf = pd.Series(t[d == 1].unique()).sort_values(ignore_index=True)
    # Observed failures.
    m1 = tf.apply(lambda x: sum(t1[d1 == 1] == x))
    m2 = tf.apply(lambda x: sum(t2[d2 == 1] == x))
    # Number at risk.
    n1 = tf.apply(lambda x: sum(t1 >= x))
    n2 = tf.apply(lambda x: sum(t2 >= x))
    # Expected failures under null hypothesis.
    e1 = n1 / (n1 + n2) * (m1 + m2)
    e2 = n2 / (n1 + n2) * (m1 + m2)

    table = pd.DataFrame({
        'tf': tf,
        'm1f': m1,
        'm2f': m2,
        'n1f': n1,
        'n2f': n2,
        'e1f': e1,
        'e2f': e2
    })

    # Calculate log-rank statistic.
    num = (n1 * n2 * (m1 + m2) * (n1 + n2 - m1 - m2))
    den = (n1 + n2).pow(2) * (n1 + n2 - 1)
    var = sum((num / den).replace([np.nan], 0))
    log_rank_stat = pow(sum(m1) - sum(e1), 2) / var
    p = chi2(1).sf(log_rank_stat)

    return table, log_rank_stat, var, p
예제 #2
0
파일: stats.py 프로젝트: sean-reed/ramsmod
def mantel_test(t_min_1, t_max_1, t_min_2, t_max_2):
    """
    Performs a Mantel test to evaluate the null hypothesis that
    two groups have the same reliability from interval-censored failure data.
    :param t_min_1: Exclusive lower bounds of the failure intervals
    for the observations from the group 1 failure data.
    :param t_max_1: Inclusive upper bounds of the failure intervals
    for the observations from the group 1 failure data.
   :param t_min_2: Exclusive lower bounds of the failure intervals
    for the observations from the group 2 failure data.
    :param t_max_2: Inclusive upper bounds of the failure intervals
    for the observations from the group 2 failure data.
    :return: A tuple containing a Pandas DataFrame with a table containing results from
    calculations used to perform the test, the Mantel test statistic, the estimated
    variance in the test statistic and the calculated P-value for the test.
    """
    # Convert inputs to pd.Series if not already.
    t_min_1 = convert_to_pd_series(t_min_1)
    t_max_1 = convert_to_pd_series(t_max_1)
    t_min_2 = convert_to_pd_series(t_min_2)
    t_max_2 = convert_to_pd_series(t_max_2)

    t_min = pd.concat([t_min_1, t_min_2], ignore_index=True)
    t_max = pd.concat([t_max_1, t_max_2], ignore_index=True)

    n_1 = t_min_1.size
    n_2 = t_min_2.size
    n = n_1 + n_2

    later = np.zeros(n)
    earlier = np.zeros(n)
    for i in range(n):
        later[i] = sum(t_min[i] >= t_max)
        earlier[i] = sum(t_max[i] <= t_min)

    v = later - earlier

    table = pd.DataFrame(
        {
            't_min': t_min,
            't_max': t_max,
            'later': later,
            'earlier': earlier,
            'v': v
        },
        index=range(1, n + 1))
    table.index.name = "Observation #"

    var = n_1 * n_2 * sum(np.power(v, 2)) / ((n_1 + n_2) * (n_1 + n_2 - 1))
    sd = np.sqrt(var)
    w = sum(v[:n_1])
    p = norm.sf(abs(w), scale=sd) * 2

    return table, w, var, p
예제 #3
0
def plot_interval_censored(tmin, tmax, ax=None, show_legend=True):
    """
    Returns a plot of observations from interval-censored failure data.
    :param tmin: The exclusive lower bounds of the failure time intervals.
    :param tmax: The inclusive upper bound of the failure time intervals
    (use np.infty for right-censored observations).
    :param ax: Matplotlib axes on which to plot, if None then one will be created.
    :param show_legend: Boolean that is True if legend should be added to axes and
    False otherwise.
    :return: Matplotlib axes containing the plot.
    """
    tmin = convert_to_pd_series(tmin)
    tmax = convert_to_pd_series(tmax)

    if ax is None:
        fig = plt.figure()  # Create plot figure.
        ax = fig.add_subplot()  # Create the axes to plot on.

    # Add the observations to the axes.
    for i in range(len(tmin)):
        if tmin.iloc[i] == tmax.iloc[i]:
            ax.scatter(tmin.iloc[i], i + 1, color='b', marker='o')
        elif tmax.iloc[i] == np.infty:
            ax.scatter(tmin.iloc[i], i + 1, color='b', marker='>')
        else:
            ax.hlines(i + 1, tmin.iloc[i], tmax.iloc[i], color='b')

    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.set_ylabel('Observation #')
    ax.set_xlabel('Time')
    ax.set_xlim(0)
    ax.set_ylim(0)

    # Add legend to axes.
    if show_legend:
        legend_elements = [
            Line2D([0], [0], color='b', label='Interval-censored'),
            Line2D([0], [0],
                   color='w',
                   markeredgecolor='b',
                   markerfacecolor='b',
                   marker='>',
                   label='Right-censored'),
            Line2D([0], [0],
                   color='w',
                   markeredgecolor='b',
                   markerfacecolor='b',
                   marker='o',
                   label='Exact')
        ]
        ax.legend(handles=legend_elements, loc='upper right')

    return ax
예제 #4
0
def plot_right_censored(t, d, ax=None, show_legend=True):
    """
    Returns a plot of observations from right-censored failure data.
    :param t: Survival times for each observation.
    :param d: Indicator variable value for each observation, where
    value 1 indicates exact failure observed and 0 indicates failure was right-censored.
    :param ax: Matplotlib axes on which to plot, if None then one will be created.
    :param show_legend: Boolean that is True if legend should be added to axes and
    False otherwise.

    :return: Matplotlib axes containing the plot.
    """
    t = convert_to_pd_series(t)
    d = convert_to_pd_series(d)

    if ax is None:
        ax = plt.gca()

    # Add the observations to the axes.
    for i in range(len(t)):
        if d.iloc[i]:
            # Failure observation.
            ax.scatter(t.iloc[i], i + 1, color='b', marker='o')
        else:
            # Right-censored observation.
            ax.scatter(t.iloc[i], i + 1, color='b', marker='>')

    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.set_ylabel('Observation #')

    # Add legend to axes.
    if show_legend:
        legend_elements = [
            Line2D([0], [0],
                   color='w',
                   markeredgecolor='b',
                   markerfacecolor='b',
                   marker='>',
                   label='Right-censored'),
            Line2D([0], [0],
                   color='w',
                   markeredgecolor='b',
                   markerfacecolor='b',
                   marker='o',
                   label='Exact')
        ]
        ax.legend(handles=legend_elements)

    return ax
예제 #5
0
def kaplan_meier_fit(t, d, ci=None, alpha=0.05):
    """
    Produces a table from right-censored failure data containing data relating to
    the Kaplan-Meier estimate of the reliability function.
    :param t: Survival times of the observations in the failure data.
    :param d: Indicator variable values showing if observations were failures (value 1) or right-censored (value 0).
    :param ci: string with value 'gw' for Greenwood's and 'egw' for exponential
    Greenwood's confidence interval bounds.
    :param alpha: float giving level of significance for confidence interval (i.e.
    giving (1-alpha)*100% confidence level it contains true reliability). Default is 0.05.
    :return: Pandas DataFrame with columns 't', 'm', 'n' and 'R' containing
     the ordered failure times and corresponding number of observed failures,
     number at risk and Kaplan-Meier reliability estimates respectively.
   """
    t = convert_to_pd_series(t)
    d = convert_to_pd_series(d)

    if t.size != d.size:
        raise ValueError("times and observed must be equal size.")

    # Get the ordered failure times.
    only_failures = t[d == 1]
    failures_with_zero = only_failures.append(pd.Series([0]),
                                              ignore_index=True)
    unique_failures = pd.Series(failures_with_zero.unique())
    tf = unique_failures.sort_values(ignore_index=True)

    # Get the number of failures observed at the ordered failure times.
    mf = tf.apply(lambda x: (only_failures == x).sum())

    # Get the number right-censored prior in prior interval to each ordered failure time.
    only_censored = t[d == 0]
    total_q = tf.apply(lambda x: (only_censored < x).sum()).shift(periods=-1)
    qf = total_q.diff()
    qf.iloc[0] = total_q.iloc[0]  # Set censored between t_(0) and t_(1).
    qf.iloc[-1] = (only_censored >= tf.iloc[-1]
                   ).sum()  # Set censored after final ordered failure time.

    # Get the number at risk to fail at the ordered failure times.
    nf = tf.apply(lambda x: (t >= x).sum())

    # Calculate conditional survival probability
    # at the ordered failure times.
    surv_probs = 1 - (mf / nf)

    # Calculate Kaplan-Meier reliability estimates at the
    # ordered failure times.
    r = surv_probs.cumprod()

    # Create Pandas DataFrame with results.
    km_table = pd.DataFrame({'t': tf, 'm': mf, 'q': qf, 'n': nf, 'R': r})
    km_table.index.name = "f"

    # Add confidence intervals.
    if ci == 'gw':
        km_table['CI Lower'], km_table['CI Upper'] = _greenwoods_ci(
            mf, nf, r, alpha)
    elif ci == 'egw':
        km_table['CI Lower'], km_table['CI Upper'] = _exp_greenwoods_ci(
            mf, nf, r, alpha)
    elif ci is not None:
        raise ValueError(
            "Invalid ci value, must be 'gw' for Greenwood's or 'egw' for exponential Greenwood's"
            " confidence interval.")

    return km_table
예제 #6
0
def turnbull_fit(tmin, tmax, tol=0.001):
    """
    Computes Turnbull estimates of reliability from interval censored
    failure data.

    Parameters
    ----------
    tmin: Lower bounds of failure times for observations.
    tmax: Upper bounds of failure times for observations.
    tol: Tolerance for the iterative procedure, convergence terminates when
     maximum difference from previous reliability estimate at any time is less than
     tolerance.

    Returns
    -------
    Pandas DataFrame with column 't' containing the interval end point time values
     and 'R' containing the corresponding Turnbull reliability estimates.
    """
    tmin = convert_to_pd_series(tmin)
    tmax = convert_to_pd_series(tmax)

    if tmin.size != tmax.size:
        raise ValueError("t_min and t_max must be equal size.")

    t = np.sort(pd.Series([0]).append(tmin).append(tmax).unique())

    # Initial reliability function as equal reduction at each time grid point.
    reliabilities = np.linspace(1.0, 0, len(t))

    # Form alphas matrix describing if each observation (matrix rows)
    # could fail (value 1) or not (value 0) in each interval (matrix columns).
    n = len(tmin)
    m = len(t) - 1
    alphas = np.empty((n, m), dtype=np.bool)
    for i in range(n):
        t_min_i = tmin.iloc[i]
        t_max_i = tmax.iloc[i]
        for j in range(m):
            grid_t_min_j = t[j]
            grid_t_max_j = t[j + 1]
            if t_min_i != t_max_i:
                alphas[(i,
                        j)] = t_min_i < grid_t_max_j and t_max_i > grid_t_min_j
            else:  # Exact failure observation.
                alphas[(
                    i, j)] = t_min_i <= grid_t_max_j and t_max_i > grid_t_min_j

    while True:
        # Compute estimated failures in each interval.
        p = -np.diff(reliabilities)  # Interval failure probabilities.
        p_alphas = alphas * p
        d = ((p_alphas.T / p_alphas.sum(axis=1)).T.sum(axis=0))
        # Compute number at risk in each interval.
        y = np.cumsum(d[::-1])[::-1]
        updated_reliabilities = np.insert(np.cumprod(1 - (d / y)), 0, 1)

        difference = np.max(np.abs(updated_reliabilities - reliabilities))
        reliabilities = updated_reliabilities
        if difference <= tol:
            break

    turnbull_table = pd.DataFrame({
        't': t,
        'R': reliabilities
    },
                                  index=range(1, m + 2))

    return turnbull_table