Пример #1
0
def multivariate_logrank_test(event_durations, groups, event_observed=None, alpha=0.95, t_0=-1, **kwargs):
    """
    This test is a generalization of the logrank_test: it can deal with n>2 populations (and should
      be equal when n=2):

    H_0: all event series are from the same generating processes
    H_A: there exist atleast one group that differs from the other.

    Parameters:
      event_durations: a (n,) numpy array the (partial) lifetimes of all individuals
      groups: a (n,) numpy array of unique group labels for each individual.
      event_observed: a (n,) numpy array of event observations: 1 if observed death, 0 if censored. Defaults
          to all observed.
      alpha: the level of signifiance desired.
      t_0: the final time to compare the series' up to. Defaults to all.
      kwargs: add keywords and meta-data to the experiment summary.

    Returns:
      summary: a print-friendly summary of the statistical test
      p_value: the p-value
      test_result: True if reject the null, (pendantically) None if we can't reject the null.

    """
    assert event_durations.shape[0] == groups.shape[0], "event_durations must be the same shape as groups"

    if event_observed is None:
        event_observed = np.ones((event_durations.shape[0], 1))

    unique_groups, rm, obs, _ = group_survival_table_from_events(groups, event_durations, event_observed, np.zeros_like(event_durations), t_0)
    n_groups = unique_groups.shape[0]

    # compute the factors needed
    N_j = obs.sum(0).values
    n_ij = (rm.sum(0).values - rm.cumsum(0).shift(1).fillna(0))
    d_i = obs.sum(1)
    n_i = rm.values.sum() - rm.sum(1).cumsum().shift(1).fillna(0)
    ev = n_ij.mul(d_i / n_i, axis='index').sum(0)

    # vector of observed minus expected
    Z_j = N_j - ev

    assert abs(Z_j.sum()) < 10e-8, "Sum is not zero."  # this should move to a test eventually.

    # compute covariance matrix
    V_ = n_ij.mul(np.sqrt(d_i) / n_i, axis='index').fillna(1)
    V = -np.dot(V_.T, V_)
    ix = np.arange(n_groups)
    V[ix, ix] = V[ix, ix] + ev

    # take the first n-1 groups
    U = Z_j.ix[:-1].dot(np.linalg.inv(V[:-1, :-1]).dot(Z_j.ix[:-1]))  # Z.T*inv(V)*Z

    # compute the p-values and tests
    test_result, p_value = chisq_test(U, n_groups - 1, alpha)
    summary = pretty_print_summary(test_result, p_value, U, t_0=t_0, test='logrank',
                                   alpha=alpha, null_distribution='chi squared',
                                   df=n_groups - 1, **kwargs)

    return summary, p_value, test_result
Пример #2
0
def test_group_survival_table_from_events_on_waltons_data():
    df = load_waltons()
    first_obs = np.zeros(df.shape[0])
    g, removed, observed, censored = utils.group_survival_table_from_events(df['group'], df['T'], df['E'], first_obs)
    assert len(g) == 2
    assert all(removed.columns == ['removed:miR-137', 'removed:control'])
    assert all(removed.index == observed.index)
    assert all(removed.index == censored.index)
Пример #3
0
def test_group_survival_table_from_events_on_waltons_data():
    df = load_waltons()
    first_obs = np.zeros(df.shape[0])
    g, removed, observed, censored = utils.group_survival_table_from_events(
        df["group"], df["T"], df["E"], first_obs)
    assert len(g) == 2
    assert all(removed.columns == ["removed:miR-137", "removed:control"])
    assert all(removed.index == observed.index)
    assert all(removed.index == censored.index)
Пример #4
0
def multivariate_logrank_test(
        event_durations,
        groups,
        event_observed=None,
        t_0=-1,
        weightings=None,
        **kwargs) -> StatisticalResult:  # pylint: disable=too-many-locals
    r"""
    This test is a generalization of the logrank_test: it can deal with n>2 populations (and should
    be equal when n=2):

    .. math::
        \begin{align}
         & H_0: h_1(t) = h_2(t) = h_3(t) = ... = h_n(t) \\
         & H_A: \text{there exist at least one group that differs from the other.}
        \end{align}


    Parameters
    ----------

    event_durations: iterable
        a (n,) list-like representing the (possibly partial) durations of all individuals

    groups: iterable
        a (n,) list-like of unique group labels for each individual.

    event_observed: iterable, optional
        a (n,) list-like of event_observed events: 1 if observed death, 0 if censored. Defaults to all observed.

    t_0: float, optional (default=-1)
        the period under observation, -1 for all time.

    weightings: str, optional
        apply a weighted logrank test: options are "wilcoxon" for Wilcoxon (also known as Breslow), "tarone-ware"
        for Tarone-Ware, "peto" for Peto test and "fleming-harrington" for Fleming-Harrington test.
        These are useful for testing for early or late differences in the survival curve. For the Fleming-Harrington
        test, keyword arguments p and q must also be provided with non-negative values.

        Weightings are applied at the ith ordered failure time, :math:`t_{i}`, according to:
            Wilcoxon: :math:`n_i`
            Tarone-Ware: :math:`\sqrt{n_i}`
            Peto: :math:`\bar{S}(t_i)`
            Fleming-Harrington: :math:`\hat{S}(t_i)^p \times (1 - \hat{S}(t_i))^q`

            where :math:`n_i` is the number at risk just prior to time :math:`t_{i}`, :math:`\bar{S}(t_i)` is
            Peto-Peto's modified survival estimate and :math:`\hat{S}(t_i)` is the left-continuous
            Kaplan-Meier survival estimate at time :math:`t_{i}`.

    kwargs:
        add keywords and meta-data to the experiment summary.


    Returns
    -------

    StatisticalResult
       a StatisticalResult object with properties ``p_value``, ``summary``, ``test_statistic``, ``print_summary``

    Examples
    --------

    .. code:: python

        df = pd.DataFrame({
           'durations': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
           'events': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0],
           'groups': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
        })
        result = multivariate_logrank_test(df['durations'], df['groups'], df['events'])
        result.test_statistic
        result.p_value
        result.print_summary()


        # numpy example
        G = [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
        T = [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7]
        E = [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0]
        result = multivariate_logrank_test(T, G, E)
        result.test_statistic


    See Also
    --------
    pairwise_logrank_test
    logrank_test
    """
    kwargs.setdefault("test_name", "multivariate_logrank_test")

    event_durations, groups = np.asarray(event_durations), np.asarray(groups)
    if event_observed is None:
        event_observed = np.ones((event_durations.shape[0], 1))
    else:
        event_observed = np.asarray(event_observed)

    n = np.max(event_durations.shape)
    assert n == np.max(event_durations.shape) == np.max(
        event_observed.shape), "inputs must be of the same length."
    groups, event_durations, event_observed = map(
        lambda x: pd.Series(np.asarray(x).reshape(n)),
        [groups, event_durations, event_observed])

    unique_groups, rm, obs, _ = group_survival_table_from_events(
        groups, event_durations, event_observed, limit=t_0)
    n_groups = unique_groups.shape[0]

    # compute the factors needed
    n_ij = rm.sum(0).values - rm.cumsum(0).shift(1).fillna(0)
    d_i = obs.sum(1)
    n_i = rm.values.sum() - rm.sum(1).cumsum().shift(1).fillna(0)
    ev_i = n_ij.mul(d_i / n_i, axis="index")

    # compute weightings for log-rank alternatives
    if weightings is None:
        w_i = np.ones(d_i.shape[0])
    elif weightings == "wilcoxon":
        kwargs["test_name"] = kwargs["test_name"].replace(
            "logrank", "Wilcoxon")
        w_i = n_i
    elif weightings == "tarone-ware":
        kwargs["test_name"] = kwargs["test_name"].replace(
            "logrank", "Tarone-Ware")
        w_i = np.sqrt(n_i)
    elif weightings == "peto":
        kwargs["test_name"] = kwargs["test_name"].replace("logrank", "Peto")
        w_i = np.cumprod(1.0 - (ev_i.sum(1)) /
                         (n_i + 1))  # Peto-Peto's modified survival estimates.
    elif weightings == "fleming-harrington":
        if "p" in kwargs:
            p = kwargs["p"]
            if p < 0:
                raise ValueError("p must be non-negative.")
        else:
            raise ValueError(
                "Must provide keyword argument p for Flemington-Harrington test statistic"
            )
        if "q" in kwargs:
            q = kwargs["q"]
            if q < 0:
                raise ValueError("q must be non-negative.")
        else:
            raise ValueError(
                "Must provide keyword argument q for Flemington-Harrington test statistic"
            )
        kwargs["test_name"] = kwargs["test_name"].replace(
            "logrank", "Flemington-Harrington")
        kmf = KaplanMeierFitter().fit(event_durations,
                                      event_observed=event_observed)
        s = kmf.survival_function_.to_numpy().flatten(
        )[:-1]  # Left-continuous Kaplan-Meier survival estimate.
        w_i = np.power(s, p) * np.power(1.0 - s, q)
    else:
        raise ValueError("Invalid value for weightings.")

    # apply weights to observed and expected
    N_j = obs.mul(w_i, axis=0).sum(0).values
    ev = ev_i.mul(w_i, axis=0).sum(0)

    # vector of observed minus expected
    Z_j = N_j - ev

    assert abs(Z_j.sum(
    )) < 10e-8, "Sum is not zero."  # this should move to a test eventually.

    # compute covariance matrix
    factor = (((n_i - d_i) /
               (n_i - 1)).replace([np.inf, np.nan], 1)) * d_i / n_i**2
    n_ij["_"] = n_i.values
    V_ = (n_ij.mul(w_i, axis=0)).mul(np.sqrt(factor),
                                     axis="index").fillna(0)  # weighted V_
    V = -np.dot(V_.T, V_)
    ix = np.arange(n_groups)
    V[ix, ix] = V[ix, ix] - V[-1, ix]
    V = V[:-1, :-1]

    # take the first n-1 groups
    U = Z_j.iloc[:-1] @ np.linalg.pinv(
        V[:-1, :-1]) @ Z_j.iloc[:-1]  # Z.T*inv(V)*Z

    # compute the p-values and tests
    p_value = _chisq_test_p_value(U, n_groups - 1)
    return StatisticalResult(p_value,
                             U,
                             t_0=t_0,
                             null_distribution="chi squared",
                             degrees_of_freedom=n_groups - 1,
                             **kwargs)
Пример #5
0
def test_group_survival_table_from_events_works_with_series():
    df = pd.DataFrame([[1, True, 3], [1, True, 3], [4, False, 2]], columns=["duration", "E", "G"])
    ug, _, _, _ = utils.group_survival_table_from_events(df.G, df.duration, df.E, np.array([[0, 0, 0]]))
    npt.assert_array_equal(ug, np.array([3, 2]))
Пример #6
0
def multivariate_logrank_test(event_durations,
                              groups,
                              event_observed=None,
                              alpha=0.95,
                              t_0=-1,
                              **kwargs):  # pylint: disable=too-many-locals
    """
    This test is a generalization of the logrank_test: it can deal with n>2 populations (and should
      be equal when n=2):

    H_0: all event series are from the same generating processes
    H_A: there exist atleast one group that differs from the other.

    Parameters:
      event_durations: a (n,) numpy array of the (partial) lifetimes of all individuals
      groups: a (n,) numpy array of unique group labels for each individual.
      event_observed: a (n,) numpy array of event observations: 1 if observed death, 0 if censored. Defaults
          to all observed.
      alpha: the level of significance desired.
      t_0: the final time to compare the series' up to. Defaults to all.
      kwargs: add keywords and meta-data to the experiment summary.

    Returns
      results: a StatisticalResult object with properties 'p_value', 'summary', 'test_statistic', 'test_result'

    Example:

        >> df = pd.DataFrame({
            'durations': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
            'events': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0],
            'groups': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
        })
        >> result = multivariate_logrank_test(df['durations'], df['groups'], df['events'])
        >> result.test_statistic
        >> result.p_value


        >> # numpy example
        >> G = [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
        >> T = [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7]
        >> E = [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0]
        >> result = multivariate_logrank_test(T, G, E)
        >> result.test_statistic



    """
    if not (0 < alpha <= 1.0):
        raise ValueError("alpha parameter must be between 0 and 1.")

    event_durations, groups = np.asarray(event_durations), np.asarray(groups)
    if event_observed is None:
        event_observed = np.ones((event_durations.shape[0], 1))
    else:
        event_observed = np.asarray(event_observed)

    n = np.max(event_durations.shape)
    assert (n == np.max(event_durations.shape) == np.max(
        event_observed.shape)), "inputs must be of the same length."
    groups, event_durations, event_observed = map(
        lambda x: pd.Series(np.asarray(x).reshape(n)),
        [groups, event_durations, event_observed])

    unique_groups, rm, obs, _ = group_survival_table_from_events(
        groups, event_durations, event_observed, limit=t_0)
    n_groups = unique_groups.shape[0]

    # compute the factors needed
    N_j = obs.sum(0).values
    n_ij = rm.sum(0).values - rm.cumsum(0).shift(1).fillna(0)
    d_i = obs.sum(1)
    n_i = rm.values.sum() - rm.sum(1).cumsum().shift(1).fillna(0)
    ev = n_ij.mul(d_i / n_i, axis="index").sum(0)

    # vector of observed minus expected
    Z_j = N_j - ev

    assert abs(Z_j.sum(
    )) < 10e-8, "Sum is not zero."  # this should move to a test eventually.

    # compute covariance matrix
    factor = (((n_i - d_i) /
               (n_i - 1)).replace([np.inf, np.nan], 1)) * d_i / n_i**2
    n_ij["_"] = n_i.values
    V_ = n_ij.mul(np.sqrt(factor), axis="index").fillna(0)
    V = -np.dot(V_.T, V_)
    ix = np.arange(n_groups)
    V[ix, ix] = V[ix, ix] - V[-1, ix]
    V = V[:-1, :-1]

    # take the first n-1 groups
    U = Z_j.iloc[:-1].dot(np.linalg.pinv(V[:-1, :-1])).dot(
        Z_j.iloc[:-1])  # Z.T*inv(V)*Z

    # compute the p-values and tests
    test_result, p_value = chisq_test(U, n_groups - 1, alpha)

    return StatisticalResult(test_result,
                             p_value,
                             U,
                             t_0=t_0,
                             alpha=alpha,
                             null_distribution="chi squared",
                             df=n_groups - 1,
                             **kwargs)
Пример #7
0
def multivariate_logrank_test(event_durations, groups, event_observed=None,
                              alpha=0.95, t_0=-1, **kwargs):
    """
    This test is a generalization of the logrank_test: it can deal with n>2 populations (and should
      be equal when n=2):

    H_0: all event series are from the same generating processes
    H_A: there exist atleast one group that differs from the other.

    Parameters:
      event_durations: a (n,) numpy array the (partial) lifetimes of all individuals
      groups: a (n,) numpy array of unique group labels for each individual.
      event_observed: a (n,) numpy array of event observations: 1 if observed death, 0 if censored. Defaults
          to all observed.
      alpha: the level of significance desired.
      t_0: the final time to compare the series' up to. Defaults to all.
      kwargs: add keywords and meta-data to the experiment summary.

    Returns
      results: a StatisticalResult object with properties 'p_value', 'summary', 'test_statistic', 'test_result'

    """
    if not (0 < alpha <= 1.):
        raise ValueError('alpha parameter must be between 0 and 1.')

    event_durations, groups = np.asarray(event_durations), np.asarray(groups)
    if event_observed is None:
        event_observed = np.ones((event_durations.shape[0], 1))
    else:
        event_observed = np.asarray(event_observed)

    n = np.max(event_durations.shape)
    assert n == np.max(event_durations.shape) == np.max(event_observed.shape), "inputs must be of the same length."
    groups, event_durations, event_observed = map(lambda x: pd.Series(np.asarray(x).reshape(n,)), [groups, event_durations, event_observed])

    unique_groups, rm, obs, _ = group_survival_table_from_events(groups, event_durations, event_observed, limit=t_0)
    n_groups = unique_groups.shape[0]

    # compute the factors needed
    N_j = obs.sum(0).values
    n_ij = (rm.sum(0).values - rm.cumsum(0).shift(1).fillna(0))
    d_i = obs.sum(1)
    n_i = rm.values.sum() - rm.sum(1).cumsum().shift(1).fillna(0)
    ev = n_ij.mul(d_i / n_i, axis='index').sum(0)

    # vector of observed minus expected
    Z_j = N_j - ev

    assert abs(Z_j.sum()) < 10e-8, "Sum is not zero."  # this should move to a test eventually.

    # compute covariance matrix
    factor = (((n_i - d_i) / (n_i - 1)).replace(np.inf, 1)) * d_i
    n_ij['_'] = n_i.values
    V_ = n_ij.mul(np.sqrt(factor) / n_i, axis='index').fillna(1)
    V = -np.dot(V_.T, V_)
    ix = np.arange(n_groups)
    V[ix, ix] = -V[-1, ix] + V[ix, ix]
    V = V[:-1, :-1]

    # take the first n-1 groups
    U = Z_j.iloc[:-1].dot(np.linalg.pinv(V[:-1, :-1]).dot(Z_j.iloc[:-1]))  # Z.T*inv(V)*Z

    # compute the p-values and tests
    test_result, p_value = chisq_test(U, n_groups - 1, alpha)

    return StatisticalResult(test_result, p_value, U, t_0=t_0,
                             alpha=alpha, null_distribution='chi squared',
                             df=n_groups - 1, **kwargs)
Пример #8
0
def multivariate_logrank_test(event_durations,
                              groups,
                              event_observed=None,
                              t_0=-1,
                              **kwargs):  # pylint: disable=too-many-locals
    r"""
    This test is a generalization of the logrank_test: it can deal with n>2 populations (and should
    be equal when n=2):

    .. math::
        \begin{align}
         & H_0: h_1(t) = h_2(t) = h_3(t) = ... = h_n(t) \\
         & H_A: \text{there exist at least one group that differs from the other.}
        \end{align}


    Parameters
    ----------

    event_durations: iterable
        a (n,) list-like representing the (possibly partial) durations of all individuals

    groups: iterable
        a (n,) list-like of unique group labels for each individual.

    event_observed: iterable, optional
        a (n,) list-like of event_observed events: 1 if observed death, 0 if censored. Defaults to all observed.

    t_0: float, optional (default=-1)
        the period under observation, -1 for all time.

    kwargs:
        add keywords and meta-data to the experiment summary.


    Returns
    -------

    StatisticalResult
       a StatisticalResult object with properties ``p_value``, ``summary``, ``test_statistic``, ``print_summary``

    Examples
    --------

    >>> df = pd.DataFrame({
    >>>    'durations': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
    >>>    'events': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0],
    >>>    'groups': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
    >>> })
    >>> result = multivariate_logrank_test(df['durations'], df['groups'], df['events'])
    >>> result.test_statistic
    >>> result.p_value
    >>> result.print_summary()


    >>> # numpy example
    >>> G = [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
    >>> T = [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7]
    >>> E = [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0]
    >>> result = multivariate_logrank_test(T, G, E)
    >>> result.test_statistic


    See Also
    --------
    pairwise_logrank_test
    logrank_test
    """
    event_durations, groups = np.asarray(event_durations), np.asarray(groups)
    if event_observed is None:
        event_observed = np.ones((event_durations.shape[0], 1))
    else:
        event_observed = np.asarray(event_observed)

    n = np.max(event_durations.shape)
    assert n == np.max(event_durations.shape) == np.max(
        event_observed.shape), "inputs must be of the same length."
    groups, event_durations, event_observed = map(
        lambda x: pd.Series(np.asarray(x).reshape(n)),
        [groups, event_durations, event_observed])

    unique_groups, rm, obs, _ = group_survival_table_from_events(
        groups, event_durations, event_observed, limit=t_0)
    n_groups = unique_groups.shape[0]

    # compute the factors needed
    N_j = obs.sum(0).values
    n_ij = rm.sum(0).values - rm.cumsum(0).shift(1).fillna(0)
    d_i = obs.sum(1)
    n_i = rm.values.sum() - rm.sum(1).cumsum().shift(1).fillna(0)
    ev = n_ij.mul(d_i / n_i, axis="index").sum(0)

    # vector of observed minus expected
    Z_j = N_j - ev

    assert abs(Z_j.sum(
    )) < 10e-8, "Sum is not zero."  # this should move to a test eventually.

    # compute covariance matrix
    factor = (((n_i - d_i) /
               (n_i - 1)).replace([np.inf, np.nan], 1)) * d_i / n_i**2
    n_ij["_"] = n_i.values
    V_ = n_ij.mul(np.sqrt(factor), axis="index").fillna(0)
    V = -np.dot(V_.T, V_)
    ix = np.arange(n_groups)
    V[ix, ix] = V[ix, ix] - V[-1, ix]
    V = V[:-1, :-1]

    # take the first n-1 groups
    U = Z_j.iloc[:-1] @ np.linalg.pinv(
        V[:-1, :-1]) @ Z_j.iloc[:-1]  # Z.T*inv(V)*Z

    # compute the p-values and tests
    p_value = chisq_test(U, n_groups - 1)

    return StatisticalResult(p_value,
                             U,
                             t_0=t_0,
                             null_distribution="chi squared",
                             degrees_of_freedom=n_groups - 1,
                             **kwargs)
Пример #9
0
def multivariate_logrank_test(event_durations, groups, event_observed=None, 
                              alpha=0.95, t_0=-1, suppress_print=False, **kwargs):
    """
    This test is a generalization of the logrank_test: it can deal with n>2 populations (and should
      be equal when n=2):

    H_0: all event series are from the same generating processes
    H_A: there exist atleast one group that differs from the other.

    Parameters:
      event_durations: a (n,) numpy array the (partial) lifetimes of all individuals
      groups: a (n,) numpy array of unique group labels for each individual.
      event_observed: a (n,) numpy array of event observations: 1 if observed death, 0 if censored. Defaults
          to all observed.
      alpha: the level of signifiance desired.
      t_0: the final time to compare the series' up to. Defaults to all.
      suppress_print: if True, do not print the summary. Default False.
      kwargs: add keywords and meta-data to the experiment summary.

    Returns:
      summary: a print-friendly summary of the statistical test
      p_value: the p-value
      test_result: True if reject the null, (pendantically) None if we can't reject the null.

    """
    if event_observed is None:
        event_observed = np.ones((event_durations.shape[0], 1))

    n = max(event_durations.shape)
    assert n == max(event_durations.shape) == max(event_observed.shape), "inputs must be of the same length."
    groups, event_durations, event_observed = map(lambda x: pd.Series(np.reshape(x, (n,))), [groups, event_durations, event_observed])

    unique_groups, rm, obs, _ = group_survival_table_from_events(groups, event_durations, event_observed, np.zeros_like(event_durations), t_0)
    n_groups = unique_groups.shape[0]

    # compute the factors needed
    N_j = obs.sum(0).values
    n_ij = (rm.sum(0).values - rm.cumsum(0).shift(1).fillna(0))
    d_i = obs.sum(1)
    n_i = rm.values.sum() - rm.sum(1).cumsum().shift(1).fillna(0)
    ev = n_ij.mul(d_i / n_i, axis='index').sum(0)

    # vector of observed minus expected
    Z_j = N_j - ev

    assert abs(Z_j.sum()) < 10e-8, "Sum is not zero."  # this should move to a test eventually.

    # compute covariance matrix
    V_ = n_ij.mul(np.sqrt(d_i) / n_i, axis='index').fillna(1)
    V = -np.dot(V_.T, V_)
    ix = np.arange(n_groups)
    V[ix, ix] = V[ix, ix] + ev

    # take the first n-1 groups
    U = Z_j.ix[:-1].dot(np.linalg.pinv(V[:-1, :-1]).dot(Z_j.ix[:-1]))  # Z.T*inv(V)*Z

    # compute the p-values and tests
    test_result, p_value = chisq_test(U, n_groups - 1, alpha)
    summary = pretty_print_summary(test_result, p_value, U, t_0=t_0, test='logrank',
                                   alpha=alpha, null_distribution='chi squared',
                                   df=n_groups - 1, **kwargs)

    if not suppress_print:
      print(summary)
    return summary, p_value, test_result
Пример #10
0
def test_group_survival_table_from_events_works_with_series():
    df = pd.DataFrame([[1, True, 3], [1, True, 3], [4, False, 2]], columns=['duration', 'E', 'G'])
    ug, _, _, _ = utils.group_survival_table_from_events(df.G, df.duration, df.E, np.array([[0, 0, 0]]))
    npt.assert_array_equal(ug, np.array([3, 2]))
Пример #11
0
def multivariate_logrank_test(event_durations,
                              groups,
                              event_observed=None,
                              alpha=0.95,
                              t_0=-1,
                              **kwargs):
    """
    This test is a generalization of the logrank_test: it can deal with n>2 populations (and should
      be equal when n=2):

    H_0: all event series are from the same generating processes
    H_A: there exist atleast one group that differs from the other.

    Parameters:
      event_durations: a (n,) numpy array the (partial) lifetimes of all individuals
      groups: a (n,) numpy array of unique group labels for each individual.
      event_observed: a (n,) numpy array of event observations: 1 if observed death, 0 if censored. Defaults
          to all observed.
      alpha: the level of significance desired.
      t_0: the final time to compare the series' up to. Defaults to all.
      kwargs: add keywords and meta-data to the experiment summary.

    Returns
      results: a StatisticalResult object with properties 'p_value', 'summary', 'test_statistic', 'test_result'

    """
    event_durations, groups = np.asarray(event_durations), np.asarray(groups)
    if event_observed is None:
        event_observed = np.ones((event_durations.shape[0], 1))
    else:
        event_observed = np.asarray(event_observed)

    n = np.max(event_durations.shape)
    assert n == np.max(event_durations.shape) == np.max(
        event_observed.shape), "inputs must be of the same length."
    groups, event_durations, event_observed = map(
        lambda x: pd.Series(np.reshape(x, (n, ))),
        [groups, event_durations, event_observed])

    unique_groups, rm, obs, _ = group_survival_table_from_events(
        groups, event_durations, event_observed, limit=t_0)
    n_groups = unique_groups.shape[0]

    # compute the factors needed
    N_j = obs.sum(0).values
    n_ij = (rm.sum(0).values - rm.cumsum(0).shift(1).fillna(0))
    d_i = obs.sum(1)
    n_i = rm.values.sum() - rm.sum(1).cumsum().shift(1).fillna(0)
    ev = n_ij.mul(d_i / n_i, axis='index').sum(0)

    # vector of observed minus expected
    Z_j = N_j - ev

    assert abs(Z_j.sum(
    )) < 10e-8, "Sum is not zero."  # this should move to a test eventually.

    # compute covariance matrix
    factor = (((n_i - d_i) / (n_i - 1)).replace(np.inf, 1)) * d_i
    n_ij['_'] = n_i.values
    V_ = n_ij.mul(np.sqrt(factor) / n_i, axis='index').fillna(1)
    V = -np.dot(V_.T, V_)
    ix = np.arange(n_groups)
    V[ix, ix] = -V[-1, ix] + V[ix, ix]
    V = V[:-1, :-1]

    # take the first n-1 groups
    U = Z_j.iloc[:-1].dot(np.linalg.pinv(V[:-1, :-1]).dot(
        Z_j.iloc[:-1]))  # Z.T*inv(V)*Z

    # compute the p-values and tests
    test_result, p_value = chisq_test(U, n_groups - 1, alpha)

    return StatisticalResult(test_result,
                             p_value,
                             U,
                             t_0=t_0,
                             test='logrank',
                             alpha=alpha,
                             null_distribution='chi squared',
                             df=n_groups - 1,
                             **kwargs)