Python PHReg 예제들, statsmodels.duration.hazard_regression.PHReg Python 예제들

예제 #1

0

파일 보기

파일: test_phreg.py 프로젝트: bashtage/statsmodels

    def test_fit_regularized(self):

        # Data set sizes
        for n,p in (50,2),(100,5):

            # Penalty weights
            for js,s in enumerate([0,0.1]):

                coef_name = "coef_%d_%d_%d" % (n, p, js)
                params = getattr(survival_enet_r_results, coef_name)

                fname = "survival_data_%d_%d.csv" % (n, p)
                time, status, entry, exog = self.load_file(fname)

                exog -= exog.mean(0)
                exog /= exog.std(0, ddof=1)

                model = PHReg(time, exog, status=status, ties='breslow')
                sm_result = model.fit_regularized(alpha=s)

                # The agreement isn't very high, the issue may be on
                # the R side.  See below for further checks.
                assert_allclose(sm_result.params, params, rtol=0.3)

                # The penalized log-likelihood that we are maximizing.
                def plf(params):
                    llf = model.loglike(params) / len(time)
                    L1_wt = 1
                    llf = llf - s * ((1 - L1_wt)*np.sum(params**2) / 2 + L1_wt*np.sum(np.abs(params)))
                    return llf

                # Confirm that we are doing better than glmnet.
                llf_r = plf(params)
                llf_sm = plf(sm_result.params)
                assert_equal(np.sign(llf_sm - llf_r), 1)

예제 #2

0

파일 보기

파일: test_phreg.py 프로젝트: soumyadsanyal/statsmodels

    def test_fit_regularized(self):

        # Data set sizes
        for n,p in (50,2),(100,5):

            # Penalty weights
            for js,s in enumerate([0,0.1]):

                coef_name = "coef_%d_%d_%d" % (n, p, js)
                coef = getattr(survival_enet_r_results, coef_name)

                fname = "survival_data_%d_%d.csv" % (n, p)
                time, status, entry, exog = self.load_file(fname)

                exog -= exog.mean(0)
                exog /= exog.std(0, ddof=1)

                mod = PHReg(time, exog, status=status, ties='breslow')
                rslt = mod.fit_regularized(alpha=s)

                # The agreement isn't very high, the issue may be on
                # their side.  They seem to use some approximations
                # that we are not using.
                assert_allclose(rslt.params, coef, rtol=0.3)

                # Smoke test for summary
                smry = rslt.summary()

예제 #3

0

파일 보기

파일: test_phreg.py 프로젝트: soumyadsanyal/statsmodels

    def test_formula(self):

        np.random.seed(34234)
        time = 50 * np.random.uniform(size=200)
        status = np.random.randint(0, 2, 200).astype(np.float64)
        exog = np.random.normal(size=(200,4))
        entry = np.zeros_like(time)
        entry[0:10] = time[0:10] / 2

        df = pd.DataFrame({"time": time, "status": status,
                           "exog1": exog[:, 0], "exog2": exog[:, 1],
                           "exog3": exog[:, 2], "exog4": exog[:, 3],
                           "entry": entry})

        mod1 = PHReg(time, exog, status, entry=entry)
        rslt1 = mod1.fit()

        fml = "time ~ 0 + exog1 + exog2 + exog3 + exog4"
        mod2 = PHReg.from_formula(fml, df, status=status,
                                  entry=entry)
        rslt2 = mod2.fit()

        mod3 = PHReg.from_formula(fml, df, status="status",
                                  entry="entry")
        rslt3 = mod3.fit()

        assert_allclose(rslt1.params, rslt2.params)
        assert_allclose(rslt1.params, rslt3.params)
        assert_allclose(rslt1.bse, rslt2.bse)
        assert_allclose(rslt1.bse, rslt3.bse)

예제 #4

0

파일 보기

파일: test_phreg.py 프로젝트: soumyadsanyal/statsmodels

    def test_summary(self):
        # smoke test
        np.random.seed(34234)
        time = 50 * np.random.uniform(size=200)
        status = np.random.randint(0, 2, 200).astype(np.float64)
        exog = np.random.normal(size=(200,4))

        mod = PHReg(time, exog, status)
        rslt = mod.fit()
        rslt.summary()

예제 #5

0

파일 보기

파일: test_phreg.py 프로젝트: 5267/statsmodels

    def test_summary(self):
        # smoke test
        np.random.seed(34234)
        time = 50 * np.random.uniform(size=200)
        status = np.random.randint(0, 2, 200).astype(np.float64)
        exog = np.random.normal(size=(200,4))

        mod = PHReg(time, exog, status)
        rslt = mod.fit()
        smry = rslt.summary()

        strata = np.kron(np.arange(50), np.ones(4))
        mod = PHReg(time, exog, status, strata=strata)
        rslt = mod.fit()
        smry = rslt.summary()
        msg = "3 strata dropped for having no events"
        assert_(msg in str(smry))

        groups = np.kron(np.arange(25), np.ones(8))
        mod = PHReg(time, exog, status)
        rslt = mod.fit(groups=groups)
        smry = rslt.summary()

        entry = np.random.uniform(0.1, 0.8, 200) * time
        mod = PHReg(time, exog, status, entry=entry)
        rslt = mod.fit()
        smry = rslt.summary()
        msg = "200 observations have positive entry times"
        assert_(msg in str(smry))

예제 #6

0

파일 보기

파일: test_phreg.py 프로젝트: soumyadsanyal/statsmodels

    def test_offset(self):

        np.random.seed(34234)
        time = 50 * np.random.uniform(size=200)
        status = np.random.randint(0, 2, 200).astype(np.float64)
        exog = np.random.normal(size=(200,4))

        mod1 = PHReg(time, exog, status)
        rslt1 = mod1.fit()
        offset = exog[:,0] * rslt1.params[0]
        exog = exog[:, 1:]

        mod2 = PHReg(time, exog, status, offset=offset)
        rslt2 = mod2.fit()

        assert_allclose(rslt2.params, rslt1.params[1:])

예제 #7

0

파일 보기

파일: test_phreg.py 프로젝트: soumyadsanyal/statsmodels

    def test_predict_formula(self):

        n = 100
        np.random.seed(34234)
        time = 50 * np.random.uniform(size=n)
        status = np.random.randint(0, 2, n).astype(np.float64)
        exog = np.random.uniform(1, 2, size=(n, 2))

        df = pd.DataFrame({"time": time, "status": status,
                           "exog1": exog[:, 0], "exog2": exog[:, 1]})

        fml = "time ~ 0 + exog1 + np.log(exog2) + exog1*exog2"
        model1 = PHReg.from_formula(fml, df, status=status)
        result1 = model1.fit()

        from patsy import dmatrix
        dfp = dmatrix(model1.data.design_info.builder, df)

        pr1 = result1.predict()
        pr2 = result1.predict(exog=df)
        pr3 = model1.predict(result1.params, exog=dfp) # No standard errors
        pr4 = model1.predict(result1.params, cov_params=result1.cov_params(), exog=dfp)

        prl = (pr1, pr2, pr3, pr4)
        for i in range(4):
            for j in range(i):
                assert_allclose(prl[i].predicted_values, prl[j].predicted_values)

        prl = (pr1, pr2, pr4)
        for i in range(3):
            for j in range(i):
                assert_allclose(prl[i].standard_errors, prl[j].standard_errors)

예제 #8

0

파일 보기

파일: test_phreg.py 프로젝트: mrajancsr/statsmodels

    def test_predict(self):
        # All smoke tests. We should be able to convert the lhr and hr
        # tests into real tests against R.  There are many options to
        # this function that may interact in complicated ways.  Only a
        # few key combinations are tested here.
        np.random.seed(34234)
        endog = 50 * np.random.uniform(size=200)
        status = np.random.randint(0, 2, 200).astype(np.float64)
        exog = np.random.normal(size=(200, 4))

        mod = PHReg(endog, exog, status)
        rslt = mod.fit()
        rslt.predict()
        for pred_type in "lhr", "hr", "cumhaz", "surv":
            rslt.predict(pred_type=pred_type)
            rslt.predict(endog=endog[0:10], pred_type=pred_type)
            rslt.predict(endog=endog[0:10], exog=exog[0:10, :], pred_type=pred_type)

예제 #9

0

파일 보기

파일: test_phreg.py 프로젝트: 5267/statsmodels

    def do1(self, fname, ties, entry_f, strata_f):

        # Read the test data.
        time, status, entry, exog = self.load_file(fname)
        n = len(time)

        vs = fname.split("_")
        n = int(vs[2])
        p = int(vs[3].split(".")[0])
        ties1 = ties[0:3]

        # Needs to match the kronecker statement in survival.R
        strata = np.kron(range(5), np.ones(n // 5))

        # No stratification or entry times
        mod = PHReg(time, exog, status, ties=ties)
        phrb = mod.fit(**args)
        coef_r, se_r, time_r, hazard_r = get_results(n, p, None, ties1)
        assert_allclose(phrb.params, coef_r, rtol=1e-3)
        assert_allclose(phrb.bse, se_r, rtol=1e-4)
        time_h, cumhaz, surv = phrb.baseline_cumulative_hazard[0]

        # Entry times but no stratification
        phrb = PHReg(time, exog, status, entry=entry,
                     ties=ties).fit(**args)
        coef, se, time_r, hazard_r = get_results(n, p, "et", ties1)
        assert_allclose(phrb.params, coef, rtol=1e-3)
        assert_allclose(phrb.bse, se, rtol=1e-3)

        # Stratification but no entry times
        phrb = PHReg(time, exog, status, strata=strata,
                      ties=ties).fit(**args)
        coef, se, time_r, hazard_r = get_results(n, p, "st", ties1)
        assert_allclose(phrb.params, coef, rtol=1e-4)
        assert_allclose(phrb.bse, se, rtol=1e-4)

        # Stratification and entry times
        phrb = PHReg(time, exog, status, entry=entry,
                     strata=strata, ties=ties).fit(**args)
        coef, se, time_r, hazard_r = get_results(n, p, "et_st", ties1)
        assert_allclose(phrb.params, coef, rtol=1e-3)
        assert_allclose(phrb.bse, se, rtol=1e-4)

        #smoke test
        time_h, cumhaz, surv = phrb.baseline_cumulative_hazard[0]

예제 #10

0

파일 보기

파일: test_phreg.py 프로젝트: soumyadsanyal/statsmodels

    def test_get_distribution(self):
        # Smoke test
        np.random.seed(34234)
        exog = np.random.normal(size=(200, 2))
        lin_pred = exog.sum(1)
        elin_pred = np.exp(-lin_pred)
        time = -elin_pred * np.log(np.random.uniform(size=200))

        mod = PHReg(time, exog)
        rslt = mod.fit()

        dist = rslt.get_distribution()

        fitted_means = dist.mean()
        true_means = elin_pred
        fitted_var = dist.var()
        fitted_sd = dist.std()
        sample = dist.rvs()

예제 #11

0

파일 보기

파일: test_phreg.py 프로젝트: bashtage/statsmodels

    def test_formula_args(self):

        np.random.seed(34234)
        n = 200
        time = 50 * np.random.uniform(size=n)
        status = np.random.randint(0, 2, size=n).astype(np.float64)
        exog = np.random.normal(size=(200, 2))
        offset = np.random.uniform(size=n)
        entry = np.random.uniform(0, 1, size=n) * time

        df = pd.DataFrame({"time": time, "status": status, "x1": exog[:, 0],
                           "x2": exog[:, 1], "offset": offset, "entry": entry})
        model1 = PHReg.from_formula("time ~ x1 + x2", status="status", offset="offset",
                                    entry="entry", data=df)
        result1 = model1.fit()
        model2 = PHReg.from_formula("time ~ x1 + x2", status=df.status, offset=df.offset,
                                    entry=df.entry, data=df)
        result2 = model2.fit()
        assert_allclose(result1.params, result2.params)
        assert_allclose(result1.bse, result2.bse)

예제 #12

0

파일 보기

파일: test_phreg.py 프로젝트: bashtage/statsmodels

    def test_get_distribution(self):
        np.random.seed(34234)
        n = 200
        exog = np.random.normal(size=(n, 2))
        lin_pred = exog.sum(1)
        elin_pred = np.exp(-lin_pred)
        time = -elin_pred * np.log(np.random.uniform(size=n))
        status = np.ones(n)
        status[0:20] = 0
        strata = np.kron(range(5), np.ones(n // 5))

        mod = PHReg(time, exog, status=status, strata=strata)
        rslt = mod.fit()

        dist = rslt.get_distribution()

        fitted_means = dist.mean()
        true_means = elin_pred
        fitted_var = dist.var()
        fitted_sd = dist.std()
        sample = dist.rvs()

예제 #13

0

파일 보기

파일: test_phreg.py 프로젝트: bashtage/statsmodels

    def test_formula_cat_interactions(self):

        time = np.r_[1, 2, 3, 4, 5, 6, 7, 8, 9]
        status = np.r_[1, 1, 0, 0, 1, 0, 1, 1, 1]
        x1 = np.r_[1, 1, 1, 2, 2, 2, 3, 3, 3]
        x2 = np.r_[1, 2, 3, 1, 2, 3, 1, 2, 3]
        df = pd.DataFrame({"time": time, "status": status,
                           "x1": x1, "x2": x2})

        model1 = PHReg.from_formula("time ~ C(x1) + C(x2) + C(x1)*C(x2)", status="status",
                                    data=df)
        assert_equal(model1.exog.shape, [9, 8])

예제 #14

0

파일 보기

파일: test_phreg.py 프로젝트: soumyadsanyal/statsmodels

    def test_post_estimation(self):
        # All regression tests
        np.random.seed(34234)
        time = 50 * np.random.uniform(size=200)
        status = np.random.randint(0, 2, 200).astype(np.float64)
        exog = np.random.normal(size=(200,4))

        mod = PHReg(time, exog, status)
        rslt = mod.fit()
        mart_resid = rslt.martingale_residuals
        assert_allclose(np.abs(mart_resid).sum(), 120.72475743348433)

        w_avg = rslt.weighted_covariate_averages
        assert_allclose(np.abs(w_avg[0]).sum(0),
               np.r_[7.31008415, 9.77608674,10.89515885, 13.1106801])

        bc_haz = rslt.baseline_cumulative_hazard
        v = [np.mean(np.abs(x)) for x in bc_haz[0]]
        w = np.r_[23.482841556421608, 0.44149255358417017,
                  0.68660114081275281]
        assert_allclose(v, w)

        score_resid = rslt.score_residuals
        v = np.r_[ 0.50924792, 0.4533952, 0.4876718, 0.5441128]
        w = np.abs(score_resid).mean(0)
        assert_allclose(v, w)

        groups = np.random.randint(0, 3, 200)
        mod = PHReg(time, exog, status)
        rslt = mod.fit(groups=groups)
        robust_cov = rslt.cov_params()
        v = [0.00513432, 0.01278423, 0.00810427, 0.00293147]
        w = np.abs(robust_cov).mean(0)
        assert_allclose(v, w, rtol=1e-6)

        s_resid = rslt.schoenfeld_residuals
        ii = np.flatnonzero(np.isfinite(s_resid).all(1))
        s_resid = s_resid[ii, :]
        v = np.r_[0.85154336, 0.72993748, 0.73758071, 0.78599333]
        assert_allclose(np.abs(s_resid).mean(0), v)

예제 #15

0

파일 보기

파일: _kernel_estimates.py 프로젝트: arnab0000/Internships

def _kernel_cumincidence(time, status, exog, kfunc, freq_weights,
                         dimred=True):
    """
    Calculates cumulative incidence functions using kernels.

    Parameters
    ----------
    time : array_like
        The observed time values
    status : array_like
        The status values.  status == 0 indicates censoring,
        status == 1, 2, ... are the events.
    exog : array_like
        Covariates such that censoring becomes independent of
        outcome times conditioned on the covariate values.
    kfunc : function
        A kernel function
    freq_weights : array_like
        Optional frequency weights
    dimred : bool
        If True, proportional hazards regression models are used to
        reduce exog to two columns by predicting overall events and
        censoring in two separate models.  If False, exog is used
        directly for calculating kernel weights without dimension
        reduction.
    """

    # Reorder so time is ascending
    ii = np.argsort(time)
    time = time[ii]
    status = status[ii]
    exog = exog[ii, :]
    nobs = len(time)

    # Convert the unique times to ranks (0, 1, 2, ...)
    utime, rtime = np.unique(time, return_inverse=True)

    # Last index where each unique time occurs.
    ie = np.searchsorted(time, utime, side='right') - 1

    ngrp = int(status.max())

    # All-cause status
    statusa = (status >= 1).astype(np.float64)

    if freq_weights is not None:
        freq_weights = freq_weights / freq_weights.sum()

    ip = []
    sp = [None] * nobs
    n_risk = [None] * nobs
    kd = [None] * nobs
    for k in range(ngrp):
        status0 = (status == k + 1).astype(np.float64)

        # Dimension reduction step
        if dimred:
            sfe = PHReg(time, exog, status0).fit()
            fitval_e = sfe.predict().predicted_values
            sfc = PHReg(time, exog, 1 - status0).fit()
            fitval_c = sfc.predict().predicted_values
            exog2d = np.hstack((fitval_e[:, None], fitval_c[:, None]))
            exog2d -= exog2d.mean(0)
            exog2d /= exog2d.std(0)
        else:
            exog2d = exog

        ip0 = 0
        for i in range(nobs):

            if k == 0:
                kd1 = exog2d - exog2d[i, :]
                kd1 = kfunc(kd1)
                kd[i] = kd1

            # Get the local all-causes survival function
            if k == 0:
                denom = np.cumsum(kd[i][::-1])[::-1]
                num = kd[i] * statusa
                rat = num / denom
                tr = 1e-15
                ii = np.flatnonzero((denom < tr) & (num < tr))
                rat[ii] = 0
                ratc = 1 - rat
                ratc = np.clip(ratc, 1e-10, np.inf)
                lrat = np.log(ratc)
                prat = np.cumsum(lrat)[ie]
                sf = np.exp(prat)
                sp[i] = np.r_[1, sf[:-1]]
                n_risk[i] = denom[ie]

            # Number of cause-specific deaths at each unique time.
            d0 = np.bincount(rtime, weights=status0*kd[i],
                             minlength=len(utime))

            # The cumulative incidence function probabilities.  Carry
            # forward once the effective sample size drops below 1.
            ip1 = np.cumsum(sp[i] * d0 / n_risk[i])
            jj = len(ip1) - np.searchsorted(n_risk[i][::-1], 1)
            if jj < len(ip1):
                ip1[jj:] = ip1[jj - 1]
            if freq_weights is None:
                ip0 += ip1
            else:
                ip0 += freq_weights[i] * ip1

        if freq_weights is None:
            ip0 /= nobs

        ip.append(ip0)

    return utime, ip

예제 #16

0

파일 보기

파일: _kernel_estimates.py 프로젝트: BranYang/statsmodels

def _kernel_cumincidence(time, status, exog, kfunc, freq_weights,
                         dimred=True):
    """
    Calculates cumulative incidence functions using kernels.

    Parameters
    ----------
    time : array-like
        The observed time values
    status : array-like
        The status values.  status == 0 indicates censoring,
        status == 1, 2, ... are the events.
    exog : array-like
        Covariates such that censoring becomes independent of
        outcome times conditioned on the covariate values.
    kfunc : function
        A kernel function
    freq_weights : array-like
        Optional frequency weights
    dimred : boolean
        If True, proportional hazards regression models are used to
        reduce exog to two columns by predicting overall events and
        censoring in two separate models.  If False, exog is used
        directly for calculating kernel weights without dimension
        reduction.
    """

    # Reorder so time is ascending
    ii = np.argsort(time)
    time = time[ii]
    status = status[ii]
    exog = exog[ii, :]
    nobs = len(time)

    # Convert the unique times to ranks (0, 1, 2, ...)
    utime, rtime = np.unique(time, return_inverse=True)

    # Last index where each unique time occurs.
    ie = np.searchsorted(time, utime, side='right') - 1

    ngrp = int(status.max())

    # All-cause status
    statusa = (status >= 1).astype(np.float64)

    if freq_weights is not None:
        freq_weights = freq_weights / freq_weights.sum()

    ip = []
    sp = [None] * nobs
    n_risk = [None] * nobs
    kd = [None] * nobs
    for k in range(ngrp):
        status0 = (status == k + 1).astype(np.float64)

        # Dimension reduction step
        if dimred:
            sfe = PHReg(time, exog, status0).fit()
            fitval_e = sfe.predict().predicted_values
            sfc = PHReg(time, exog, 1 - status0).fit()
            fitval_c = sfc.predict().predicted_values
            exog2d = np.hstack((fitval_e[:, None], fitval_c[:, None]))
            exog2d -= exog2d.mean(0)
            exog2d /= exog2d.std(0)
        else:
            exog2d = exog

        ip0 = 0
        for i in range(nobs):

            if k == 0:
                kd1 = exog2d - exog2d[i, :]
                kd1 = kfunc(kd1)
                kd[i] = kd1

            # Get the local all-causes survival function
            if k == 0:
                denom = np.cumsum(kd[i][::-1])[::-1]
                num = kd[i] * statusa
                rat = num / denom
                tr = 1e-15
                ii = np.flatnonzero((denom < tr) & (num < tr))
                rat[ii] = 0
                ratc = 1 - rat
                ratc = np.clip(ratc, 1e-10, np.inf)
                lrat = np.log(ratc)
                prat = np.cumsum(lrat)[ie]
                sf = np.exp(prat)
                sp[i] = np.r_[1, sf[:-1]]
                n_risk[i] = denom[ie]

            # Number of cause-specific deaths at each unique time.
            d0 = np.bincount(rtime, weights=status0*kd[i],
                             minlength=len(utime))

            # The cumulative incidence function probabilities.  Carry
            # forward once the effective sample size drops below 1.
            ip1 = np.cumsum(sp[i] * d0 / n_risk[i])
            jj = len(ip1) - np.searchsorted(n_risk[i][::-1], 1)
            if jj < len(ip1):
                ip1[jj:] = ip1[jj - 1]
            if freq_weights is None:
                ip0 += ip1
            else:
                ip0 += freq_weights[i] * ip1

        if freq_weights is None:
            ip0 /= nobs

        ip.append(ip0)

    return utime, ip

예제 #17

0

파일 보기

파일: _kernel_estimates.py 프로젝트: BranYang/statsmodels

def _kernel_survfunc(time, status, exog, kfunc, freq_weights):
    """
    Estimate the marginal survival function under dependent censoring.

    Parameters
    ----------
    time : array-like
        The observed times for each subject
    status : array-like
        The status for each subject (1 indicates event, 0 indicates
        censoring)
    exog : array-like
        Covariates such that censoring is independent conditional on
        exog
    kfunc : function
        Kernel function
    freq_weights : array-like
        Optional frequency weights

    Returns
    -------
    probs : array-like
        The estimated survival probabilities
    times : array-like
        The times at which the survival probabilities are estimated

    References
    ----------
    Zeng, Donglin 2004. Estimating Marginal Survival Function by
    Adjusting for Dependent Censoring Using Many Covariates. The
    Annals of Statistics 32 (4): 1533 55.
    doi:10.1214/009053604000000508.
    http://arxiv.org/pdf/math/0409180.pdf
    """

    # Dimension reduction step
    sfe = PHReg(time, exog, status).fit()
    fitval_e = sfe.predict().predicted_values
    sfc = PHReg(time, exog, 1 - status).fit()
    fitval_c = sfc.predict().predicted_values
    exog2d = np.hstack((fitval_e[:, None], fitval_c[:, None]))

    n = len(time)
    ixd = np.flatnonzero(status == 1)

    # For consistency with standard KM, only compute the survival
    # function at the times of observed events.
    utime = np.unique(time[ixd])

    # Reorder everything so time is ascending
    ii = np.argsort(time)
    time = time[ii]
    status = status[ii]
    exog2d = exog2d[ii, :]

    # Last index where each evaluation time occurs.
    ie = np.searchsorted(time, utime, side='right') - 1

    if freq_weights is not None:
        freq_weights = freq_weights / freq_weights.sum()

    sprob = 0.
    for i in range(n):

        kd = exog2d - exog2d[i, :]
        kd = kfunc(kd)

        denom = np.cumsum(kd[::-1])[::-1]
        num = kd * status
        rat = num / denom
        tr = 1e-15
        ii = np.flatnonzero((denom < tr) & (num < tr))
        rat[ii] = 0
        ratc = 1 - rat
        ratc = np.clip(ratc, 1e-12, np.inf)
        lrat = np.log(ratc)
        prat = np.cumsum(lrat)[ie]
        prat = np.exp(prat)

        if freq_weights is None:
            sprob += prat
        else:
            sprob += prat * freq_weights[i]

    if freq_weights is None:
        sprob /= n

    return sprob, utime

예제 #18

0

파일 보기

파일: _kernel_estimates.py 프로젝트: arnab0000/Internships

def _kernel_survfunc(time, status, exog, kfunc, freq_weights):
    """
    Estimate the marginal survival function under dependent censoring.

    Parameters
    ----------
    time : array_like
        The observed times for each subject
    status : array_like
        The status for each subject (1 indicates event, 0 indicates
        censoring)
    exog : array_like
        Covariates such that censoring is independent conditional on
        exog
    kfunc : function
        Kernel function
    freq_weights : array_like
        Optional frequency weights

    Returns
    -------
    probs : array_like
        The estimated survival probabilities
    times : array_like
        The times at which the survival probabilities are estimated

    References
    ----------
    Zeng, Donglin 2004. Estimating Marginal Survival Function by
    Adjusting for Dependent Censoring Using Many Covariates. The
    Annals of Statistics 32 (4): 1533 55.
    doi:10.1214/009053604000000508.
    https://arxiv.org/pdf/math/0409180.pdf
    """

    # Dimension reduction step
    sfe = PHReg(time, exog, status).fit()
    fitval_e = sfe.predict().predicted_values
    sfc = PHReg(time, exog, 1 - status).fit()
    fitval_c = sfc.predict().predicted_values
    exog2d = np.hstack((fitval_e[:, None], fitval_c[:, None]))

    n = len(time)
    ixd = np.flatnonzero(status == 1)

    # For consistency with standard KM, only compute the survival
    # function at the times of observed events.
    utime = np.unique(time[ixd])

    # Reorder everything so time is ascending
    ii = np.argsort(time)
    time = time[ii]
    status = status[ii]
    exog2d = exog2d[ii, :]

    # Last index where each evaluation time occurs.
    ie = np.searchsorted(time, utime, side='right') - 1

    if freq_weights is not None:
        freq_weights = freq_weights / freq_weights.sum()

    sprob = 0.
    for i in range(n):

        kd = exog2d - exog2d[i, :]
        kd = kfunc(kd)

        denom = np.cumsum(kd[::-1])[::-1]
        num = kd * status
        rat = num / denom
        tr = 1e-15
        ii = np.flatnonzero((denom < tr) & (num < tr))
        rat[ii] = 0
        ratc = 1 - rat
        ratc = np.clip(ratc, 1e-12, np.inf)
        lrat = np.log(ratc)
        prat = np.cumsum(lrat)[ie]
        prat = np.exp(prat)

        if freq_weights is None:
            sprob += prat
        else:
            sprob += prat * freq_weights[i]

    if freq_weights is None:
        sprob /= n

    return sprob, utime

예제 #19

0

파일 보기

파일: test_phreg.py 프로젝트: nmlook/statsmodels

    def test_summary(self):
        # smoke test
        np.random.seed(34234)
        time = 50 * np.random.uniform(size=200)
        status = np.random.randint(0, 2, 200).astype(np.float64)
        exog = np.random.normal(size=(200, 4))

        mod = PHReg(time, exog, status)
        rslt = mod.fit()
        smry = rslt.summary()

        strata = np.kron(np.arange(50), np.ones(4))
        mod = PHReg(time, exog, status, strata=strata)
        rslt = mod.fit()
        smry = rslt.summary()
        msg = "3 strata dropped for having no events"
        assert_(msg in str(smry))

        groups = np.kron(np.arange(25), np.ones(8))
        mod = PHReg(time, exog, status)
        rslt = mod.fit(groups=groups)
        smry = rslt.summary()

        entry = np.random.uniform(0.1, 0.8, 200) * time
        mod = PHReg(time, exog, status, entry=entry)
        rslt = mod.fit()
        smry = rslt.summary()
        msg = "200 observations have positive entry times"
        assert_(msg in str(smry))