예제 #1
0
def test_cross_validator_returns_k_results():
    cf = CoxPHFitter()
    results = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col='T', event_col='E', k=3)
    assert len(results) == 3

    results = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col='T', event_col='E', k=5)
    assert len(results) == 5
예제 #2
0
    def train(self, X, Y):  # the fit method depend on the model type
        if ('semi_parametric' in str(type(
                self.model))) or ('multi_task' in str(type(self.model))):
            self.model.fit(X=X,
                           T=Y['SurvivalTime'],
                           E=Y['Event'],
                           init_method='zeros',
                           num_epochs=500)

        if 'survival_forest' in str(type(self.model)):
            self.model.fit(X=X,
                           T=Y['SurvivalTime'],
                           E=Y['Event'],
                           max_features='all',
                           max_depth=20,
                           sample_size_pct=0.33)

        if 'lifelines' in str(
                type(self.model)
        ):  # else we want to fit a model from lifeline library using cross validation
            k_fold_cross_validation(self.model,
                                    pd.concat([X, Y], axis=1),
                                    'SurvivalTime',
                                    event_col='Event',
                                    k=5)
예제 #3
0
def test_cross_validator_returns_k_results():
    cf = CoxPHFitter()
    results = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col="T", event_col="E", k=3)
    assert len(results) == 3

    results = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col="T", event_col="E", k=5)
    assert len(results) == 5
예제 #4
0
def test_cross_validator_with_specific_loss_function():
    def square_loss(y_actual, y_pred):
        return ((y_actual - y_pred) ** 2).mean()

    cf = CoxPHFitter()
    results_sq = utils.k_fold_cross_validation(cf, load_regression_dataset(), evaluation_measure=square_loss,
                                               duration_col='T', event_col='E')
    results_con = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col='T', event_col='E')
    assert list(results_sq) != list(results_con)
예제 #5
0
def test_cross_validator_with_specific_loss_function():
    def square_loss(y_actual, y_pred):
        return ((y_actual - y_pred) ** 2).mean()

    cf = CoxPHFitter()
    results_sq = utils.k_fold_cross_validation(cf, load_regression_dataset(), evaluation_measure=square_loss,
                                               duration_col='T', event_col='E')
    results_con = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col='T', event_col='E')
    assert list(results_sq) != list(results_con)
예제 #6
0
def test_cross_validator_returns_fitters_k_results():
    cf = CoxPHFitter()
    fitters = [cf, cf]
    results = utils.k_fold_cross_validation(fitters, load_regression_dataset(), duration_col='T', event_col='E', k=3)
    assert len(results) == 2
    assert len(results[0]) == len(results[1]) == 3

    results = utils.k_fold_cross_validation(fitters, load_regression_dataset(), duration_col='T', event_col='E', k=5)
    assert len(results) == 2
    assert len(results[0]) == len(results[1]) == 5
예제 #7
0
def test_cross_validator_returns_fitters_k_results():
    cf = CoxPHFitter()
    fitters = [cf, cf]
    results = utils.k_fold_cross_validation(fitters, load_regression_dataset(), duration_col="T", event_col="E", k=3)
    assert len(results) == 2
    assert len(results[0]) == len(results[1]) == 3

    results = utils.k_fold_cross_validation(fitters, load_regression_dataset(), duration_col="T", event_col="E", k=5)
    assert len(results) == 2
    assert len(results[0]) == len(results[1]) == 5
예제 #8
0
    def test_crossval_for_cox_ph_normalized(self, data_pred2, data_pred1):
        cf = CoxPHFitter()
        for data_pred in [data_pred1, data_pred2]:
            data_norm = data_pred.copy()

            times = data_norm['t']
            # Normalize to mean = 0 and standard deviation = 1
            times -= np.mean(times)
            times /= np.std(times)
            data_norm['t'] = times

            x1 = data_norm['x1']
            x1 -= np.mean(x1)
            x1 /= np.std(x1)
            data_norm['x1'] = x1

            if 'x2' in data_norm.columns:
                x2 = data_norm['x2']
                x2 -= np.mean(x2)
                x2 /= np.std(x2)
                data_norm['x2'] = x2

            scores = k_fold_cross_validation(cf, data_norm,
                                             duration_col='t',
                                             event_col='E', k=3,
                                             predictor='predict_partial_hazard')

            mean_score = 1 - np.mean(scores)  # this is because we are using predict_partial_hazard
            expected = 0.9
            msg = "Expected min-mean c-index {:.2f} < {:.2f}"
            assert mean_score > expected, msg.format(expected, mean_score)
예제 #9
0
    def test_crossval_for_cox_ph_normalized(self, data_pred2, data_pred1):
        cf = CoxPHFitter()
        for data_pred in [data_pred1, data_pred2]:
            data_norm = data_pred.copy()

            times = data_norm['t']
            # Normalize to mean = 0 and standard deviation = 1
            times -= np.mean(times)
            times /= np.std(times)
            data_norm['t'] = times

            x1 = data_norm['x1']
            x1 -= np.mean(x1)
            x1 /= np.std(x1)
            data_norm['x1'] = x1

            if 'x2' in data_norm.columns:
                x2 = data_norm['x2']
                x2 -= np.mean(x2)
                x2 /= np.std(x2)
                data_norm['x2'] = x2

            scores = k_fold_cross_validation(cf, data_norm,
                                             duration_col='t',
                                             event_col='E', k=3,
                                             predictor='predict_partial_hazard')

            mean_score = 1 - np.mean(scores)  # this is because we are using predict_partial_hazard
            expected = 0.9
            msg = "Expected min-mean c-index {:.2f} < {:.2f}"
            assert mean_score > expected, msg.format(expected, mean_score)
예제 #10
0
    def test_crossval_for_cox_ph_with_normalizing_times(
            self, data_pred2, data_pred1):
        cf = CoxPHFitter()

        for data_pred in [data_pred1, data_pred2]:

            # why does this
            data_norm = data_pred.copy()
            times = data_norm['t']
            # Normalize to mean = 0 and standard deviation = 1
            times -= np.mean(times)
            times /= np.std(times)
            data_norm['t'] = times

            scores = k_fold_cross_validation(
                cf,
                data_norm,
                duration_col='t',
                event_col='E',
                k=3,
                predictor='predict_partial_hazard')

            mean_score = 1 - np.mean(scores)

            expected = 0.9
            msg = "Expected min-mean c-index {:.2f} < {:.2f}"
            assert mean_score > expected, msg.format(expected, mean_score)
예제 #11
0
def test_cross_validator_with_specific_loss_function():
    cf = CoxPHFitter()
    results_sq = utils.k_fold_cross_validation(
        cf,
        load_regression_dataset(),
        scoring_method="concordance_index",
        duration_col="T",
        event_col="E")
예제 #12
0
def test_cross_validator_with_predictor_and_kwargs():
    cf = CoxPHFitter()
    results_06 = utils.k_fold_cross_validation(cf,
                                               load_regression_dataset(),
                                               duration_col='T',
                                               k=3,
                                               predictor="predict_percentile",
                                               predictor_kwargs={'p': 0.6})
예제 #13
0
def test_cross_validator_with_predictor():
    cf = CoxPHFitter()
    results = utils.k_fold_cross_validation(cf,
                                            load_regression_dataset(),
                                            duration_col='T',
                                            event_col='E',
                                            k=3,
                                            predictor="predict_expectation")
예제 #14
0
def main():
    """ """
    # simulating a feature matrix for 100 samples with 50 features
    data = np.random.random((100, 50))
    # simulating time of observations (days) min 10 days, max 2500 days
    observed_time = np.random.randint(10, 2500, (100))
    # simulating event (death) 0 did not occur 1 occured
    observed_event = np.random.randint(0, 2, (100))

    test_data = np.random.random((25, 50))
    test_observed_time = np.random.randint(10, 2500, (25))
    test_observed_event = np.random.randint(0, 2, (25))

    for feature_id, feature_vect in enumerate(data.T):
        dataframe = pd.DataFrame({
            'feature nb{0}'.format(feature_id): feature_vect,
            'event': observed_event,
            'time': observed_time
        })

        #building a coxph model to see the significance of each independant feature
        cox_model = CoxPHFitter()

        cox_model.fit(dataframe, duration_col='time', event_col='event')

        pvalue = cox_model.summary.p[0]
        print('pvalue: {0} for feature nb: {1}'.format(pvalue, feature_id))

        if pvalue > 0.05:
            print('feature nb {0} not overall significant!'.format(feature_id))
            continue

        # test the robustness: score close / higher to 0.7 is a good sign
        scores = k_fold_cross_validation(cox_model,
                                         dataframe,
                                         duration_col='time',
                                         event_col='event',
                                         k=3)

        print('score (mean) (c-index) for {0}'.format(np.mean(scores)))

        # validate the features on the test set
        test_dataframe = pd.DataFrame({
            'feature nb{0}'.format(feature_id):
            test_data.T[feature_id],
            'event':
            test_observed_event,
            'time':
            test_observed_time
        })

        inferred_time = cox_model.predict_expectation(test_dataframe)

        validation_c_index = concordance_index(test_observed_time,
                                               inferred_time,
                                               test_observed_event)

        print('validation c-index: {0}'.format(validation_c_index))
예제 #15
0
def test_cross_validator_with_predictor_and_kwargs():
    cf = CoxPHFitter()
    results_06 = utils.k_fold_cross_validation(
        cf,
        load_regression_dataset(),
        duration_col="T",
        k=3,
        predictor="predict_percentile",
        predictor_kwargs={"p": 0.6},
    )
    assert len(results_06) == 3
예제 #16
0
def test_cross_validator_with_predictor():
    cf = CoxPHFitter()
    results = utils.k_fold_cross_validation(
        cf,
        load_regression_dataset(),
        duration_col="T",
        event_col="E",
        k=3,
        predictor="predict_expectation",
    )
    assert len(results) == 3
예제 #17
0
def crossValidate(name, fitter):
    from lifelines.utils import k_fold_cross_validation
    import numpy as np
    print("Cross Validating " + name)
    print(
        np.mean(
            k_fold_cross_validation(fitter,
                                    times,
                                    duration_col='time',
                                    event_col='success')))
    print("End cross-validation of " + name)
예제 #18
0
    def test_crossval_for_aalen_add(self, data_pred2, data_pred1):
        aaf = AalenAdditiveFitter()
        for data_pred in [data_pred1, data_pred2]:
            mean_scores = []
            for repeat in range(20):
                scores = k_fold_cross_validation(aaf, data_pred,
                                                 duration_col='t',
                                                 event_col='E', k=3)
                mean_scores.append(np.mean(scores))

            expected = 0.90
            msg = "Expected min-mean c-index {:.2f} < {:.2f}"
            assert np.mean(mean_scores) > expected, msg.format(expected, np.mean(scores))
예제 #19
0
    def test_crossval_for_aalen_add(self, data_pred2, data_pred1):
        aaf = AalenAdditiveFitter()
        for data_pred in [data_pred1, data_pred2]:
            mean_scores = []
            for repeat in range(20):
                scores = k_fold_cross_validation(aaf, data_pred,
                                                 duration_col='t',
                                                 event_col='E', k=3)
                mean_scores.append(np.mean(scores))

            expected = 0.90
            msg = "Expected min-mean c-index {:.2f} < {:.2f}"
            assert np.mean(mean_scores) > expected, msg.format(expected, scores.mean())
예제 #20
0
    def test_crossval_for_cox_ph(self, data_pred2, data_pred1):
        cf = CoxPHFitter()

        for data_pred in [data_pred1, data_pred2]:
            scores = k_fold_cross_validation(cf, data_pred,
                                             duration_col='t',
                                             event_col='E', k=3,
                                             predictor='predict_partial_hazard')

            mean_score = 1 - np.mean(scores)  # this is because we are using predict_partial_hazard

            expected = 0.9
            msg = "Expected min-mean c-index {:.2f} < {:.2f}"
            assert mean_score > expected, msg.format(expected, mean_score)
예제 #21
0
    def test_crossval_for_cox_ph(self, data_pred2, data_pred1):
        cf = CoxPHFitter()

        for data_pred in [data_pred1, data_pred2]:
            scores = k_fold_cross_validation(cf, data_pred,
                                             duration_col='t',
                                             event_col='E', k=3,
                                             predictor='predict_partial_hazard')

            mean_score = 1 - np.mean(scores)  # this is because we are using predict_partial_hazard

            expected = 0.9
            msg = "Expected min-mean c-index {:.2f} < {:.2f}"
            assert mean_score > expected, msg.format(expected, mean_score)
예제 #22
0
    def test_crossval_for_cox_ph_with_normalizing_times(self, data_pred2, data_pred1):
        cf = CoxPHFitter()

        for data_pred in [data_pred1, data_pred2]:

            # why does this
            data_norm = data_pred.copy()
            times = data_norm['t']
            # Normalize to mean = 0 and standard deviation = 1
            times -= np.mean(times)
            times /= np.std(times)
            data_norm['t'] = times

            scores = k_fold_cross_validation(cf, data_norm,
                                             duration_col='t',
                                             event_col='E', k=3,
                                             predictor='predict_partial_hazard')

            mean_score = 1 - np.mean(scores)

            expected = 0.9
            msg = "Expected min-mean c-index {:.2f} < {:.2f}"
            assert mean_score > expected, msg.format(expected, mean_score)
예제 #23
0
def cox_regression_experiment():
    dynamic_features = np.load('pick_5_visit_features_merge_1.npy')[
        0:2100, :, :-2]
    dynamic_features.astype(np.int32)
    labels = np.load('pick_5_visit_labels_merge_1.npy')[:, :, -4].reshape(
        -1, dynamic_features.shape[1], 1)
    data = np.concatenate((dynamic_features, labels), axis=2).reshape(-1, 94)
    data_set = pd.DataFrame(data)
    col_list = list(data_set.columns.values)
    new_col = [str(x) for x in col_list]
    data_set.columns = new_col
    np.savetxt('allPatient_now.csv', data_set, delimiter=',')
    print(list(data_set.columns.values))
    cph = CoxPHFitter(penalizer=100)
    cph.fit(data_set, duration_col='0', event_col='93', show_progress=True)
    cph.print_summary()
    # cph.plot(columns=['15','20','21','25'])
    # plt.savefig('cox model' + '.png', format='png')

    scores = k_fold_cross_validation(cph, data_set, '0', event_col='93', k=5)
    print(scores)
    print(np.mean(scores))
    print(np.std(scores))
예제 #24
0
from lifelines import CoxPHFitter
from lifelines.utils import k_fold_cross_validation
import pandas as pd
import numpy as np




data = pd.read_csv('/home/xinfy/Desktop/lasurv/project_BC_surv/coxnnet_py3mo/coxnetv1/ssBRCA/datacxph/x.csv')
ytime = pd.read_csv('/home/xinfy/Desktop/lasurv/project_BC_surv/coxnnet_py3mo/coxnetv1/ssBRCA/datacxph/ytime.csv')
ystatus = pd.read_csv('/home/xinfy/Desktop/lasurv/project_BC_surv/coxnnet_py3mo/coxnetv1/ssBRCA/datacxph/ystatus.csv')


data['time'] = ytime['time'].values
data['event'] = ystatus['status'].values



cph = CoxPHFitter(penalizer=0.05)
# #cph.fit(dataset, duration_col='time', event_col='event', show_progress=True)
scores = k_fold_cross_validation(cph, data, duration_col='time', event_col='event', k=10, scoring_method="concordance_index")

print(np.asarray(scores))

#print(np.asarray(scores).mean(),np.asarray(scores).std())
예제 #25
0
def test_cross_validator_with_stratified_cox_model():
    cf = CoxPHFitter(strata=['race'])
    utils.k_fold_cross_validation(cf,
                                  load_rossi(),
                                  duration_col='week',
                                  event_col='arrest')
    smysub = smy.loc[smy['type'] == sel].copy()
    axes[0, j].errorbar(x=np.arange(smysub.shape[0]),
                        y=np.exp(smysub['coef']),
                        marker='o',
                        linestyle='',
                        yerr=np.exp(smysub['err']))
    axes[0, j].set_title('exp(beta) coefs for {}'.format(
        ['manufacturer', 'capacity'][j]))
    axes[0, j].set_xlim([-0.1, len(smysub) - 0.9])
    axes[0, j].set_xticks(np.arange(smysub.shape[0]))
    axes[0,
         j].set_xticklabels([t.split('[')[1][2:-1] for t in smysub['index']])

cx1 = sa.CoxPHFitter(normalize=False)
scores = k_fold_cross_validation(cx1,
                                 dft,
                                 k=5,
                                 duration_col='maxhours',
                                 event_col='failed',
                                 predictor='predict_expectation')

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 2))
sns.boxplot(scores, vert=False, color='lightblue', ax=axes, showmeans=True)
axes.annotate('{:.3f}'.format(np.mean(scores)),
              xy=(np.mean(scores), 1),
              xycoords='data',
              xytext=(10, 10),
              textcoords='offset points',
              color='r',
              fontsize=12)
axes.set_xlim([0.5, 1])
예제 #27
0
def test_cross_validator_with_predictor():
    cf = CoxPHFitter()
    results = utils.k_fold_cross_validation(cf, load_regression_dataset(),
                                            duration_col='T', event_col='E', k=3,
                                            predictor="predict_expectation")
    assert len(results) == 3
예제 #28
0
var3 0.2186     1.2443    0.0758 2.8836 0.0039      0.0700      0.3672  **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Concordance = 0.580
"""

cph.plot()

# Using Aalen's Additive model
aaf = AalenAdditiveFitter(fit_intercept=False)
aaf.fit(regression_dataset, duration_col='T', event_col='E')
aaf.plot()

X = regression_dataset.drop(['E', 'T'], axis=1)
aaf.predict_survival_function(
    X.iloc[10:12]).plot()  # get the unique survival functions of two subjects

scores = k_fold_cross_validation(cph,
                                 regression_dataset,
                                 duration_col='T',
                                 event_col='E',
                                 k=10)
print(scores)
print(np.mean(scores))
print(np.std(scores))

plt.show()
#==============================================================================
#==============================================================================
예제 #29
0
    def __linear_small(self, is_death, train_data_path, basepath):
        small_dataset_file = train_data_path
        small_dataset = pandas.read_csv(small_dataset_file,
                                        encoding='UTF-8',
                                        index_col=[0])
        del small_dataset['patient_id']
        del small_dataset['name']

        # 哑变量处理
        formular = ''
        classify_attr = {
            'gender', 'smoking', 'highflux', 'payment', 'marital', 'alcohol',
            'HBsAg', 'HBsAb', 'HBeAg', 'HBeAb', 'HBcAb', 'HCV',
            'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'diuretic', 'LipidD',
            'CaPB', 'NCaPB', 'VitD', 'mucosaprotect', 'H2RA', 'PPI', 'APUD',
            'access', 'ESRDcause', 'hypertension', 'DM',
            'cardiovasculardisease', 'cerebrovasculardisease', 'bleeding',
            'malignancy', 'ablocker', 'bblocker'
        }
        for column in small_dataset.columns:
            if column in classify_attr:
                formular = formular + 'C(' + column + ')+'
            else:
                formular = formular + column + '+'
        formular = formular[:-1]

        small_dataset = patsy.dmatrix(formular + '-1',
                                      small_dataset,
                                      return_type='dataframe')
        if is_death:
            T_true, E_true, T_false, E_false = ('survivaltime1', 'outcome1',
                                                'survivaltime2', 'outcome2')
            attr_file, p632_file, var_file, kfold_file = (
                'lm_significant_attrs.txt', 'lm_stats632.csv',
                'lm_statvar.txt', 'lm_statskfold.csv')
            beta_file, p_file = ('lm_coef.csv', 'lm_p.csv')
        else:
            T_true, E_true, T_false, E_false = ('survivaltime2', 'outcome2',
                                                'survivaltime1', 'outcome1')
            attr_file, p632_file, var_file, kfold_file = (
                'lm_significant_attrs_e.txt', 'lm_stats632_e.csv',
                'lm_statvar_e.txt', 'lm_statskfold_e.csv')
            beta_file, p_file = ('lm_coef_e.csv', 'lm_p_e.csv')
        del small_dataset[T_false]
        del small_dataset[E_false]

        significant_attrs = list()
        for column in small_dataset.columns:
            # print('column', column)
            if column in {T_true, E_true}:
                continue
            subset = small_dataset[[column, T_true, E_true]]
            # print('subset', subset)
            try:
                cox = CoxPHFitter()
                cox.fit(subset, T_true, E_true)
                # print('cox.summary['p'][0]:', cox.summary['p'][0])
                if cox.summary['p'][0] < 0.05:
                    significant_attrs.append(column)
            except Exception:
                continue
        output = open(attr_file, mode='w')
        for attr in significant_attrs:
            output.write(attr + '\n')
        output.close()

        input = open(attr_file)
        significant_attrs = [line.strip() for line in input.readlines()]
        input.close()

        significant_attrs.append(T_true)
        significant_attrs.append(E_true)
        print('linear_small ## sign_attr : %d' % len(significant_attrs))

        small_dataset = small_dataset[significant_attrs]

        # 10000 times .632 bootstrap
        count = 0
        stats632 = list()
        statscoef = list()
        statspvalue = list()
        while count < 10000:  # 线性训练
            try:
                train_set = small_dataset.take(
                    numpy.random.randint(0,
                                         len(small_dataset),
                                         size=len(small_dataset)))
                test_set = small_dataset.ix[set(
                    small_dataset.index).difference(set(train_set.index))]

                train_set.index = range(len(train_set))
                test_set.index = range(len(test_set))

                cox = CoxPHFitter()
                cox.fit(train_set, T_true, E_true)
                train_cindex = concordance_index(
                    cox.durations,
                    -cox.predict_partial_hazard(cox.data).values.ravel(),
                    cox.event_observed)

                statscoef.append(cox.summary[['coef']].T)
                statspvalue.append(cox.summary[['p']].T)

                # test_set
                test_actual_T = test_set[T_true].copy()
                test_actual_E = test_set[E_true].copy()
                test_variable = test_set[test_set.columns.difference(
                    [T_true, E_true])]
                test_predictT = cox.predict_expectation(test_variable)

                # small_set
                all_actual_T = small_dataset[T_true].copy()
                all_actual_E = small_dataset[E_true].copy()
                all_variable = small_dataset[small_dataset.columns.difference(
                    [T_true, E_true])]
                all_predictT = cox.predict_expectation(all_variable)

                try:
                    test_cindex = concordance_index(test_actual_T,
                                                    test_predictT,
                                                    test_actual_E)
                    all_cindex = concordance_index(all_actual_T, all_predictT,
                                                   all_actual_E)
                except Exception:
                    test_cindex = concordance_index(test_actual_T,
                                                    test_predictT)
                    all_cindex = concordance_index(all_actual_T, all_predictT)

                stats632.append([train_cindex, test_cindex, all_cindex])
                count += 1
                print('632 -> %d' % count)
            except Exception:
                continue
        stats632_df = pandas.DataFrame(stats632,
                                       columns=['train', 'test', 'all'])
        stats632_df.to_csv(p632_file, encoding='UTF-8')

        statscoef_df = pandas.DataFrame(
            pandas.concat(statscoef, ignore_index=True))
        statscoef_df.to_csv(beta_file, encoding='UTF-8')
        statspvalue_df = pandas.DataFrame(
            pandas.concat(statspvalue, ignore_index=True))
        statspvalue_df.to_csv(p_file, encoding='UTF-8')

        # 2000 times 10-fold cross-validation、十折交叉
        count = 0
        statskfold = list()
        while count < 2000:
            try:
                cox = CoxPHFitter()
                scores = k_fold_cross_validation(cox, small_dataset, T_true,
                                                 E_true, 10)
                statskfold.append(scores)
                count += 1
                print('k-fold -> %d' % count)
            except Exception:
                continue
        statskfold_df = pandas.DataFrame(statskfold)
        statskfold_df.to_csv(basepath + "/" + kfold_file, encoding='UTF-8')
예제 #30
0
all_features_drop_corr, de_corr_features = RandDropCorr(
    all_features_drop_low_var, 0.8)
all_features_drop_corr.columns = de_corr_features

all_features_reduced = pd.concat(
    [all_features_drop_corr, survival_df_filtered],
    axis=1).drop('case_submitter_id', axis=1)

my_cph = CoxPHFitter(penalizer=0.005, l1_ratio=0.9)
# haha.drop(['original_glszm_SizeZoneNonUniformity_1'],axis=1).to_csv('truth_reg_vars.csv')
# my_cph.fit(haha.drop(['original_glszm_SizeZoneNonUniformity_1'],axis=1), duration_col = 'days_to_death', event_col='vital_status')
my_cph.fit(all_features_reduced,
           duration_col='days_to_death',
           event_col='vital_status')
my_cph.print_summary()

haha = all_features_reduced.drop(
    ['original_glrlm_GrayLevelNonUniformityNormalized_1'], axis=1)
haha = haha.drop(['original_glcm_SumEntropy_1'], axis=1)

my_cph.fit(haha, duration_col='days_to_death', event_col='vital_status')
my_cph.print_summary()

scores = k_fold_cross_validation(my_cph,
                                 all_features_reduced,
                                 duration_col='days_to_death',
                                 event_col='vital_status',
                                 k=10,
                                 scoring_method="concordance_index")
np.mean(scores)
tx = df['history_of_neoadjuvant_treatment']=='Yes'
ax = plt.subplot(111)

kmf1 = KaplanMeierFitter(alpha=0.95)
kmf1.fit(durations=df.ix[tx, survival_col], event_observed=df.ix[tx, censor_col], label=['Tx==Yes'])
kmf1.plot(ax=ax, show_censors=True,  ci_show=False)


kmf2 = KaplanMeierFitter(alpha=0.95)
kmf2.fit(durations=df.ix[~tx, survival_col], event_observed=df.ix[~tx, censor_col], label=['Tx==No'])
kmf2.plot(ax=ax, show_censors=True,  ci_show=False )

add_at_risk_counts(kmf1, kmf2, ax=ax)
plt.title ('Acute myeloid leukemia survival analysis with Tx and without Tx')
plt.xlabel(survival_col)
plt.savefig('km.png')

results = logrank_test(df.ix[tx, survival_col], df.ix[~tx, survival_col], df.ix[tx, censor_col], df.ix[~tx, censor_col], alpha=.99 )
results.print_summary()

cox = CoxPHFitter(normalize=False)
df_age = df[[survival_col, censor_col, 'age_at_initial_pathologic_diagnosis']]
df_age = df_age[pd.notnull(df_age['age_at_initial_pathologic_diagnosis'])]
cox = cox.fit(df_age, survival_col, event_col=censor_col, include_likelihood=True)
cox.print_summary()

scores = k_fold_cross_validation(cox, df_age, survival_col, event_col=censor_col, k=10)
print scores
print 'Mean score', np.mean(scores)
print 'Std', np.std(scores)
 
예제 #32
0
    aft.fit(times, duration_col='time', event_col='success')
    aft.print_summary(3)

    #aft = WeibullAFTFitter().fit(times, 'time', 'success', ancillary_df=True)
    save(name + 'aft', aft.plot())
    fitters[name] = aft
    crossValidate(name, aft)
    print("END " + name)

print('EXAMPLE DATA FOLLOWS')
from lifelines import AalenAdditiveFitter, CoxPHFitter
from lifelines.datasets import load_regression_dataset
from lifelines.utils import k_fold_cross_validation
import numpy as np

df = load_regression_dataset()

#create the three models we'd like to compare.
aaf_1 = AalenAdditiveFitter(coef_penalizer=0.5)
aaf_2 = AalenAdditiveFitter(coef_penalizer=10)
cph = CoxPHFitter()

print(
    np.mean(k_fold_cross_validation(cph, df, duration_col='T', event_col='E')))
print(
    np.mean(k_fold_cross_validation(aaf_1, df, duration_col='T',
                                    event_col='E')))
print(
    np.mean(k_fold_cross_validation(aaf_2, df, duration_col='T',
                                    event_col='E')))
예제 #33
0
features = genfromtxt(features_path, delimiter=',')[1:].astype(int)
#features = random.sample(range(0, 15939), 23)
columns = columns[features]
for i, column in enumerate(columns):
    columns[i] = column.strip()

X = dataset["data"]
X = X[:,features]
data = pd.DataFrame(X, columns = columns)
event = dataset["cencoring"]
time = dataset["labels"][:,5]
data["event"] = event.ravel()
data["time"] = time
    
cf = CoxPHFitter()
scores = k_fold_cross_validation(cf, data, 'time', event_col='event', k=3)
print scores
print np.mean(scores)
print np.std(scores)


le = preprocessing.LabelEncoder()
subtypes = le.fit_transform(dataset["subtypes"])
data["subtype"] = subtypes 
T = data["time"]
C = data["event"]

kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=C)
kmf.plot(title = 'Survival Day Profile of Breast Cancer Patients')
def main():
    parser = ArgumentParser()
    parser.add_argument('--data_dir', default='../../data/frequency')
    parser.add_argument('--out_dir', default='../../output')
    args = parser.parse_args()
    data_dir = args.data_dir
    out_dir = args.out_dir
    # collect data
    vocab = get_default_vocab()
    tf = pd.read_csv(os.path.join(data_dir, '2013_2016_tf_norm_log.tsv'),
                     sep='\t',
                     index_col=0)
    D_L = pd.read_csv(os.path.join(data_dir, '2013_2016_3gram_residuals.tsv'),
                      sep='\t',
                      index_col=0).loc[vocab, :].fillna(0, inplace=False)
    D_U = pd.read_csv(os.path.join(data_dir,
                                   '2013_2016_user_diffusion_log.tsv'),
                      sep='\t',
                      index_col=0).loc[vocab, :].fillna(0, inplace=False)
    D_S = pd.read_csv(os.path.join(data_dir,
                                   '2013_2016_subreddit_diffusion_log.tsv'),
                      sep='\t',
                      index_col=0).loc[vocab, :].fillna(0, inplace=False)
    D_T = pd.read_csv(os.path.join(data_dir,
                                   '2013_2016_thread_diffusion_log.tsv'),
                      sep='\t',
                      index_col=0).loc[vocab, :].fillna(0, inplace=False)
    #     growth_words = get_growth_words()
    #     growth_decline_words, split_points = get_growth_decline_words_and_params()
    success_words, fail_words, split_points = get_success_fail_words()

    split_points = split_points.apply(lambda x: int(ceil(x)))
    # organize into survival df
    combined_words = fail_words + success_words
    V = len(combined_words)
    deaths = pd.Series(pd.np.zeros(V), index=combined_words)
    deaths.loc[fail_words] = 1
    N = tf.shape[1]
    split_points_combined = pd.concat([
        split_points,
        pd.Series([
            N,
        ] * len(success_words), index=success_words)
    ],
                                      axis=0)
    covariates = [tf, D_L, D_U, D_S, D_T]
    covariate_names = ['f', 'D_L', 'D_U', 'D_S', 'D_T']
    survival_df = build_survival_df(fail_words, success_words,
                                    split_points_combined, covariates,
                                    covariate_names)
    survival_df_nan = survival_df[survival_df.isnull().any(axis=1)]

    # full timeframe test
    # fit regression using all covariates and all data up to and including time of death
    scaler = StandardScaler()
    survival_df_norm = survival_df.copy()
    survival_df_norm[covariate_names] = scaler.fit_transform(
        survival_df_norm[covariate_names])
    cox_model = CoxPHFitter()
    event_var = 'death'
    time_var = 't'
    cox_model.fit(survival_df_norm, time_var, event_col=event_var)
    regression_output_file = os.path.join(out_dir,
                                          'cox_regression_all_data.txt')
    orig_stdout = sys.stdout
    with open(regression_output_file, 'w') as regression_output:
        sys.stdout = regression_output
        cox_model.print_summary()
        sys.stdout = orig_stdout

    # fixed timeframe test
    # fit regression using all covariates and only data up to first m months
    m = 3
    death_words = list(fail_words)
    right_censored_words = list(success_words)
    combined_words = death_words + right_censored_words
    fixed_death_times = pd.Series(pd.np.repeat(m, len(combined_words)),
                                  index=combined_words)
    covariates = [tf, D_L, D_U, D_S, D_T]
    covariate_names = ['f', 'D_L', 'D_U', 'D_S', 'D_T']
    survival_df = build_survival_df(death_words, right_censored_words,
                                    fixed_death_times, covariates,
                                    covariate_names)
    # now provide the actual death/censorship times
    N = tf.shape[1]
    death_times = pd.concat([
        split_points.loc[death_words],
        pd.Series([
            N,
        ] * len(right_censored_words),
                  index=right_censored_words)
    ],
                            axis=0)
    survival_df['t'] = death_times
    cox_model = CoxPHFitter()
    survival_df.loc[:, covariate_names] = scaler.fit_transform(
        survival_df.loc[:, covariate_names])
    cox_model.fit(survival_df, time_var, event_col=event_var)
    regression_output_file = os.path.join(out_dir,
                                          'cox_regression_first_%d.txt' % (m))
    orig_stdout = sys.stdout
    with open(regression_output_file, 'w') as regression_output:
        sys.stdout = regression_output
        cox_model.print_summary()
        sys.stdout = orig_stdout

    # concordance values
    # set up multiple models with different feature sets
    # then run 10-fold cross-validation to generate concordance scores
    # and plot distributions
    cv = 10
    feature_sets = []

    covariate_sets = [['f'], ['f', 'D_L'], ['f', 'D_U', 'D_S', 'D_T'],
                      ['f', 'D_L', 'D_U', 'D_S', 'D_T']]
    covariate_set_names = ['f', 'f+L', 'f+S', 'f+L+S']
    covariate_set_scores = {}
    cv = 10
    for covariate_set, covariate_set_name in izip(covariate_sets,
                                                  covariate_set_names):
        survival_df_relevant = survival_df.loc[:, covariate_set +
                                               [time_var, event_var]]
        cox_model = CoxPHFitter()
        scores = k_fold_cross_validation(cox_model,
                                         survival_df_relevant,
                                         time_var,
                                         event_col=event_var,
                                         k=cv)
        covariate_set_scores[covariate_set_name] = scores
    covariate_set_scores = pd.DataFrame(covariate_set_scores).transpose()
    score_names = ['score_%d' % (i) for i in range(cv)]
    covariate_set_scores.columns = score_names
    # significance test between f and f+C, f+D, f+C+D concordance scores
    pval_thresh = 0.05
    baseline_scores = covariate_set_scores.loc['f', score_names]
    covariate_test_names = ['f+L', 'f+S', 'f+L+S']
    # bonferroni correction = alpha / 3
    pval_corrected = pval_thresh / len(covariate_test_names)
    covariate_set_scores.loc[:, 'pval_thresh'] = pval_corrected
    for covariate_test_name in covariate_test_names:
        covariate_test_scores = covariate_set_scores.loc[covariate_test_name,
                                                         score_names]
        t_stat, pval = ttest_ind(covariate_test_scores,
                                 baseline_scores,
                                 equal_var=False)
        covariate_set_scores.loc[covariate_test_name, 't_test'] = t_stat
        covariate_set_scores.loc[covariate_test_name, 'pval'] = pval
    # write to file
    out_file = os.path.join(
        out_dir, 'cox_regression_concordance_%d_fold_scores.tsv' % (cv))
    covariate_set_scores.to_csv(out_file, sep='\t', index=True)
예제 #35
0
def test_cross_validator_with_stratified_cox_model():
    cf = CoxPHFitter(strata=["race"])
    utils.k_fold_cross_validation(cf, load_rossi(), duration_col="week", event_col="arrest")
예제 #36
0
def test_cross_validator_with_stratified_cox_model():
    cf = CoxPHFitter(strata=['race'])
    utils.k_fold_cross_validation(cf, load_rossi(), duration_col='week', event_col='arrest')