예제 #1
0
    def get_c_index(self):

        features = self.get_X_test()
        target = self.get_T_test()
        event = self.get_E_test()

        return concordance_index(self.model_forest, features,
                                    target, event)
예제 #2
0
    def cv_survival(self, cv=10, params={}, num_trees=1000,
                    balance_classes=False, verbose=True, ):

        self.verify_best_num_feats()

        # Check if the best hyperparameters were processed
        params = self.check_params(params)

        kf = KFold(n_splits=cv, shuffle=True, random_state=self.seed)

        scores = []
        models = []
        datasets = []

        df_cv = self.df_train.copy()

        for fold, (index_train, index_test) in enumerate(kf.split(df_cv), 1):
            if verbose:
                print('Fold {}'.format(fold))

            data_train = df_cv.iloc[index_train].reset_index( drop = True )
            data_test  = df_cv.iloc[index_test].reset_index( drop = True )

            # Creating the X, T and E inputs
            X_train, X_test = data_train[self.features], data_test[self.features]
            T_train, T_test = data_train[self.target].values, data_test[self.target].values
            E_train, E_test = data_train[self.event].values, data_test[self.event].values

            weights = self.compute_event_weights(E_train, balance_classes)

            X_train = X_train[self.feature_importance[:self.best_num_feats]]
            X_test = X_test[self.feature_importance[:self.best_num_feats]]

            # Creating model
            model_forest = ConditionalSurvivalForestModel(num_trees=num_trees,)
            model_forest.fit(X_train, T_train, E_train, seed=self.seed, weights=weights, **params)

            # Append score for post calculation average of folds
            scores.append(concordance_index(model_forest, X_test, T_test, E_test))

        # Refit model with all training data
        self.fit_model(num_trees=num_trees,
                       params=params,
                       balance_classes=balance_classes)

        scores = np.array(scores)
        self.cv_score = np.mean(scores)
        if verbose:
            print('CV Score: {:.3f}'.format(self.cv_score))
예제 #3
0
def compute_scores(model, table, timepoints, variables):
    c_indexes = []
    for i in timepoints:
        table.loc[:, 'disease_progress_temp'] = table['disease_progress']
        table.loc[:, 'PFS_temp'] = table['PFS']
        table.loc[table.PFS > i, 'disease_progress_temp'] = 0
        table.loc[table.PFS > i, 'PFS_temp'] = i
        c_indexes.append(
            concordance_index(model, table[variables], table['PFS_temp'],
                              table['disease_progress_temp']))
    brier_scores = brier_score(model,
                               table[variables],
                               table['PFS'],
                               table['disease_progress'],
                               t_max=84,
                               figure_size=(20, 6.5))
    return c_indexes, brier_scores
예제 #4
0
E_rsf_train, E_rsf_test = E_rsf[index_train], E_rsf[index_test]

km_base_model = KaplanMeierModel()
km_base_model.fit(T_rsf_test, E_rsf_test)

cv = RepeatedStratifiedKFold(n_splits=5,n_repeats=2,random_state=21)

rsf = RandomSurvivalForestModel(num_trees=200)
#rsf_num_trees = [100,200,300,500]
#rsf_max_depth = [5,10,15,20,25,30,35,40,45,50]
#rsf_min_node_size = [3, 5, 7, 9]
#param_grid_rsf = {'num_trees':rsf_num_trees,'max_depth':rsf_max_depth,'min_node_size':rsf_min_node_size}
rsf.fit(X_rsf_train,T_rsf_train,E_rsf_train,max_features='sqrt',max_depth=36,min_node_size=4,seed=21)
#rsf_cv = RandomizedSearchCV(rsf, param_distributions=param_grid_rsf, cv=cv,scoring='accuracy',random_state=42,)
#rsf_cv.fit(X_rsf_train,T_rsf_train,E_rsf_train)
c_index = concordance_index(rsf,X_rsf_test,T_rsf_test,E_rsf_test)
print('C-index: {:0.2f}'.format(c_index))
ibs = integrated_brier_score(rsf, X_rsf_test, T_rsf_test, E_rsf_test)
print('IBS: {:0.2f}'.format(ibs))

# Initializing the figure
fig, ax = plt.subplots(figsize=(8, 4))

# Randomly extracting a data-point that experienced an event 
choices = np.argwhere((E_rsf_test==1.)&(T_rsf_test>=1)).flatten()
np.random.seed(16)
#k = np.random.choice( choices, 1)[0]

# Saving the time of event
t = T_rsf_test[choices]
예제 #5
0
    return c_indexes, brier_scores


# In[ ]:

# In[36]:

csf = ConditionalSurvivalForestModel(num_trees=100)
csf.fit(train[features],
        train['PFS'],
        train['disease_progress'],
        max_features=1,
        max_depth=5,
        min_node_size=2)

c_index = concordance_index(csf, test[features], test['PFS'],
                            test['disease_progress'])
print('C-index: {:.2f}'.format(c_index))

ibs = integrated_brier_score(csf,
                             test[features],
                             test['PFS'],
                             test['disease_progress'],
                             t_max=84,
                             figure_size=(20, 6.5))
print('IBS: {:.2f}'.format(ibs))

results = compare_to_actual(csf,
                            test[features],
                            test['PFS'],
                            test['disease_progress'],
                            is_at_risk=False,
        alpha=0.05,
        minprop=0.1)

# In[ ]:

# Computing variables importance
csf.variable_importance_table.head(5)

# In order to assess the model performance, we previously split the original dataset into training and testing sets, so that we can now compute its performance metrics on the testing set:
#
# The C-index represents the global assessment of the model discrimination power: this is the model’s ability to correctly provide a reliable ranking of the survival times based on the individual risk scores. In general, when the C-index is close to 1, the model has an almost perfect discriminatory power; but if it is close to 0.5, it has no ability to discriminate between low and high risk subjects.

# In[ ]:

from pysurvival.utils.metrics import concordance_index
c_index = concordance_index(csf, X_test, T_test, E_test)
print('C-index: {:.2f}'.format(c_index))  #0.83

# The Brier score measures the average discrepancies between the status and the estimated probabilities at a given time. Thus, the lower the score (usually below 0.25), the better the predictive performance. To assess the overall error measure across multiple time points, the Integrated Brier Score (IBS) is usually computed as well.

# In[ ]:

from pysurvival.utils.display import integrated_brier_score
ibs = integrated_brier_score(csf,
                             X_test,
                             T_test,
                             E_test,
                             t_max=12,
                             figure_size=(15, 5))
print('IBS: {:.2f}'.format(ibs))
     'event'].values
 structure = [{
     'activation': 'SELU',
     'num_units': num_units
 }, {
     'activation': 'SELU',
     'num_units': num_units2
 }]
 nonlinear_coxph = NonLinearCoxPHModel(structure=structure)
 nonlinear_coxph.fit(X_train,
                     T_train,
                     E_train,
                     lr=lr,
                     init_method='xav_uniform',
                     dropout=dropout)
 c_index = concordance_index(nonlinear_coxph, X_test,
                             T_test, E_test)
 c_index_df = c_index_df.append({"c_index": c_index},
                                ignore_index=True)
 print(c_index_df)
 if len(c_index_df) == 10:
     mean = c_index_df["c_index"].mean()
     dataframe_hp = dataframe_hp.append(
         {
             "lr": lr,
             "dropout": dropout,
             "num_units": num_units,
             "num_units2": num_units2,
             "num_layers": 2,
             "activation": "SELU/SELU",
             "mean": mean
         },
 T_train, T_test = data_train['time'].values, data_test[
     'time'].values
 E_train, E_test = data_train['event'].values, data_test[
     'event'].values
 structure = [{'activation': 'RELU', 'num_units': 25}]
 nonlinear_coxph = NonLinearCoxPHModel(structure=structure)
 start = time.time()
 nonlinear_coxph.fit(X_train,
                     T_train,
                     E_train,
                     lr=0.0001,
                     init_method='xav_uniform',
                     dropout=0.2)
 stop = time.time()
 duration = stop - start
 c_index = concordance_index(nonlinear_coxph, X_test, T_test,
                             E_test)
 c_index_train = concordance_index(nonlinear_coxph, X_train,
                                   T_train, E_train)
 c_index_multiple_models = c_index_multiple_models.append(
     {
         "c_index_test": c_index,
         "c_index_train": c_index_train,
         "duration": duration
     },
     ignore_index=True)
 print(c_index_multiple_models)
 if len(c_index_multiple_models) == 10:
     mean_test = c_index_multiple_models["c_index_test"].mean()
     mean_train = c_index_multiple_models["c_index_train"].mean()
     mean_duration = c_index_multiple_models["duration"].mean()
     c_index_df = c_index_df.append({"c_index_test": mean_test},
    for b in max_depth:
        for c in min_node:
            cc = []
            kf = StratifiedKFold(n_splits=7, random_state=42, shuffle=True)
            i = 1
            for train_index, test_index in kf.split(Xtemp,Etemp):
                X1_train, X1_test = Xtemp.loc[train_index], Xtemp.loc[test_index]
                X_train, X_test = X1_train[featuresTemp], X1_test[featuresTemp]
                T_train, T_test = X1_train['NumDays'].values, X1_test['NumDays'].values
                E_train, E_test = Etemp.loc[train_index].values, Etemp.loc[test_index].values
                xst = RandomSurvivalForestModel(num_trees=a) 
                xst.fit(X_train, T_train, E_train, max_features = 'sqrt', max_depth = b,
                min_node_size = c, num_threads = -1, 
                sample_size_pct = 0.63, importance_mode = 'normalized_permutation',
                seed = None, save_memory = False )
                c_index = concordance_index(xst, X_test, T_test, E_test)
                cc.append(c_index)
                i = i+1
            print(a,b, c, mean(cc))
                    

CI = []
IBS = []
best_num_tree = 15
best_depth = 10
best_min_node = 5
k_folds = 4

i=1
kf=StratifiedKFold(n_splits = k_folds, random_state = 1, shuffle = True)
for train_index, test_index in kf.split(X,E):
def run_pysurvival_with_repetitions(data,
                                    features,
                                    survival,
                                    event,
                                    models,
                                    test_ratio,
                                    repetitions=10):

    num_samples = len(data.index)
    print('Number of Samples:', num_samples)
    ''' Initialize Outputs '''
    outputs = initialize_outputs(models, features)
    ''' Run Survival Model N times '''
    for _ in range(repetitions):
        ''' Dataset Splitting '''
        index_train, index_test = train_test_split(range(num_samples),
                                                   test_size=test_ratio)
        data_train = data.loc[index_train].reset_index(drop=True)
        data_test = data.loc[index_test].reset_index(drop=True)

        X_train, X_test = data_train[features], data_test[features]
        T_train, T_test = data_train[survival].values, data_test[
            survival].values
        E_train, E_test = data_train[event].values, data_test[event].values
        ''' Run Cox '''
        if 'cox' in models:
            coxph = CoxPHModel()
            coxph.fit(X_train,
                      T_train,
                      E_train,
                      lr=0.0001,
                      l2_reg=1e-2,
                      init_method='zeros',
                      verbose=False)
            c_index = concordance_index(coxph, X_test, T_test, E_test)
            outputs['cox']['c_index'].append(c_index)
            ibs = integrated_brier_score(coxph,
                                         X_test,
                                         T_test,
                                         E_test,
                                         t_max=None)
            outputs['cox']['ibs'].append(ibs)
            for idx, i in enumerate(features):
                outputs['cox']['weights'][i].append(coxph.weights[idx])
        ''' Run RSF '''
        if 'rsf' in models:
            rsf = RandomSurvivalForestModel(num_trees=200)
            rsf.fit(X_train,
                    T_train,
                    E_train,
                    max_features="sqrt",
                    max_depth=5,
                    min_node_size=20)
            c_index = concordance_index(rsf, X_test, T_test, E_test)
            outputs['rsf']['c_index'].append(c_index)
            ibs = integrated_brier_score(rsf,
                                         X_test,
                                         T_test,
                                         E_test,
                                         t_max=None)
            outputs['rsf']['ibs'].append(ibs)
            for key, value in rsf.variable_importance.items():
                outputs['rsf']['importance'][key].append(value)
        ''' Run Deepsurv '''
        if 'deepsurv' in models:
            structure = [{
                'activation': 'ReLU',
                'num_units': 128
            }, {
                'activation': 'ReLU',
                'num_units': 128
            }, {
                'activation': 'ReLU',
                'num_units': 128
            }]

            nonlinear_coxph = NonLinearCoxPHModel(structure=structure)
            nonlinear_coxph.fit(X_train,
                                T_train,
                                E_train,
                                lr=1e-4,
                                init_method='xav_uniform',
                                verbose=False)
            c_index = concordance_index(nonlinear_coxph, X_test, T_test,
                                        E_test)
            outputs['deepsurv']['c_index'].append(c_index)
            ibs = integrated_brier_score(nonlinear_coxph,
                                         X_test,
                                         T_test,
                                         E_test,
                                         t_max=None)
            outputs['deepsurv']['ibs'].append(ibs)

    return outputs
예제 #11
0
    data_train, data_test = train_test_split(data,
                                             test_size=0.2,
                                             random_state=iter)
    X_train = data_train[[
        "functional_proteins_cluster", "immunoregulatory_protein_cluster",
        "coexpression_cluster", "age", "grade", "Architecture"
    ]]
    T_train = data_train["Recurrence_time"]
    E_train = data_train["Recurrence"]

    X_test = data_test[[
        "functional_proteins_cluster", "immunoregulatory_protein_cluster",
        "coexpression_cluster", "age", "grade", "Architecture"
    ]]
    T_test = data_test["Recurrence_time"]
    E_test = data_test["Recurrence"]

    #Fit the RSF according to the training set
    fitted = rf.fit(X=X_train, T=T_train, E=E_train, max_depth=5, seed=iter)
    concordance = concordance_index(rf,
                                    X,
                                    T,
                                    E,
                                    include_ties=True,
                                    additional_results=False)
    concordances.append(concordance)

mean_concordance = np.mean(np.array(concordances))

print("Mean concordance index: " + str(round(mean_concordance, 4)))
예제 #12
0
def cv_train_and_report_model(X,
                              T,
                              E,
                              show=True,
                              num_tree=10,
                              max_depth=1,
                              min_node=2,
                              kf=None,
                              prep_model=None):
    if prep_model is None:

        def _prep_model(X, T, E):
            xst = RandomSurvivalForestModel(num_trees=num_tree)
            xst.fit(X,
                    T,
                    E,
                    max_features='sqrt',
                    max_depth=max_depth,
                    min_node_size=min_node,
                    num_threads=-1,
                    sample_size_pct=0.63,
                    importance_mode='normalized_permutation',
                    seed=None,
                    save_memory=False)
            return xst

        prep_model = _prep_model
    i = 1
    if kf is None:
        kf = StratifiedKFold(n_splits=10, shuffle=True)
    cis = []
    ibss = []
    for train_index, test_index in kf.split(X, E):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        T_train, T_test = T.iloc[train_index], T.iloc[test_index]
        E_train, E_test = E.iloc[train_index], E.iloc[test_index]
        #xst = RandomSurvivalForestModel(num_trees=num_tree)
        #xst.fit(X_train, T_train, E_train, max_features = 'sqrt', max_depth = max_depth,
        #    min_node_size = min_node, num_threads = -1,
        #    sample_size_pct = 0.63, importance_mode = 'normalized_permutation',
        #    seed = None, save_memory=False )
        xst = prep_model(X_train, T_train, E_train)
        c_index = concordance_index(xst, X_test, T_test, E_test)

        if show:
            print('\n {} of kfold {}'.format(i, kf.n_splits))
            print('C-index: {:.2f}'.format(c_index))
            results = compare_to_actual(xst,
                                        X_test,
                                        T_test,
                                        E_test,
                                        is_at_risk=True,
                                        figure_size=(16, 6),
                                        metrics=['rmse', 'mean', 'median'])
            ibs = integrated_brier_score(xst,
                                         X_test,
                                         T_test,
                                         E_test,
                                         t_max=100,
                                         figure_size=(15, 5))
            print('IBS: {:.2f}'.format(ibs))
        else:
            ibs = ibs_no_figure(xst, X_test, T_test, E_test, t_max=100)
        cis.append(c_index)
        ibss.append(ibs)
        i = i + 1
    return cis, ibss