예제 #1
0
km_base_model = KaplanMeierModel()
km_base_model.fit(T_rsf_test, E_rsf_test)

cv = RepeatedStratifiedKFold(n_splits=5,n_repeats=2,random_state=21)

rsf = RandomSurvivalForestModel(num_trees=200)
#rsf_num_trees = [100,200,300,500]
#rsf_max_depth = [5,10,15,20,25,30,35,40,45,50]
#rsf_min_node_size = [3, 5, 7, 9]
#param_grid_rsf = {'num_trees':rsf_num_trees,'max_depth':rsf_max_depth,'min_node_size':rsf_min_node_size}
rsf.fit(X_rsf_train,T_rsf_train,E_rsf_train,max_features='sqrt',max_depth=36,min_node_size=4,seed=21)
#rsf_cv = RandomizedSearchCV(rsf, param_distributions=param_grid_rsf, cv=cv,scoring='accuracy',random_state=42,)
#rsf_cv.fit(X_rsf_train,T_rsf_train,E_rsf_train)
c_index = concordance_index(rsf,X_rsf_test,T_rsf_test,E_rsf_test)
print('C-index: {:0.2f}'.format(c_index))
ibs = integrated_brier_score(rsf, X_rsf_test, T_rsf_test, E_rsf_test)
print('IBS: {:0.2f}'.format(ibs))

# Initializing the figure
fig, ax = plt.subplots(figsize=(8, 4))

# Randomly extracting a data-point that experienced an event 
choices = np.argwhere((E_rsf_test==1.)&(T_rsf_test>=1)).flatten()
np.random.seed(16)
#k = np.random.choice( choices, 1)[0]

# Saving the time of event
t = T_rsf_test[choices]

# Computing the Survival function for all times t
#survival = rsf.predict_survival(X_rsf_test.values[t, :]).flatten()
예제 #2
0
csf = ConditionalSurvivalForestModel(num_trees=100)
csf.fit(train[features],
        train['PFS'],
        train['disease_progress'],
        max_features=1,
        max_depth=5,
        min_node_size=2)

c_index = concordance_index(csf, test[features], test['PFS'],
                            test['disease_progress'])
print('C-index: {:.2f}'.format(c_index))

ibs = integrated_brier_score(csf,
                             test[features],
                             test['PFS'],
                             test['disease_progress'],
                             t_max=84,
                             figure_size=(20, 6.5))
print('IBS: {:.2f}'.format(ibs))

results = compare_to_actual(csf,
                            test[features],
                            test['PFS'],
                            test['disease_progress'],
                            is_at_risk=False,
                            figure_size=(16, 6),
                            metrics=['rmse', 'mean', 'median'])

csf_c_index, csf_brier_score = compute_scores(csf, test,
                                              list(np.arange(0, 86, 2)),
                                              features)
# The C-index represents the global assessment of the model discrimination power: this is the model’s ability to correctly provide a reliable ranking of the survival times based on the individual risk scores. In general, when the C-index is close to 1, the model has an almost perfect discriminatory power; but if it is close to 0.5, it has no ability to discriminate between low and high risk subjects.

# In[ ]:

from pysurvival.utils.metrics import concordance_index
c_index = concordance_index(csf, X_test, T_test, E_test)
print('C-index: {:.2f}'.format(c_index))  #0.83

# The Brier score measures the average discrepancies between the status and the estimated probabilities at a given time. Thus, the lower the score (usually below 0.25), the better the predictive performance. To assess the overall error measure across multiple time points, the Integrated Brier Score (IBS) is usually computed as well.

# In[ ]:

from pysurvival.utils.display import integrated_brier_score
ibs = integrated_brier_score(csf,
                             X_test,
                             T_test,
                             E_test,
                             t_max=12,
                             figure_size=(15, 5))
print('IBS: {:.2f}'.format(ibs))

# Now that we have built a model that seems to provide great performances, let's compare the time series of the actual and predicted number of customers who stop doing business with the SaaS company, for each time t.

# In[ ]:

from pysurvival.utils.display import compare_to_actual
results = compare_to_actual(csf,
                            X_test,
                            T_test,
                            E_test,
                            is_at_risk=False,
                            figure_size=(16, 6),
예제 #4
0
# Fitting the model
csf = RandomSurvivalForestModel(num_trees=200)
csf.fit(X_train, T_train, E_train, max_features='sqrt',
        max_depth=5, min_node_size=20)


csf.variable_importance_table


from pysurvival.utils.metrics import concordance_index
c_index = concordance_index(csf, X_test, T_test, E_test)
print('C-index: {:.2f}'.format(c_index)) #0.83


from pysurvival.utils.display import integrated_brier_score
ibs = integrated_brier_score(csf, X_test, T_test, E_test, t_max=12,
    figure_size=(12,5))
print('IBS: {:.2f}'.format(ibs))


to_remove = ['estadoCivil_outro', 'ano']
features = np.setdiff1d(features, to_remove).tolist()


# Creating the X, T and E inputs
X_train, X_test = df[features], data_test[features]
T_train, T_test = df[time_column], data_test[time_column]
E_train, E_test = df[event_column], data_test[event_column]


csf = RandomSurvivalForestModel(num_trees=200)
for train_index, test_index in kf.split(X,E):
    print('\n {} of {}'.format(i,kf.n_splits)) 
    X1_train, X1_test = X.loc[train_index], X.loc[test_index]
    X_train, X_test = X1_train[features], X1_test[features]
    T_train, T_test = X1_train['NumDays'].values, X1_test['NumDays'].values
    E_train, E_test = E.loc[train_index].values, E.loc[test_index].values
    xst = RandomSurvivalForestModel(num_trees=best_num_tree) 
    xst.fit(X_train, T_train, E_train, max_features = 'sqrt', max_depth = best_depth,
        min_node_size = best_min_node, num_threads = -1, 
        sample_size_pct = 0.63, importance_mode = 'normalized_permutation',
        seed = None, save_memory=False )
    c_index = concordance_index(xst, X_test, T_test, E_test)
        
    results = compare_to_actual(xst, X_test, T_test, E_test, is_at_risk = True,  figure_size=(16, 6), 
                                metrics = ['rmse', 'mean', 'median'])
    ibs = integrated_brier_score(xst, X_test, T_test, E_test, t_max=2000, figure_size=(15,5))
    CI.append(c_index)
    IBS.append(ibs)
    print('C-index: {:.2f}'.format(c_index))
    print('IBS: {:.2f}'.format(ibs))
    i = i+1            

# Save the model for use in pipeline
#save_model(xst, '../pipeline/survival_model.zip')         

xst.variable_importance_table.head(20)     
preds = xst.predict_survival(onlyPredictionData.iloc[:,:-2].transpose())
preds_df = pd.DataFrame(preds).T
preds_df.to_excel('preds.xlsx')            
            
            
예제 #6
0
def cv_train_and_report_model(X,
                              T,
                              E,
                              show=True,
                              num_tree=10,
                              max_depth=1,
                              min_node=2,
                              kf=None,
                              prep_model=None):
    if prep_model is None:

        def _prep_model(X, T, E):
            xst = RandomSurvivalForestModel(num_trees=num_tree)
            xst.fit(X,
                    T,
                    E,
                    max_features='sqrt',
                    max_depth=max_depth,
                    min_node_size=min_node,
                    num_threads=-1,
                    sample_size_pct=0.63,
                    importance_mode='normalized_permutation',
                    seed=None,
                    save_memory=False)
            return xst

        prep_model = _prep_model
    i = 1
    if kf is None:
        kf = StratifiedKFold(n_splits=10, shuffle=True)
    cis = []
    ibss = []
    for train_index, test_index in kf.split(X, E):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        T_train, T_test = T.iloc[train_index], T.iloc[test_index]
        E_train, E_test = E.iloc[train_index], E.iloc[test_index]
        #xst = RandomSurvivalForestModel(num_trees=num_tree)
        #xst.fit(X_train, T_train, E_train, max_features = 'sqrt', max_depth = max_depth,
        #    min_node_size = min_node, num_threads = -1,
        #    sample_size_pct = 0.63, importance_mode = 'normalized_permutation',
        #    seed = None, save_memory=False )
        xst = prep_model(X_train, T_train, E_train)
        c_index = concordance_index(xst, X_test, T_test, E_test)

        if show:
            print('\n {} of kfold {}'.format(i, kf.n_splits))
            print('C-index: {:.2f}'.format(c_index))
            results = compare_to_actual(xst,
                                        X_test,
                                        T_test,
                                        E_test,
                                        is_at_risk=True,
                                        figure_size=(16, 6),
                                        metrics=['rmse', 'mean', 'median'])
            ibs = integrated_brier_score(xst,
                                         X_test,
                                         T_test,
                                         E_test,
                                         t_max=100,
                                         figure_size=(15, 5))
            print('IBS: {:.2f}'.format(ibs))
        else:
            ibs = ibs_no_figure(xst, X_test, T_test, E_test, t_max=100)
        cis.append(c_index)
        ibss.append(ibs)
        i = i + 1
    return cis, ibss