km_base_model = KaplanMeierModel() km_base_model.fit(T_rsf_test, E_rsf_test) cv = RepeatedStratifiedKFold(n_splits=5,n_repeats=2,random_state=21) rsf = RandomSurvivalForestModel(num_trees=200) #rsf_num_trees = [100,200,300,500] #rsf_max_depth = [5,10,15,20,25,30,35,40,45,50] #rsf_min_node_size = [3, 5, 7, 9] #param_grid_rsf = {'num_trees':rsf_num_trees,'max_depth':rsf_max_depth,'min_node_size':rsf_min_node_size} rsf.fit(X_rsf_train,T_rsf_train,E_rsf_train,max_features='sqrt',max_depth=36,min_node_size=4,seed=21) #rsf_cv = RandomizedSearchCV(rsf, param_distributions=param_grid_rsf, cv=cv,scoring='accuracy',random_state=42,) #rsf_cv.fit(X_rsf_train,T_rsf_train,E_rsf_train) c_index = concordance_index(rsf,X_rsf_test,T_rsf_test,E_rsf_test) print('C-index: {:0.2f}'.format(c_index)) ibs = integrated_brier_score(rsf, X_rsf_test, T_rsf_test, E_rsf_test) print('IBS: {:0.2f}'.format(ibs)) # Initializing the figure fig, ax = plt.subplots(figsize=(8, 4)) # Randomly extracting a data-point that experienced an event choices = np.argwhere((E_rsf_test==1.)&(T_rsf_test>=1)).flatten() np.random.seed(16) #k = np.random.choice( choices, 1)[0] # Saving the time of event t = T_rsf_test[choices] # Computing the Survival function for all times t #survival = rsf.predict_survival(X_rsf_test.values[t, :]).flatten()
csf = ConditionalSurvivalForestModel(num_trees=100) csf.fit(train[features], train['PFS'], train['disease_progress'], max_features=1, max_depth=5, min_node_size=2) c_index = concordance_index(csf, test[features], test['PFS'], test['disease_progress']) print('C-index: {:.2f}'.format(c_index)) ibs = integrated_brier_score(csf, test[features], test['PFS'], test['disease_progress'], t_max=84, figure_size=(20, 6.5)) print('IBS: {:.2f}'.format(ibs)) results = compare_to_actual(csf, test[features], test['PFS'], test['disease_progress'], is_at_risk=False, figure_size=(16, 6), metrics=['rmse', 'mean', 'median']) csf_c_index, csf_brier_score = compute_scores(csf, test, list(np.arange(0, 86, 2)), features)
# The C-index represents the global assessment of the model discrimination power: this is the model’s ability to correctly provide a reliable ranking of the survival times based on the individual risk scores. In general, when the C-index is close to 1, the model has an almost perfect discriminatory power; but if it is close to 0.5, it has no ability to discriminate between low and high risk subjects. # In[ ]: from pysurvival.utils.metrics import concordance_index c_index = concordance_index(csf, X_test, T_test, E_test) print('C-index: {:.2f}'.format(c_index)) #0.83 # The Brier score measures the average discrepancies between the status and the estimated probabilities at a given time. Thus, the lower the score (usually below 0.25), the better the predictive performance. To assess the overall error measure across multiple time points, the Integrated Brier Score (IBS) is usually computed as well. # In[ ]: from pysurvival.utils.display import integrated_brier_score ibs = integrated_brier_score(csf, X_test, T_test, E_test, t_max=12, figure_size=(15, 5)) print('IBS: {:.2f}'.format(ibs)) # Now that we have built a model that seems to provide great performances, let's compare the time series of the actual and predicted number of customers who stop doing business with the SaaS company, for each time t. # In[ ]: from pysurvival.utils.display import compare_to_actual results = compare_to_actual(csf, X_test, T_test, E_test, is_at_risk=False, figure_size=(16, 6),
# Fitting the model csf = RandomSurvivalForestModel(num_trees=200) csf.fit(X_train, T_train, E_train, max_features='sqrt', max_depth=5, min_node_size=20) csf.variable_importance_table from pysurvival.utils.metrics import concordance_index c_index = concordance_index(csf, X_test, T_test, E_test) print('C-index: {:.2f}'.format(c_index)) #0.83 from pysurvival.utils.display import integrated_brier_score ibs = integrated_brier_score(csf, X_test, T_test, E_test, t_max=12, figure_size=(12,5)) print('IBS: {:.2f}'.format(ibs)) to_remove = ['estadoCivil_outro', 'ano'] features = np.setdiff1d(features, to_remove).tolist() # Creating the X, T and E inputs X_train, X_test = df[features], data_test[features] T_train, T_test = df[time_column], data_test[time_column] E_train, E_test = df[event_column], data_test[event_column] csf = RandomSurvivalForestModel(num_trees=200)
for train_index, test_index in kf.split(X,E): print('\n {} of {}'.format(i,kf.n_splits)) X1_train, X1_test = X.loc[train_index], X.loc[test_index] X_train, X_test = X1_train[features], X1_test[features] T_train, T_test = X1_train['NumDays'].values, X1_test['NumDays'].values E_train, E_test = E.loc[train_index].values, E.loc[test_index].values xst = RandomSurvivalForestModel(num_trees=best_num_tree) xst.fit(X_train, T_train, E_train, max_features = 'sqrt', max_depth = best_depth, min_node_size = best_min_node, num_threads = -1, sample_size_pct = 0.63, importance_mode = 'normalized_permutation', seed = None, save_memory=False ) c_index = concordance_index(xst, X_test, T_test, E_test) results = compare_to_actual(xst, X_test, T_test, E_test, is_at_risk = True, figure_size=(16, 6), metrics = ['rmse', 'mean', 'median']) ibs = integrated_brier_score(xst, X_test, T_test, E_test, t_max=2000, figure_size=(15,5)) CI.append(c_index) IBS.append(ibs) print('C-index: {:.2f}'.format(c_index)) print('IBS: {:.2f}'.format(ibs)) i = i+1 # Save the model for use in pipeline #save_model(xst, '../pipeline/survival_model.zip') xst.variable_importance_table.head(20) preds = xst.predict_survival(onlyPredictionData.iloc[:,:-2].transpose()) preds_df = pd.DataFrame(preds).T preds_df.to_excel('preds.xlsx')
def cv_train_and_report_model(X, T, E, show=True, num_tree=10, max_depth=1, min_node=2, kf=None, prep_model=None): if prep_model is None: def _prep_model(X, T, E): xst = RandomSurvivalForestModel(num_trees=num_tree) xst.fit(X, T, E, max_features='sqrt', max_depth=max_depth, min_node_size=min_node, num_threads=-1, sample_size_pct=0.63, importance_mode='normalized_permutation', seed=None, save_memory=False) return xst prep_model = _prep_model i = 1 if kf is None: kf = StratifiedKFold(n_splits=10, shuffle=True) cis = [] ibss = [] for train_index, test_index in kf.split(X, E): X_train, X_test = X.iloc[train_index], X.iloc[test_index] T_train, T_test = T.iloc[train_index], T.iloc[test_index] E_train, E_test = E.iloc[train_index], E.iloc[test_index] #xst = RandomSurvivalForestModel(num_trees=num_tree) #xst.fit(X_train, T_train, E_train, max_features = 'sqrt', max_depth = max_depth, # min_node_size = min_node, num_threads = -1, # sample_size_pct = 0.63, importance_mode = 'normalized_permutation', # seed = None, save_memory=False ) xst = prep_model(X_train, T_train, E_train) c_index = concordance_index(xst, X_test, T_test, E_test) if show: print('\n {} of kfold {}'.format(i, kf.n_splits)) print('C-index: {:.2f}'.format(c_index)) results = compare_to_actual(xst, X_test, T_test, E_test, is_at_risk=True, figure_size=(16, 6), metrics=['rmse', 'mean', 'median']) ibs = integrated_brier_score(xst, X_test, T_test, E_test, t_max=100, figure_size=(15, 5)) print('IBS: {:.2f}'.format(ibs)) else: ibs = ibs_no_figure(xst, X_test, T_test, E_test, t_max=100) cis.append(c_index) ibss.append(ibs) i = i + 1 return cis, ibss