def get_c_index(self): features = self.get_X_test() target = self.get_T_test() event = self.get_E_test() return concordance_index(self.model_forest, features, target, event)
def cv_survival(self, cv=10, params={}, num_trees=1000, balance_classes=False, verbose=True, ): self.verify_best_num_feats() # Check if the best hyperparameters were processed params = self.check_params(params) kf = KFold(n_splits=cv, shuffle=True, random_state=self.seed) scores = [] models = [] datasets = [] df_cv = self.df_train.copy() for fold, (index_train, index_test) in enumerate(kf.split(df_cv), 1): if verbose: print('Fold {}'.format(fold)) data_train = df_cv.iloc[index_train].reset_index( drop = True ) data_test = df_cv.iloc[index_test].reset_index( drop = True ) # Creating the X, T and E inputs X_train, X_test = data_train[self.features], data_test[self.features] T_train, T_test = data_train[self.target].values, data_test[self.target].values E_train, E_test = data_train[self.event].values, data_test[self.event].values weights = self.compute_event_weights(E_train, balance_classes) X_train = X_train[self.feature_importance[:self.best_num_feats]] X_test = X_test[self.feature_importance[:self.best_num_feats]] # Creating model model_forest = ConditionalSurvivalForestModel(num_trees=num_trees,) model_forest.fit(X_train, T_train, E_train, seed=self.seed, weights=weights, **params) # Append score for post calculation average of folds scores.append(concordance_index(model_forest, X_test, T_test, E_test)) # Refit model with all training data self.fit_model(num_trees=num_trees, params=params, balance_classes=balance_classes) scores = np.array(scores) self.cv_score = np.mean(scores) if verbose: print('CV Score: {:.3f}'.format(self.cv_score))
def compute_scores(model, table, timepoints, variables): c_indexes = [] for i in timepoints: table.loc[:, 'disease_progress_temp'] = table['disease_progress'] table.loc[:, 'PFS_temp'] = table['PFS'] table.loc[table.PFS > i, 'disease_progress_temp'] = 0 table.loc[table.PFS > i, 'PFS_temp'] = i c_indexes.append( concordance_index(model, table[variables], table['PFS_temp'], table['disease_progress_temp'])) brier_scores = brier_score(model, table[variables], table['PFS'], table['disease_progress'], t_max=84, figure_size=(20, 6.5)) return c_indexes, brier_scores
E_rsf_train, E_rsf_test = E_rsf[index_train], E_rsf[index_test] km_base_model = KaplanMeierModel() km_base_model.fit(T_rsf_test, E_rsf_test) cv = RepeatedStratifiedKFold(n_splits=5,n_repeats=2,random_state=21) rsf = RandomSurvivalForestModel(num_trees=200) #rsf_num_trees = [100,200,300,500] #rsf_max_depth = [5,10,15,20,25,30,35,40,45,50] #rsf_min_node_size = [3, 5, 7, 9] #param_grid_rsf = {'num_trees':rsf_num_trees,'max_depth':rsf_max_depth,'min_node_size':rsf_min_node_size} rsf.fit(X_rsf_train,T_rsf_train,E_rsf_train,max_features='sqrt',max_depth=36,min_node_size=4,seed=21) #rsf_cv = RandomizedSearchCV(rsf, param_distributions=param_grid_rsf, cv=cv,scoring='accuracy',random_state=42,) #rsf_cv.fit(X_rsf_train,T_rsf_train,E_rsf_train) c_index = concordance_index(rsf,X_rsf_test,T_rsf_test,E_rsf_test) print('C-index: {:0.2f}'.format(c_index)) ibs = integrated_brier_score(rsf, X_rsf_test, T_rsf_test, E_rsf_test) print('IBS: {:0.2f}'.format(ibs)) # Initializing the figure fig, ax = plt.subplots(figsize=(8, 4)) # Randomly extracting a data-point that experienced an event choices = np.argwhere((E_rsf_test==1.)&(T_rsf_test>=1)).flatten() np.random.seed(16) #k = np.random.choice( choices, 1)[0] # Saving the time of event t = T_rsf_test[choices]
return c_indexes, brier_scores # In[ ]: # In[36]: csf = ConditionalSurvivalForestModel(num_trees=100) csf.fit(train[features], train['PFS'], train['disease_progress'], max_features=1, max_depth=5, min_node_size=2) c_index = concordance_index(csf, test[features], test['PFS'], test['disease_progress']) print('C-index: {:.2f}'.format(c_index)) ibs = integrated_brier_score(csf, test[features], test['PFS'], test['disease_progress'], t_max=84, figure_size=(20, 6.5)) print('IBS: {:.2f}'.format(ibs)) results = compare_to_actual(csf, test[features], test['PFS'], test['disease_progress'], is_at_risk=False,
alpha=0.05, minprop=0.1) # In[ ]: # Computing variables importance csf.variable_importance_table.head(5) # In order to assess the model performance, we previously split the original dataset into training and testing sets, so that we can now compute its performance metrics on the testing set: # # The C-index represents the global assessment of the model discrimination power: this is the model’s ability to correctly provide a reliable ranking of the survival times based on the individual risk scores. In general, when the C-index is close to 1, the model has an almost perfect discriminatory power; but if it is close to 0.5, it has no ability to discriminate between low and high risk subjects. # In[ ]: from pysurvival.utils.metrics import concordance_index c_index = concordance_index(csf, X_test, T_test, E_test) print('C-index: {:.2f}'.format(c_index)) #0.83 # The Brier score measures the average discrepancies between the status and the estimated probabilities at a given time. Thus, the lower the score (usually below 0.25), the better the predictive performance. To assess the overall error measure across multiple time points, the Integrated Brier Score (IBS) is usually computed as well. # In[ ]: from pysurvival.utils.display import integrated_brier_score ibs = integrated_brier_score(csf, X_test, T_test, E_test, t_max=12, figure_size=(15, 5)) print('IBS: {:.2f}'.format(ibs))
'event'].values structure = [{ 'activation': 'SELU', 'num_units': num_units }, { 'activation': 'SELU', 'num_units': num_units2 }] nonlinear_coxph = NonLinearCoxPHModel(structure=structure) nonlinear_coxph.fit(X_train, T_train, E_train, lr=lr, init_method='xav_uniform', dropout=dropout) c_index = concordance_index(nonlinear_coxph, X_test, T_test, E_test) c_index_df = c_index_df.append({"c_index": c_index}, ignore_index=True) print(c_index_df) if len(c_index_df) == 10: mean = c_index_df["c_index"].mean() dataframe_hp = dataframe_hp.append( { "lr": lr, "dropout": dropout, "num_units": num_units, "num_units2": num_units2, "num_layers": 2, "activation": "SELU/SELU", "mean": mean },
T_train, T_test = data_train['time'].values, data_test[ 'time'].values E_train, E_test = data_train['event'].values, data_test[ 'event'].values structure = [{'activation': 'RELU', 'num_units': 25}] nonlinear_coxph = NonLinearCoxPHModel(structure=structure) start = time.time() nonlinear_coxph.fit(X_train, T_train, E_train, lr=0.0001, init_method='xav_uniform', dropout=0.2) stop = time.time() duration = stop - start c_index = concordance_index(nonlinear_coxph, X_test, T_test, E_test) c_index_train = concordance_index(nonlinear_coxph, X_train, T_train, E_train) c_index_multiple_models = c_index_multiple_models.append( { "c_index_test": c_index, "c_index_train": c_index_train, "duration": duration }, ignore_index=True) print(c_index_multiple_models) if len(c_index_multiple_models) == 10: mean_test = c_index_multiple_models["c_index_test"].mean() mean_train = c_index_multiple_models["c_index_train"].mean() mean_duration = c_index_multiple_models["duration"].mean() c_index_df = c_index_df.append({"c_index_test": mean_test},
for b in max_depth: for c in min_node: cc = [] kf = StratifiedKFold(n_splits=7, random_state=42, shuffle=True) i = 1 for train_index, test_index in kf.split(Xtemp,Etemp): X1_train, X1_test = Xtemp.loc[train_index], Xtemp.loc[test_index] X_train, X_test = X1_train[featuresTemp], X1_test[featuresTemp] T_train, T_test = X1_train['NumDays'].values, X1_test['NumDays'].values E_train, E_test = Etemp.loc[train_index].values, Etemp.loc[test_index].values xst = RandomSurvivalForestModel(num_trees=a) xst.fit(X_train, T_train, E_train, max_features = 'sqrt', max_depth = b, min_node_size = c, num_threads = -1, sample_size_pct = 0.63, importance_mode = 'normalized_permutation', seed = None, save_memory = False ) c_index = concordance_index(xst, X_test, T_test, E_test) cc.append(c_index) i = i+1 print(a,b, c, mean(cc)) CI = [] IBS = [] best_num_tree = 15 best_depth = 10 best_min_node = 5 k_folds = 4 i=1 kf=StratifiedKFold(n_splits = k_folds, random_state = 1, shuffle = True) for train_index, test_index in kf.split(X,E):
def run_pysurvival_with_repetitions(data, features, survival, event, models, test_ratio, repetitions=10): num_samples = len(data.index) print('Number of Samples:', num_samples) ''' Initialize Outputs ''' outputs = initialize_outputs(models, features) ''' Run Survival Model N times ''' for _ in range(repetitions): ''' Dataset Splitting ''' index_train, index_test = train_test_split(range(num_samples), test_size=test_ratio) data_train = data.loc[index_train].reset_index(drop=True) data_test = data.loc[index_test].reset_index(drop=True) X_train, X_test = data_train[features], data_test[features] T_train, T_test = data_train[survival].values, data_test[ survival].values E_train, E_test = data_train[event].values, data_test[event].values ''' Run Cox ''' if 'cox' in models: coxph = CoxPHModel() coxph.fit(X_train, T_train, E_train, lr=0.0001, l2_reg=1e-2, init_method='zeros', verbose=False) c_index = concordance_index(coxph, X_test, T_test, E_test) outputs['cox']['c_index'].append(c_index) ibs = integrated_brier_score(coxph, X_test, T_test, E_test, t_max=None) outputs['cox']['ibs'].append(ibs) for idx, i in enumerate(features): outputs['cox']['weights'][i].append(coxph.weights[idx]) ''' Run RSF ''' if 'rsf' in models: rsf = RandomSurvivalForestModel(num_trees=200) rsf.fit(X_train, T_train, E_train, max_features="sqrt", max_depth=5, min_node_size=20) c_index = concordance_index(rsf, X_test, T_test, E_test) outputs['rsf']['c_index'].append(c_index) ibs = integrated_brier_score(rsf, X_test, T_test, E_test, t_max=None) outputs['rsf']['ibs'].append(ibs) for key, value in rsf.variable_importance.items(): outputs['rsf']['importance'][key].append(value) ''' Run Deepsurv ''' if 'deepsurv' in models: structure = [{ 'activation': 'ReLU', 'num_units': 128 }, { 'activation': 'ReLU', 'num_units': 128 }, { 'activation': 'ReLU', 'num_units': 128 }] nonlinear_coxph = NonLinearCoxPHModel(structure=structure) nonlinear_coxph.fit(X_train, T_train, E_train, lr=1e-4, init_method='xav_uniform', verbose=False) c_index = concordance_index(nonlinear_coxph, X_test, T_test, E_test) outputs['deepsurv']['c_index'].append(c_index) ibs = integrated_brier_score(nonlinear_coxph, X_test, T_test, E_test, t_max=None) outputs['deepsurv']['ibs'].append(ibs) return outputs
data_train, data_test = train_test_split(data, test_size=0.2, random_state=iter) X_train = data_train[[ "functional_proteins_cluster", "immunoregulatory_protein_cluster", "coexpression_cluster", "age", "grade", "Architecture" ]] T_train = data_train["Recurrence_time"] E_train = data_train["Recurrence"] X_test = data_test[[ "functional_proteins_cluster", "immunoregulatory_protein_cluster", "coexpression_cluster", "age", "grade", "Architecture" ]] T_test = data_test["Recurrence_time"] E_test = data_test["Recurrence"] #Fit the RSF according to the training set fitted = rf.fit(X=X_train, T=T_train, E=E_train, max_depth=5, seed=iter) concordance = concordance_index(rf, X, T, E, include_ties=True, additional_results=False) concordances.append(concordance) mean_concordance = np.mean(np.array(concordances)) print("Mean concordance index: " + str(round(mean_concordance, 4)))
def cv_train_and_report_model(X, T, E, show=True, num_tree=10, max_depth=1, min_node=2, kf=None, prep_model=None): if prep_model is None: def _prep_model(X, T, E): xst = RandomSurvivalForestModel(num_trees=num_tree) xst.fit(X, T, E, max_features='sqrt', max_depth=max_depth, min_node_size=min_node, num_threads=-1, sample_size_pct=0.63, importance_mode='normalized_permutation', seed=None, save_memory=False) return xst prep_model = _prep_model i = 1 if kf is None: kf = StratifiedKFold(n_splits=10, shuffle=True) cis = [] ibss = [] for train_index, test_index in kf.split(X, E): X_train, X_test = X.iloc[train_index], X.iloc[test_index] T_train, T_test = T.iloc[train_index], T.iloc[test_index] E_train, E_test = E.iloc[train_index], E.iloc[test_index] #xst = RandomSurvivalForestModel(num_trees=num_tree) #xst.fit(X_train, T_train, E_train, max_features = 'sqrt', max_depth = max_depth, # min_node_size = min_node, num_threads = -1, # sample_size_pct = 0.63, importance_mode = 'normalized_permutation', # seed = None, save_memory=False ) xst = prep_model(X_train, T_train, E_train) c_index = concordance_index(xst, X_test, T_test, E_test) if show: print('\n {} of kfold {}'.format(i, kf.n_splits)) print('C-index: {:.2f}'.format(c_index)) results = compare_to_actual(xst, X_test, T_test, E_test, is_at_risk=True, figure_size=(16, 6), metrics=['rmse', 'mean', 'median']) ibs = integrated_brier_score(xst, X_test, T_test, E_test, t_max=100, figure_size=(15, 5)) print('IBS: {:.2f}'.format(ibs)) else: ibs = ibs_no_figure(xst, X_test, T_test, E_test, t_max=100) cis.append(c_index) ibss.append(ibs) i = i + 1 return cis, ibss