def fit_model(self, num_trees=3000, params={}, balance_classes=False): """ Function used to fit an usable model, with all trainig data, after doing parameters optimization. """ features = self.get_X_train() target = self.get_T_train() event = self.get_E_train() #weights = self.compute_event_weights(self.df_train['DISEASE_FREE_STATUS_RECURRED/PROGRESSED'], balance_classes) weights = self.compute_event_weights(event, balance_classes) params = self.check_params(params) model_forest = ConditionalSurvivalForestModel(num_trees=num_trees,) model_forest.fit(features, target, event, seed=self.seed, weights=weights, **params) self.model_forest = model_forest
def cv_survival(self, cv=10, params={}, num_trees=1000, balance_classes=False, verbose=True, ): self.verify_best_num_feats() # Check if the best hyperparameters were processed params = self.check_params(params) kf = KFold(n_splits=cv, shuffle=True, random_state=self.seed) scores = [] models = [] datasets = [] df_cv = self.df_train.copy() for fold, (index_train, index_test) in enumerate(kf.split(df_cv), 1): if verbose: print('Fold {}'.format(fold)) data_train = df_cv.iloc[index_train].reset_index( drop = True ) data_test = df_cv.iloc[index_test].reset_index( drop = True ) # Creating the X, T and E inputs X_train, X_test = data_train[self.features], data_test[self.features] T_train, T_test = data_train[self.target].values, data_test[self.target].values E_train, E_test = data_train[self.event].values, data_test[self.event].values weights = self.compute_event_weights(E_train, balance_classes) X_train = X_train[self.feature_importance[:self.best_num_feats]] X_test = X_test[self.feature_importance[:self.best_num_feats]] # Creating model model_forest = ConditionalSurvivalForestModel(num_trees=num_trees,) model_forest.fit(X_train, T_train, E_train, seed=self.seed, weights=weights, **params) # Append score for post calculation average of folds scores.append(concordance_index(model_forest, X_test, T_test, E_test)) # Refit model with all training data self.fit_model(num_trees=num_trees, params=params, balance_classes=balance_classes) scores = np.array(scores) self.cv_score = np.mean(scores) if verbose: print('CV Score: {:.3f}'.format(self.cv_score))
def process_feature_importance(self, num_trees=500, params={}, balance_classes=True, verbose=True,): """ Function to process the feature importance, in order to operate the library with a reduced number of features, a way to addess the curse of dimentionality. """ params = self.check_params(params) print('Started processing feature importance. This may take a while.') # use whole data to process feature importance df = self.df.copy() # Creating the X, T and E inputs X = df[self.features] T = df[self.target].values E = df[self.event].values weights = self.compute_event_weights(E, balance_classes) model_forest = ConditionalSurvivalForestModel(num_trees=num_trees,) model_forest.fit(X, T, E, seed=self.seed, weights=weights, **params) for i in range(0,20): tamanho = model_forest.variable_importance_table.shape[0] novo_tamanho = int(0.8 * tamanho) print('Tamanho: {}'.format(tamanho)) print('Novo Tamanho: {}'.format(novo_tamanho)) X = X[model_forest.variable_importance_table['feature'].iloc[:novo_tamanho]] model_forest = ConditionalSurvivalForestModel(num_trees=num_trees,) model_forest.fit(X, T, E, seed=self.seed, weights=weights, **params) self.set_feature_importance(model_forest.variable_importance_table['feature'])
def load_model(path_file): """ Load the model and its parameters from a .zip file Parameters: ----------- * path_file, str address of the file where the model will be loaded from Returns: -------- * pysurvival_model : Pysurvival object Pysurvival model """ # Initializing a base model from pysurvival.models import BaseModel base_model = BaseModel() # Temporary loading the model base_model.load(path_file) model_name = base_model.name # Loading the actual Pysurvival model - Kaplan-Meier if 'kaplanmeier' in model_name.lower(): if 'smooth' in model_name.lower(): from pysurvival.models.non_parametric import SmoothKaplanMeierModel pysurvival_model = SmoothKaplanMeierModel() else: from pysurvival.models.non_parametric import KaplanMeierModel pysurvival_model = KaplanMeierModel() elif 'linearmultitask' in model_name.lower(): from pysurvival.models.multi_task import LinearMultiTaskModel pysurvival_model = LinearMultiTaskModel() elif 'neuralmultitask' in model_name.lower(): from pysurvival.models.multi_task import NeuralMultiTaskModel structure = [ { 'activation': 'relu', 'num_units': 128 }, ] pysurvival_model = NeuralMultiTaskModel(structure=structure) elif 'exponential' in model_name.lower(): from pysurvival.models.parametric import ExponentialModel pysurvival_model = ExponentialModel() elif 'weibull' in model_name.lower(): from pysurvival.models.parametric import WeibullModel pysurvival_model = WeibullModel() elif 'gompertz' in model_name.lower(): from pysurvival.models.parametric import GompertzModel pysurvival_model = GompertzModel() elif 'loglogistic' in model_name.lower(): from pysurvival.models.parametric import LogLogisticModel pysurvival_model = LogLogisticModel() elif 'lognormal' in model_name.lower(): from pysurvival.models.parametric import LogNormalModel pysurvival_model = LogNormalModel() elif 'simulation' in model_name.lower(): from pysurvival.models.simulations import SimulationModel pysurvival_model = SimulationModel() elif 'coxph' in model_name.lower(): if 'nonlinear' in model_name.lower(): from pysurvival.models.semi_parametric import NonLinearCoxPHModel pysurvival_model = NonLinearCoxPHModel() else: from pysurvival.models.semi_parametric import CoxPHModel pysurvival_model = CoxPHModel() elif 'random' in model_name.lower() and 'survival' in model_name.lower(): from pysurvival.models.survival_forest import RandomSurvivalForestModel pysurvival_model = RandomSurvivalForestModel() elif 'extra' in model_name.lower() and 'survival' in model_name.lower(): from pysurvival.models.survival_forest import ExtraSurvivalTreesModel pysurvival_model = ExtraSurvivalTreesModel() elif 'condi' in model_name.lower() and 'survival' in model_name.lower(): from pysurvival.models.survival_forest import ConditionalSurvivalForestModel pysurvival_model = ConditionalSurvivalForestModel() elif 'svm' in model_name.lower(): if 'linear' in model_name.lower(): from pysurvival.models.svm import LinearSVMModel pysurvival_model = LinearSVMModel() elif 'kernel' in model_name.lower(): from pysurvival.models.svm import KernelSVMModel pysurvival_model = KernelSVMModel() else: raise NotImplementedError( '{} is not a valid pysurvival model.'.format(model_name)) # Transferring the components pysurvival_model.__dict__.update(copy.deepcopy(base_model.__dict__)) del base_model return pysurvival_model
concordance_index(model, table[variables], table['PFS_temp'], table['disease_progress_temp'])) brier_scores = brier_score(model, table[variables], table['PFS'], table['disease_progress'], t_max=84, figure_size=(20, 6.5)) return c_indexes, brier_scores # In[ ]: # In[36]: csf = ConditionalSurvivalForestModel(num_trees=100) csf.fit(train[features], train['PFS'], train['disease_progress'], max_features=1, max_depth=5, min_node_size=2) c_index = concordance_index(csf, test[features], test['PFS'], test['disease_progress']) print('C-index: {:.2f}'.format(c_index)) ibs = integrated_brier_score(csf, test[features], test['PFS'], test['disease_progress'],
def build_forest(self, num_trees=500): self.model = ConditionalSurvivalForestModel(num_trees=num_trees)
# Creating the X, T and E inputs X_train, X_test = data_train[features], data_test[features] T_train, T_test = data_train[time_column], data_test[time_column] E_train, E_test = data_train[event_column], data_test[event_column] # Let's now fit a Conditional Survival Forest model to the training set. # # Note: The choice of the hyper-parameters was obtained using grid-search selection, not displayed in this tutorial. # In[ ]: from pysurvival.models.survival_forest import ConditionalSurvivalForestModel # Fitting the model csf = ConditionalSurvivalForestModel(num_trees=200) csf.fit(X_train, T_train, E_train, max_features='sqrt', max_depth=5, min_node_size=20, alpha=0.05, minprop=0.1) # In[ ]: # Computing variables importance csf.variable_importance_table.head(5) # In order to assess the model performance, we previously split the original dataset into training and testing sets, so that we can now compute its performance metrics on the testing set:
def build_random_forest(self, num_trees=500): self.model = RandomSurvivalForestModel(num_trees=num_trees)
def build_extra_survival_trees(self, num_trees=500): self.model = ExtraSurvivalTreesModel(num_trees=num_trees)
def build_cox(self): self.model = CoxPHFitter(penalizer=0.01)
def build_multitask(self): self.model = LinearMultiTaskModel()
def build_logNormal(self): self.model = LogNormalAFTFitter(penalizer=0.01)
def build_weibullAFT(self): self.model = WeibullAFTFitter(penalizer=0.01)
def build_piecewise_exponential_regression(self): self.model = PiecewiseExponentialRegressionFitter(breakpoints=[1, 400], penalizer=0.01)
def build_aalenAdditive(self): self.model = AalenAdditiveFitter(coef_penalizer=0.01, smoothing_penalizer=1000)
class Model: def __init__(self, name): self.name = name # We define different models def build_aalenAdditive(self): self.model = AalenAdditiveFitter(coef_penalizer=0.01, smoothing_penalizer=1000) def build_piecewise_exponential_regression(self): self.model = PiecewiseExponentialRegressionFitter(breakpoints=[1, 400], penalizer=0.01) def build_weibullAFT(self): self.model = WeibullAFTFitter(penalizer=0.01) def build_logNormal(self): self.model = LogNormalAFTFitter(penalizer=0.01) def build_cox(self): self.model = CoxPHFitter(penalizer=0.01) def build_multitask(self): self.model = LinearMultiTaskModel() def build_random_forest(self, num_trees=500): self.model = RandomSurvivalForestModel(num_trees=num_trees) def build_extra_survival_trees(self, num_trees=500): self.model = ExtraSurvivalTreesModel(num_trees=num_trees) def build_forest(self, num_trees=500): self.model = ConditionalSurvivalForestModel(num_trees=num_trees) def train(self, X, Y): # the fit method depend on the model type if ('semi_parametric' in str(type( self.model))) or ('multi_task' in str(type(self.model))): self.model.fit(X=X, T=Y['SurvivalTime'], E=Y['Event'], init_method='zeros', num_epochs=500) if 'survival_forest' in str(type(self.model)): self.model.fit(X=X, T=Y['SurvivalTime'], E=Y['Event'], max_features='all', max_depth=20, sample_size_pct=0.33) if 'lifelines' in str( type(self.model) ): # else we want to fit a model from lifeline library using cross validation k_fold_cross_validation(self.model, pd.concat([X, Y], axis=1), 'SurvivalTime', event_col='Event', k=5) # self.model.fit(pd.concat([X, Y], axis=1), 'SurvivalTime', event_col='Event', show_progress=False) def predict_survival_function(self, X): return self.model.pred(X) def predict_expectation(self, X): # the expectation is different depending on the package we use # as the expectation does not exist in pysurvival -> we will use predict_risk if 'pysurvival' in str(type(self.model)): return self.model.predict_risk(X) else: return self.model.predict_expectation(X) def c_index(self, X, Y): Y_prediction = self.predict_expectation(X) # Y_prediction = 2 * max(Y_prediction) - Y_prediction if 'pysurvival' in str(type(self.model)): Y_prediction = 10 * max(Y_prediction) - Y_prediction Y_prediction = pd.DataFrame(Y_prediction, index=Y.index, columns=['SurvivalTime']) else: Y_prediction = pd.DataFrame(Y_prediction.values, index=Y.index, columns=['SurvivalTime']) Y_prediction['Event'] = np.nan # Y['Event'] return cindex(Y, Y_prediction) def predict_and_format(self, X, filename): Y_prediction = self.predict_expectation(X) if 'pysurvival' in str(type(self.model)): Y_prediction = 10 * max(Y_prediction) - Y_prediction Y_prediction = pd.DataFrame(Y_prediction, index=X.index, columns=['SurvivalTime']) else: Y_prediction = pd.DataFrame(Y_prediction.values, index=X.index, columns=['SurvivalTime']) Y_prediction['Event'] = np.nan # Y['Event'] # Y_prediction.to_csv(filename) return Y_prediction def fit_and_score(self, X, Y): # for each feature : create a model, train it (only on the selected feature) and compute the c-index score scores = [] for feature in X.columns.values: Xj = X[feature].values.reshape((len(X), 1)) self.train(Xj, Y) scores.append(self.c_index(Xj, Y)) scores = pd.Series(scores, index=X.columns).sort_values(ascending=False) return scores