示例#1
0
    def fit_model(self,
                  num_trees=3000,
                  params={},
                  balance_classes=False):
        """
        Function used to fit an usable model, with all
        trainig data, after doing parameters optimization.
        """

        features = self.get_X_train()
        target = self.get_T_train()
        event = self.get_E_train()

        #weights = self.compute_event_weights(self.df_train['DISEASE_FREE_STATUS_RECURRED/PROGRESSED'], balance_classes)
        weights = self.compute_event_weights(event, balance_classes)


        params = self.check_params(params)

        model_forest = ConditionalSurvivalForestModel(num_trees=num_trees,)

        model_forest.fit(features,
                          target,
                          event,
                          seed=self.seed,
                          weights=weights,
                          **params)

        self.model_forest = model_forest
示例#2
0
    def cv_survival(self, cv=10, params={}, num_trees=1000,
                    balance_classes=False, verbose=True, ):

        self.verify_best_num_feats()

        # Check if the best hyperparameters were processed
        params = self.check_params(params)

        kf = KFold(n_splits=cv, shuffle=True, random_state=self.seed)

        scores = []
        models = []
        datasets = []

        df_cv = self.df_train.copy()

        for fold, (index_train, index_test) in enumerate(kf.split(df_cv), 1):
            if verbose:
                print('Fold {}'.format(fold))

            data_train = df_cv.iloc[index_train].reset_index( drop = True )
            data_test  = df_cv.iloc[index_test].reset_index( drop = True )

            # Creating the X, T and E inputs
            X_train, X_test = data_train[self.features], data_test[self.features]
            T_train, T_test = data_train[self.target].values, data_test[self.target].values
            E_train, E_test = data_train[self.event].values, data_test[self.event].values

            weights = self.compute_event_weights(E_train, balance_classes)

            X_train = X_train[self.feature_importance[:self.best_num_feats]]
            X_test = X_test[self.feature_importance[:self.best_num_feats]]

            # Creating model
            model_forest = ConditionalSurvivalForestModel(num_trees=num_trees,)
            model_forest.fit(X_train, T_train, E_train, seed=self.seed, weights=weights, **params)

            # Append score for post calculation average of folds
            scores.append(concordance_index(model_forest, X_test, T_test, E_test))

        # Refit model with all training data
        self.fit_model(num_trees=num_trees,
                       params=params,
                       balance_classes=balance_classes)

        scores = np.array(scores)
        self.cv_score = np.mean(scores)
        if verbose:
            print('CV Score: {:.3f}'.format(self.cv_score))
示例#3
0
    def process_feature_importance(self, num_trees=500, params={},
                                    balance_classes=True, verbose=True,):
        """
        Function to process the feature importance, in order to
        operate the library with a reduced number of features,
        a way to addess the curse of dimentionality.

        """

        params = self.check_params(params)

        print('Started processing feature importance. This may take a while.')

        # use whole data to process feature importance
        df = self.df.copy()

        # Creating the X, T and E inputs
        X = df[self.features]
        T = df[self.target].values
        E = df[self.event].values

        weights = self.compute_event_weights(E, balance_classes)

        model_forest = ConditionalSurvivalForestModel(num_trees=num_trees,)
        model_forest.fit(X, T, E, seed=self.seed, weights=weights, **params)

        for i in range(0,20):

            tamanho = model_forest.variable_importance_table.shape[0]
            novo_tamanho = int(0.8 * tamanho)

            print('Tamanho: {}'.format(tamanho))
            print('Novo Tamanho: {}'.format(novo_tamanho))

            X = X[model_forest.variable_importance_table['feature'].iloc[:novo_tamanho]]

            model_forest = ConditionalSurvivalForestModel(num_trees=num_trees,)
            model_forest.fit(X, T, E, seed=self.seed, weights=weights, **params)

        self.set_feature_importance(model_forest.variable_importance_table['feature'])
示例#4
0
def load_model(path_file):
    """ Load the model and its parameters from a .zip file 

    Parameters:
    -----------
    * path_file, str
        address of the file where the model will be loaded from 

    Returns:
    --------
    * pysurvival_model : Pysurvival object
        Pysurvival model
    """

    # Initializing a base model
    from pysurvival.models import BaseModel
    base_model = BaseModel()

    # Temporary loading the model
    base_model.load(path_file)
    model_name = base_model.name

    # Loading the actual Pysurvival model - Kaplan-Meier
    if 'kaplanmeier' in model_name.lower():

        if 'smooth' in model_name.lower():
            from pysurvival.models.non_parametric import SmoothKaplanMeierModel
            pysurvival_model = SmoothKaplanMeierModel()

        else:
            from pysurvival.models.non_parametric import KaplanMeierModel
            pysurvival_model = KaplanMeierModel()

    elif 'linearmultitask' in model_name.lower():

        from pysurvival.models.multi_task import LinearMultiTaskModel
        pysurvival_model = LinearMultiTaskModel()

    elif 'neuralmultitask' in model_name.lower():

        from pysurvival.models.multi_task import NeuralMultiTaskModel
        structure = [
            {
                'activation': 'relu',
                'num_units': 128
            },
        ]
        pysurvival_model = NeuralMultiTaskModel(structure=structure)

    elif 'exponential' in model_name.lower():

        from pysurvival.models.parametric import ExponentialModel
        pysurvival_model = ExponentialModel()

    elif 'weibull' in model_name.lower():

        from pysurvival.models.parametric import WeibullModel
        pysurvival_model = WeibullModel()

    elif 'gompertz' in model_name.lower():

        from pysurvival.models.parametric import GompertzModel
        pysurvival_model = GompertzModel()

    elif 'loglogistic' in model_name.lower():

        from pysurvival.models.parametric import LogLogisticModel
        pysurvival_model = LogLogisticModel()

    elif 'lognormal' in model_name.lower():

        from pysurvival.models.parametric import LogNormalModel
        pysurvival_model = LogNormalModel()

    elif 'simulation' in model_name.lower():

        from pysurvival.models.simulations import SimulationModel
        pysurvival_model = SimulationModel()

    elif 'coxph' in model_name.lower():

        if 'nonlinear' in model_name.lower():
            from pysurvival.models.semi_parametric import NonLinearCoxPHModel
            pysurvival_model = NonLinearCoxPHModel()

        else:
            from pysurvival.models.semi_parametric import CoxPHModel
            pysurvival_model = CoxPHModel()

    elif 'random' in model_name.lower() and 'survival' in model_name.lower():

        from pysurvival.models.survival_forest import RandomSurvivalForestModel
        pysurvival_model = RandomSurvivalForestModel()

    elif 'extra' in model_name.lower() and 'survival' in model_name.lower():

        from pysurvival.models.survival_forest import ExtraSurvivalTreesModel
        pysurvival_model = ExtraSurvivalTreesModel()

    elif 'condi' in model_name.lower() and 'survival' in model_name.lower():

        from pysurvival.models.survival_forest import ConditionalSurvivalForestModel
        pysurvival_model = ConditionalSurvivalForestModel()

    elif 'svm' in model_name.lower():

        if 'linear' in model_name.lower():

            from pysurvival.models.svm import LinearSVMModel
            pysurvival_model = LinearSVMModel()

        elif 'kernel' in model_name.lower():

            from pysurvival.models.svm import KernelSVMModel
            pysurvival_model = KernelSVMModel()

    else:
        raise NotImplementedError(
            '{} is not a valid pysurvival model.'.format(model_name))

    # Transferring the components
    pysurvival_model.__dict__.update(copy.deepcopy(base_model.__dict__))
    del base_model

    return pysurvival_model
示例#5
0
            concordance_index(model, table[variables], table['PFS_temp'],
                              table['disease_progress_temp']))
    brier_scores = brier_score(model,
                               table[variables],
                               table['PFS'],
                               table['disease_progress'],
                               t_max=84,
                               figure_size=(20, 6.5))
    return c_indexes, brier_scores


# In[ ]:

# In[36]:

csf = ConditionalSurvivalForestModel(num_trees=100)
csf.fit(train[features],
        train['PFS'],
        train['disease_progress'],
        max_features=1,
        max_depth=5,
        min_node_size=2)

c_index = concordance_index(csf, test[features], test['PFS'],
                            test['disease_progress'])
print('C-index: {:.2f}'.format(c_index))

ibs = integrated_brier_score(csf,
                             test[features],
                             test['PFS'],
                             test['disease_progress'],
示例#6
0
 def build_forest(self, num_trees=500):
     self.model = ConditionalSurvivalForestModel(num_trees=num_trees)
# Creating the X, T and E inputs
X_train, X_test = data_train[features], data_test[features]
T_train, T_test = data_train[time_column], data_test[time_column]
E_train, E_test = data_train[event_column], data_test[event_column]

# Let's now fit a Conditional Survival Forest model to the training set.
#
# Note: The choice of the hyper-parameters was obtained using grid-search selection, not displayed in this tutorial.

# In[ ]:

from pysurvival.models.survival_forest import ConditionalSurvivalForestModel

# Fitting the model
csf = ConditionalSurvivalForestModel(num_trees=200)
csf.fit(X_train,
        T_train,
        E_train,
        max_features='sqrt',
        max_depth=5,
        min_node_size=20,
        alpha=0.05,
        minprop=0.1)

# In[ ]:

# Computing variables importance
csf.variable_importance_table.head(5)

# In order to assess the model performance, we previously split the original dataset into training and testing sets, so that we can now compute its performance metrics on the testing set:
示例#8
0
 def build_random_forest(self, num_trees=500):
     self.model = RandomSurvivalForestModel(num_trees=num_trees)
示例#9
0
 def build_extra_survival_trees(self, num_trees=500):
     self.model = ExtraSurvivalTreesModel(num_trees=num_trees)
示例#10
0
 def build_cox(self):
     self.model = CoxPHFitter(penalizer=0.01)
示例#11
0
 def build_multitask(self):
     self.model = LinearMultiTaskModel()
示例#12
0
 def build_logNormal(self):
     self.model = LogNormalAFTFitter(penalizer=0.01)
示例#13
0
 def build_weibullAFT(self):
     self.model = WeibullAFTFitter(penalizer=0.01)
示例#14
0
 def build_piecewise_exponential_regression(self):
     self.model = PiecewiseExponentialRegressionFitter(breakpoints=[1, 400],
                                                       penalizer=0.01)
示例#15
0
 def build_aalenAdditive(self):
     self.model = AalenAdditiveFitter(coef_penalizer=0.01,
                                      smoothing_penalizer=1000)
示例#16
0
class Model:
    def __init__(self, name):
        self.name = name

    # We define different models

    def build_aalenAdditive(self):
        self.model = AalenAdditiveFitter(coef_penalizer=0.01,
                                         smoothing_penalizer=1000)

    def build_piecewise_exponential_regression(self):
        self.model = PiecewiseExponentialRegressionFitter(breakpoints=[1, 400],
                                                          penalizer=0.01)

    def build_weibullAFT(self):
        self.model = WeibullAFTFitter(penalizer=0.01)

    def build_logNormal(self):
        self.model = LogNormalAFTFitter(penalizer=0.01)

    def build_cox(self):
        self.model = CoxPHFitter(penalizer=0.01)

    def build_multitask(self):
        self.model = LinearMultiTaskModel()

    def build_random_forest(self, num_trees=500):
        self.model = RandomSurvivalForestModel(num_trees=num_trees)

    def build_extra_survival_trees(self, num_trees=500):
        self.model = ExtraSurvivalTreesModel(num_trees=num_trees)

    def build_forest(self, num_trees=500):
        self.model = ConditionalSurvivalForestModel(num_trees=num_trees)

    def train(self, X, Y):  # the fit method depend on the model type
        if ('semi_parametric' in str(type(
                self.model))) or ('multi_task' in str(type(self.model))):
            self.model.fit(X=X,
                           T=Y['SurvivalTime'],
                           E=Y['Event'],
                           init_method='zeros',
                           num_epochs=500)

        if 'survival_forest' in str(type(self.model)):
            self.model.fit(X=X,
                           T=Y['SurvivalTime'],
                           E=Y['Event'],
                           max_features='all',
                           max_depth=20,
                           sample_size_pct=0.33)

        if 'lifelines' in str(
                type(self.model)
        ):  # else we want to fit a model from lifeline library using cross validation
            k_fold_cross_validation(self.model,
                                    pd.concat([X, Y], axis=1),
                                    'SurvivalTime',
                                    event_col='Event',
                                    k=5)
            # self.model.fit(pd.concat([X, Y], axis=1), 'SurvivalTime', event_col='Event', show_progress=False)

    def predict_survival_function(self, X):
        return self.model.pred(X)

    def predict_expectation(self, X):
        # the expectation is different depending on the package we use
        # as the expectation does not exist in pysurvival ->  we will use predict_risk
        if 'pysurvival' in str(type(self.model)):
            return self.model.predict_risk(X)
        else:

            return self.model.predict_expectation(X)

    def c_index(self, X, Y):
        Y_prediction = self.predict_expectation(X)

        # Y_prediction = 2 * max(Y_prediction) - Y_prediction
        if 'pysurvival' in str(type(self.model)):
            Y_prediction = 10 * max(Y_prediction) - Y_prediction
            Y_prediction = pd.DataFrame(Y_prediction,
                                        index=Y.index,
                                        columns=['SurvivalTime'])
        else:
            Y_prediction = pd.DataFrame(Y_prediction.values,
                                        index=Y.index,
                                        columns=['SurvivalTime'])

        Y_prediction['Event'] = np.nan  # Y['Event']
        return cindex(Y, Y_prediction)

    def predict_and_format(self, X, filename):
        Y_prediction = self.predict_expectation(X)

        if 'pysurvival' in str(type(self.model)):
            Y_prediction = 10 * max(Y_prediction) - Y_prediction
            Y_prediction = pd.DataFrame(Y_prediction,
                                        index=X.index,
                                        columns=['SurvivalTime'])
        else:
            Y_prediction = pd.DataFrame(Y_prediction.values,
                                        index=X.index,
                                        columns=['SurvivalTime'])

        Y_prediction['Event'] = np.nan  # Y['Event']
        # Y_prediction.to_csv(filename)
        return Y_prediction

    def fit_and_score(self, X, Y):
        # for each feature : create a model, train it (only on the selected feature) and compute the c-index score
        scores = []
        for feature in X.columns.values:
            Xj = X[feature].values.reshape((len(X), 1))

            self.train(Xj, Y)
            scores.append(self.c_index(Xj, Y))

        scores = pd.Series(scores,
                           index=X.columns).sort_values(ascending=False)
        return scores