示例#1
0
    def fit(self,
            X,
            T,
            E,
            init_method='glorot_normal',
            lr=1e-2,
            max_iter=100,
            l2_reg=1e-2,
            alpha=0.95,
            tol=1e-3,
            verbose=True):
        """
        Fitting a proportional hazards regression model using
        the Efron's approximation method to take into account tied times.
        
        As the Hessian matrix of the log-likelihood can be 
        calculated without too much effort, the model parameters are 
        computed using the Newton_Raphson Optimization scheme:
                W_new = W_old - lr*<Hessian^(-1), gradient>
        
        Arguments:
        ---------
        * `X` : **array-like**, *shape=(n_samples, n_features)* --
            The input samples.

        * `T` : **array-like** -- 
            The target values describing when the event of interest or 
            censoring occurred.

        * `E` : **array-like** --
            The values that indicate if the event of interest occurred 
            i.e.: E[i]=1 corresponds to an event, and E[i] = 0 means censoring, 
            for all i.

        * `init_method` : **str** *(default = 'glorot_uniform')* -- 
            Initialization method to use. Here are the possible options:

            * `glorot_uniform`: Glorot/Xavier uniform initializer
            * `he_uniform`: He uniform variance scaling initializer
            * `uniform`: Initializing tensors with uniform (-1, 1) distribution
            * `glorot_normal`: Glorot normal initializer,
            * `he_normal`: He normal initializer.
            * `normal`: Initializing tensors with standard normal distribution
            * `ones`: Initializing tensors to 1
            * `zeros`: Initializing tensors to 0
            * `orthogonal`: Initializing tensors with a orthogonal matrix,
            
        * `lr`: **float** *(default=1e-4)* -- 
            learning rate used in the optimization

        * `max_iter`: **int** *(default=100)* -- 
            The maximum number of iterations in the Newton optimization

        * `l2_reg`: **float** *(default=1e-4)* -- 
            L2 regularization parameter for the model coefficients

        * `alpha`: **float** *(default=0.95)* -- 
            Confidence interval

        * `tol`: **float** *(default=1e-3)* -- 
            Tolerance for stopping criteria

        * `verbose`: **bool** *(default=True)* -- 
            Whether or not producing detailed logging about the modeling
 
        Example:
        --------

        #### 1 - Importing packages
        import numpy as np
        import pandas as pd
        from matplotlib import pyplot as plt
        from sklearn.model_selection import train_test_split
        from pysurvival.models.simulations import SimulationModel
        from pysurvival.models.semi_parametric import CoxPHModel
        from pysurvival.utils.metrics import concordance_index
        from pysurvival.utils.display import integrated_brier_score
        #%pylab inline  # To use with Jupyter notebooks


        #### 2 - Generating the dataset from a Log-Logistic parametric model
        # Initializing the simulation model
        sim = SimulationModel( survival_distribution = 'log-logistic',  
                               risk_type = 'linear',
                               censored_parameter = 10.1, 
                               alpha = 0.1, beta=1.2 )

        # Generating N random samples 
        N = 1000
        dataset = sim.generate_data(num_samples = N, num_features = 3)

        #### 3 - Creating the modeling dataset
        # Defining the features
        features = sim.features

        # Building training and testing sets #
        index_train, index_test = train_test_split( range(N), test_size = 0.2)
        data_train = dataset.loc[index_train].reset_index( drop = True )
        data_test  = dataset.loc[index_test].reset_index( drop = True )

        # Creating the X, T and E input
        X_train, X_test = data_train[features], data_test[features]
        T_train, T_test = data_train['time'].values, data_test['time'].values
        E_train, E_test = data_train['event'].values, data_test['event'].values


        #### 4 - Creating an instance of the Cox PH model and fitting the data.
        # Building the model
        coxph = CoxPHModel()
        coxph.fit(X_train, T_train, E_train, lr=0.5, l2_reg=1e-2, 
            init_method='zeros')


        #### 5 - Cross Validation / Model Performances
        c_index = concordance_index(coxph, X_test, T_test, E_test) #0.92
        print('C-index: {:.2f}'.format(c_index))

        ibs = integrated_brier_score(coxph, X_test, T_test, E_test, t_max=10, 
                    figure_size=(20, 6.5) )

        References:
        -----------
        * https://en.wikipedia.org/wiki/Proportional_hazards_model#Tied_times
        * Efron, Bradley (1974). "The Efficiency of Cox's Likelihood 
          Function for Censored Data". Journal of the American Statistical 
          Association. 72 (359): 557-565. 
        """

        # Collecting features names
        N, self.num_vars = X.shape
        if isinstance(X, pd.DataFrame):
            self.variables = X.columns.tolist()
        else:
            self.variables = ['x_{}'.format(i) for i in range(self.num_vars)]

        # Checking the format of the data
        X, T, E = utils.check_data(X, T, E)
        order = np.argsort(-T)
        T = T[order]
        E = E[order]
        X = self.scaler.fit_transform(X[order, :])
        self.std_scale = np.sqrt(self.scaler.var_)

        # Initializing the model
        self.model = _CoxPHModel()

        # Creating the time axis
        self.model.get_times(T, E)

        # Initializing the parameters
        W = np.zeros(self.num_vars)
        W = opt.initialization(init_method, W, False).flatten()
        W = W.astype(np.float64)

        # Optimizing to find best parameters
        epsilon = 1e-9
        self.model.newton_optimization(X, T, E, W, lr, l2_reg, tol, epsilon,
                                       max_iter, verbose)

        # Saving the Cython attributes in the Python object
        self.weights = np.array(self.model.W)
        self.loss = self.model.loss
        self.times = np.array(self.model.times)
        self.gradient = np.array(self.model.gradient)
        self.Hessian = np.array(self.model.Hessian)
        self.inv_Hessian = np.array(self.model.inv_Hessian)
        self.loss_values = np.array(self.model.loss_values)
        self.grad2_values = np.array(self.model.grad2_values)

        # Computing baseline functions
        score = np.exp(np.dot(X, self.weights))
        baselines = _baseline_functions(score, T, E)

        # Saving the Cython attributes in the Python object
        self.baseline_hazard = np.array(baselines[1])
        self.baseline_survival = np.array(baselines[2])
        del self.model
        self.get_time_buckets()

        # Calculating summary
        self.get_summary(alpha)

        return self
示例#2
0
    def fit(self,
            X,
            T,
            E,
            init_method='glorot_uniform',
            optimizer='adam',
            lr=1e-4,
            num_epochs=1000,
            dropout=0.2,
            batch_normalization=False,
            bn_and_dropout=False,
            l2_reg=1e-5,
            verbose=True):
        """ 
        Fit the estimator based on the given parameters.

        Parameters:
        -----------
        * `X` : **array-like**, *shape=(n_samples, n_features)* --
            The input samples.

        * `T` : **array-like** -- 
            The target values describing when the event of interest or censoring
            occurred.

        * `E` : **array-like** --
            The values that indicate if the event of interest occurred i.e.: 
            E[i]=1 corresponds to an event, and E[i] = 0 means censoring, 
            for all i.

        * `init_method` : **str** *(default = 'glorot_uniform')* -- 
            Initialization method to use. Here are the possible options:

            * `glorot_uniform`: Glorot/Xavier uniform initializer
            * `he_uniform`: He uniform variance scaling initializer 
            * `uniform`: Initializing tensors with uniform (-1, 1) distribution
            * `glorot_normal`: Glorot normal initializer,
            * `he_normal`: He normal initializer.
            * `normal`: Initializing tensors with standard normal distribution
            * `ones`: Initializing tensors to 1
            * `zeros`: Initializing tensors to 0
            * `orthogonal`: Initializing tensors with a orthogonal matrix,

        * `optimizer`:  **str** *(default = 'adam')* -- 
            iterative method for optimizing a differentiable objective function.
            Here are the possible options:

            - `adadelta`
            - `adagrad`
            - `adam`
            - `adamax`
            - `rmsprop`
            - `sparseadam`
            - `sgd`

        * `lr`: **float** *(default=1e-4)* -- 
            learning rate used in the optimization

        * `num_epochs`: **int** *(default=1000)* -- 
            The number of iterations in the optimization

        * `dropout`: **float** *(default=0.5)* -- 
            Randomly sets a fraction rate of input units to 0 
            at each update during training time, which helps prevent overfitting.

        * `l2_reg`: **float** *(default=1e-4)* -- 
            L2 regularization parameter for the model coefficients

        * `batch_normalization`: **bool** *(default=True)* -- 
            Applying Batch Normalization or not

        * `bn_and_dropout`: **bool** *(default=False)* -- 
            Applying Batch Normalization and Dropout at the same time

        * `verbose`: **bool** *(default=True)* -- 
            Whether or not producing detailed logging about the modeling
                

        Example:
        --------

        #### 1 - Importing packages
        import numpy as np
        import pandas as pd
        from matplotlib import pyplot as plt
        from sklearn.model_selection import train_test_split
        from pysurvival.models.simulations import SimulationModel
        from pysurvival.models.semi_parametric import NonLinearCoxPHModel
        from pysurvival.utils.metrics import concordance_index
        from pysurvival.utils.display import integrated_brier_score
        #%matplotlib inline  # To use with Jupyter notebooks

        #### 2 - Generating the dataset from a nonlinear Weibull parametric model
        # Initializing the simulation model
        sim = SimulationModel( survival_distribution = 'weibull',  
                               risk_type = 'Gaussian',
                               censored_parameter = 2.1, 
                               alpha = 0.1, beta=3.2 )

        # Generating N random samples 
        N = 1000
        dataset = sim.generate_data(num_samples = N, num_features=3)

        # Showing a few data-points 
        dataset.head(2)

        #### 3 - Creating the modeling dataset
        # Defining the features
        features = sim.features

        # Building training and testing sets #
        index_train, index_test = train_test_split( range(N), test_size = 0.2)
        data_train = dataset.loc[index_train].reset_index( drop = True )
        data_test  = dataset.loc[index_test].reset_index( drop = True )

        # Creating the X, T and E input
        X_train, X_test = data_train[features], data_test[features]
        T_train, T_test = data_train['time'].values, data_test['time'].values
        E_train, E_test = data_train['event'].values, data_test['event'].values


        #### 4 - Creating an instance of the NonLinear CoxPH model and fitting 
        # the data.

        # Defining the MLP structure. Here we will build a 1-hidden layer 
        # with 150 units and `BentIdentity` as its activation function
        structure = [ {'activation': 'BentIdentity', 'num_units': 150},  ]

        # Building the model
        nonlinear_coxph = NonLinearCoxPHModel(structure=structure) 
        nonlinear_coxph.fit(X_train, T_train, E_train, lr=1e-3, 
            init_method='xav_uniform')


        #### 5 - Cross Validation / Model Performances
        c_index = concordance_index(nonlinear_coxph, X_test, T_test, E_test)
        print('C-index: {:.2f}'.format(c_index))

        ibs = integrated_brier_score(nonlinear_coxph, X_test, T_test, E_test, 
            t_max=10, figure_size=(20, 6.5) )

        """

        # Checking data format (i.e.: transforming into numpy array)
        X, T, E = utils.check_data(X, T, E)

        # Extracting data parameters
        N, self.num_vars = X.shape
        input_shape = self.num_vars

        # Scaling data
        if self.auto_scaler:
            X_original = self.scaler.fit_transform(X)

        # Sorting X, T, E in descending order according to T
        order = np.argsort(-T)
        T = T[order]
        E = E[order]
        X_original = X_original[order, :]
        self.times = np.unique(T[E.astype(bool)])
        self.nb_times = len(self.times)
        self.get_time_buckets()

        # Initializing the model
        model = nn.NeuralNet(input_shape, 1, self.structure, init_method,
                             dropout, batch_normalization, bn_and_dropout)

        # Looping through the data to calculate the loss
        X = torch.cuda.FloatTensor(X_original)

        # Computing the Risk and Fail tensors
        Risk, Fail = self.risk_fail_matrix(T, E)
        Risk = torch.cuda.FloatTensor(Risk)
        Fail = torch.cuda.FloatTensor(Fail)

        # Computing Efron's matrices
        Efron_coef, Efron_one, Efron_anti_one = self.efron_matrix()
        Efron_coef = torch.cuda.FloatTensor(Efron_coef)
        Efron_one = torch.cuda.FloatTensor(Efron_one)
        Efron_anti_one = torch.cuda.FloatTensor(Efron_anti_one)

        # Performing order 1 optimization
        model, loss_values = opt.optimize(self.loss_function,
                                          model,
                                          optimizer,
                                          lr,
                                          num_epochs,
                                          verbose,
                                          X=X,
                                          Risk=Risk,
                                          Fail=Fail,
                                          Efron_coef=Efron_coef,
                                          Efron_one=Efron_one,
                                          Efron_anti_one=Efron_anti_one,
                                          l2_reg=l2_reg)

        # Saving attributes
        self.model = model.eval()
        self.loss_values = loss_values

        # Computing baseline functions
        x = X_original
        x = torch.cuda.FloatTensor(x)

        # Calculating risk_score
        score = np.exp(
            self.model(torch.cuda.FloatTensor(x)).data.cpu().numpy().flatten())
        baselines = _baseline_functions(score, T, E)

        # Saving the Cython attributes in the Python object
        self.times = np.array(baselines[0])
        self.baseline_hazard = np.array(baselines[1])
        self.baseline_survival = np.array(baselines[2])

        return self