def Aalen_model(df, l2=0.01, coeff_pen=0.1, smooth_pen=0.1):
    '''Invokes the Aalen Additive Fitter class to creat an instance that fits the regression model:

    hazard(t)  = b_0(t) + b_1(t)*x_1 + ... + b_N(t)*x_N
	i.e., the hazard rate is a linear function of the covariates.

    Parameters
    df: Pandas dataframe.  The y column must be called "Total_years."  A column of Boolean values called
        "censored" to indicate which row of data is censored, as indicated by True or False or 1 or 0.
    coeff_pen = 0.1: Attach a L2 penalizer to the size of the coeffcients during regression. This improves
        stability of the estimates and controls for high correlation between covariates.  For example,
        this shrinks the absolute value of c_{i,t}. Recommended, even if a small value.
    Smoothing_penalizer = 0.1: Attach a L2 penalizer to difference between adjacent (over time) coefficents. For
        example, this shrinks the absolute value of c_{i,t} - c_{i,t+1}.

	Other built-in, unadjustable parameters:
    Intercept = False.  We suggest adding a column of 1 to model the baseline hazard.
    nn_cumulative_hazard = True:  In its True state, it forces the the negative hazard values to be zero

    Output: aaf instance fitted to df'''
    aaf = AalenAdditiveFitter(fit_intercept=False,
                              coef_penalizer=coeff_pen,
                              smoothing_penalizer=smooth_pen,
                              nn_cumulative_hazard=True)
    aaf.fit(df, 'Total_years', event_col='censored')
    return aaf
示例#2
0
 def __init__(self,
              predictive_relationship,
              ranks,
              max_iter=200,
              init='kmeans',
              tol=1e-3,
              stopping=None,
              n_jobs=1,
              verbose=0,
              random_state=None,
              coef_penalizer=1.0,
              fit_intercept=True):
     self.predictive_relationship = predictive_relationship
     AalenAdditiveFitter.__init__(self,
                                  coef_penalizer=coef_penalizer,
                                  fit_intercept=fit_intercept)
     DFMF.__init__(self,
                   ranks=ranks,
                   max_iter=max_iter,
                   init=init,
                   tol=tol,
                   stopping=stopping,
                   n_jobs=n_jobs,
                   verbose=verbose,
                   random_state=random_state)
示例#3
0
    def test_aalen_additive_fit_with_censor(self, block):
        n = 2500
        d = 6
        timeline = np.linspace(0, 70, 10000)
        hz, coef, X = generate_hazard_rates(n, d, timeline)
        X.columns = coef.columns
        cumulative_hazards = pd.DataFrame(cumulative_integral(
            coef.values, timeline),
                                          index=timeline,
                                          columns=coef.columns)
        T = generate_random_lifetimes(hz, timeline)
        T[np.isinf(T)] = 10
        X["T"] = T
        X["E"] = np.random.binomial(1, 0.99, n)

        aaf = AalenAdditiveFitter()
        aaf.fit(X, "T", "E")

        for i in range(d + 1):
            ax = self.plt.subplot(d + 1, 1, i + 1)
            col = cumulative_hazards.columns[i]
            ax = cumulative_hazards[col].loc[:15].plot(ax=ax)
            ax = aaf.plot(loc=slice(0, 15), ax=ax, columns=[col])
        self.plt.title("test_aalen_additive_fit_with_censor")
        self.plt.show(block=block)
        return
    def fit(self, X, y, **fit_params):
        X_ = X.copy()
        X_[self.duration_column]=y[self.duration_column]
        if self.event_col is not None:
            X_[self.event_col] = y[self.event_col]

        params = self.get_params()
        est = AalenAdditiveFitter(**params)
        est.fit(X_, duration_col=self.duration_column, event_col=self.event_col, timeline=self.timeline, id_col = self.id_col, **fit_params)
        self.estimator = est
        return self
    def fit(self, X, y, **fit_params):
        X_ = X.copy()
        X_[self.duration_column] = y[self.duration_column]
        if self.event_col is not None:
            X_[self.event_col] = y[self.event_col]

        params = self.get_params()
        est = AalenAdditiveFitter(**params)
        est.fit(X_,
                duration_col=self.duration_column,
                event_col=self.event_col,
                timeline=self.timeline,
                id_col=self.id_col,
                **fit_params)
        self.estimator = est
        return self
示例#6
0
    def score_model(self):
        # get the data and clean it
        temp = self.sample_size
        self.sample_size = 100000
        df, dep = self.load_and_clean_data()
        self.sample_size = temp

        # create the model
        aaf = AalenAdditiveFitter()
        cph = CoxPHFitter()

        # define fields for the model
        modelspec = 'YR_BRTH + AGE_DX + RADIATN + HISTREC + ERSTATUS + PRSTATUS + BEHANAL + HST_STGA + NUMPRIMS + RACE'
        X = pt.dmatrix(modelspec, df, return_type='dataframe')
        X = X.join(df[['SRV_TIME_MON', 'CENSORED']])

        scores = k_fold_cross_validation(aaf,
                                         X,
                                         'SRV_TIME_MON',
                                         event_col='CENSORED',
                                         k=5)
        print('\nCross Validation Scores: ')
        print(scores)
        print('Score Mean: {0:.4}'.format(np.mean(scores)))
        print('Score SD  : {0:.4}'.format(np.std(scores)))

        return
示例#7
0
    def fit(self, X, y, **fit_params):
        X_ = X.copy()
        X_[self.duration_column] = y[self.duration_column]
        if self.event_col is not None:
            X_[self.event_col] = y[self.event_col]

        est = AalenAdditiveFitter(fit_intercept=self.fit_intercept,
                                  alpha=self.alpha,
                                  coef_penalizer=self.coef_penalizer,
                                  smoothing_penalizer=self.smoothing_penalizer)
        est.fit(X_,
                duration_col=self.duration_column,
                event_col=self.event_col,
                timeline=self.timeline,
                id_col=self.id_col,
                **fit_params)
        self.estimator = est
        return self
示例#8
0
    def prepare_model(self):

        # get the data and clean it
        df, dep = self.load_and_clean_data()

        # create the model
        aaf = AalenAdditiveFitter()

        # define fields for the model
        modelspec = 'YR_BRTH + AGE_DX + RADIATN + HISTREC + ERSTATUS + PRSTATUS + BEHANAL + HST_STGA + NUMPRIMS + RACE'
        X = pt.dmatrix(modelspec, df, return_type='dataframe')
        X = X.join(df[['SRV_TIME_MON', 'CENSORED']])

        # fit the model
        if self.verbose:
            print('Creating Aalen Additive Model')

        aaf.fit(X, 'SRV_TIME_MON', 'CENSORED')

        return aaf
    def prepare_model(self):

        # get the data and clean it
        df, dep = self.load_and_clean_data()

        # create the model
        aaf = AalenAdditiveFitter()

        # define fields for the model
        modelspec = 'YR_BRTH + AGE_DX + RADIATN + HISTREC + ERSTATUS + PRSTATUS + BEHANAL + HST_STGA + NUMPRIMS + RACE'
        X = pt.dmatrix(modelspec, df, return_type='dataframe')
        X = X.join(df[['SRV_TIME_MON','CENSORED']])

        # fit the model
        if self.verbose:
            print('Creating Aalen Additive Model')

        aaf.fit(X, 'SRV_TIME_MON', 'CENSORED')

        return aaf
示例#10
0
    def test_aalen_additive_smoothed_plot(self, block):
        # this is a visual test of the fitting the cumulative
        # hazards.
        n = 2500
        d = 3
        timeline = np.linspace(0, 150, 5000)
        hz, coef, X = generate_hazard_rates(n, d, timeline)
        T = generate_random_lifetimes(hz, timeline) + 0.1 * np.random.uniform(size=(n, 1))
        C = np.random.binomial(1, 0.8, size=n)
        X["T"] = T
        X["E"] = C

        # fit the aaf, no intercept as it is already built into X, X[2] is ones
        aaf = AalenAdditiveFitter(coef_penalizer=0.1, fit_intercept=False)
        aaf.fit(X, "T", "E")
        ax = aaf.smoothed_hazards_(1).iloc[0 : aaf.cumulative_hazards_.shape[0] - 500].plot()
        ax.set_xlabel("time")
        ax.set_title("test_aalen_additive_smoothed_plot")
        self.plt.show(block=block)
        return
示例#11
0
    def fit(self, R, y=None, Thetas=dict()):
        """
        R: dict
            Relations between types

        ranks: dict
            Number of latent factors in which to decompose the relations

        y: array-like, dimensions (n x 1) or (n x 2)
            The first column is the time to predict.
            The second column is optional and is the event we are predicting.
        """
        self.random_state = check_random_state(self.random_state)
        if self.verbose:
            print("Fitting data fusion procedure")
        DFMF.fit(self, R, Thetas)
        if y is not None:
            y = self._check_y(y)
            self.outputs_ = y

        t1, t2 = self.predictive_relationship
        factors = ["Factor " + str(i) for i in range(1, self.ranks[t2] + 1)]
        x = self.G_[t1, t1].dot(self.S_[t1, t2][0])
        X = pd.DataFrame(x, columns=factors)

        if y is not None:
            self.regression_ = True
            if self.verbose:
                print("Fitting Aalen additive model")
            if y.shape[1] == 2:
                X['T'] = y[:, 0]
                X['E'] = y[:, 1]
                AalenAdditiveFitter.fit(self, X, 'T', event_col='E')
            else:
                X['T'] = y
                AalenAdditiveFitter.fit(self, X, 'T')
            self._fit_Kaplan_Meier()
        else:
            self.regression_ = False

        return self
示例#12
0
    def test_aalen_additive_plot(self, block):
        # this is a visual test of the fitting the cumulative
        # hazards.
        n = 2500
        d = 3
        timeline = np.linspace(0, 70, 10000)
        hz, coef, X = generate_hazard_rates(n, d, timeline)
        T = generate_random_lifetimes(hz, timeline)
        T[np.isinf(T)] = 10
        C = np.random.binomial(1, 1.0, size=n)
        X["T"] = T
        X["E"] = C

        # fit the aaf, no intercept as it is already built into X, X[2] is ones
        aaf = AalenAdditiveFitter(coef_penalizer=0.1, fit_intercept=False)

        aaf.fit(X, "T", "E")
        ax = aaf.plot(iloc=slice(0, aaf.cumulative_hazards_.shape[0] - 100))
        ax.set_xlabel("time")
        ax.set_title("test_aalen_additive_plot")
        self.plt.show(block=block)
        return
def Aalen_model(df, l2 = 0.01, coeff_pen = 0.1, smooth_pen = 0.1):
    '''Invokes the Aalen Additive Fitter class to creat an instance that fits the regression model:

    hazard(t)  = b_0(t) + b_1(t)*x_1 + ... + b_N(t)*x_N
	i.e., the hazard rate is a linear function of the covariates.

    Parameters
    df: Pandas dataframe.  The y column must be called "Total_years."  A column of Boolean values called
        "censored" to indicate which row of data is censored, as indicated by True or False or 1 or 0.
    coeff_pen = 0.1: Attach a L2 penalizer to the size of the coeffcients during regression. This improves
        stability of the estimates and controls for high correlation between covariates.  For example,
        this shrinks the absolute value of c_{i,t}. Recommended, even if a small value.
    Smoothing_penalizer = 0.1: Attach a L2 penalizer to difference between adjacent (over time) coefficents. For
        example, this shrinks the absolute value of c_{i,t} - c_{i,t+1}.

	Other built-in, unadjustable parameters:
    Intercept = False.  We suggest adding a column of 1 to model the baseline hazard.
    nn_cumulative_hazard = True:  In its True state, it forces the the negative hazard values to be zero

    Output: aaf instance fitted to df'''
    aaf = AalenAdditiveFitter(fit_intercept=False, coef_penalizer=coeff_pen, smoothing_penalizer=smooth_pen, nn_cumulative_hazard=True)
    aaf.fit(df, 'Total_years', event_col='censored')
    return aaf
示例#14
0
    def test_aaf_panel_dataset(self, block):

        panel_dataset = load_panel_test()
        aaf = AalenAdditiveFitter()
        aaf.fit(panel_dataset, id_col="id", duration_col="t", event_col="E")
        aaf.plot()
        self.plt.title("test_aaf_panel_dataset")
        self.plt.show(block=block)
        return
示例#15
0
    def predict(self, R, Thetas=dict(), _type='cumulative_hazards', **kwargs):
        """
        Assuming that the type to refit is the first type of
         predictive_relationship
        """

        if not self.regression_:
            raise Exception("No regression was fitted on the traning")

        X = self._modify_test_data(R, Thetas)
        if _type == 'cumulative_hazards':
            return AalenAdditiveFitter.predict_cumulative_hazard(
                self, X, id_col=kwargs.get('id_col', None))
        elif _type == 'survival_function':
            return AalenAdditiveFitter.predict_survival_function(self, X)
        elif _type == 'percentile':
            return AalenAdditiveFitter.predict_percentile(
                self, X, kwargs.get('p', 0))
        elif _type == 'median':
            return AalenAdditiveFitter.predict_median(self, X)
        elif _type == 'expectation':
            return AalenAdditiveFitter.predict_expectation(self, X)
        else:
            raise ValueError("Not avaialble type of prediction")
示例#16
0
    def run_survival_curve(self, df):
        ''' used for testing only'''

        aaf = AalenAdditiveFitter()

        modelspec = 'YR_BRTH + AGE_DX + RADIATN + HISTREC + ERSTATUS + PRSTATUS + BEHANAL + HST_STGA + NUMPRIMS + RACE'
        X = pt.dmatrix(modelspec, df, return_type='dataframe')
        X = X.join(df[['SRV_TIME_MON', 'CENSORED']])
        aaf.fit(X, 'SRV_TIME_MON', 'CENSORED')

        # INSERT VALUES TO TEST HERE
        test = np.array([[1., 1961., 52., 0, 0., 2., 1., 0., 4., 2.]])

        aaf.predict_survival_function(test).plot()
        plt.show()

        exp = aaf.predict_expectation(test)
        print(exp)

        return
    def run_survival_curve(self, df):
        ''' used for testing only'''

        aaf = AalenAdditiveFitter()

        modelspec = 'YR_BRTH + AGE_DX + RADIATN + HISTREC + ERSTATUS + PRSTATUS + BEHANAL + HST_STGA + NUMPRIMS + RACE'
        X = pt.dmatrix(modelspec, df, return_type='dataframe')
        X = X.join(df[['SRV_TIME_MON','CENSORED']])
        aaf.fit(X, 'SRV_TIME_MON', 'CENSORED')

        # INSERT VALUES TO TEST HERE                
        test = np.array([[ 1., 1961., 52., 0, 0., 2., 1., 0., 4., 2.]])

        aaf.predict_survival_function(test).plot();
        plt.show()

        exp = aaf.predict_expectation(test)
        print(exp)

        return
示例#18
0
from lifelines.datasets import generate_regression_dataset
regression_dataset = generate_regression_dataset()
from lifelines import AalenAdditiveFitter, CoxPHFitter
cf = CoxPHFitter()
cf.fit(regression_dataset, duration_col='T', event_col='E')
aaf = AalenAdditiveFitter(fit_intercept=False)
aaf.fit(regression_dataset, duration_col='T', event_col='E')
x = regression_dataset[regression_dataset.columns - ['E','T']]
aaf.predict_survival_function(x.ix[10:12]).plot()
aaf.plot()
示例#19
0
n=200, number of events=189

       coef  exp(coef)  se(coef)      z      p  lower 0.95  upper 0.95
var1 0.2213     1.2477    0.0743 2.9796 0.0029      0.0757      0.3669  **
var2 0.0509     1.0522    0.0829 0.6139 0.5393     -0.1116      0.2134
var3 0.2186     1.2443    0.0758 2.8836 0.0039      0.0700      0.3672  **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Concordance = 0.580
"""

cph.plot()

# Using Aalen's Additive model
aaf = AalenAdditiveFitter(fit_intercept=False)
aaf.fit(regression_dataset, duration_col='T', event_col='E')
aaf.plot()

X = regression_dataset.drop(['E', 'T'], axis=1)
aaf.predict_survival_function(
    X.iloc[10:12]).plot()  # get the unique survival functions of two subjects

scores = k_fold_cross_validation(cph,
                                 regression_dataset,
                                 duration_col='T',
                                 event_col='E',
                                 k=10)
print(scores)
print(np.mean(scores))
print(np.std(scores))
示例#20
0
regression_dataset.head()






from lifelines import AalenAdditiveFitter, CoxPHFitter

# Using Cox Proportional Hazards model
cf = CoxPHFitter()
cf.fit(regression_dataset, 'T', event_col='E')
cf.print_summary()

# Using Aalen's Additive model
aaf = AalenAdditiveFitter(fit_intercept=False)
aaf.fit(regression_dataset, 'T', event_col='E')






x = regression_dataset[regression_dataset.columns - ['E','T']]
aaf.predict_survival_function(x.ix[10:12]).plot() #get the unique survival functions of the first two subjects



aaf.plot()

df = df[df['Duration'] != 0]

df2 = df.loc[:, [
    'DISTRIBUTION CHANNEL', 'GENDER', 'SMOKER STATUS', 'PremiumPattern',
    'BENEFITS TYPE', 'BROKER COMM'
]]

#df2 = df.loc[:, ['GENDER', 'SMOKER STATUS', 'PremiumPattern']]

#df2 = df.loc[:, ['SMOKER STATUS', 'GENDER']]

df2 = pd.get_dummies(df2)

#T = df['Duration']

E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False)

df2['E'] = E
df2['T'] = T

aaf = AalenAdditiveFitter()
aaf.fit(df2, 'T', event_col='E', show_progress=True)
pickle.dump(aaf, open('Smoker_Gender_All.pkl', 'wb'))
aaf.plot()

#cph = CoxPHFitter()
#cph.fit(df2, duration_col='T', event_col='E', show_progress=True, strata=['SMOKER STATUS_No','SMOKER STATUS_Yes',
#                                                                          'GENDER_F', 'GENDER_M'])
#pickle.dump(cph, open('Smoker_Gender_CPF.pkl', 'wb'))
#cph.plot()
示例#22
0
 def __init__(self, penalizer=0, include_recency=False):
     super().__init__(include_recency=include_recency)
     self.cf = AalenAdditiveFitter(coef_penalizer=penalizer)
示例#23
0
def aalen_aditive(in_df):
    assert (not in_df.isnull().values.any())
    aaf = AalenAdditiveFitter(fit_intercept=False)
    aaf.fit(in_df, 'LivingDays', event_col='Dead')
示例#24
0
    aft.fit(times, duration_col='time', event_col='success')
    aft.print_summary(3)

    #aft = WeibullAFTFitter().fit(times, 'time', 'success', ancillary_df=True)
    save(name + 'aft', aft.plot())
    fitters[name] = aft
    crossValidate(name, aft)
    print("END " + name)

print('EXAMPLE DATA FOLLOWS')
from lifelines import AalenAdditiveFitter, CoxPHFitter
from lifelines.datasets import load_regression_dataset
from lifelines.utils import k_fold_cross_validation
import numpy as np

df = load_regression_dataset()

#create the three models we'd like to compare.
aaf_1 = AalenAdditiveFitter(coef_penalizer=0.5)
aaf_2 = AalenAdditiveFitter(coef_penalizer=10)
cph = CoxPHFitter()

print(
    np.mean(k_fold_cross_validation(cph, df, duration_col='T', event_col='E')))
print(
    np.mean(k_fold_cross_validation(aaf_1, df, duration_col='T',
                                    event_col='E')))
print(
    np.mean(k_fold_cross_validation(aaf_2, df, duration_col='T',
                                    event_col='E')))
    sns.set()
    naf.plot(ax=ax, legend=False)
    plt.title(state_name)
    plt.xlim(0, 80)
plt.tight_layout()
plt.savefig(
    '/home/raed/Dropbox/INSE - 6320/Final Project/Cumulative_Hazard_for_each_State.pdf'
)
plt.show()

#Survival Regression using the following covariates : Couple Race, Income Range, State and Marriage Date
X = patsy.dmatrix(
    'State + Couple_Race + Household_Income_Range + Husband_Education + Husband_Race + Marriage_Date -1',
    data,
    return_type='dataframe')
aaf = AalenAdditiveFitter(coef_penalizer=1.0, fit_intercept=True)
X['T'] = data['Duration']
X['E'] = data['Divorce']
aaf.fit(X, 'T', event_col='E')

aaf.cumulative_hazards_.head()
sns.set()
aaf.plot(columns=[
    'State[Alabama]', 'baseline', 'Couple_Race[T.Same-Race]',
    'Household_Income_Range[T.42,830$ - 44,765$]'
],
         ix=slice(1, 15))
plt.savefig(
    '/home/raed/Dropbox/INSE - 6320/Final Project/Survival_Regression_for_Alabamae.pdf'
)
plt.show()
示例#26
0
# -*- coding: utf-8 -*-
# aalen additive

if __name__ == "__main__":
    import pandas as pd
    import time

    from lifelines import AalenAdditiveFitter
    from lifelines.datasets import load_rossi

    df = load_rossi()
    df = pd.concat([df] * 5).reset_index(drop=True)
    print("Size: ", df.shape)
    aaf = AalenAdditiveFitter()
    start_time = time.time()
    aaf.fit(df, duration_col="week", event_col="arrest")
    print("--- %s seconds ---" % (time.time() - start_time))
    print(aaf.score_)
示例#27
0
 def __init__(self):
     super(AalenAdditive, self).__init__(AalenAdditiveFitter(),
                                         self.__class__.__name__)
示例#28
0
def kfoldcv(data, k=5, m=10, penalizer=0.5, timeinterval=np.linspace(1,20,20), duration_col='ndays_act', event_col='observed'):
    """
    Trains data with AalenAdditiveFitter and (k-fold) cross validate it.
    Based on lifelines library for survival analysis in Python.
    
    data: Pandas dataframe.
    k: number of folds
    m: number of time units to be included in the cross validation
    penalizer: argument of class AalenAdditiveFitter (lifelines library)
    timeinterval: argument of AalenAdditiveFitter().fit method. Time points that are fitted.
    duration_col: last column from data. It contains the lifetime of each case.
    event_col: second-to-last column from data. It contains the censorships. So far this function
                only works without censorships, that is, all death events must be observed. 
                Therefore, it must be a column of ones.
                
    Prints: Average relative error of the predicted probabilities. 

    """
    
    aaf = AalenAdditiveFitter(penalizer=penalizer)
    n, d = data.shape
    data = data.copy()
    data = data.reindex(np.random.permutation(data.index)).sort(event_col)
    scores = []
    assignments = np.array((n // k + 1) * list(range(1, k + 1)))
    assignments = assignments[:n]

    testing_columns = data.columns - [duration_col, event_col]

    for i in range(1, k + 1):

        ix = assignments == i
        training_data = data.ix[~ix]
        testing_data = data.ix[ix]

        T_actual = testing_data[duration_col].values
        E_actual = testing_data[event_col].values
        X_testing = testing_data[testing_columns]

        aaf.fit(training_data, duration_col=duration_col, event_col=event_col, timeline=timeinterval)

        used_ind = []
        prec_sum = 0
        rel_sum = 0
        rel_error_list = []
        df = testing_data
        #ndays must be the last column, and observed the second-to-last
        for j,row in df.iterrows():
        
            if not j in used_ind:
                
                a = df[np.all(df.ix[:,0:-2].values==df.ix[j,:-2].values, axis=1)]
                list_ = a.index.tolist()
                used_ind += list_
                
                actual_rate_series = a.ndays_act.value_counts() / a.shape[0]
                mini = min(actual_rate_series.shape[0], m)
                actual_rate = np.array(actual_rate_series)[:mini]
                pred_df = aaf.predict_survival_function(a.iloc[0,:-2][None,:])
                pred_array = np.array(pred_df)              
                pred_rate = np.zeros(mini)
                pred_rate[0] = 1 - pred_array[0]
                
                for alpha in range(1, mini):
                    pred_rate[alpha] = pred_array[alpha-1] - pred_array[alpha]
                      
                maxi = np.maximum(pred_rate, actual_rate)
                rel_error = np.abs(pred_rate - actual_rate) / maxi
                rel_error_list.append(rel_error)
                succes_rate = len(rel_error[rel_error <= 0.15]) / mini
                prec_sum += succes_rate * len(a)
                rel_sum += np.sum(rel_error) / mini * len(a)
                
        precision = prec_sum / df.shape[0]
        relative = rel_sum / df.shape[0]
        scores.append(precision)
        
        print "Average relative error: ", relative