def run_survival_curve(self, df): ''' used for testing only''' aaf = AalenAdditiveFitter() modelspec = 'YR_BRTH + AGE_DX + RADIATN + HISTREC + ERSTATUS + PRSTATUS + BEHANAL + HST_STGA + NUMPRIMS + RACE' X = pt.dmatrix(modelspec, df, return_type='dataframe') X = X.join(df[['SRV_TIME_MON','CENSORED']]) aaf.fit(X, 'SRV_TIME_MON', 'CENSORED') # INSERT VALUES TO TEST HERE test = np.array([[ 1., 1961., 52., 0, 0., 2., 1., 0., 4., 2.]]) aaf.predict_survival_function(test).plot(); plt.show() exp = aaf.predict_expectation(test) print(exp) return
def run_survival_curve(self, df): ''' used for testing only''' aaf = AalenAdditiveFitter() modelspec = 'YR_BRTH + AGE_DX + RADIATN + HISTREC + ERSTATUS + PRSTATUS + BEHANAL + HST_STGA + NUMPRIMS + RACE' X = pt.dmatrix(modelspec, df, return_type='dataframe') X = X.join(df[['SRV_TIME_MON', 'CENSORED']]) aaf.fit(X, 'SRV_TIME_MON', 'CENSORED') # INSERT VALUES TO TEST HERE test = np.array([[1., 1961., 52., 0, 0., 2., 1., 0., 4., 2.]]) aaf.predict_survival_function(test).plot() plt.show() exp = aaf.predict_expectation(test) print(exp) return
def predict(self, R, Thetas=dict(), _type='cumulative_hazards', **kwargs): """ Assuming that the type to refit is the first type of predictive_relationship """ if not self.regression_: raise Exception("No regression was fitted on the traning") X = self._modify_test_data(R, Thetas) if _type == 'cumulative_hazards': return AalenAdditiveFitter.predict_cumulative_hazard( self, X, id_col=kwargs.get('id_col', None)) elif _type == 'survival_function': return AalenAdditiveFitter.predict_survival_function(self, X) elif _type == 'percentile': return AalenAdditiveFitter.predict_percentile( self, X, kwargs.get('p', 0)) elif _type == 'median': return AalenAdditiveFitter.predict_median(self, X) elif _type == 'expectation': return AalenAdditiveFitter.predict_expectation(self, X) else: raise ValueError("Not avaialble type of prediction")
#First we select 4 random couples married in 2017 each of which from a different state # We are subsutting the dataframe 'data' to a smaller dataframe of 4 cpuples only and compare our prediction outputs' row = {'State[Alabama]':[0.0] , 'State[Maryland]':[1.0], 'State[Mississippi]':[0.0], 'State[New Hampshire]':[0.0], \ 'Couple_Race[T.Same-Race]':[1.0], 'Household_Income_Range[T.42,830$ - 44,765$]':[0.0], \ 'Household_Income_Range[T.66,532$ - 70,303$]':[0.0],'Household_Income_Range[T.67,500$ - 75,000$]':[1.0], \ 'Husband_Education[T.16+ years]':[1.0] , 'Husband_Education[T.Less than 12 years]':[0.0], \ 'Husband_Race[T.Other Ethnic Groups]':[1.0], 'Marriage_Date':[2016], 'T':1, 'E':[0]} MD = pd.DataFrame(data=row) print("MD couple's unique data point", MD) ##plotting the predicted value for this specific couple ax = plt.subplot(2, 1, 1) aaf.predict_cumulative_hazard(MD).plot(ax=ax, legend=False) plt.title('Mississippi Couple predicted Hazard and Survival time') ax = plt.subplot(2, 1, 2) aaf.predict_survival_function(MD).plot(ax=ax, legend=False) plt.savefig('/home/raed/Dropbox/INSE - 6320/Final Project/MarylandCouple.pdf') plt.show() #same idea for Albama couple , we choose the same education level , ethnicity to keep our comparison valid row = {'State[Alabama]':[1.0] , 'State[Maryland]':[0.0], 'State[Mississippi]':[0.0], 'State[New Hampshire]':[0.0], \ 'Couple_Race[T.Same-Race]':[1.0], 'Household_Income_Range[T.42,830$ - 44,765$]':[1.0], \ 'Household_Income_Range[T.66,532$ - 70,303$]':[0.0],'Household_Income_Range[T.67,500$ - 75,000$]':[0.0], \ 'Husband_Education[T.16+ years]':[0.0] , 'Husband_Education[T.Less than 12 years]':[0.0], \ 'Husband_Race[T.Other Ethnic Groups]':[1.0], 'Marriage_Date':[2016], 'T':1, 'E':[0]} AL = pd.DataFrame(data=row) print("AL couple's unique data point", AL) ##plotting the predicted value for this specific couple ax = plt.subplot(2, 1, 1) aaf.predict_cumulative_hazard(AL).plot(ax=ax, legend=False)
from lifelines.datasets import generate_regression_dataset regression_dataset = generate_regression_dataset() from lifelines import AalenAdditiveFitter, CoxPHFitter cf = CoxPHFitter() cf.fit(regression_dataset, duration_col='T', event_col='E') aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(regression_dataset, duration_col='T', event_col='E') x = regression_dataset[regression_dataset.columns - ['E','T']] aaf.predict_survival_function(x.ix[10:12]).plot() aaf.plot()
# Using Cox Proportional Hazards model cf = CoxPHFitter() cf.fit(regression_dataset, 'T', event_col='E') cf.print_summary() # Using Aalen's Additive model aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(regression_dataset, 'T', event_col='E') x = regression_dataset[regression_dataset.columns - ['E','T']] aaf.predict_survival_function(x.ix[10:12]).plot() #get the unique survival functions of the first two subjects aaf.plot()
var3 0.2186 1.2443 0.0758 2.8836 0.0039 0.0700 0.3672 ** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Concordance = 0.580 """ cph.plot() # Using Aalen's Additive model aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(regression_dataset, duration_col='T', event_col='E') aaf.plot() X = regression_dataset.drop(['E', 'T'], axis=1) aaf.predict_survival_function( X.iloc[10:12]).plot() # get the unique survival functions of two subjects scores = k_fold_cross_validation(cph, regression_dataset, duration_col='T', event_col='E', k=10) print(scores) print(np.mean(scores)) print(np.std(scores)) plt.show() #============================================================================== #==============================================================================
naf = NelsonAalenFitter() naf.fit(T, event_observed=E) #but instead of a survival_function_ being exposed, a cumulative_hazard_ is. #Survival Regression from lifelines.datasets import load_regression_dataset regression_dataset = load_regression_dataset() regression_dataset.head() from lifelines import AalenAdditiveFitter, CoxPHFitter # Using Cox Proportional Hazards model cf = CoxPHFitter() cf.fit(regression_dataset, 'T', event_col='E') cf.print_summary() # Using Aalen's Additive model aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(regression_dataset, 'T', event_col='E') x = regression_dataset[regression_dataset.columns - ['E', 'T']] aaf.predict_survival_function(x.ix[10:12]).plot( ) #get the unique survival functions of the first two subjects aaf.plot()
def kfoldcv(data, k=5, m=10, penalizer=0.5, timeinterval=np.linspace(1,20,20), duration_col='ndays_act', event_col='observed'): """ Trains data with AalenAdditiveFitter and (k-fold) cross validate it. Based on lifelines library for survival analysis in Python. data: Pandas dataframe. k: number of folds m: number of time units to be included in the cross validation penalizer: argument of class AalenAdditiveFitter (lifelines library) timeinterval: argument of AalenAdditiveFitter().fit method. Time points that are fitted. duration_col: last column from data. It contains the lifetime of each case. event_col: second-to-last column from data. It contains the censorships. So far this function only works without censorships, that is, all death events must be observed. Therefore, it must be a column of ones. Prints: Average relative error of the predicted probabilities. """ aaf = AalenAdditiveFitter(penalizer=penalizer) n, d = data.shape data = data.copy() data = data.reindex(np.random.permutation(data.index)).sort(event_col) scores = [] assignments = np.array((n // k + 1) * list(range(1, k + 1))) assignments = assignments[:n] testing_columns = data.columns - [duration_col, event_col] for i in range(1, k + 1): ix = assignments == i training_data = data.ix[~ix] testing_data = data.ix[ix] T_actual = testing_data[duration_col].values E_actual = testing_data[event_col].values X_testing = testing_data[testing_columns] aaf.fit(training_data, duration_col=duration_col, event_col=event_col, timeline=timeinterval) used_ind = [] prec_sum = 0 rel_sum = 0 rel_error_list = [] df = testing_data #ndays must be the last column, and observed the second-to-last for j,row in df.iterrows(): if not j in used_ind: a = df[np.all(df.ix[:,0:-2].values==df.ix[j,:-2].values, axis=1)] list_ = a.index.tolist() used_ind += list_ actual_rate_series = a.ndays_act.value_counts() / a.shape[0] mini = min(actual_rate_series.shape[0], m) actual_rate = np.array(actual_rate_series)[:mini] pred_df = aaf.predict_survival_function(a.iloc[0,:-2][None,:]) pred_array = np.array(pred_df) pred_rate = np.zeros(mini) pred_rate[0] = 1 - pred_array[0] for alpha in range(1, mini): pred_rate[alpha] = pred_array[alpha-1] - pred_array[alpha] maxi = np.maximum(pred_rate, actual_rate) rel_error = np.abs(pred_rate - actual_rate) / maxi rel_error_list.append(rel_error) succes_rate = len(rel_error[rel_error <= 0.15]) / mini prec_sum += succes_rate * len(a) rel_sum += np.sum(rel_error) / mini * len(a) precision = prec_sum / df.shape[0] relative = rel_sum / df.shape[0] scores.append(precision) print "Average relative error: ", relative