def survival_ll_nelson_aalen(content): naf = NelsonAalenFitter() naf.fit(content['times'], event_observed=content['events']) return httpWrapper( json.dumps({ 'hazard': naf.cumulative_hazard_.to_dict(), 'confidence': naf.confidence_interval_.to_dict() }, ignore_nan=True ))
def concat_hazard_curve(T, C): naf = NelsonAalenFitter(nelson_aalen_smoothing=False) naf.fit(T, event_observed=C) #return naf.smoothed_hazard_(bandwidth=bandwidth).reindex(range(1,max_idx+1))['differenced-NA_estimate'].values return naf.cumulative_hazard_.reindex( 1, args.max_idx + 1).values, naf.confidence_interval_.reindex( 1, args, max_idx + 1).values
def calcSurvHazardCat(df: pd.DataFrame, *, hazardcol: str = "hazard",) -> pd.DataFrame: """ Calculate cumulative hazard survived for each individual patient, as an alternative to raw (and often censored) survival time. Parameters ---------- df A data frame with two compulsory columns: time and event. hazardcol Column name for the survived hazard. Returns ------- The input dataframe, with an extra column of hazards. """ ### Fit survival Nelson-Aalen Estimator of Hazard on survival data T = df["time"] E = df["event"] naf = NelsonAalenFitter() naf.fit(T, E) df[hazardcol] = naf.predict(T).tolist() return df
def survival_ll_nelson_aalen(content): kmf = NelsonAalenFitter() kmf.fit(content['times'], event_observed=content['events']) return httpWrapper( json.dumps({ 'result': kmf.survival_function_, 'hazard': cumulative_hazard_, 'median': kmf.kmf.median_ }))
def fit( self, durations, event_observed=None, timeline=None, entry=None, label=None, alpha=None, ci_labels=None, weights=None ): # pylint: disable=too-many-arguments """ Parameters ---------- durations: an array, or pd.Series, of length n duration subject was observed for timeline: return the best estimate at the values in timelines (positively increasing) event_observed: an array, or pd.Series, of length n True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: string a string to name the column of the estimate. alpha: float, optional (default=0.05) the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: iterable add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns ------- self, with new properties like ``survival_function_``. """ self._label = coalesce(label, self._label, "BFH_estimate") alpha = coalesce(alpha, self.alpha) naf = NelsonAalenFitter(alpha=alpha) naf.fit(durations, event_observed=event_observed, timeline=timeline, label=self._label, entry=entry, ci_labels=ci_labels) self.durations, self.event_observed, self.timeline, self.entry, self.event_table, self.weights = ( naf.durations, naf.event_observed, naf.timeline, naf.entry, naf.event_table, naf.weights, ) # estimation self.survival_function_ = np.exp(-naf.cumulative_hazard_) self.confidence_interval_ = np.exp(-naf.confidence_interval_) self.confidence_interval_survival_function_ = self.confidence_interval_ self.confidence_interval_cumulative_density = 1 - self.confidence_interval_ # estimation methods self._estimation_method = "survival_function_" self._estimate_name = "survival_function_" # plotting functions self.plot_survival_function = self.plot return self
def compute_terminal_node(self): """ Compute the terminal node if condition has reached. :return: self """ self.terminal = True self.chf = NelsonAalenFitter() t = self.y.iloc[:, 1] e = self.y.iloc[:, 0] self.chf.fit(t, event_observed=e, timeline=self.timeline) return self
def NelsonAelan_dash(T, C): naf = NelsonAalenFitter() naf.fit(T, event_observed=C) naf.plot(title='Nelson-Aalen Estimate') naf.plot(ci_force_lines=True, title='Nelson-Aalen Estimate') py_p = plt.gcf() pyplot(py_p, legend=False)
def _vval2ByBootstrap(timeline, nstraps=1000): sa1_b, sa2_b = np.zeros((timeline.shape[0], nstraps)), np.zeros( (timeline.shape[0], nstraps)) for sampi in range(nstraps): tmp = df.sample(frac=1, replace=True, axis=0) ind1 = tmp[treatment_col] == 0 naf1 = NelsonAalenFitter() naf1.fit(durations=tmp.loc[ind1, duration_col], event_observed=tmp.loc[ind1, event_col]) sa1 = np.exp(-naf1.cumulative_hazard_.iloc[:, 0]) sa1 = sa1.reindex(timeline, method='ffill') sa1_b[:, sampi] = sa1.values ind2 = df[treatment_col] == 1 naf2 = NelsonAalenFitter() naf2.fit(durations=tmp.loc[ind2, duration_col], event_observed=tmp.loc[ind2, event_col]) sa2 = np.exp(-naf2.cumulative_hazard_.iloc[:, 0]) sa2 = sa2.reindex(timeline, method='ffill') sa2_b[:, sampi] = sa2.values vval2 = 1 / np.sqrt( np.nanvar(np.log(sa1_b), axis=1) + np.nanvar(np.log(sa2_b), axis=1)) return vval2
def _fit_kaplan_meier(self): """ private method to fit Kaplan-Meier curve """ if self.kmf_fit is not None: # already fitted return # Overall kmf_fit = KaplanMeierFitter() kmf_fit.fit(self.time, event_observed=self.event, label=self.label) naf_case = NelsonAalenFitter() naf_case.fit(self.time, event_observed=self.event, label=self.label) self.kmf_fit = kmf_fit self.naf_fit = naf_case
def test_naf_plotting_with_custom_colours(self, block): data1 = np.random.exponential(5, size=(200, 1)) data2 = np.random.exponential(1, size=(500)) naf = NelsonAalenFitter() naf.fit(data1) ax = naf.plot(color="r") naf.fit(data2) naf.plot(ax=ax, color="k") self.plt.title("test_naf_plotting_with_custom_coloirs") self.plt.show(block=block) return
def test_naf_plotting_slice(self, block): data1 = np.random.exponential(5, size=(200, 1)) data2 = np.random.exponential(1, size=(200, 1)) naf = NelsonAalenFitter() naf.fit(data1) ax = naf.plot(loc=slice(0, None)) naf.fit(data2) naf.plot(ax=ax, ci_force_lines=True, iloc=slice(100, 180)) self.plt.title("test_naf_plotting_slice") self.plt.show(block=block) return
def go(): print args T_all, C_all = concat_TC(all_files) T_m, C_m = concat_TC(files_m) T_f, C_f = concat_TC(files_f) for gender, (T, C) in zip(('all', 'm', 'f'), ((T_all, C_all), (T_m, C_m), (T_f, C_f))): naf = NelsonAalenFitter(nelson_aalen_smoothing=False) naf.fit(T, event_observed=C) dill.dump( naf, open( '/backup/home/jared/storage/foraging/cm/{}_{}_shuffle_{}_{}_{}' .format(gender, args.mode, args.min_length, args.ignore_first, args.memory), 'wb'))
def survival_plot_and_cox(self, df_arr, label=[], filename=''): plt.clf() color = ['red', 'green', 'blue', 'cyan', 'orange', 'black'] kmf = KaplanMeierFitter() naf = NelsonAalenFitter() for a in range(len(df_arr)): df_el = df_arr[a] if a == 0: kmf.fit(df_el['bcrmonth'], df_el['bcrstatus'], label=label[a]) ax = kmf.plot(show_censors=True, ci_show=False, color=color[a], ylim=(0, 1)) else: kmf.fit(df_el['bcrmonth'], df_el['bcrstatus'], label=label[a]) kmf.plot(ax=ax, show_censors=True, ci_show=False, color=color[a], ylim=(0, 1)) fig = ax.get_figure() fig.savefig(filename + '.png') fig.savefig(filename + '.pdf', format='PDF')
def get_scores(model, y_test, delta_test, time_grid, surv_residual = False, cens_residual = False): n = y_test.shape[0] x_train, target = model.training_data y_train, delta_train = target # compute residual from training data exp_residual_train = np.nan_to_num(np.exp(np.log(y_train) - model.predict(x_train).reshape(-1))) exp_residual_test = np.nan_to_num(np.exp(np.log(y_test) - model.predict(x_test).reshape(-1))) # compute exp(-theta) from test data to evaluate accelerating component exp_predict_neg_test = np.nan_to_num(np.exp(-model.predict(x_test)).reshape(-1)) naf_base = NelsonAalenFitter().fit(y_train, event_observed = delta_train) kmf_cens = KaplanMeierFitter().fit(y_train, event_observed = 1 - delta_train) if cens_residual == True: cens_test = kmf_cens.survival_function_at_times(exp_residual_test) elif cens_residual == False: cens_test = kmf_cens.survival_function_at_times(y_test) bss = [] nblls = [] for t in time_grid: bs, nbll = get_score(n, t, y_test, delta_test, naf_base, kmf_cens, cens_test, exp_predict_neg_test, surv_residual, cens_residual, model) bss.append(bs) nblls.append(-nbll) return (np.array(bss), np.array(nblls))
def _estimateSurv(df, ind): naf = NelsonAalenFitter() naf.fit(durations=df.loc[ind, duration_col], event_observed=df.loc[ind, event_col]) """Borrowed from lifelines""" timeline = sorted(naf.timeline) deaths = naf.event_table['observed'] """Slowest line here.""" population = naf.event_table['entrance'].cumsum() - naf.event_table['removed'].cumsum().shift(1).fillna(0) varsa = np.cumsum(_additive_var(population, deaths)) varsa = varsa.reindex(timeline, method='pad') varsa.index.name = 'timeline' varsa.name = 'surv_var' sa = np.exp(-naf.cumulative_hazard_.iloc[:, 0]) sa.name = 'surv' return naf, sa, varsa
def get_hazard_ratio_results(df, group_col, time_col, event_col): models = [] summary_ = None summary_result = None df = df[[event_col, time_col, group_col]].dropna() df[event_col] = df[event_col].astype('category') df[event_col] = df[event_col].cat.codes df[time_col] = df[time_col].astype('float') if not df.empty: for name, grouped_df in df.groupby(group_col): hr = NelsonAalenFitter() t = grouped_df[time_col] e = grouped_df[event_col] hr.fit(t, event_observed=e, label=name + " (N=" + str(len(t.tolist())) + ")") models.append(hr) return models
def test_naf_plot_cumulative_hazard(self, block): data1 = np.random.exponential(5, size=(200, 1)) naf = NelsonAalenFitter() naf.fit(data1) ax = naf.plot() naf.plot_cumulative_hazard(ax=ax, ci_force_lines=True) self.plt.title("I should have plotted the same thing, but different styles + color!") self.plt.show(block=block) return
def plot_hazard(df, TName, EName=None, groupBy=None, splitBy=None, params={}): print('\tHazard') ylabel, naf = 'Hazard_Rate', NelsonAalenFitter() params['ylabel'] = ylabel return plot_any(df, fitter=naf, TName=TName, EName=EName, groupBy=groupBy, splitBy=splitBy, params=params)
def get_surv(model, x_test, timegrid=None): ''' model: PyCox model class or compatibles x_test: covariate dataset to compute survival estimates timegrid: option to set upperbound of time grid to "Y" of training dataset ''' warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=RuntimeWarning) x_train, target = model.training_data y_train, delta_train = target # compute residual from training data exp_residual = np.nan_to_num( np.exp(np.log(y_train) - model.predict(x_train).reshape(-1))) # compute exp(-theta) from test data to evaluate accelerating component exp_predict = np.nan_to_num(np.exp(-model.predict(x_test)).reshape(-1)) # estimate cumulative baseline hazard function # based on training dataset H = NelsonAalenFitter().fit(exp_residual, event_observed=delta_train).cumulative_hazard_ # extract timegrid and estimated hazards if timegrid == "train": max_time = y_train.max() else: max_time = max(H.index) if H.shape[0] * exp_predict.shape[0] >= 5 * 10e7: l = round(5 * 10e7 / exp_predict.shape[0]) time_grid = np.quantile(a=H.loc[H.index <= max_time].index.values, q=[i / l for i in range(l + 1)], interpolation='nearest') else: time_grid = H.loc[H.index <= max_time].index.values H_base = H.loc[time_grid].values.reshape(-1) h_base = H_base[1:] - H_base[:-1] h_base = np.repeat(h_base.reshape(-1, 1), exp_predict.shape[0], axis=1) # evaluate conditional cumulative hazard estimates # based on test dataset surv = pd.DataFrame(np.exp(-np.cumsum(h_base * exp_predict, axis=0)), index=time_grid[1:], columns=[i for i in range(exp_predict.shape[0])]) surv.index.names = ["duration"] return surv
def test_naf_plot_cumulative_hazard_bandwith_1(self, block): data1 = np.random.exponential(5, size=(2000, 1)) ** 2 naf = NelsonAalenFitter() naf.fit(data1) naf.plot_hazard(bandwidth=5.0, iloc=slice(0, 1700)) self.plt.title("test_naf_plot_cumulative_hazard_bandwith_1") self.plt.show(block=block) return
def createHazardGraph(durations, event_observed): naf = NelsonAalenFitter() naf.fit(durations, event_observed) naf.plot(ci_show=False) plt.title("Hard Drive Nelson-Aalen Hazard Estimate") plt.ylabel("Cumulative Hazard") plt.show()
def plot_HR(df, with_ci=False): T = df['days_survived'] E = df['death'] naf = NelsonAalenFitter() cutoff = np.percentile(df['risk'], 75) high_risk = df['risk'] > cutoff naf.fit(T[high_risk], event_observed=E[high_risk], label='High_Risk') ax = naf.plot(ci_show=with_ci) naf.fit(T[~high_risk], event_observed=E[~high_risk], label='Low_Risk') naf.plot(ax=ax, ci_show=with_ci) plt.ylim(0, .1) plt.xlabel("Days") plt.ylabel("Risk of Death") plt.title("Cardiovascular Death Risk over time (top quartile)") if with_ci: plt.savefig("./hr_with_ci.png") else: plt.savefig("./hr_without_ci.png")
def get_surv(model, x_test, timegrid="train"): ''' model: PyCox model class or compatibles x_test: covariate dataset to compute survival estimates ''' warnings.simplefilter(action='ignore', category=FutureWarning) x_train, target = model.training_data y_train, delta_train = target # compute residual from training data exp_residual = np.exp(np.log(y_train) - model.predict(x_train).reshape(-1)) # compute exp(-theta) from test data to evaluate accelerating component exp_predict = np.exp(-model.predict(x_test)).reshape(-1) # estimate cumulative baseline hazard function # based on training dataset H = NelsonAalenFitter().fit(exp_residual, event_observed=delta_train).cumulative_hazard_ # extract timegrid and estimated hazards time_grid = H.index.to_numpy()[1:] H_base = H.values.reshape(-1) h_base = H_base[1:] - H_base[:-1] h_base = np.repeat(h_base.reshape(-1, 1), exp_predict.shape[0], axis=1) # evaluate conditional cumulative hazard estimates # based on test dataset surv = pd.DataFrame(np.exp(-np.cumsum(h_base * exp_predict, axis=0)), index=time_grid, columns=[i for i in range(exp_predict.shape[0])]) surv.index.names = ["duration"] # set upperbound of time grid to "Y" of training dataset # (to be comparable to survival predictions from PyCox models) if timegrid == "train": surv = surv.loc[surv.index <= y_train.max()] return surv
def _vval2ByBootstrap(timeline, nstraps=1000): sa1_b, sa2_b = np.zeros((timeline.shape[0], nstraps)), np.zeros((timeline.shape[0], nstraps)) for sampi in range(nstraps): tmp = df.sample(frac=1, replace=True, axis=0) ind1 = tmp[treatment_col] == 0 naf1 = NelsonAalenFitter() naf1.fit(durations=tmp.loc[ind1, duration_col], event_observed=tmp.loc[ind1, event_col]) sa1 = np.exp(-naf1.cumulative_hazard_.iloc[:, 0]) sa1 = sa1.reindex(timeline, method='ffill') sa1_b[:, sampi] = sa1.values ind2 = df[treatment_col] == 1 naf2 = NelsonAalenFitter() naf2.fit(durations=tmp.loc[ind2, duration_col], event_observed=tmp.loc[ind2, event_col]) sa2 = np.exp(-naf2.cumulative_hazard_.iloc[:, 0]) sa2 = sa2.reindex(timeline, method='ffill') sa2_b[:, sampi] = sa2.values vval2 = 1/np.sqrt(np.nanvar(np.log(sa1_b), axis=1) + np.nanvar(np.log(sa2_b), axis=1)) return vval2
def get_sa(request): dirname = os.path.dirname(os.path.dirname(__file__)).replace('\\', '/') kmffile = '/images/test1.jpg' naffile = '/images/test2.jpg' context = {} context['kmf'] = kmffile context['naf'] = naffile if not os.path.exists(dirname + kmffile) and not os.path.exists(dirname + naffile): df = load_waltons() T = df['T'] # an array of durations E = df['E'] # a either boolean or binary array representing whether the 'death' was observed (alternatively an individual can be censored) kmf = KaplanMeierFitter(alpha=0.95) kmf.fit(durations=T, event_observed=E, timeline=None, entry=None, label='KM_estimate', alpha=None, left_censorship=False, ci_labels=None) naf = NelsonAalenFitter(alpha=0.95, nelson_aalen_smoothing=True) naf.fit(durations=T, event_observed=E, timeline=None, entry=None, label='NA_estimate', alpha=None, ci_labels=None) kmf.plot() plt.savefig(dirname + kmffile) naf.plot() plt.savefig(dirname + naffile) # return render_to_response(template_name='sa_test.html', context=context, context_instance=RequestContext(request=request)) return render(request=request, template_name='sa_test.html', context=context)
EPS_LIST = [0.05,0.1,0.2,0.4,0.8,1.6] bins0 = config.BIN0 bins1 = config.BIN1 df = pd.read_stata("wichert.dta") data_ = zip(df.time/max(df.time), df.event.astype(int)) data = [(a, b) for (a,b) in data_ if a >= config.GAMMA] print("[*] Remove #%d outliers" % (len(data_) - len(data))) N = len(df) # number of data points #kmf = KaplanMeierFitter() (T, E) = zip(*data) #kmf.fit(T, event_observed=E) naf = NelsonAalenFitter() naf.fit(T, event_observed=E) #ax = pyplot.subplot(121) #naf.plot(ax=ax) #ax = pyplot.subplot(122) #kmf.plot(ax=ax) true_value = naf.cumulative_hazard_.values #naf.cumulative_hazard_.to_csv("naf.csv") #pyplot.show() data0 = [ a for (a,b) in data if b == 0 ] data1 = [ a for (a,b) in data if b == 1 ]
LD = LCL.load_lending_data(load_files,keep_status,keep_terms,keep_grades) print('loaded {0} loans'.format(len(LD))) print_figs = False #%% #load long/lat data for each zip-code zip3_data = LCL.load_location_data(data_dir,group_by='zip3') LD['zip3'] = LD['zip3'].astype(int) LD = pd.merge(LD, zip3_data, how='inner', left_on='zip3', right_index=True) #%% Compute hazard functions for each loan grade and term term_bandwidths = [4., 8.] #list of NAF smoothing bandwidth (for each term) naf = NelsonAalenFitter(nelson_aalen_smoothing=False) #init NAF model all_hazards = {} #initialize dict to store hazard functions for idx,term in enumerate(keep_terms): #compute all hazard functions for each term cur_data = LD[LD.term==term] lifetimes = cur_data['num_pymnts'].copy() #lifetime is number of payments received lifetimes.ix[cur_data.loan_status == 'Fully Paid'] = term #if the loan is fully paid set the lifetime to the full term is_observed = cur_data.loan_status.isin(['Charged Off']) #observed loans are just the ones that have been charged off, rest are censored all_hazards[term] = np.zeros((len(keep_grades),term+1)) #initialize matrix of hazard functions for gidx,grade in enumerate(keep_grades): #fit model for each grade grade_data = cur_data.grade == grade naf.fit(lifetimes[grade_data],event_observed=is_observed[grade_data],label=grade,timeline=np.arange(term+1)) all_hazards[term][gidx,:] = naf.smoothed_hazard_(term_bandwidths[idx]).squeeze()
def fit( self, durations, event_observed=None, timeline=None, entry=None, label="BFH_estimate", alpha=None, ci_labels=None, ): # pylint: disable=too-many-arguments """ Parameters ---------- durations: an array, or pd.Series, of length n duration subject was observed for timeline: return the best estimate at the values in timelines (positively increasing) event_observed: an array, or pd.Series, of length n True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: string a string to name the column of the estimate. alpha: float, optional (default=0.05) the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: iterable add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns ------- self, with new properties like ``survival_function_``. """ self._label = label alpha = coalesce(alpha, self.alpha) naf = NelsonAalenFitter(alpha=alpha) naf.fit( durations, event_observed=event_observed, timeline=timeline, label=label, entry=entry, ci_labels=ci_labels ) self.durations, self.event_observed, self.timeline, self.entry, self.event_table = ( naf.durations, naf.event_observed, naf.timeline, naf.entry, naf.event_table, ) # estimation self.survival_function_ = np.exp(-naf.cumulative_hazard_) self.confidence_interval_ = np.exp(-naf.confidence_interval_) # estimation methods self._estimation_method = "survival_function_" self._estimate_name = "survival_function_" self._update_docstrings() # plotting functions self.plot_survival_function = self.plot return self
# Plot Kaplan-Meier curve kmf = KaplanMeierFitter() first = 0 for r in cac_ranges: ix = cac_values == r if first == 0: kmf.fit(times[ix], censors[ix], label=r) ax = kmf.plot() first = 1 else: kmf.fit(times[ix], censors[ix], label=r) kmf.plot(ax=ax) elif curve == 'hazard': # Plot hazard curve naf = NelsonAalenFitter() first = 0 for r in cac_ranges: ix = cac_values == r if first == 0: naf.fit(times[ix], censors[ix], label=r) ax = naf.plot() first = 1 else: naf.fit(times[ix], censors[ix], label=r) naf.plot(ax=ax) ax.set_ylabel("%", fontsize=12) ax.set_title(tag, fontsize=14) ax.set_xlabel("Years to event", fontsize=12)
plt.savefig('/home/raed/Dropbox/INSE - 6320/Final Project/income_States.pdf') plt.show() ax = plt.subplot(111) for r in data['Has_Children'].unique(): ix = data['Has_Children'] == r kmf.fit(data['Duration'].loc[ix], data['Divorce'].loc[ix], label=r) sns.set() ax = kmf.plot(title='Mariage Survival Estimate Based on Children', ax=ax, linewidth=2.5) #Export the figure plt.savefig('/home/raed/Dropbox/INSE - 6320/Final Project/Children.pdf') plt.show() naf = NelsonAalenFitter() naf.fit(data['Duration'], data['Divorce']) sns.set() naf.plot(title='Cumulative hazard over time', legend=False) print(naf.cumulative_hazard_.head(32)) plt.savefig( '/home/raed/Dropbox/INSE - 6320/Final Project/Cumulative_Hazard_function.pdf' ) plt.show() ax = plt.subplot(111) for r in data['Couple_Race'].unique(): ix = data['Couple_Race'] == r naf.fit(data['Duration'].loc[ix], data['Divorce'].loc[ix], label=r) sns.set() ax = naf.plot(title='Cumulative Hazard by Couple Race ',
kmf.fit(T[ix], E[ix], label=dept) kmf.plot(ax=ax, legend=False) plt.title(dept) plt.xlim(0, 1000) if i == 0: plt.ylabel('Frac. in staying after $n$ years') plt.tight_layout() for i, dept in enumerate(depts): ix = data['dept'] == dept kmf.fit(T[ix], E[ix], label=dept) print(dept, kmf.median_) # Looking at a hazard curve from lifelines import NelsonAalenFitter naf = NelsonAalenFitter() naf.fit(T, event_observed=E) print(naf.cumulative_hazard_.head()) naf.plot() # This hazard curve shows us that there is low hazard of someone leaving starting off, then it gets worse, # once you stay for 500 days you stay at least a bit more, then exponentially it gets worse! # SURVIVAL REGRESSION -- figuring out the influences of other aspects on whether or not someone survives # Can't use regular linear regression. Want to use Cox's model or Aalen's additive model. # Cox's Proportional Hazard model # "The idea behind the model is that the log-hazard of an individual is a linear function of their static covariates # and a population-level baseline hazard that changes over time" - from https://lifelines.readthedocs.io/en/latest/Survival%20Regression.html
seen_data_events.add(time_to_event) data_events = np.append(data_events,np.array([time_to_event]*num_repair)) for v in sales_dict.values(): #investigate why some negative leftovers on certain valid dates , more repairs than sales ??? if v>0: data_events = np.append(data_events,np.zeros(v)) t=[] if len(data_events)==0: all_data.append([0]*19) continue data_events[data_events==0] = 70 C= data_events <70 naf = NelsonAalenFitter() naf.fit(data_events, event_observed=C ) y_h = np.array(naf.cumulative_hazard_).reshape(len(naf.cumulative_hazard_)) x= np.array(naf.cumulative_hazard_.index).astype(int) seen_data_events.add(0) seen_data_events.add(70) if len(y_h) > 14: slope, intercept, r_value, p_value, std_err = stats.linregress(x[len(x)-5:len(x)-1],y_h[len(y_h)-5:len(y_h)-1]) #plt.figure() #plt.plot(x, y_h, 'ko') #plt.plot(x, linear_f(x,slope,intercept ), 'r-') #plt.legend()
seen_data_events.add(time_to_event) data_events = np.append(data_events,np.array([time_to_event]*num_repair)) for v in sales_dict.values(): #investigate why some negative leftovers on certain valid dates , more repairs than sales ??? if v>0: data_events = np.append(data_events,np.zeros(v)) t=[] if len(data_events)==0: all_data.append([0]*19) continue data_events[data_events==0] = 160 C= data_events <160 naf = NelsonAalenFitter() naf.fit(data_events, censorship=C ) y_h = np.array(naf.cumulative_hazard_).reshape(len(naf.cumulative_hazard_)) x= np.array(naf.cumulative_hazard_.index).astype(int) seen_data_events.add(0) seen_data_events.add(160) if len(y_h) > 14: slope, intercept, r_value, p_value, std_err = stats.linregress(x[len(x)-5:len(x)-1],y_h[len(y_h)-5:len(y_h)-1]) #plt.figure() #plt.plot(x, y_h, 'ko') #plt.plot(x, linear_f(x,slope,intercept ), 'r-') #plt.legend()
# WeibullFitter ############################################################ from lifelines import WeibullFitter wf = WeibullFitter() wf.fit(T, E) print(wf.lambda_, wf.rho_) wf.print_summary() wf.plot() ############################################################ # NelsonAalenFitter ############################################################ from lifelines import NelsonAalenFitter naf = NelsonAalenFitter() naf.fit(T, event_observed=E) naf.plot() # univariate analysis: cum hazard # ORIG_CHN ax = plt.subplot() for chn in df_cox.ORIG_CHN.unique(): is_chn = (df_cox.ORIG_CHN == chn) naf.fit(T[is_chn], event_observed=E[is_chn], label=chn) naf.plot(ax=ax) # PURPOSE ax = plt.subplot() for purpose in df_cox.PURPOSE.unique():
1 13 1 miR-137 2 13 1 miR-137 3 13 1 miR-137 4 19 1 miR-137 """ T = df['T'] E = df['E'] # Fit the survival curve kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) # or, more succiently, kmf.fit(T, E) kmf.plot() # Plot cumulative hazard function naf = NelsonAalenFitter() naf.fit(T, E) naf.plot() #------------------------------------------------------------------------------ # Multiple groups #------------------------------------------------------------------------------ groups = df['group'] ix = (groups == 'miR-137') kmf.fit(T[~ix], E[~ix], label='control') ax = kmf.plot() kmf.fit(T[ix], E[ix], label='miR-137') kmf.plot(ax=ax)
class Node: score = 0 split_val = None split_var = None lhs = None rhs = None chf = None chf_terminal = None terminal = False def __init__(self, x, y, tree, f_idxs, n_features, unique_deaths=1, min_leaf=1, random_state=None): """ A Node of the Survival Tree. :param x: The input samples. Should be a Dataframe with the shape [n_samples, n_features]. :param y: The target values as a Dataframe with the survival time in the first column and the event. :param tree: The corresponding Survival Tree :param f_idxs: The indices of the features to use. :param n_features: The number of features to use. :param unique_deaths: The minimum number of unique deaths required to be at a leaf node. :param min_leaf: The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_leaf training samples in each of the left and right branches. """ self.x = x self.y = y self.tree = tree self.f_idxs = f_idxs self.n_features = n_features self.unique_deaths = unique_deaths self.random_state = random_state self.min_leaf = min_leaf self.grow_tree() def grow_tree(self): """ Grow tree by calculating the Nodes recursively. :return: self """ unique_deaths = self.y.iloc[:, 1].reset_index().drop_duplicates().sum()[1] if unique_deaths <= self.unique_deaths: self.compute_terminal_node() return self self.score, self.split_val, self.split_var, lhs_idxs_opt, rhs_idxs_opt = splitting.find_split( self) if self.split_var is None: self.compute_terminal_node() return self if self.random_state is None: lf_idxs = np.random.permutation(self.x.shape[1])[:self.n_features] rf_idxs = np.random.permutation(self.x.shape[1])[:self.n_features] else: lf_idxs = np.random.RandomState( seed=self.random_state).permutation( self.x.shape[1])[:self.n_features] rf_idxs = np.random.RandomState( seed=self.random_state).permutation( self.x.shape[1])[:self.n_features] self.lhs = Node(self.x.iloc[lhs_idxs_opt, :], self.y.iloc[lhs_idxs_opt, :], self.tree, lf_idxs, self.n_features, min_leaf=self.min_leaf, random_state=self.random_state) self.rhs = Node(self.x.iloc[rhs_idxs_opt, :], self.y.iloc[rhs_idxs_opt, :], self.tree, rf_idxs, self.n_features, min_leaf=self.min_leaf, random_state=self.random_state) return self def compute_terminal_node(self): """ Compute the terminal node if condition has reached. :return: self """ self.terminal = True self.chf = NelsonAalenFitter() t = self.y.iloc[:, 0] e = self.y.iloc[:, 1] self.chf.fit(t, event_observed=e, timeline=self.tree.timeline) return self def predict(self, x): """ Predict the cumulative hazard function if its a terminal node. If not walk through the tree. :param x: The input sample. :return: Predicted cumulative hazard function if terminal node """ if self.terminal: self.tree.chf = self.chf.cumulative_hazard_ self.tree.chf = self.tree.chf.iloc[:, 0] return self.tree.chf else: if x[self.split_var] <= self.split_val: self.lhs.predict(x) else: self.rhs.predict(x)
import pandas as pd df = pd.read_stata("wichert.dta") data_ = zip(df.time/max(df.time), df.event.astype(int)) data = [(a, b) for (a,b) in data_ if a >= config.GAMMA] print("[*] Remove #%d outliers" % (len(data_) - len(data))) N = len(df) # number of data points from lifelines import KaplanMeierFitter from lifelines import NelsonAalenFitter kmf = KaplanMeierFitter() (T, E) = zip(*data) kmf.fit(T, event_observed=E) naf = NelsonAalenFitter() naf.fit(T, event_observed=E) ax = pyplot.subplot(121) naf.plot(ax=ax) ax = pyplot.subplot(122) kmf.plot(ax=ax) print naf.cumulative_hazard_ naf.cumulative_hazard_.to_csv("naf.csv") pyplot.show() data0 = [ a for (a,b) in data if b == 0 ] data1 = [ a for (a,b) in data if b == 1 ]