def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True): from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # each individual should have an ID of time of leaving study df = df.set_index([duration_col, id_col]) C_panel = df[[event_col]].to_panel().transpose(2, 1, 0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n, d = df.shape # so this is a problem line. bfill performs a recursion which is # really not scalable. Plus even for modest datasets, this eats a lot of memory. wp = df.to_panel().bfill().fillna(0) # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) columns = wp.items hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) # initializes the penalizer matrix penalizer = self.penalizer * np.eye(d) ids = wp.minor_axis.values progress = progress_bar(len(non_censorsed_times)) # this makes indexing times much faster wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False) for i, (id, time) in enumerate(non_censorsed_times): relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. X = wp[time].values # perform linear regression step. try: V = dot(inv(dot(X.T, X) + penalizer), X.T) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") v = dot(V, 1.0 * relevant_individuals) hazards_.ix[id, time] = v.T variance_.ix[id, time] = V[:, relevant_individuals][:, 0] ** 2 # update progress bar if show_progress: progress.update(i) # print a new line so the console displays well if show_progress: print() ordered_cols = df.columns # to_panel() mixes up my columns # not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols] self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols] self.variance_ = variance_.groupby(level=1).sum()[ordered_cols] if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = wp self.durations = T self.event_observed = C self._compute_confidence_intervals() self.plot = plot_regressions(self) return
def _fit_static(self, dataframe, duration_col="T", event_col="E", timeline=None, show_progress=True): """ Perform inference on the coefficients of the Aalen additive model. Parameters: dataframe: a pandas dataframe, with covariates and a duration_col and a event_col. one row per individual. duration_col refers to how long the individual was observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col should be left as None. duration_col: specify what the duration column is called in the dataframe event_col: specify what the event occurred column is called in the dataframe timeline: reformat the estimates index to a new timeline. progress_bar: include a fancy progress bar! Returns: self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_ """ from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # set unique ids for individuals id_col = 'id' ids = np.arange(df.shape[0]) df[id_col] = ids # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # each individual should have an ID of time of leaving study C = pd.Series(df[event_col].values, dtype=bool, index=ids) T = pd.Series(df[duration_col].values, index=ids) df = df.set_index(id_col) ix = T.argsort() T, C = T.iloc[ix], C.iloc[ix] del df[event_col] del df[duration_col] n, d = df.shape columns = df.columns # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) n_deaths = len(non_censorsed_times) hazards_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) variance_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) # initializes the penalizer matrix penalizer = self.penalizer * np.eye(d) # initialize loop variables. progress = progress_bar(n_deaths) to_remove = [] t = T.iloc[0] i = 0 for id, time in T.iteritems(): # should be sorted. if t != time: assert t < time # remove the individuals from the previous loop. df.iloc[to_remove] = 0. to_remove = [] t = time to_remove.append(id) if C[id] == 0: continue relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. X = df.values try: V = dot(inv(dot(X.T, X) + penalizer), X.T) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") v = dot(V, 1.0 * relevant_individuals) hazards_.ix[time, id] = v.T variance_.ix[time, id] = V[:, relevant_individuals][:, 0] ** 2 # update progress bar if show_progress: i += 1 progress.update(i) # print a new line so the console displays well if show_progress: print() # not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=0).sum() self.cumulative_hazards_ = self.hazards_.cumsum() self.variance_ = variance_.groupby(level=0).sum() if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = dataframe self.durations = T self.event_observed = C self._compute_confidence_intervals() self.plot = plot_regressions(self) return
def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True): from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() #if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. #each individual should have an ID of time of leaving study df = df.set_index([duration_col, id_col]) C_panel = df[[event_col]].to_panel().transpose(2, 1, 0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n, d = df.shape #so this is a problem line. bfill performs a recursion which is #really not scalable. Plus even for modest datasets, this eats a lot of memory. wp = df.to_panel().bfill().fillna(0) #initialize dataframe to store estimates non_censorsed_times = T[C].iteritems() columns = wp.items hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) #initializes the penalizer matrix penalizer = self.penalizer * np.eye(d) ids = wp.minor_axis.values progress = progress_bar(len(non_censorsed_times)) #this makes indexing times much faster wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False) for i, (id, time) in enumerate(non_censorsed_times): relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. X = wp[time].values #perform linear regression step. try: V = dot(inv(dot(X.T, X) + penalizer), X.T) except LinAlgError: print( "Linear regression error. Try increasing the penalizer term." ) v = dot(V, 1.0 * relevant_individuals) hazards_.ix[id, time] = v.T variance_.ix[id, time] = V[:, relevant_individuals][:, 0]**2 #update progress bar if show_progress: progress.update(i) #print a new line so the console displays well if show_progress: print() ordered_cols = df.columns #to_panel() mixes up my columns #not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols] self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols] self.variance_ = variance_.groupby(level=1).sum()[ordered_cols] if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex( timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = wp self.durations = T self.event_observed = C self._compute_confidence_intervals() self.plot = plot_regressions(self) return
def _fit_static(self, dataframe, duration_col="T", event_col="E", timeline=None, show_progress=True): """ Perform inference on the coefficients of the Aalen additive model. Parameters: dataframe: a pandas dataframe, with covariates and a duration_col and a event_col. one row per individual. duration_col refers to how long the individual was observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col should be left as None. duration_col: specify what the duration column is called in the dataframe event_col: specify what the event occurred column is called in the dataframe timeline: reformat the estimates index to a new timeline. progress_bar: include a fancy progress bar! Returns: self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_ """ from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() #set unique ids for individuals id_col = 'id' ids = np.arange(df.shape[0]) df[id_col] = ids #if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. #each individual should have an ID of time of leaving study C = df[event_col].astype(bool) T = df[duration_col] df = df.set_index([duration_col, id_col]) ix = T.argsort() T, C = T.iloc[ix], C.iloc[ix] del df[event_col] n, d = df.shape columns = df.columns #initialize dataframe to store estimates non_censorsed_times = T[C].iteritems() n_deaths = len(non_censorsed_times) hazards_ = pd.DataFrame( np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) variance_ = pd.DataFrame( np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) #initializes the penalizer matrix penalizer = self.penalizer * np.eye(d) #initialize loop variables. progress = progress_bar(n_deaths) to_remove = [] t = T.iloc[0] i = 0 for id, time in T.iteritems(): #should be sorted. if t != time: assert t < time #remove the individuals from the previous loop. df.ix[to_remove] = 0. to_remove = [] t = time to_remove.append(id) if C[id] == 0: continue relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. #perform linear regression step. X = df.values try: V = dot(inv(dot(X.T, X) + penalizer), X.T) except LinAlgError: print( "Linear regression error. Try increasing the penalizer term." ) v = dot(V, 1.0 * relevant_individuals) hazards_.ix[time, id] = v.T variance_.ix[time, id] = V[:, relevant_individuals][:, 0]**2 #update progress bar if show_progress: i += 1 progress.update(i) #print a new line so the console displays well if show_progress: print() #not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=0).sum() self.cumulative_hazards_ = self.hazards_.cumsum() self.variance_ = variance_.groupby(level=0).sum() if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex( timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = dataframe self.durations = T self.event_observed = C self._compute_confidence_intervals() self.plot = plot_regressions(self) return
def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True): from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # if the regression should fit an intercept if self.fit_intercept: df["baseline"] = 1.0 # each individual should have an ID of time of leaving study df = df.set_index([duration_col, id_col]) # if no event_col is specified, assume all non-censorships if event_col is None: event_col = "E" df[event_col] = 1 C_panel = df[[event_col]].to_panel().transpose(2, 1, 0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n, d = df.shape # so this is a problem line. bfill performs a recursion which is # really not scalable. Plus even for modest datasets, this eats a lot of memory. # Plus is bfill the correct thing to choose? It's forward looking... wp = df.to_panel().bfill().fillna(0) # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) columns = wp.items hazards_ = pd.DataFrame( np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times) ) variance_ = pd.DataFrame( np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times) ) previous_hazard = np.zeros((d,)) ids = wp.minor_axis.values progress = progress_bar(len(non_censorsed_times)) # this makes indexing times much faster wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False) for i, (id, time) in enumerate(non_censorsed_times): relevant_individuals = ids == id assert relevant_individuals.sum() == 1.0 # perform linear regression step. try: v, V = lr( wp[time].values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard, ) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") hazards_.ix[id, time] = v.T variance_.ix[id, time] = V[:, relevant_individuals][:, 0] ** 2 previous_hazard = v.T # update progress bar if show_progress: progress.update(i) # print a new line so the console displays well if show_progress: print() ordered_cols = df.columns # to_panel() mixes up my columns self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols] self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols] self.variance_ = variance_.groupby(level=1).sum()[ordered_cols] if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method="ffill") self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method="ffill") self.variance_ = self.variance_.reindex(timeline, method="ffill") self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = wp self.durations = T self.event_observed = C self._compute_confidence_intervals() self.plot = plot_regressions(self) return
def fit(self, dataframe, duration_col="T", event_col="E", timeline=None, id_col=None, show_progress=True): """ Perform inference on the coefficients of the Aalen additive model. Parameters: dataframe: a pandas dataframe, with covariates and a duration_col and a event_col. static covariates: one row per individual. duration_col refers to how long the individual was observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col should be left as None. time-varying covariates: For time-varying covariates, an id_col is required to keep track of individuals' changing covariates. individual should have a unique id. duration_col refers to how long the individual has been observed to up to that point. event_col refers to if the event (death) occured in that period. Censored individuals will not have a 1. For example: +----+---+---+------+------+ | id | T | E | var1 | var2 | +----+---+---+------+------+ | 1 | 1 | 0 | 0 | 1 | | 1 | 2 | 0 | 0 | 1 | | 1 | 3 | 0 | 4 | 3 | | 1 | 4 | 1 | 8 | 4 | | 2 | 1 | 0 | 1 | 1 | | 2 | 2 | 0 | 1 | 2 | | 2 | 3 | 0 | 1 | 2 | +----+---+---+------+------+ duration_col: specify what the duration column is called in the dataframe event_col: specify what the event occurred column is called in the dataframe timeline: reformat the estimates index to a new timeline. id_col: (only for time-varying covariates) name of the id column in the dataframe progress_bar: include a fancy progress bar! Returns: self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_ """ from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() #only for time-indp. covariates if id_col is None: df['id'] = np.arange(df.shape[0]) id_col = 'id' #if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. #each individual should have an ID of time of leaving study df = df.set_index([id_col, duration_col]) C_panel = df[[event_col]].to_panel().transpose(1,2,0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n,d = df.shape wp = df.to_panel().transpose(1,2,0).bfill().fillna(0) #bfill will cause problems later, plus it is slow. non_censorsed_times = T[C].iteritems() #initialize dataframe to store estimates hazards_ = pd.DataFrame( np.zeros((len(non_censorsed_times),d)), columns = df.columns, index = from_tuples(non_censorsed_times)) variance_ = pd.DataFrame( np.zeros((len(non_censorsed_times),d)), columns = df.columns, index = from_tuples(non_censorsed_times)) #initializes the penalizer matrix penalizer = self.penalizer*np.eye(d) ids = wp.items progress = progress_bar(len(non_censorsed_times)) #wp = wp.transpose(1,0,2) for i,(id, time) in enumerate(non_censorsed_times): relevant_individuals = (ids==id) assert relevant_individuals.sum() == 1. #X = wp[time].values X = wp.major_xs(time).values.T #perform linear regression step. try: V = dot(inv(dot(X.T, X) + penalizer), X.T) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") v = dot(V, 1.0*relevant_individuals ) hazards_.ix[id, time] = v.T variance_.ix[id, time] = V[:, relevant_individuals][:,0]**2 #update progress bar if show_progress: progress.update(i) #print a new line so the console displays well if show_progress: print() #not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=1).sum() self.cumulative_hazards_= self.hazards_.cumsum() self.variance_ = variance_.groupby(level=1).sum() if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill') self.variance_= self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = wp self.durations = T self.event_observed = C self._compute_confidence_intervals() self.plot = plot_regressions(self) return self