def _fit_model_to_data_batch(self, X, T, E, weights, show_progress): n, d = X.shape # we are mutating values of X, so copy it. X = X.copy() # iterate over all the unique death times unique_death_times = np.sort(np.unique(T[E])) n_deaths = unique_death_times.shape[0] total_observed_exits = 0 hazards_ = np.zeros((n_deaths, d)) variance_hazards_ = np.zeros((n_deaths, d)) v = np.zeros(d) start = time.time() W = np.sqrt(weights) X = W[:, None] * X for i, t in enumerate(unique_death_times): exits = T == t deaths = exits & E try: v, V = lr(X, W * deaths, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=v, ix=deaths) except LinAlgError: warnings.warn( "Linear regression error at index=%d, time=%.3f. Try increasing the coef_penalizer value." % (i, t), ConvergenceWarning, ) v = np.zeros_like(v) V = np.zeros_like(V) hazards_[i, :] = v variance_hazards_[i, :] = (V ** 2).sum(1) X[exits, :] = 0 if show_progress and i % int((n_deaths / 10)) == 0: print("\rIteration %d/%d, seconds_since_start = %.2f" % (i + 1, n_deaths, time.time() - start), end="") last_iteration = i + 1 # terminate early when there are less than (3 * d) subjects left, where d does not include the intercept. # the value 3 if from R survival lib. if (3 * (d - 1)) >= n - total_observed_exits: if show_progress: print("Terminating early due to too few subjects remaining. This is expected behaviour.") break total_observed_exits += exits.sum() if show_progress: print("Convergence completed.") return hazards_, variance_hazards_, last_iteration
def _fit_model_to_data_batch(self, X, T, E, weights, show_progress): n, d = X.shape # we are mutating values of X, so copy it. X = X.copy() # iterate over all the unique death times unique_death_times = np.sort(np.unique(T[E])) n_deaths = unique_death_times.shape[0] total_observed_exits = 0 hazards_ = np.zeros((n_deaths, d)) variance_hazards_ = np.zeros((n_deaths, d)) v = np.zeros(d) start = time.time() W = np.sqrt(weights) X = W[:, None] * X for i, t in enumerate(unique_death_times): exits = T == t deaths = exits & E try: v, V = lr(X, W * deaths, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=v, ix=deaths) except LinAlgError: warnings.warn( "Linear regression error at index=%d, time=%.3f. Try increasing the coef_penalizer value." % (i, t), ConvergenceWarning, ) v = np.zeros_like(v) V = np.zeros_like(V) hazards_[i, :] = v variance_hazards_[i, :] = (V ** 2).sum(1) X[exits, :] = 0 if show_progress and i % int((n_deaths / 10)) == 0: print("Iteration %d/%d, seconds_since_start = %.2f" % (i + 1, n_deaths, time.time() - start)) last_iteration = i + 1 # terminate early when there are less than (3 * d) subjects left, where d does not include the intercept. # the value 3 if from R survival lib. if (3 * (d - 1)) >= n - total_observed_exits: if show_progress: print("Terminating early due to too few subjects remaining. This is expected behaviour.") break total_observed_exits += exits.sum() if show_progress: print("Convergence completed.") return hazards_, variance_hazards_, last_iteration
def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True): from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # each individual should have an ID of time of leaving study df = df.set_index([duration_col, id_col]) # if no event_col is specified, assume all non-censorships if event_col is None: event_col = 'E' df[event_col] = 1 C_panel = df[[event_col]].to_panel().transpose(2, 1, 0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n, d = df.shape # so this is a problem line. bfill performs a recursion which is # really not scalable. Plus even for modest datasets, this eats a lot of memory. # Plus is bfill the correct thing to choose? It's forward looking... wp = df.to_panel().bfill().fillna(0) # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) columns = wp.items hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) previous_hazard = np.zeros((d,)) ids = wp.minor_axis.values progress = progress_bar(len(non_censorsed_times)) # this makes indexing times much faster wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False) for i, (id, time) in enumerate(non_censorsed_times): relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. try: v, V = lr(wp[time].values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") hazards_.ix[id, time] = v.T variance_.ix[id, time] = V[:, relevant_individuals][:, 0] ** 2 previous_hazard = v.T # update progress bar if show_progress: progress.update(i) # print a new line so the console displays well if show_progress: print() ordered_cols = df.columns # to_panel() mixes up my columns self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols] self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols] self.variance_ = variance_.groupby(level=1).sum()[ordered_cols] if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = wp self.durations = T self.event_observed = C self._compute_confidence_intervals() return
def _fit_static(self, dataframe, duration_col, event_col=None, timeline=None, show_progress=True): """ Perform inference on the coefficients of the Aalen additive model. Parameters: dataframe: a pandas dataframe, with covariates and a duration_col and a event_col. one row per individual. duration_col refers to how long the individual was observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col should be left as None. duration_col: specify what the duration column is called in the dataframe event_col: specify what the event occurred column is called in the dataframe timeline: reformat the estimates index to a new timeline. progress_bar: include a fancy progress bar! Returns: self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_ """ from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # set unique ids for individuals id_col = 'id' ids = np.arange(df.shape[0]) df[id_col] = ids # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # if no event_col is specified, assume all non-censorships if event_col: c = df[event_col].values del df[event_col] else: c = np.ones_like(ids) # each individual should have an ID of time of leaving study C = pd.Series(c, dtype=bool, index=ids) T = pd.Series(df[duration_col].values, index=ids) df = df.set_index(id_col) ix = T.argsort() T, C = T.iloc[ix], C.iloc[ix] del df[duration_col] n, d = df.shape columns = df.columns # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) n_deaths = len(non_censorsed_times) hazards_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) variance_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) # initialize loop variables. previous_hazard = np.zeros((d,)) progress = progress_bar(n_deaths) to_remove = [] t = T.iloc[0] i = 0 for id, time in T.iteritems(): # should be sorted. if t != time: assert t < time # remove the individuals from the previous loop. df.iloc[to_remove] = 0. to_remove = [] t = time to_remove.append(id) if C[id] == 0: continue relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. try: v, V = lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") hazards_.ix[time, id] = v.T variance_.ix[time, id] = V[:, relevant_individuals][:, 0] ** 2 previous_hazard = v.T # update progress bar if show_progress: i += 1 progress.update(i) # print a new line so the console displays well if show_progress: print() # not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=0).sum() self.cumulative_hazards_ = self.hazards_.cumsum() self.variance_ = variance_.groupby(level=0).sum() if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = dataframe self.durations = T self.event_observed = C self._compute_confidence_intervals() return
def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True): from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # each individual should have an ID of time of leaving study df = df.set_index([duration_col, id_col]) # if no event_col is specified, assume all non-censorships if event_col is None: event_col = 'E' df[event_col] = 1 C_panel = df[[event_col]].to_panel().transpose(2, 1, 0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n, d = df.shape # so this is a problem line. bfill performs a recursion which is # really not scalable. Plus even for modest datasets, this eats a lot of memory. # Plus is bfill the correct thing to choose? It's forward looking... wp = df.to_panel().bfill().fillna(0) # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) columns = wp.items hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) previous_hazard = np.zeros((d, )) ids = wp.minor_axis.values progress = progress_bar(len(non_censorsed_times)) # this makes indexing times much faster wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False) for i, (id, time) in enumerate(non_censorsed_times): relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. try: v, V = lr(wp[time].values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: print( "Linear regression error. Try increasing the penalizer term." ) hazards_.ix[id, time] = v.T variance_.ix[id, time] = V[:, relevant_individuals][:, 0]**2 previous_hazard = v.T # update progress bar if show_progress: progress.update(i) # print a new line so the console displays well if show_progress: print() ordered_cols = df.columns # to_panel() mixes up my columns self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols] self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols] self.variance_ = variance_.groupby(level=1).sum()[ordered_cols] if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex( timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = wp self.durations = T self.event_observed = C self._compute_confidence_intervals() return
def _fit_static(self, dataframe, duration_col, event_col=None, timeline=None, show_progress=True): """ Perform inference on the coefficients of the Aalen additive model. Parameters: dataframe: a pandas dataframe, with covariates and a duration_col and a event_col. one row per individual. duration_col refers to how long the individual was observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col should be left as None. duration_col: specify what the duration column is called in the dataframe event_col: specify what the event occurred column is called in the dataframe timeline: reformat the estimates index to a new timeline. progress_bar: include a fancy progress bar! Returns: self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_ """ from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # set unique ids for individuals id_col = 'id' ids = np.arange(df.shape[0]) df[id_col] = ids # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # if no event_col is specified, assume all non-censorships if event_col: c = df[event_col].values del df[event_col] else: c = np.ones_like(ids) # each individual should have an ID of time of leaving study C = pd.Series(c, dtype=bool, index=ids) T = pd.Series(df[duration_col].values, index=ids) df = df.set_index(id_col) ix = T.argsort() T, C = T.iloc[ix], C.iloc[ix] del df[duration_col] n, d = df.shape columns = df.columns # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) n_deaths = len(non_censorsed_times) hazards_ = pd.DataFrame( np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) variance_ = pd.DataFrame( np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) # initialize loop variables. previous_hazard = np.zeros((d, )) progress = progress_bar(n_deaths) to_remove = [] t = T.iloc[0] i = 0 for id, time in T.iteritems(): # should be sorted. if t != time: assert t < time # remove the individuals from the previous loop. df.iloc[to_remove] = 0. to_remove = [] t = time to_remove.append(id) if C[id] == 0: continue relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. try: v, V = lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: print( "Linear regression error. Try increasing the penalizer term." ) hazards_.ix[time, id] = v.T variance_.ix[time, id] = V[:, relevant_individuals][:, 0]**2 previous_hazard = v.T # update progress bar if show_progress: i += 1 progress.update(i) # print a new line so the console displays well if show_progress: print() # not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=0).sum() self.cumulative_hazards_ = self.hazards_.cumsum() self.variance_ = variance_.groupby(level=0).sum() if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex( timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = dataframe self.durations = T self.event_observed = C self._compute_confidence_intervals() return
def fit(self, event_times, X, timeline=None, censorship=None, columns=None, verbose=True, debug=False): """currently X is a static (n,d) array event_times: (n,1) array of event times X: (n,d) the design matrix, either a numpy matrix or DataFrame. timeline: (t,1) timepoints in ascending order censorship: (n,1) boolean array of censorships: True if observed, False if right-censored. By default, assuming all are observed. Fits: self.cumulative_hazards_: a (t,d+1) dataframe of cumulative hazard coefficients self.hazards_: a (t,d+1) dataframe of hazard coefficients """ # deal with the covariate matrix. Check if it is a dataframe or numpy # array n, d = X.shape # append a columns of ones for the baseline hazard ix = event_times.argsort(0)[:, 0] baseline = np.ones((n, 1)) X = np.hstack([X[ix,:], baseline]) sorted_event_times = event_times[ix, 0] # set the column's names of the dataframe. if columns is None: columns = range(d) else: columns = [c for c in columns] if self.fit_intercept: columns += ['baseline'] # set the censorship events. 1 if the death was observed. if censorship is None: observed = np.ones(n, dtype=bool) else: observed = censorship[ix].reshape(n) # set the timeline -- this is used as DataFrame index in the results if timeline is None: timeline = sorted_event_times timeline = np.unique(timeline.astype(float)) if timeline[0] > 0: timeline = np.insert(timeline, 0, 0.) unique_times = np.unique(timeline) zeros = np.zeros((timeline.shape[0], d + self.fit_intercept)) self.cumulative_hazards_ = pd.DataFrame( zeros.copy(), index=unique_times, columns=columns) self.hazards_ = pd.DataFrame( np.zeros((event_times.shape[0], d + self.fit_intercept)), index=event_times[:, 0], columns=columns) self._variance = pd.DataFrame( zeros.copy(), index=unique_times, columns=columns) # create the penalizer matrix for L2 regression penalizer = (self.penalizer * np.eye(d + self.fit_intercept)).astype( np.float32, copy=False) t_0 = sorted_event_times[0] cum_v = np.zeros((d + self.fit_intercept, 1)) v = cum_v.copy() n_iters = len(sorted_event_times) for i, time in enumerate(sorted_event_times): if debug: pdb.set_trace() relevant_times = (t_0 < timeline) * (timeline <= time) if observed[i] == 0: X[i,:] = 0 try: lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) #V = dot(inv(dot(X.T, X) + penalizer), X.T) except LinAlgError: pass # if penalizer > 0, this should not occur. But sometimes it does... #V = dot(pinv(dot(X.T, X) + penalizer), X.T) v = dot(V, basis(n, i)) cum_v = cum_v + v self.cumulative_hazards_.ix[relevant_times] = self.cumulative_hazards_.ix[ relevant_times].values + cum_v.T self.hazards_.iloc[i] = self.hazards_.iloc[i].values + v.T self._variance.ix[relevant_times] = self._variance.ix[ relevant_times].values + dot( V[:, i][:, None], V[:, i][None,:] ).diagonal() t_0 = time X[i,:] = 0 if verbose: sys.stdout.write("\r iteration %i of %i completed" % (i + 1, n_iters)) sys.stdout.flush() # clean up last iteration relevant_times = (timeline > time) self.hazards_.iloc[i] = v.T try: self.cumulative_hazards_.ix[relevant_times] = cum_v.T self._variance.ix[relevant_times] = dot( V[:, i][:, None], V[:, i][None,:] ).diagonal() except: pass self.timeline = timeline self.X = X self.censorship = censorship self.event_times = event_times self._compute_confidence_intervals() self.plot = plot_regressions(self) return self