def fit(self, event_times, X, timeline = None, censorship=None, columns = None): """currently X is a static (n,d) array event_times: (1,n) array of event times X: (n,d) the design matrix timeline: (1,t) timepoints in ascending order censorship: (1,n) boolean array of censorships: True if observed, False if right-censored. By default, assuming all are observed. Fits: self.cumulative_hazards_: a (t,d+1) dataframe of cumulative hazard coefficients self.hazards_: a (t,d+1) dataframe of hazard coefficients """ n,d = X.shape X_ = X.copy() ix = event_times.argsort(1)[0,:] X_ = X_[ix,:].copy() if not self.fit_intercept else np.c_[ X_[ix,:].copy(), np.ones((n,1)) ] sorted_event_times = event_times[0,ix].copy() if columns is None: columns = range(d) + ["baseline"] else: columns = [c for c in columns ] + ["baseline"] if censorship is None: observed = np.ones(n, dtype=bool) else: observed = censorship.reshape(n) if timeline is None: timeline = sorted_event_times zeros = np.zeros((timeline.shape[0],d+self.fit_intercept)) self.cumulative_hazards_ = pd.DataFrame(zeros.copy() , index=timeline, columns = columns) self.hazards_ = pd.DataFrame(np.zeros((event_times.shape[1],d+self.fit_intercept)), index=sorted_event_times, columns = columns) self._variance = pd.DataFrame(zeros.copy(), index=timeline, columns = columns) penalizer = self.penalizer*np.eye(d + self.fit_intercept) t_0 = sorted_event_times[0] cum_v = np.zeros((d+self.fit_intercept,1)) d = 0 for i,time in enumerate(sorted_event_times): relevant_times = (t_0<timeline)*(timeline<=time) if observed[i] == 0: X_[i,:] = 0 try: V = dot(inv(dot(X_.T,X_) - penalizer), X_.T) except LinAlgError: self.cumulative_hazards_.ix[relevant_times] =cum_v.T self.hazards_.iloc[i] = v.T self._variance.ix[relevant_times] = dot( V[:,i][:,None], V[:,i][None,:] ).diagonal() X_[i,:] = 0 t_0 = time continue v = dot(V, basis(n,i)) cum_v = cum_v + v self.cumulative_hazards_.ix[relevant_times] = self.cumulative_hazards_.ix[relevant_times].values + cum_v.T self.hazards_.iloc[i] = self.hazards_.iloc[i].values + v.T self._variance.ix[relevant_times] = self._variance.ix[relevant_times].values + dot( V[:,i][:,None], V[:,i][None,:] ).diagonal() t_0 = time X_[i,:] = 0 relevant_times = (timeline>time) self.cumulative_hazards_.ix[relevant_times] = cum_v.T self.hazards_.iloc[i] = v.T self._variance.ix[relevant_times] = dot( V[:,i][:,None], V[:,i][None,:] ).diagonal() self.timeline = timeline self.censorship = censorship self._compute_confidence_intervals() return self
def fit(self, event_times, X, timeline=None, censorship=None, columns=None): """currently X is a static (n,d) array event_times: (n,1) array of event times X: (n,d) the design matrix, either a numpy matrix or DataFrame. timeline: (t,1) timepoints in ascending order censorship: (n,1) boolean array of censorships: True if observed, False if right-censored. By default, assuming all are observed. Fits: self.cumulative_hazards_: a (t,d+1) dataframe of cumulative hazard coefficients self.hazards_: a (t,d+1) dataframe of hazard coefficients """ # deal with the covariate matrix. Check if it is a dataframe or numpy array n, d = X.shape if type(X) == pd.core.frame.DataFrame: X_ = X.values.copy() if columns is None: columns = X.columns else: X_ = X.copy() # append a columns of ones for the baseline hazard ix = event_times.argsort(0)[:, 0].copy() X_ = X_[ix,:].copy() if not self.fit_intercept else np.c_[ X_[ix,:].copy(), np.ones((n, 1)) ] sorted_event_times = event_times[ix, 0].copy() # set the column's names of the dataframe. if columns is None: columns = range(d) else: columns = [c for c in columns] if self.fit_intercept: columns += ['baseline'] # set the censorship events. 1 if the death was observed. if censorship is None: observed = np.ones(n, dtype=bool) else: observed = censorship[ix].reshape(n) # set the timeline -- this is used as DataFrame index in the results if timeline is None: timeline = sorted_event_times.copy() timeline = np.unique(timeline.astype(float)) if timeline[0] > 0: timeline = np.insert(timeline, 0, 0.) unique_times = np.unique(timeline) zeros = np.zeros((timeline.shape[0], d + self.fit_intercept)) self.cumulative_hazards_ = pd.DataFrame(zeros.copy(), index=unique_times, columns=columns) self.hazards_ = pd.DataFrame( np.zeros((event_times.shape[0], d + self.fit_intercept)), index=event_times[:, 0], columns=columns) self._variance = pd.DataFrame(zeros.copy(), index=unique_times, columns=columns) # create the penalizer matrix for L2 regression penalizer = self.penalizer * np.eye(d + self.fit_intercept) t_0 = sorted_event_times[0] cum_v = np.zeros((d + self.fit_intercept, 1)) v = cum_v.copy() for i, time in enumerate(sorted_event_times): relevant_times = (t_0 < timeline) * (timeline <= time) if observed[i] == 0: X_[i,:] = 0 try: V = dot(inv(dot(X_.T, X_) + penalizer), X_.T) except LinAlgError: # if penalizer > 0, this should not occur. But sometimes it does... V = dot(pinv(dot(X_.T, X_) + penalizer), X_.T) v = dot(V, basis(n, i)) cum_v = cum_v + v self.cumulative_hazards_.ix[relevant_times] = self.cumulative_hazards_.ix[relevant_times].values + cum_v.T self.hazards_.iloc[i] = self.hazards_.iloc[i].values + v.T self._variance.ix[relevant_times] = self._variance.ix[relevant_times].values + dot( V[:, i][:, None], V[:, i][None,:] ).diagonal() t_0 = time X_[i,:] = 0 # clean up last iteration relevant_times = (timeline > time) self.hazards_.iloc[i] = v.T try: self.cumulative_hazards_.ix[relevant_times] = cum_v.T self._variance.ix[relevant_times] = dot( V[:, i][:, None], V[:, i][None,:] ).diagonal() except: pass self.timeline = timeline self.X = X self.censorship = censorship self.event_times = event_times self._compute_confidence_intervals() self.plot = plot_regressions(self) return self
def fit(self, event_times, X, timeline = None, censorship=None, columns=None): """currently X is a static (n,d) array event_times: (n,1) array of event times X: (n,d) the design matrix, either a numpy matrix or DataFrame. timeline: (t,1) timepoints in ascending order censorship: (n,1) boolean array of censorships: True if observed, False if right-censored. By default, assuming all are observed. Fits: self.cumulative_hazards_: a (t,d+1) dataframe of cumulative hazard coefficients self.hazards_: a (t,d+1) dataframe of hazard coefficients """ #deal with the covariate matrix. Check if it is a dataframe or numpy array n,d = X.shape if type(X)==pd.core.frame.DataFrame: X_ = X.values.copy() if columns is None: columns = X.columns else: X_ = X.copy() # append a columns of ones for the baseline hazard ix = event_times.argsort()[0,:] X_ = X_[ix,:].copy() if not self.fit_intercept else np.c_[ X_[ix,:].copy(), np.ones((n,1)) ] sorted_event_times = event_times[0,ix].copy() #set the column's names of the dataframe. if columns is None: columns = range(d) + ["baseline"] else: columns = [c for c in columns ] + ["baseline"] #set the censorship events. 1 if the death was observed. if censorship is None: observed = np.ones(n, dtype=bool) else: observed = censorship.reshape(n) #set the timeline -- this is used as DataFrame index in the results if timeline is None: timeline = sorted_event_times timeline = timeline.astype(float) if timeline[0] > 0: timeline = np.insert(timeline,0,0.) zeros = np.zeros((timeline.shape[0],d+self.fit_intercept)) self.cumulative_hazards_ = pd.DataFrame(zeros.copy() , index=timeline, columns = columns) self.hazards_ = pd.DataFrame(np.zeros((event_times.shape[1],d+self.fit_intercept)), index=sorted_event_times, columns = columns) self._variance = pd.DataFrame(zeros.copy(), index=timeline, columns = columns) #create the penalizer matrix for L2 regression penalizer = self.penalizer*np.eye(d + self.fit_intercept) t_0 = sorted_event_times[0] cum_v = np.zeros((d+self.fit_intercept,1)) v = cum_v.copy() for i,time in enumerate(sorted_event_times): relevant_times = (t_0<timeline)*(timeline<=time) if observed[i] == 0: X_[i,:] = 0 try: V = dot(inv(dot(X_.T,X_) - penalizer), X_.T) except LinAlgError: #if penalizer > 0, this should not occur. self.cumulative_hazards_.ix[relevant_times] =cum_v.T self.hazards_.iloc[i] = v.T self._variance.ix[relevant_times] = dot( V[:,i][:,None], V[:,i][None,:] ).diagonal() X_[i,:] = 0 t_0 = time continue v = dot(V, basis(n,i)) cum_v = cum_v + v self.cumulative_hazards_.ix[relevant_times] = self.cumulative_hazards_.ix[relevant_times].values + cum_v.T self.hazards_.iloc[i] = self.hazards_.iloc[i].values + v.T self._variance.ix[relevant_times] = self._variance.ix[relevant_times].values + dot( V[:,i][:,None], V[:,i][None,:] ).diagonal() t_0 = time X_[i,:] = 0 #clean up last iteration relevant_times = (timeline>time) self.cumulative_hazards_.ix[relevant_times] = cum_v.T self.hazards_.iloc[i] = v.T self._variance.ix[relevant_times] = dot( V[:,i][:,None], V[:,i][None,:] ).diagonal() self.timeline = timeline self.X = X self.censorship = censorship self._compute_confidence_intervals() return self
def fit(self, event_times, X, timeline=None, censorship=None, columns=None, verbose=True, debug=False): """currently X is a static (n,d) array event_times: (n,1) array of event times X: (n,d) the design matrix, either a numpy matrix or DataFrame. timeline: (t,1) timepoints in ascending order censorship: (n,1) boolean array of censorships: True if observed, False if right-censored. By default, assuming all are observed. Fits: self.cumulative_hazards_: a (t,d+1) dataframe of cumulative hazard coefficients self.hazards_: a (t,d+1) dataframe of hazard coefficients """ # deal with the covariate matrix. Check if it is a dataframe or numpy # array n, d = X.shape # append a columns of ones for the baseline hazard ix = event_times.argsort(0)[:, 0] baseline = np.ones((n, 1)) X = np.hstack([X[ix,:], baseline]) sorted_event_times = event_times[ix, 0] # set the column's names of the dataframe. if columns is None: columns = range(d) else: columns = [c for c in columns] if self.fit_intercept: columns += ['baseline'] # set the censorship events. 1 if the death was observed. if censorship is None: observed = np.ones(n, dtype=bool) else: observed = censorship[ix].reshape(n) # set the timeline -- this is used as DataFrame index in the results if timeline is None: timeline = sorted_event_times timeline = np.unique(timeline.astype(float)) if timeline[0] > 0: timeline = np.insert(timeline, 0, 0.) unique_times = np.unique(timeline) zeros = np.zeros((timeline.shape[0], d + self.fit_intercept)) self.cumulative_hazards_ = pd.DataFrame( zeros.copy(), index=unique_times, columns=columns) self.hazards_ = pd.DataFrame( np.zeros((event_times.shape[0], d + self.fit_intercept)), index=event_times[:, 0], columns=columns) self._variance = pd.DataFrame( zeros.copy(), index=unique_times, columns=columns) # create the penalizer matrix for L2 regression penalizer = (self.penalizer * np.eye(d + self.fit_intercept)).astype( np.float32, copy=False) t_0 = sorted_event_times[0] cum_v = np.zeros((d + self.fit_intercept, 1)) v = cum_v.copy() n_iters = len(sorted_event_times) for i, time in enumerate(sorted_event_times): if debug: pdb.set_trace() relevant_times = (t_0 < timeline) * (timeline <= time) if observed[i] == 0: X[i,:] = 0 try: lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) #V = dot(inv(dot(X.T, X) + penalizer), X.T) except LinAlgError: pass # if penalizer > 0, this should not occur. But sometimes it does... #V = dot(pinv(dot(X.T, X) + penalizer), X.T) v = dot(V, basis(n, i)) cum_v = cum_v + v self.cumulative_hazards_.ix[relevant_times] = self.cumulative_hazards_.ix[ relevant_times].values + cum_v.T self.hazards_.iloc[i] = self.hazards_.iloc[i].values + v.T self._variance.ix[relevant_times] = self._variance.ix[ relevant_times].values + dot( V[:, i][:, None], V[:, i][None,:] ).diagonal() t_0 = time X[i,:] = 0 if verbose: sys.stdout.write("\r iteration %i of %i completed" % (i + 1, n_iters)) sys.stdout.flush() # clean up last iteration relevant_times = (timeline > time) self.hazards_.iloc[i] = v.T try: self.cumulative_hazards_.ix[relevant_times] = cum_v.T self._variance.ix[relevant_times] = dot( V[:, i][:, None], V[:, i][None,:] ).diagonal() except: pass self.timeline = timeline self.X = X self.censorship = censorship self.event_times = event_times self._compute_confidence_intervals() self.plot = plot_regressions(self) return self