def _fit_model_to_data_batch(self, X, T, E, weights, show_progress):

        n, d = X.shape

        # we are mutating values of X, so copy it.
        X = X.copy()

        # iterate over all the unique death times
        unique_death_times = np.sort(np.unique(T[E]))
        n_deaths = unique_death_times.shape[0]
        total_observed_exits = 0

        hazards_ = np.zeros((n_deaths, d))
        variance_hazards_ = np.zeros((n_deaths, d))
        v = np.zeros(d)
        start = time.time()

        W = np.sqrt(weights)
        X = W[:, None] * X

        for i, t in enumerate(unique_death_times):

            exits = T == t
            deaths = exits & E
            try:
                v, V = lr(X, W * deaths, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=v, ix=deaths)
            except LinAlgError:
                warnings.warn(
                    "Linear regression error at index=%d, time=%.3f. Try increasing the coef_penalizer value." % (i, t),
                    ConvergenceWarning,
                )
                v = np.zeros_like(v)
                V = np.zeros_like(V)

            hazards_[i, :] = v

            variance_hazards_[i, :] = (V ** 2).sum(1)

            X[exits, :] = 0

            if show_progress and i % int((n_deaths / 10)) == 0:
                print("\rIteration %d/%d, seconds_since_start = %.2f" % (i + 1, n_deaths, time.time() - start),
                      end="")

            last_iteration = i + 1
            # terminate early when there are less than (3 * d) subjects left, where d does not include the intercept.
            # the value 3 if from R survival lib.
            if (3 * (d - 1)) >= n - total_observed_exits:
                if show_progress:
                    print("Terminating early due to too few subjects remaining. This is expected behaviour.")
                break

            total_observed_exits += exits.sum()

        if show_progress:
            print("Convergence completed.")
        return hazards_, variance_hazards_, last_iteration
Пример #2
0
    def _fit_model_to_data_batch(self, X, T, E, weights, show_progress):

        n, d = X.shape

        # we are mutating values of X, so copy it.
        X = X.copy()

        # iterate over all the unique death times
        unique_death_times = np.sort(np.unique(T[E]))
        n_deaths = unique_death_times.shape[0]
        total_observed_exits = 0

        hazards_ = np.zeros((n_deaths, d))
        variance_hazards_ = np.zeros((n_deaths, d))
        v = np.zeros(d)
        start = time.time()

        W = np.sqrt(weights)
        X = W[:, None] * X

        for i, t in enumerate(unique_death_times):

            exits = T == t
            deaths = exits & E
            try:
                v, V = lr(X, W * deaths, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=v, ix=deaths)
            except LinAlgError:
                warnings.warn(
                    "Linear regression error at index=%d, time=%.3f. Try increasing the coef_penalizer value." % (i, t),
                    ConvergenceWarning,
                )
                v = np.zeros_like(v)
                V = np.zeros_like(V)

            hazards_[i, :] = v

            variance_hazards_[i, :] = (V ** 2).sum(1)

            X[exits, :] = 0

            if show_progress and i % int((n_deaths / 10)) == 0:
                print("Iteration %d/%d, seconds_since_start = %.2f" % (i + 1, n_deaths, time.time() - start))

            last_iteration = i + 1
            # terminate early when there are less than (3 * d) subjects left, where d does not include the intercept.
            # the value 3 if from R survival lib.
            if (3 * (d - 1)) >= n - total_observed_exits:
                if show_progress:
                    print("Terminating early due to too few subjects remaining. This is expected behaviour.")
                break

            total_observed_exits += exits.sum()

        if show_progress:
            print("Convergence completed.")
        return hazards_, variance_hazards_, last_iteration
Пример #3
0
    def _fit_varying(self, dataframe, duration_col="T", event_col="E",
                     id_col=None, timeline=None, show_progress=True):

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # each individual should have an ID of time of leaving study
        df = df.set_index([duration_col, id_col])

        # if no event_col is specified, assume all non-censorships
        if event_col is None:
            event_col = 'E'
            df[event_col] = 1

        C_panel = df[[event_col]].to_panel().transpose(2, 1, 0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n, d = df.shape

        # so this is a problem line. bfill performs a recursion which is
        # really not scalable. Plus even for modest datasets, this eats a lot of memory.
        # Plus is bfill the correct thing to choose? It's forward looking...
        wp = df.to_panel().bfill().fillna(0)

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        columns = wp.items
        hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                columns=columns, index=from_tuples(non_censorsed_times))

        variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                 columns=columns, index=from_tuples(non_censorsed_times))

        previous_hazard = np.zeros((d,))
        ids = wp.minor_axis.values
        progress = progress_bar(len(non_censorsed_times))

        # this makes indexing times much faster
        wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False)

        for i, (id, time) in enumerate(non_censorsed_times):

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            try:
                v, V = lr(wp[time].values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard)
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")

            hazards_.ix[id, time] = v.T
            variance_.ix[id, time] = V[:, relevant_individuals][:, 0] ** 2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        ordered_cols = df.columns  # to_panel() mixes up my columns

        self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols]
        self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols]
        self.variance_ = variance_.groupby(level=1).sum()[ordered_cols]

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = wp

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()

        return
Пример #4
0
    def _fit_static(self, dataframe, duration_col, event_col=None,
                    timeline=None, show_progress=True):
        """
        Perform inference on the coefficients of the Aalen additive model.

        Parameters:
            dataframe: a pandas dataframe, with covariates and a duration_col and a event_col.
                      one row per individual. duration_col refers to how long the individual was
                      observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col
                      should be left as None.

            duration_col: specify what the duration column is called in the dataframe
            event_col: specify what the event occurred column is called in the dataframe
            timeline: reformat the estimates index to a new timeline.
            progress_bar: include a fancy progress bar!

        Returns:
          self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_
        """

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # set unique ids for individuals
        id_col = 'id'
        ids = np.arange(df.shape[0])
        df[id_col] = ids

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # if no event_col is specified, assume all non-censorships
        if event_col:
            c = df[event_col].values
            del df[event_col]
        else:
            c = np.ones_like(ids)

        # each individual should have an ID of time of leaving study
        C = pd.Series(c, dtype=bool, index=ids)
        T = pd.Series(df[duration_col].values, index=ids)

        df = df.set_index(id_col)

        ix = T.argsort()
        T, C = T.iloc[ix], C.iloc[ix]

        del df[duration_col]
        n, d = df.shape
        columns = df.columns

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        n_deaths = len(non_censorsed_times)

        hazards_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns,
                                index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        variance_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns,
                                 index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        # initialize loop variables.
        previous_hazard = np.zeros((d,))
        progress = progress_bar(n_deaths)
        to_remove = []
        t = T.iloc[0]
        i = 0

        for id, time in T.iteritems():  # should be sorted.

            if t != time:
                assert t < time
                # remove the individuals from the previous loop.
                df.iloc[to_remove] = 0.
                to_remove = []
                t = time

            to_remove.append(id)
            if C[id] == 0:
                continue

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            try:
                v, V = lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard)
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")

            hazards_.ix[time, id] = v.T
            variance_.ix[time, id] = V[:, relevant_individuals][:, 0] ** 2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                i += 1
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        # not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=0).sum()
        self.cumulative_hazards_ = self.hazards_.cumsum()
        self.variance_ = variance_.groupby(level=0).sum()

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = dataframe
        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()

        return
Пример #5
0
    def _fit_varying(self,
                     dataframe,
                     duration_col="T",
                     event_col="E",
                     id_col=None,
                     timeline=None,
                     show_progress=True):

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # each individual should have an ID of time of leaving study
        df = df.set_index([duration_col, id_col])

        # if no event_col is specified, assume all non-censorships
        if event_col is None:
            event_col = 'E'
            df[event_col] = 1

        C_panel = df[[event_col]].to_panel().transpose(2, 1, 0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n, d = df.shape

        # so this is a problem line. bfill performs a recursion which is
        # really not scalable. Plus even for modest datasets, this eats a lot of memory.
        # Plus is bfill the correct thing to choose? It's forward looking...
        wp = df.to_panel().bfill().fillna(0)

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        columns = wp.items
        hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                columns=columns,
                                index=from_tuples(non_censorsed_times))

        variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                 columns=columns,
                                 index=from_tuples(non_censorsed_times))

        previous_hazard = np.zeros((d, ))
        ids = wp.minor_axis.values
        progress = progress_bar(len(non_censorsed_times))

        # this makes indexing times much faster
        wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False)

        for i, (id, time) in enumerate(non_censorsed_times):

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            try:
                v, V = lr(wp[time].values,
                          relevant_individuals,
                          c1=self.coef_penalizer,
                          c2=self.smoothing_penalizer,
                          offset=previous_hazard)
            except LinAlgError:
                print(
                    "Linear regression error. Try increasing the penalizer term."
                )

            hazards_.ix[id, time] = v.T
            variance_.ix[id, time] = V[:, relevant_individuals][:, 0]**2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        ordered_cols = df.columns  # to_panel() mixes up my columns

        self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols]
        self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols]
        self.variance_ = variance_.groupby(level=1).sum()[ordered_cols]

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(
                timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = wp

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()

        return
Пример #6
0
    def _fit_static(self,
                    dataframe,
                    duration_col,
                    event_col=None,
                    timeline=None,
                    show_progress=True):
        """
        Perform inference on the coefficients of the Aalen additive model.

        Parameters:
            dataframe: a pandas dataframe, with covariates and a duration_col and a event_col.
                      one row per individual. duration_col refers to how long the individual was
                      observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col
                      should be left as None.

            duration_col: specify what the duration column is called in the dataframe
            event_col: specify what the event occurred column is called in the dataframe
            timeline: reformat the estimates index to a new timeline.
            progress_bar: include a fancy progress bar!

        Returns:
          self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_
        """

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # set unique ids for individuals
        id_col = 'id'
        ids = np.arange(df.shape[0])
        df[id_col] = ids

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # if no event_col is specified, assume all non-censorships
        if event_col:
            c = df[event_col].values
            del df[event_col]
        else:
            c = np.ones_like(ids)

        # each individual should have an ID of time of leaving study
        C = pd.Series(c, dtype=bool, index=ids)
        T = pd.Series(df[duration_col].values, index=ids)

        df = df.set_index(id_col)

        ix = T.argsort()
        T, C = T.iloc[ix], C.iloc[ix]

        del df[duration_col]
        n, d = df.shape
        columns = df.columns

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        n_deaths = len(non_censorsed_times)

        hazards_ = pd.DataFrame(
            np.zeros((n_deaths, d)),
            columns=columns,
            index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        variance_ = pd.DataFrame(
            np.zeros((n_deaths, d)),
            columns=columns,
            index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        # initialize loop variables.
        previous_hazard = np.zeros((d, ))
        progress = progress_bar(n_deaths)
        to_remove = []
        t = T.iloc[0]
        i = 0

        for id, time in T.iteritems():  # should be sorted.

            if t != time:
                assert t < time
                # remove the individuals from the previous loop.
                df.iloc[to_remove] = 0.
                to_remove = []
                t = time

            to_remove.append(id)
            if C[id] == 0:
                continue

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            try:
                v, V = lr(df.values,
                          relevant_individuals,
                          c1=self.coef_penalizer,
                          c2=self.smoothing_penalizer,
                          offset=previous_hazard)
            except LinAlgError:
                print(
                    "Linear regression error. Try increasing the penalizer term."
                )

            hazards_.ix[time, id] = v.T
            variance_.ix[time, id] = V[:, relevant_individuals][:, 0]**2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                i += 1
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        # not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=0).sum()
        self.cumulative_hazards_ = self.hazards_.cumsum()
        self.variance_ = variance_.groupby(level=0).sum()

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(
                timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = dataframe
        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()

        return
Пример #7
0
    def fit(self, event_times, X, timeline=None, censorship=None, columns=None, verbose=True, debug=False):
        """currently X is a static (n,d) array

        event_times: (n,1) array of event times
        X: (n,d) the design matrix, either a numpy matrix or DataFrame.
        timeline: (t,1) timepoints in ascending order
        censorship: (n,1) boolean array of censorships: True if observed, False if right-censored.
                    By default, assuming all are observed.

        Fits: self.cumulative_hazards_: a (t,d+1) dataframe of cumulative hazard coefficients
              self.hazards_: a (t,d+1) dataframe of hazard coefficients

        """
        # deal with the covariate matrix. Check if it is a dataframe or numpy
        # array
        n, d = X.shape

        # append a columns of ones for the baseline hazard
        ix = event_times.argsort(0)[:, 0]
        baseline = np.ones((n, 1))
        X = np.hstack([X[ix,:], baseline])
        sorted_event_times = event_times[ix, 0]

        # set the column's names of the dataframe.
        if columns is None:
            columns = range(d)
        else:
            columns = [c for c in columns]

        if self.fit_intercept:
            columns += ['baseline']

        # set the censorship events. 1 if the death was observed.
        if censorship is None:
            observed = np.ones(n, dtype=bool)
        else:
            observed = censorship[ix].reshape(n)

        # set the timeline -- this is used as DataFrame index in the results
        if timeline is None:
            timeline = sorted_event_times

        timeline = np.unique(timeline.astype(float))
        if timeline[0] > 0:
            timeline = np.insert(timeline, 0, 0.)

        unique_times = np.unique(timeline)
        zeros = np.zeros((timeline.shape[0], d + self.fit_intercept))
        self.cumulative_hazards_ = pd.DataFrame(
            zeros.copy(), index=unique_times, columns=columns)
        self.hazards_ = pd.DataFrame(
            np.zeros((event_times.shape[0], d + self.fit_intercept)), index=event_times[:, 0], columns=columns)
        self._variance = pd.DataFrame(
            zeros.copy(), index=unique_times, columns=columns)

        # create the penalizer matrix for L2 regression
        penalizer = (self.penalizer * np.eye(d + self.fit_intercept)).astype(
            np.float32, copy=False)

        t_0 = sorted_event_times[0]
        cum_v = np.zeros((d + self.fit_intercept, 1))
        v = cum_v.copy()
        n_iters = len(sorted_event_times)
        for i, time in enumerate(sorted_event_times):
            if debug:
                pdb.set_trace()
            relevant_times = (t_0 < timeline) * (timeline <= time)
            if observed[i] == 0:
                X[i,:] = 0
            try:
                lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard)
                #V = dot(inv(dot(X.T, X) + penalizer), X.T)
            except LinAlgError:
                pass
                # if penalizer > 0, this should not occur. But sometimes it does...
                #V = dot(pinv(dot(X.T, X) + penalizer), X.T)

            v = dot(V, basis(n, i))
            cum_v = cum_v + v
            self.cumulative_hazards_.ix[relevant_times] = self.cumulative_hazards_.ix[
                relevant_times].values + cum_v.T
            self.hazards_.iloc[i] = self.hazards_.iloc[i].values + v.T
            self._variance.ix[relevant_times] = self._variance.ix[
                relevant_times].values + dot( V[:, i][:, None], V[:, i][None,:] ).diagonal()
            t_0 = time
            X[i,:] = 0

            if verbose:
                sys.stdout.write("\r iteration %i of %i completed" % (i + 1, n_iters))
                sys.stdout.flush()

        # clean up last iteration
        relevant_times = (timeline > time)
        self.hazards_.iloc[i] = v.T
        try:
            self.cumulative_hazards_.ix[relevant_times] = cum_v.T
            self._variance.ix[relevant_times] =  dot( V[:, i][:, None], V[:, i][None,:] ).diagonal()
        except:
            pass
        self.timeline = timeline
        self.X = X
        self.censorship = censorship
        self.event_times = event_times
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)
        return self