def predict_log_partial_hazard(self, X): r""" This is equivalent to R's linear.predictors. Returns the log of the partial hazard for the individuals, partial since the baseline hazard is not included. Equal to :math:`(x - \bar{x})'\beta ` Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. Returns ------- DataFrame Note ----- If X is a DataFrame, the order of the columns do not matter. But if X is an array, then the column ordering is assumed to be the same as the training dataset. """ if isinstance(X, pd.DataFrame): order = self.params_.index X = X[order] check_for_numeric_dtypes_or_raise(X) X = X.astype(float) index = _get_index(X) X = normalize(X, self._norm_mean.values, 1) return pd.DataFrame(np.dot(X, self.params_), index=index)
def predict_log_partial_hazard(self, X): r""" This is equivalent to R's linear.predictors. Returns the log of the partial hazard for the individuals, partial since the baseline hazard is not included. Equal to :math:`(x - \bar{x})'\beta ` Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. Returns ------- DataFrame Note ----- If X is a DataFrame, the order of the columns do not matter. But if X is an array, then the column ordering is assumed to be the same as the training dataset. """ if isinstance(X, pd.DataFrame): order = self.hazards_.index X = X[order] check_for_numeric_dtypes_or_raise(X) X = X.astype(float) index = _get_index(X) X = normalize(X, self._norm_mean.values, 1) return pd.DataFrame(np.dot(X, self.hazards_), index=index)
def _check_values(self, df, T, E, weights, entries): check_for_numeric_dtypes_or_raise(df) check_nans_or_infs(df) check_nans_or_infs(T) check_nans_or_infs(E) check_positivity(T) check_complete_separation(df, E, T, self.event_col) if self.weights_col: if (weights.astype(int) != weights).any() and not self.robust: warnings.warn( dedent( """It appears your weights are not integers, possibly propensity or sampling scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis""" ), StatisticalWarning, ) if (weights <= 0).any(): raise ValueError( "values in weight column %s must be positive." % self.weights_col) if self.entry_col: count_invalid_rows = (entries > T).sum() if count_invalid_rows: warnings.warn( """There exist %d rows where entry > duration.""")
def predict_log_partial_hazard(self, X): r""" This is equivalent to R's linear.predictors. Returns the log of the partial hazard for the individuals, partial since the baseline hazard is not included. Equal to :math:`(x - \bar{x})'\beta` Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. Returns ------- DataFrame Note ----- If X is a DataFrame, the order of the columns do not matter. But if X is an array, then the column ordering is assumed to be the same as the training dataset. """ if isinstance(X, pd.DataFrame): check_for_numeric_dtypes_or_raise(X) X = X.astype(float) X = normalize(X, self._norm_mean.values, 1) X_pt = torch.tensor(X, dtype=self.type_pt) return pd.Series(self.net(X_pt).detach().numpy().ravel())
def _check_values(self, df, events, start, stop): # check_for_overlapping_intervals(df) # this is currently too slow for production. check_nans_or_infs(df) check_low_var(df) check_complete_separation_low_variance(df, events, self.event_col) check_for_numeric_dtypes_or_raise(df) check_for_immediate_deaths(events, start, stop) check_for_instantaneous_events(start, stop)
def _check_values(self, df, events, start, stop): # check_for_overlapping_intervals(df) # this is currently too slow for production. check_nans_or_infs(df) check_low_var(df) check_complete_separation_low_variance(df, events, self.event_col) check_for_numeric_dtypes_or_raise(df) check_for_immediate_deaths(events, start, stop) check_for_instantaneous_events(start, stop)
def _check_values(self, df, T, E, event_col): check_for_numeric_dtypes_or_raise(df) check_nans_or_infs(T) check_nans_or_infs(E) check_nans_or_infs(df) check_complete_separation(df, E, T, event_col) if self.fit_intercept: check_low_var(df)
def _check_values(self, X, T, E): check_for_numeric_dtypes_or_raise(X) check_nans_or_infs(T) check_nans_or_infs(X)
def _check_values(self, X, T, E): check_for_numeric_dtypes_or_raise(X) check_nans_or_infs(T) check_nans_or_infs(X)
def _fit( self, log_likelihood_function, df, Ts, regressors, event_col=None, show_progress=False, timeline=None, weights_col=None, robust=False, initial_point=None, entry_col=None, ): self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") + " UTC" self.weights_col = weights_col self.entry_col = entry_col self.event_col = event_col self._n_examples = df.shape[0] self.timeline = timeline self.robust = robust self.regressors = regressors # TODO name E = (pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col)) if (self.event_col is not None) else pd.Series(np.ones( self._n_examples, dtype=bool), index=df.index, name="E")) weights = (pass_for_numeric_dtypes_or_raise_array( df.pop(self.weights_col)).astype(float) if (self.weights_col is not None) else pd.Series( np.ones(self._n_examples, dtype=float), index=df.index, name="weights")) entries = (pass_for_numeric_dtypes_or_raise_array( df.pop(entry_col)).astype(float) if (entry_col is not None) else pd.Series(np.zeros(self._n_examples, dtype=float), index=df.index, name="entry")) check_nans_or_infs(E) E = E.astype(bool) self.event_observed = E.copy() self.entry = entries.copy() self.weights = weights.copy() df = df.astype(float) self._check_values(df, coalesce(Ts[1], Ts[0]), E, weights, entries) check_for_numeric_dtypes_or_raise(df) check_nans_or_infs(df) _norm_std = df.std(0) _norm_std[_norm_std < 1e-8] = 1.0 df_normalized = normalize(df, 0, _norm_std) Xs = self._create_Xs_dict(df_normalized) self._LOOKUP_SLICE = self._create_slicer(Xs) _index = pd.MultiIndex.from_tuples( sum(([(name, col) for col in columns] for name, columns in regressors.items()), [])) self._norm_std = pd.Series( [_norm_std.loc[variable_name] for _, variable_name in _index], index=_index) _params, self._log_likelihood, self._hessian_ = self._fit_model( log_likelihood_function, Ts, Xs, E.values, weights.values, entries.values, show_progress=show_progress, initial_point=initial_point, ) self.params_ = _params / self._norm_std self.variance_matrix_ = self._compute_variance_matrix() self.standard_errors_ = self._compute_standard_errors( Ts, E.values, weights.values, entries.values, Xs) self.confidence_intervals_ = self._compute_confidence_intervals() self._predicted_median = self.predict_median(df)