def predict(self, subset: Union[None, pd.core.series.Series] = None, cumulative: bool = True) -> np.ndarray: """Use trained LightGBM models to predict the outcome for each observation and time horizon. Args: subset: A Boolean Series that is True for observations for which predictions will be produced. If None, default to all observations. cumulative: If True, produce cumulative survival probabilies. If False, produce marginal survival probabilities (i.e., one minus the hazard rate). Returns: A numpy array of predictions by observation and lead length. """ subset = default_subset_to_all(subset, self.data) predict_data = self.data[self.categorical_features + self.numeric_features][subset] predictions = np.array([ lead_specific_model.predict(predict_data) for lead_specific_model in self.model ]).T if cumulative: predictions = np.cumprod(predictions, axis=1) return predictions
def compute_model_uncertainty( self, subset: Union[None, pd.core.series.Series] = None, n_iterations: int = 200 ) -> np.ndarray: """Predict with dropout as proposed by Gal and Ghahramani (2015). See https://arxiv.org/abs/1506.02142. Args: subset: A Boolean Series that is True for observations for which predictions will be produced. If None, default to all observations. n_iterations: Number of random dropout specifications to obtain predictions from. Returns: A numpy array of predictions by observation, lead length, and iteration. """ subset = default_subset_to_all(subset, self.data) model_inputs = split_categorical_features( self.data[subset], self.categorical_features, self.numeric_features ) + [1.0] predict_with_dropout = K.function( self.model.inputs + [K.learning_phase()], self.model.outputs ) predictions = np.dstack( [predict_with_dropout(model_inputs)[0] for i in range(n_iterations)] ) return predictions
def predict( self, subset: Union[None, pd.core.series.Series] = None, cumulative: bool = True ) -> np.ndarray: """Map observations to outcome means from their categorical values. Map observations with a combination of categorical values not seen in the training data to the mean of the outcome in the training set. Args: subset: A Boolean Series that is True for observations for which predictions will be produced. If None, default to all observations. cumulative: If True, will produce cumulative survival probabilities. If False, will produce marginal survival probabilities (i.e., one minus the hazard rate). Returns: A numpy array of outcome means by observation and lead length. """ subset = default_subset_to_all(subset, self.data) predictions = self.data[subset].merge( self.model, how="left", on=self.categorical_features ) predictions = predictions[self.model.columns] predictions = predictions.to_numpy() if self.objective == "multiclass": predictions = predictions.reshape( (self.num_class, subset.sum(), self.n_intervals), order="F" ) if cumulative: predictions = np.cumprod(predictions, axis=-1) return predictions
def format_input_data( self, data: Union[None, pd.core.frame.DataFrame] = None, subset: Union[None, pd.core.series.Series] = None, ) -> List[Union[pd.core.series.Series, pd.core.frame.DataFrame]]: """List each categorical feature for input to own embedding layer.""" if data is None: data = self.data subset = default_subset_to_all(subset, data) return split_categorical_features( data[subset], self.categorical_features, self.numeric_features )
def format_input_data( self, data: Union[None, pd.core.frame.DataFrame] = None, subset: Union[None, pd.core.series.Series] = None, ) -> List[Union[pd.core.series.Series, pd.core.frame.DataFrame]]: """Keep only the features and observations desired for model input.""" if data is None: data = self.data subset = default_subset_to_all(subset, data) formatted_data = data.drop(self.reserved_cols, axis=1)[subset] for col in self.categorical_features: formatted_data[col] = formatted_data[col].cat.codes return formatted_data
def compute_shap_values(self, subset: Union[None, pd.core.series.Series] = None ) -> dict: """Compute SHAP values by lead length, observation, and feature.""" subset = default_subset_to_all(subset, self.data) shap_subset = self.data[self.categorical_features + self.numeric_features][subset] shap_values = { str(i + 1) + "_lead": shap.TreeExplainer(lead_specific_model).shap_values(shap_subset)[0] for i, lead_specific_model in enumerate(self.model) if lead_specific_model.num_trees() > 0 } return shap_values
def compute_shap_values( self, subset: Union[None, pd.core.series.Series] = None ) -> dict: """Compute SHAP values by lead length, observation, and feature. SHAP values for networks with embedding layers are not supported as of 9 Jun 2020. Compute SHAP values for restricted mean survival time in addition to each lead length. Args: subset: A Boolean Series that is True for observations for which the shap values will be computed. If None, default to all observations. Returns: A dictionary of numpy arrays, each of which contains SHAP values for the outcome given by its key. """ print( "SHAP values for networks with embedding layers are not " "supported as of 9 Jun 2020." ) return None model = make_predictions_marginal(self.model) subset = default_subset_to_all(subset, self.data) shap_subset = split_categorical_features( self.data[subset], self.categorical_features, self.numeric_features ) shap_subset = [np.atleast_2d(i.values) for i in shap_subset] interval_shap_values = shap.DeepExplainer( Model(model.inputs, model.output), shap_subset ).shap_values(shap_subset) model = make_predictions_cumulative(self.model) sum_layer = Lambda(lambda x: K.sum(x, axis=1))(model.output) rmst_shap_values = shap.DeepExplainer( Model(model.inputs, sum_layer), shap_subset ).shap_values(shap_subset) if self.categorical_features: interval_shap_values = [ np.hstack(arr_list) for arr_list in interval_shap_values ] rmst_shap_values = np.hstack(rmst_shap_values) shap_values = { str(i + 1) + "_lead": arr for i, arr in enumerate(interval_shap_values) } shap_values["RMST"] = rmst_shap_values return shap_values