示例#1
0
    def predict(self,
                subset: Union[None, pd.core.series.Series] = None,
                cumulative: bool = True) -> np.ndarray:
        """Use trained LightGBM models to predict observation survival rates.

        Args:
            subset: A Boolean Series that is True for observations for which
                predictions will be produced. If None, default to all
                observations.
                cumulative: If True, produce cumulative survival probabilies.
                If False, produce marginal survival probabilities (i.e., one
                minus the hazard rate).

        Returns:
            A numpy array of survival probabilities by observation and lead
            length.
        """
        subset = survival_modeler.default_subset_to_all(subset, self.data)
        predict_data = self.data[self.categorical_features +
                                 self.numeric_features][subset]
        predictions = np.array([
            lead_specific_model.predict(predict_data)
            for lead_specific_model in self.model
        ]).T
        if cumulative:
            predictions = np.cumprod(predictions, axis=1)
        return predictions
示例#2
0
    def predict(self,
                subset: Union[None, pd.core.series.Series] = None,
                cumulative: bool = True) -> np.ndarray:
        """Map observations to survival rates from their categorical values.

        Map observations with a combination of categorical values not seen in
        the training data to the mean survival rate in the training set.

        Args:
            subset: A Boolean Series that is True for observations for which
                predictions will be produced. If None, default to all
                observations.
            cumulative: If True, will produce cumulative survival
                probabilies. If False, will produce marginal survival
                probabilities (i.e., one minus the hazard rate).

        Returns:
            A numpy array of survival probabilities by observation and lead
            length.
        """
        subset = survival_modeler.default_subset_to_all(subset, self.data)
        predictions = self.data[subset].merge(
            self.model,
            how="left",
            left_on=self.categorical_features,
            right_index=True)
        predictions = predictions[self.model.columns]
        predictions = predictions.fillna(predictions.mean())
        predictions = predictions.to_numpy()
        if cumulative:
            predictions = np.cumprod(predictions, axis=1)
        return predictions
示例#3
0
    def compute_model_uncertainty(self,
                                  subset: Union[None,
                                                pd.core.series.Series] = None,
                                  n_iterations: int = 200) -> np.ndarray:
        """Predict with dropout as proposed by Gal and Ghahramani (2015).

        See https://arxiv.org/abs/1506.02142.

        Args:
            subset: A Boolean Series that is True for observations for which
                predictions will be produced. If None, default to all
                observations.
            n_iterations: Number of random dropout specifications to obtain
                predictions from.

        Returns:
            A numpy array of predictions by observation, lead length, and
            iteration.
        """
        subset = survival_modeler.default_subset_to_all(subset, self.data)
        model_inputs = split_categorical_features(
            self.data[subset], self.categorical_features,
            self.numeric_features) + [1.0]
        predict_with_dropout = K.function(
            self.model.inputs + [K.learning_phase()], self.model.outputs)
        predictions = np.dstack([
            predict_with_dropout(model_inputs)[0] for i in range(n_iterations)
        ])
        return predictions
示例#4
0
 def format_input_data(
     self,
     data: Union[None, pd.core.frame.DataFrame] = None,
     subset: Union[None, pd.core.series.Series] = None
 ) -> List[Union[pd.core.series.Series, pd.core.frame.DataFrame]]:
     """Keep only the features and observations desired for model input."""
     if data is None:
         data = self.data
     subset = survival_modeler.default_subset_to_all(subset, data)
     return data.drop(self.reserved_cols, axis=1)[subset]
示例#5
0
 def format_input_data(
     self,
     data: Union[None, pd.core.frame.DataFrame] = None,
     subset: Union[None, pd.core.series.Series] = None,
 ) -> List[Union[pd.core.series.Series, pd.core.frame.DataFrame]]:
     """List each categorical feature for input to own embedding layer."""
     if data is None:
         data = self.data
     subset = survival_modeler.default_subset_to_all(subset, data)
     return split_categorical_features(data[subset],
                                       self.categorical_features,
                                       self.numeric_features)
示例#6
0
 def compute_shap_values(self,
                         subset: Union[None, pd.core.series.Series] = None
                         ) -> dict:
     """Compute SHAP values by lead length, observation, and feature."""
     subset = survival_modeler.default_subset_to_all(subset, self.data)
     shap_subset = self.data[self.categorical_features +
                             self.numeric_features][subset]
     shap_values = {
         str(i + 1) + '_lead':
         shap.TreeExplainer(lead_specific_model).shap_values(shap_subset)[0]
         for i, lead_specific_model in enumerate(self.model)
         if lead_specific_model.num_trees() > 0
     }
     return shap_values
示例#7
0
    def compute_shap_values(self,
                            subset: Union[None, pd.core.series.Series] = None
                            ) -> dict:
        """Compute SHAP values by lead length, observation, and feature.

        SHAP values for networks with embedding layers are not supported as of
        9 Jun 2020.

        Compute SHAP values for restricted mean survival time in addition to
        each lead length.

        Args:
            subset: A Boolean Series that is True for observations for which
                the shap values will be computed. If None, default to all
                observations.

        Returns:
            A dictionary of numpy arrays, each of which contains SHAP values
            for the outcome given by its key.
        """
        print("SHAP values for networks with embedding layers are not "
              "supported as of 9 Jun 2020.")
        return None
        model = make_predictions_marginal(self.model)
        subset = survival_modeler.default_subset_to_all(subset, self.data)
        shap_subset = split_categorical_features(self.data[subset],
                                                 self.categorical_features,
                                                 self.numeric_features)
        shap_subset = [np.atleast_2d(i.values) for i in shap_subset]
        interval_shap_values = shap.DeepExplainer(
            Model(model.inputs, model.output),
            shap_subset).shap_values(shap_subset)
        model = make_predictions_cumulative(self.model)
        sum_layer = Lambda(lambda x: K.sum(x, axis=1))(model.output)
        rmst_shap_values = shap.DeepExplainer(Model(
            model.inputs, sum_layer), shap_subset).shap_values(shap_subset)
        if self.categorical_features:
            interval_shap_values = [
                np.hstack(arr_list) for arr_list in interval_shap_values
            ]
            rmst_shap_values = np.hstack(rmst_shap_values)
        shap_values = {
            str(i + 1) + "_lead": arr
            for i, arr in enumerate(interval_shap_values)
        }
        shap_values["RMST"] = rmst_shap_values
        return shap_values