示例#1
0
    def predict(self,
                subset: Union[None, pd.core.series.Series] = None,
                cumulative: bool = True) -> np.ndarray:
        """Use trained LightGBM models to predict the outcome for each observation and time horizon.

        Args:
            subset: A Boolean Series that is True for observations for which
                predictions will be produced. If None, default to all
                observations.
            cumulative: If True, produce cumulative survival probabilies.
                If False, produce marginal survival probabilities (i.e., one
                minus the hazard rate).

        Returns:
            A numpy array of predictions by observation and lead
            length.
        """
        subset = default_subset_to_all(subset, self.data)
        predict_data = self.data[self.categorical_features +
                                 self.numeric_features][subset]
        predictions = np.array([
            lead_specific_model.predict(predict_data)
            for lead_specific_model in self.model
        ]).T
        if cumulative:
            predictions = np.cumprod(predictions, axis=1)
        return predictions
示例#2
0
    def compute_model_uncertainty(
        self, subset: Union[None, pd.core.series.Series] = None, n_iterations: int = 200
    ) -> np.ndarray:
        """Predict with dropout as proposed by Gal and Ghahramani (2015).

        See https://arxiv.org/abs/1506.02142.

        Args:
            subset: A Boolean Series that is True for observations for which
                predictions will be produced. If None, default to all
                observations.
            n_iterations: Number of random dropout specifications to obtain
                predictions from.

        Returns:
            A numpy array of predictions by observation, lead length, and
            iteration.
        """
        subset = default_subset_to_all(subset, self.data)
        model_inputs = split_categorical_features(
            self.data[subset], self.categorical_features, self.numeric_features
        ) + [1.0]
        predict_with_dropout = K.function(
            self.model.inputs + [K.learning_phase()], self.model.outputs
        )
        predictions = np.dstack(
            [predict_with_dropout(model_inputs)[0] for i in range(n_iterations)]
        )
        return predictions
示例#3
0
    def predict(
        self, subset: Union[None, pd.core.series.Series] = None, cumulative: bool = True
    ) -> np.ndarray:
        """Map observations to outcome means from their categorical values.

        Map observations with a combination of categorical values not seen in
        the training data to the mean of the outcome in the training set.

        Args:
            subset: A Boolean Series that is True for observations for which
                predictions will be produced. If None, default to all
                observations.
            cumulative: If True, will produce cumulative survival
                probabilities. If False, will produce marginal survival
                probabilities (i.e., one minus the hazard rate).

        Returns:
            A numpy array of outcome means by observation and lead
            length.
        """
        subset = default_subset_to_all(subset, self.data)
        predictions = self.data[subset].merge(
            self.model, how="left", on=self.categorical_features
        )
        predictions = predictions[self.model.columns]
        predictions = predictions.to_numpy()
        if self.objective == "multiclass":
            predictions = predictions.reshape(
                (self.num_class, subset.sum(), self.n_intervals), order="F"
            )
        if cumulative:
            predictions = np.cumprod(predictions, axis=-1)
        return predictions
示例#4
0
 def format_input_data(
     self,
     data: Union[None, pd.core.frame.DataFrame] = None,
     subset: Union[None, pd.core.series.Series] = None,
 ) -> List[Union[pd.core.series.Series, pd.core.frame.DataFrame]]:
     """List each categorical feature for input to own embedding layer."""
     if data is None:
         data = self.data
     subset = default_subset_to_all(subset, data)
     return split_categorical_features(
         data[subset], self.categorical_features, self.numeric_features
     )
示例#5
0
 def format_input_data(
     self,
     data: Union[None, pd.core.frame.DataFrame] = None,
     subset: Union[None, pd.core.series.Series] = None,
 ) -> List[Union[pd.core.series.Series, pd.core.frame.DataFrame]]:
     """Keep only the features and observations desired for model input."""
     if data is None:
         data = self.data
     subset = default_subset_to_all(subset, data)
     formatted_data = data.drop(self.reserved_cols, axis=1)[subset]
     for col in self.categorical_features:
         formatted_data[col] = formatted_data[col].cat.codes
     return formatted_data
示例#6
0
 def compute_shap_values(self,
                         subset: Union[None, pd.core.series.Series] = None
                         ) -> dict:
     """Compute SHAP values by lead length, observation, and feature."""
     subset = default_subset_to_all(subset, self.data)
     shap_subset = self.data[self.categorical_features +
                             self.numeric_features][subset]
     shap_values = {
         str(i + 1) + "_lead":
         shap.TreeExplainer(lead_specific_model).shap_values(shap_subset)[0]
         for i, lead_specific_model in enumerate(self.model)
         if lead_specific_model.num_trees() > 0
     }
     return shap_values
示例#7
0
    def compute_shap_values(
        self, subset: Union[None, pd.core.series.Series] = None
    ) -> dict:
        """Compute SHAP values by lead length, observation, and feature.

        SHAP values for networks with embedding layers are not supported as of
        9 Jun 2020.

        Compute SHAP values for restricted mean survival time in addition to
        each lead length.

        Args:
            subset: A Boolean Series that is True for observations for which
                the shap values will be computed. If None, default to all
                observations.

        Returns:
            A dictionary of numpy arrays, each of which contains SHAP values
            for the outcome given by its key.
        """
        print(
            "SHAP values for networks with embedding layers are not "
            "supported as of 9 Jun 2020."
        )
        return None
        model = make_predictions_marginal(self.model)
        subset = default_subset_to_all(subset, self.data)
        shap_subset = split_categorical_features(
            self.data[subset], self.categorical_features, self.numeric_features
        )
        shap_subset = [np.atleast_2d(i.values) for i in shap_subset]
        interval_shap_values = shap.DeepExplainer(
            Model(model.inputs, model.output), shap_subset
        ).shap_values(shap_subset)
        model = make_predictions_cumulative(self.model)
        sum_layer = Lambda(lambda x: K.sum(x, axis=1))(model.output)
        rmst_shap_values = shap.DeepExplainer(
            Model(model.inputs, sum_layer), shap_subset
        ).shap_values(shap_subset)
        if self.categorical_features:
            interval_shap_values = [
                np.hstack(arr_list) for arr_list in interval_shap_values
            ]
            rmst_shap_values = np.hstack(rmst_shap_values)
        shap_values = {
            str(i + 1) + "_lead": arr for i, arr in enumerate(interval_shap_values)
        }
        shap_values["RMST"] = rmst_shap_values
        return shap_values