示例#1
0
 def __init__(self,
              preprocessor=None,
              model=None,
              hyperparam_range=None,
              is_trained=False):
     """Initialize the ClassicalTextExplainer
     :param model: Linear models with linear coefs mapped to features or
         tree based models with inbuilt feature_importances that are sklearn
         models or follow the sklearn API.
     :type: sklearn.ensemble or sklearn.linear_model (natively supported)
     :param preprocessor: Custom preprocessor for encoding text into vector
         form. Contains custom parser, vectorizer and tokenizer. Reference
         utils_classical.BOWEncoder for preprocessor's API template.
     :type preprocessor: object
     :param hyperparam_range: Custom hyper parameter range to search over
         when training input model. passed to sklearn's GridsearchCV's
         as param_grid argument.
     :type hyperparam_range: dict     """
     self.parsed_sentence = None
     self.word_importances = None
     self.model = model
     self.is_trained = is_trained
     if self.model is None and self.is_trained:
         raise Exception(
             "Is_trained flag can't be set to true, if custom model not provided."
         )
     self.hyperparam_range = hyperparam_range
     if self.model is not None:
         # model is user defined
         if not self.is_trained and self.hyperparam_range is None:
             raise Exception(
                 "Custom model needs to be supplied with custom hyperparameter range to search over."
             )
     self.preprocessor = BOWEncoder(
     ) if preprocessor is None else preprocessor
示例#2
0
class ClassicalTextExplainer:
    """The ClassicalTextExplainer for returning explanations for n-gram
    bag-of-words models using sklearn's classifier API.
    Also serves as extensible wrapper with components built to support addition
    of fresh explainers using certain parts of this class.
    """
    def __init__(self,
                 preprocessor=None,
                 model=None,
                 hyperparam_range=None,
                 is_trained=False):
        """Initialize the ClassicalTextExplainer
        :param model: Linear models with linear coefs mapped to features or
            tree based models with inbuilt feature_importances that are sklearn
            models or follow the sklearn API.
        :type: sklearn.ensemble or sklearn.linear_model (natively supported)
        :param preprocessor: Custom preprocessor for encoding text into vector
            form. Contains custom parser, vectorizer and tokenizer. Reference
            utils_classical.BOWEncoder for preprocessor's API template.
        :type preprocessor: object
        :param hyperparam_range: Custom hyper parameter range to search over
            when training input model. passed to sklearn's GridsearchCV's
            as param_grid argument.
        :type hyperparam_range: dict     """
        self.parsed_sentence = None
        self.word_importances = None
        self.model = model
        self.is_trained = is_trained
        if self.model is None and self.is_trained:
            raise Exception(
                "Is_trained flag can't be set to true, if custom model not provided."
            )
        self.hyperparam_range = hyperparam_range
        if self.model is not None:
            # model is user defined
            if not self.is_trained and self.hyperparam_range is None:
                raise Exception(
                    "Custom model needs to be supplied with custom hyperparameter range to search over."
                )
        self.preprocessor = BOWEncoder(
        ) if preprocessor is None else preprocessor

    def _encode(self, X_str):
        """Encode text strings in X_str as vectors.
        :param X_str: Strings to be encoded.
        :type X_str: array_like (array of strings, ndarray, pandas dataframe)
        :return: A model explanation object. It is guaranteed to be a LocalExplanation.
        :rtype: array_like (ndarray, pandas dataframe). Same rows as X_str
        """
        X_vec, _ = self.preprocessor.encode_features(X_str)
        return X_vec

    def train(self, *args, **kwargs):
        """Wrapper function for 'fit()'. If the user wants to entirely swap out
            'fit()' with a customer trainer, they can modify train instead.
        :return: A model explanation object. It is guaranteed to be a LocalExplanation.
        :return: List of length 2 . The elements are:
            * An sklearn pipeline object containing trained encoder and trained model.
            * Dict containing mapping from features to the best hyperparameters.
        :rtype: list"""
        return self.fit(*args, **kwargs)

    def fit(self, X_str, y_train):
        """Trains the model with training data and labels.
            Includes:
            * Encoding X_str into vector form.
            *Note*: y_train is assumed to be encoded into sklearn compatible format.
            (use sklearn's label encoder for this purpose externally)
            * Training the model.
            * Grid search over parameter range.
            * Returns best model and corresponding hyper parameters.
        :param X_str: Dataset of strings to train on.
        :type X_str: array_like (array of strings, ndarray, pandas dataframe)
        :param y_train: Labels in encoded vector form directly sent to model.fit().
        :type y_train: array_like
        :return: List of length 2 . The elements are:
            * An sklearn pipeline object containing trained encoder and trained model.
            * Dict containing mapping from features to the best hyperparameters.
        :rtype: list
        """
        X_train = self._encode(X_str)
        if self.is_trained is False:
            if self.model is None:
                self.model = LogisticRegression()
                # Hyperparameters were chosen through hyperparamter optimization on MNLI
                self.hyperparam_range = [ExplainerParams.HYPERPARAM_RANGE]
            classifier_CV = GridSearchCV(self.model,
                                         self.hyperparam_range,
                                         cv=3,
                                         scoring="accuracy")
            classifier_CV.fit(X_train, y_train)
            # set model as the best estimator from grid search results
            self.model = classifier_CV.best_estimator_
            best_params = classifier_CV.best_params_
        else:
            best_params = self.model.get_params()

        text_model = Pipeline(steps=[("preprocessor",
                                      Encoder(self)), ("classifier",
                                                       self.model)])
        return [text_model, best_params]

    def explain_local(self, X, y=None, name=None):
        """Returns an explanation object containing explanations over words
            in the input text string.
        :param X: String to be explained.
        :type X: str
        :param y: The predicted label for the sentence
        :type y: string
        :param name: a name for saving the explanation, currently ignored
        :type str
        :return: A model explanation object containing importances and metadata.
        :rtype: LocalExplanation
        """
        X = _validate_X(X)

        [encoded_text, _] = self.preprocessor.encode_features(X,
                                                              needs_fit=False)
        encoded_label = self.model.predict(encoded_text)

        if y is None:
            y = self.preprocessor.labelEncoder.inverse_transform(encoded_label)

        # convert from vector to scalar
        encoded_label = encoded_label[0]

        # Obtain the top feature ids for the selected class label
        if hasattr(self.model, "coef_"):
            # when #labels == 2, coef_ returns 1D array
            label_coefs_all = self.model.coef_
            if len(self.preprocessor.labelEncoder.classes_) == 2:
                label_coefs_all = np.vstack(
                    (-1 * label_coefs_all, label_coefs_all))
            encoded_imp = label_coefs_all[encoded_label, :]
        elif hasattr(self.model, "feature_importances_"):
            encoded_imp = self.model.feature_importances_
        else:
            raise Exception(
                "model is missing coef_ or feature_importances_ attribute")
        decoded_imp, parsed_sentence_list = self.preprocessor.decode_imp(
            encoded_imp, X)

        local_explanantion = _create_local_explanation(
            classification=True,
            text_explanation=True,
            local_importance_values=np.array(decoded_imp),
            method=str(type(self.model)),
            model_task="classification",
            features=parsed_sentence_list,
            classes=self.preprocessor.labelEncoder.classes_,
            predicted_label=y)
        return local_explanantion